hydra:
  run:
    dir: ./experiments/Aircraft
  job:
    chdir: true

  job_logging:
    version: 1
    formatters:
      simple:
        format: '%(message)s'

class_order: ""
dataset_root: ""
workdir: ""
log_path: "metrics.json"
model_name: "ViT-B/16"
prompt_template: "a bad photo of a {}."
zero_shot_eval: true
pre_task_zero_shot_eval: False
zs_mtil_indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]  # indices of MTIL datasets (0..24) to evaluate in zero-shot; excludes any selected as downstream train_dataset
# zs_mtil_indices: [1]  # indices of MTIL datasets (0..24) to evaluate in zero-shot; excludes any selected as downstream train_dataset


batch_size: 128
increment: ${initial_increment}
initial_increment: 10  
scenario: "class"
dataset: "cifar100"

# method: "lwf"
# lr: 7.1e-6
# ls: 0.2
# we: 
# avg_freq:  
# ref_dataset:
# ref_sentences: 

weight_decay: 0.0
l2: 0
ce_method: 0

method: "MoE-Adapters"
lr: 1e-3
ls: 0.0
#we:
#avg_freq:
#ref_dataset:
#ref_sentences: random

# =====================
# DFA (two-expert + router) hyperparameters
# =====================

# Gate mode: 'ood_confidence' (default), 'router' (trainable), or 'debug' (manual weights)
gate_mode: router

# Debug mode: manually specify weights (only used when gate_mode=debug)
debug_seen_weights: [0,1]   # [w1, w2] for seen/CIL evaluation (w2 larger for in-distribution)
debug_zs_weights:                # 25x2 matrix (MTIL indices 0..24). Downstream CIL dataset(s) will be skipped automatically in ZS.
  - [0,1]   # 0  FGVCAircraft
  - [0,1]   # 1  Caltech101
  - [0,1]   # 2  CIFAR100
  - [0,1]   # 3  DescribableTextures
  - [0,1]   # 4  EuroSAT
  - [0,1]   # 5  OxfordFlowers
  - [0,1]   # 6  Food101
  - [0,1]   # 7  MNIST
  - [0,1]   # 8  OxfordPets
  - [0,1]   # 9  StanfordCars
  - [0,1]   # 10 SUN397
  - [0,1]   # 11 Country211
  - [0,1]   # 12 SST2
  - [0,1]   # 13 HatefulMemes
  - [0,1]   # 14 GTSRB
  - [0,1]   # 15 RESISC45
  - [0,1]   # 16 FER2013
  - [0,1]   # 17 UCF101
  - [0,1]   # 18 CIFAR10
  - [0,1]   # 19 STL10
  - [0,1]   # 20 VOC2007
  - [0,1]   # 21 ImageNetR
  - [0,1]   # 22 KittiDistance
  - [0,1]   # 23 PCam
  - [0,1]   # 24 CLEVRCount

# Architecture
dfa_r1: 16              # Expert1 bottleneck dimension
dfa_r2: 8               # Expert2 bottleneck dimension
dfa_router_hidden: 0    # Router hidden size (0 = no hidden layer)
dfa_beta: 1.0           # Fusion scale beta
single_router_3way: false
dfa_inject_mode: block

# Training schedule
epochs_a: 1             # epochs for Stage A (Expert1 contrastive); used if skip_stage_a=false
epochs_b: 1             # epochs for Stage B (Expert2 + Router)
use_amp: true           # enable AMP in training (Stage A/B)
e2_router_mode: moe_w   # task expert router mode ('moe_w' or 'mlp')
e2_top_k: 2             # top-k for E2 router (2 matches moe-w behavior)

# Stage A (InfoNCE)
skip_stage_a: false     # if true, skip Expert1 training (keep at initialization for ablation)
# lr_e1: 1.0e-5           # learning rate for Expert1
 
lr_e1_ini: 1.0e-5      
lr_e1_next: 1.0e-5     
lr_e1_final: 5.0e-6    
tau_con: 0.15           # contrastive temperature for InfoNCE
use_moco_queue: true    # enable momentum queue for more negatives
moco_queue_size: 128   # queue size (default 4096, increases negative samples significantly)
moco_momentum: 0.999    # momentum coefficient for queue update

e1_generic_templates:
  - "a photo of a {}"
  - "a clean photo of a {}"
  - "a bad photo of the {}"
  - "a professional photo of a {}"
  - "a studio shot of a {}"
  - "a photo of a {} a realistic setting"
  - "an image of a {}"
  - "a photo of a {} in a photo book"
  - "an image of a {} by a professional photographer"
  - "a photo of a {} using in an experiment"
  - "a photo of a {} by a camera"
  - "a photo of a nice {}"
  - "a photo of the nice {}"
  - "a photo of many {}"
  - "a photo of the cool {}"
  - "a photo of the {}"
  - "art of the {}"
  - "a blurry photo of the {}"
  - "a photo of the {} in a photo book"
  - "a pixelated photo of the {}"

# Stage B (Task expert + Router)
lr_e2: 5.0e-4           # learning rate for Expert2 + Router
lambda_kd: 0.0          # KD weight
tau_kd: 2.0             # KD temperature
lambda_reg: 0.0      # L2 regularization on Expert2
adapter_dropout: 0.1    # adapter dropout, aligned with moe-w
adapter_scalar: 0.1     # residual adapter scalar, aligned with moe-w

split_b2_router: false
epochs_b2_router: 1
lr_b2_router: 5.0e-5

# Lightweight replay buffer (few exemplars per class)
replay_per_class: 0     # max exemplars per abs class
replay_batch: 0       # replay batch size per step in Stage B (KD only)

# Router behavior statistics
router_stats_samples: 50      # samples per group (seen + each ZS dataset)
router_hist_bins: 10          # histogram bins for w1/w2

# Evaluation
eval_use_amp: true            # AMP during evaluation


# Downstream dataset index mapping for train_one_dataset (MTIL_order_2=false):
# 0: Aircraft
# 1: Caltech101
# 2: CIFAR100
# 3: DTD
# 4: EuroSAT
# 5: OxfordFlowers
# 6: Food101
# 7: MNIST
# 8: OxfordPets
# 9: StanfordCars
# 10: SUN397
# 11: Country211
# 12：SST2
# 13: Hatefulmemes
# 14: GTSRB
# 15: RESISC45
# 16: FER2013
# 17: UCF101
# 18: Cifar10
# 19: STL10
# 20: VOC2007
# 21: ImagenetR
# 22: KittiDistance
# 23: PCam
# 24: CLEVRCount

 
# text_dfa_last_k: 2
text_dfa_full: true
text_dfa_r2: 8

text_lr_e2: 1.0e-5      

joint_train_routers: true

lr_e2_router: 1.0e-3         
lr_top_router: 1.0e-5        

text_dfa_r1: 16               

block_single_router_3way: false

moco_post_enqueue_mode: fifo
 

lambda_queue_logits: 0        
queue_logits_sample: 128       
queue_logits_tau: 2.0            
queue_logits_on_text: false       

lambda_b_con: 0.001                       
tau_b_con: 0.19                          

unified_stage: false

num_task_experts: 2

seed: 43