version: 2

train_file:  ['./storage/10//downstream-finetuning-json/flickr30k_train.json']
val_file: './storage/10/downstream-finetuning-json/flickr30k_test.json' 
test_file: './storage/10/downstream-finetuning-json/flickr30k_test.json'
image_root: './storage/10/flickr30k/'  
save_sims: './storage/10/albef-sims'

image_res: 256 #384
embed_dim: 256
temp: 0.07
batch_size_train: 32
batch_size_test: 4
k_test: 128

alpha: 0.4

optimizer: {opt: adamW, lr: 1e-5, weight_decay: 0.02} 
schedular: {sched: cosine, lr: 1e-5, epochs: 10, min_lr: 1e-6, decay_rate: 1, warmup_lr: 1e-5, warmup_epochs: 1, cooldown_epochs: 0}

text_encoder: base
vision_encoder: base
adapter_append: false

model_config:
  import_path: models.clip_adjustable_cross.CLIP

# This doesn't matter for evaluation, but we have to
# include it anyway because my code is spaghetti rn.
pretrained_text: true
pretrained_vision: true
freeze_text_encoder: false
freeze_vision_encoder: false
unlock_layernorm: false
limit_num_samples: false
add_adapter: false
freeze_proj: false
unlock_dense: false
unlock_attn: false
unlock_random: false
bitfit: false
always_freeze:
  visual_encoder: []
  text_encoder: []

# This should match how the model was trained, or results
# will not make sense.

add_cross_adapter: true
fp16: true


conventional_adapter:
  insert: false
  reduction_factor: 4