train_file:  ['data/finetune/flickr30k_train.json']
val_file: 'data/finetune/flickr30k_val.json'
test_file: 'data/finetune/flickr30k_test.json'
image_root: 'images/'


## Vision Encoder
vision_config: 'models/xvlm_model/config_swinB_384.json'

use_clip_vit: False
#image_res: 384
#patch_size: 16

use_swin: True
image_res: 384
patch_size: 32


## Text Encoder
use_roberta: False
text_config: 'models/xvlm_model/config_bert.json'  # ['configs/config_bert.json', 'configs/config_roberta.json']
text_encoder: 'bert-base-uncased'  # ['data/bert-base-uncased', 'data/roberta-base']


## Training
batch_size_train: 20
batch_size_test: 12
batch_size_test_text: 64
max_tokens: 40
embed_dim: 256
temp: 0.07
k_test: 128


## Other Settings
optimizer: {opt: adamW, lr: 3e-5, weight_decay: 0.01, lr_mult: 2}
schedular: {sched: linear, lr: 3e-5, epochs: 10, num_warmup_steps: 0.1}







