vqa_root: 'datapath_to_coco/coco/' 
vg_root: 'datapath_to_vg'  
train_files: ['vqa_train','vqa_val','vg_qa']  
ann_root: 'annotation'

dataset: 'vqa'

pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth'


# size of vit model; base or large
vit: 'base'
batch_size_train: 16 
batch_size_test: 32  
vit_grad_ckpt: False 
vit_ckpt_layer: 0
init_lr: 1e-4

image_size: 384  

modality: image 

video_semantic_adapter: False  # True False
video_key_adapter: False
video_key_adapter_dim: 64
temporal_embed: False
temporal_attention: False


finetune_head: False 
finetune_all: False

k_test: 128
inference: 'rank' 

weight_decay: 0.05
min_lr: 0
max_epoch: 5
