vqa_root: 'dataset/train2014' #followed by train2014/
vg_root: 'dataset/images'  #followed by image/
#train_files: ['vqa_train','vqa_val','vg_qa']
train_files: ['vqa_train']
ann_root: 'annotation'

dataset: 'vqa'
# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth'


# size of vit model; base or large
vit: 'base'
batch_size_train: 4
batch_size_test: 4
vit_grad_ckpt: False
vit_ckpt_layer: 0
init_lr: 2.0e-5

image_size: 224


modality: image # image video

# adapter
adapter: True
adapter_textual: True
adapter_visual: True
adapter_multimodal: True

adapter_textual_dim: 128
adapter_visual_dim: 128
adapter_multi_modal_dim: 128

video_semantic_adapter: False  # True False
video_key_adapter: False
video_key_adapter_dim: 64

temporal_embed: False

temporal_attention: False

finetune_head: False
finetune_all: False


k_test: 128
inference: 'rank'

# optimizer[]
weight_decay: 0.05
min_lr: 0
max_epoch: 5
