dataset: 'msrvtt-qa'

pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth'


# size of vit model; base or large
vit: 'base'
batch_size_train: 8
batch_size_test: 32  

vit_grad_ckpt: False 
vit_ckpt_layer: 0
init_lr: 1e-4

image_size: 224


modality: video 


video_semantic_adapter: False  # True False
video_key_adapter: False
video_key_adapter_dim: 64
temporal_embed: False
temporal_attention: False

finetune_head: False # linear probe
finetune_all: False


k_test: 128
inference: 'generate' 

weight_decay: 0.05
min_lr: 0
max_epoch: 5
