image_root: 'datapath_to_flickr30k/flickr30k/images'
ann_root: 'datapath_to_flickr30k/flickr30k'
dataset: 'flickr'


pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth'


vit: 'base'

batch_size_train: 16
batch_size_test: 32
vit_grad_ckpt: False
vit_ckpt_layer: 4
init_lr: 1e-4


modality: image # image video

frame_aware_attention: False  # True False
video_key_adapter: False
video_key_adapter_dim: 64
temporal_embed: False
temporal_attention: False

# fine-tuning setting
finetune_head: False
finetune_all: False

image_size: 384
queue_size: 57600
alpha: 0.4
beta: 0
k_test: 128
negative_all_rank: False

# optimizer
weight_decay: 0.05
min_lr: 0
max_epoch: 4

