model:
  arch: mlp_contrastive_fusion
  model_type: base

  vis_embed_dim: 1536 # embed. dim. of specific visual model used here
  text_embed_dim: 1024 # embed. dim. of specific text model used here
  
  proj_embed_dim: 512
  num_layers_vis: 4
  num_layers_text: 4
  expansion_factor: 4
  dropout: 0.6

datasets:
  coco_caption_feat_dinov2_vitg14_bge_large:
    vis_processor:
      train:
        name: "feat_train"
        noise_std: 0
    text_processor:
      train:
        name: "feat_train"
        noise_std: 0
  vg_caption_feat_dinov2_vitg14_bge_large:
    vis_processor:
      train:
        name: "feat_train"
        noise_std: 0
    text_processor:
      train:
        name: "feat_train"
        noise_std: 0
  sbu_caption_feat_dinov2_vitg14_bge_large:
    vis_processor:
      train:
        name: "feat_train"
        noise_std: 0
    text_processor:
      train:
        name: "feat_train"
        noise_std: 0
  # conceptual_caption_3m_feat_dinov2_vitg14_bge_large:
  #   vis_processor:
  #     train:
  #       name: "feat_train"
  #       noise_std: 0
  #   text_processor:
  #     train:
  #       name: "feat_train"
  #       noise_std: 0
  flickr30k_feat_dinov2_vitg14_bge_large:
    vis_processor:
        eval:
          name: "feat_eval"
    text_processor:
        eval:
          name: "feat_eval"

run:
  task: visual_text_pretrain
  
  # optimizing
  lr_sched: "linear_warmup_cosine_lr"
  init_lr: 1e-3
  min_lr: 1e-6
  warmup_lr: 1e-6
  warmup_steps: 100
  max_epoch: 500
  weight_decay: 0.1
  amp: False
  
  # dataloading
  num_workers: 8

  batch_size_train: 10240
  batch_size_eval: 1024

  # mixup
  mixup_alpha_sched: "cosine_mixup_alpha"
  init_mixup_alpha: 1.0
  max_mixup_alpha: 1.0
  
  evaluate: False
  train_splits: ["train"]
  valid_splits: ["test"]
  test_splits: ["test"]

  
  # misc
  seed: 42
  output_dir: "output/fusion/mlp_contrastive_fusion/pretrain/dinov2_vitg14_bge_large_coco_vg_sbu_cap_cc3m/512proj_4vlayer_4tlayer_4exp_06drop_0noise_1e6warm100_1e3lr_cosine_500ep_01wd_40960bs_mixup_1"

  # distribution
  device: "cuda"
  world_size: 1
  distributed: False
