

model:
  arch: MVAE_pretrain

#  load_pretrained: True
#  pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"

  # vit encoder
  vit_type: "base"
  image_size: 128   # 224
  vit_ckpt_layer: 0
  vit_drop_path_rate: 0
  vit_layer_norm_epsilon: 1e-6
  vit_grad_ckpt: False

  # bert config
  med_config_path: "configs/models/med_config.json"
  mlm_mask_prob: 0.15

  prompt: "a picture of "

  # VAE image decoder config
  latent_dim: 500
  hidden_dims: [16, 32, 64, 128, 256, 512]

  embed_dim: 256
  momentum: 0.995
  alpha: 0.4
  temp: 0.07

  max_txt_len: 30

preprocess:
    vis_processor:
        train:
          name: "MVAE_image_train"
          image_size: 128
    text_processor:
        train:
          name: "MVAE_caption"
