
task:
  _name: image_text_pretrain
  data: ../../dataset/mscoco/train.tsv
  valid_data: ../../dataset/mscoco/val_new.tsv
  selected_cols: image_id,image,caption
  bpe_dir: ../../utils/BPE

  max_src_length: 70
  patch_image_size: 256

  valid_file: ../../dataset/mscoco/val_texts.json
  text_mask_ratio: 0.15
  image_mask_ratio: 0.75
  vl_text_mask_ratio: 0.4
  vl_image_mask_ratio: 0.6875
  min_scale: 0.9

criterion:
  _name: image_text_pretrain_loss
  dcl_text_alpha: 0.5
  dcl_image_alpha: 1.0
  dcl_vl_text_alpha: 0.5
  dcl_vl_image_alpha: 0.5
  dcl_logit_scale: 2.5
  label_smoothing: 0.1

optimizer:
  _name: adjust_adam
  adam_betas: (0.9,0.98)
  adam_eps: 1e-08
  weight_decay: 0.05
  use_distributed_fused_adam: true

lr_scheduler:
  _name: adjust_cosine
  warmup_updates: 3000
  max_update: 200000
  min_lr: 1e-6

optimization:
  max_update: 200000
  lr: [0.0005]
  update_freq: [1]
  clip_norm: 3.0
  skip_remainder_batch: false

dataset:
  num_workers: 6
  batch_size: 32
  fixed_validation_seed: 3407
  validate_interval: 1
  batch_size_valid: 16
  ensure_equal_batch: true

common:

  # use bf16
  fp16: false
  memory_efficient_fp16: false
  bf16: true
  memory_efficient_bf16: true

  # use fp16
  #  fp16: true
  #  memory_efficient_fp16: true
  #  bf16: false
  #  memory_efficient_bf16: false

  no_bias_decay: true
  log_format: simple
  log_interval: 10
  user_dir: ../../user_module
  disable_iterator_cache: true
  seed: 3407
  tensorboard_logdir: ${checkpoint.save_dir}

checkpoint:
  keep_last_epochs: 10
  save_interval: 1
  best_checkpoint_metric: txt_r1
  maximize_best_checkpoint_metric: true
  no_save_optimizer_state: true
  load_checkpoint_on_all_dp_ranks: true

distributed_training:
  ddp_backend: legacy_ddp

model:
  _name: one_peace_pretrain

  encoder:
    checkpoint_activations: true
    text_adapter:
      bucket_size: 256
      layernorm_embedding: false
      add_type_embedding: false
      shrink_alpha: 1.0
      dropout: 0.0
      use_attn_bias: true

    image_adapter:
      bucket_size: 16
      rel_bucket_size: 16
      vision_encoder_type: hmlp
      layernorm_embedding: false
      add_type_embedding: false
      shrink_alpha: 1.0
      dropout: 0.0
      use_attn_bias: true

    embed_dim: 1536
    ffn_embed_dim: 6144
    layers: 40
    attention_heads: 24
    normalize_before: true
    learned_pos: true
    drop_path_rate: 0.4
    use_text_moe: true
    use_image_moe: true
    use_audio_moe: false
    attention_dropout: 0.0
    dropout: 0.0
    activation_fn: gelu
    magneto_scale_attn: true
    scale_attn: false
    scale_fc: true
    scale_heads: false
    use_layer_scale: true
    layer_scale_init_value: 1e-6

  decoder:
    checkpoint_activations: true
    text_adapter:
      bucket_size: 256
      layernorm_embedding: false
      add_type_embedding: false
      shrink_alpha: 1.0
      dropout: 0.0
      use_attn_bias: false

    image_adapter:
      bucket_size: 16
      rel_bucket_size: 16
      vision_encoder_type: none
      layernorm_embedding: false
      add_type_embedding: false
      shrink_alpha: 1.0
      dropout: 0.0
      use_attn_bias: false

    embed_dim: 768
    ffn_embed_dim: 2048
    layers: 2
    attention_heads: 12
    normalize_before: true
    learned_pos: true
    drop_path_rate: 0.0
    use_text_moe: true
    use_image_moe: true
    use_audio_moe: false
    attention_dropout: 0.0
    dropout: 0.0
    activation_fn: gelu
    magneto_scale_attn: true
    scale_attn: false
    scale_fc: true
    scale_heads: false
    use_layer_scale: false
    layer_scale_init_value: 1e-6