
task:
  _name: audio_text_pretrain
  data: ../../dataset/macs/merge_clotho_audiocaps_macs.tsv
  valid_data: ../../dataset/audiocaps/audiocaps_val_new.tsv
  selected_cols: uniq_id,audio,text,duration
  bpe_dir: ../../utils/BPE

  max_src_length: 70
  max_duration: 15

  valid_file: ../../dataset/esc50/esc50_label.json
  audio_mask_ratio: 0.55
  al_text_mask_ratio: 0.4
  al_audio_mask_ratio: 0.45

  audio_mask_prob_adjust: 0.1
  audio_mask_length: 5

criterion:
  _name: audio_text_pretrain_loss
  dcl_audio_alpha: 1.0
  dcl_al_text_alpha: 0.5
  dcl_al_audio_alpha: 0.5
  dcl_logit_scale: 2.5
  label_smoothing: 0.1

optimizer:
  _name: adjust_adam
  adam_betas: (0.9,0.98)
  adam_eps: 1e-08
  weight_decay: 0.05
  use_distributed_fused_adam: true

lr_scheduler:
  _name: adjust_cosine
  warmup_ratio: 0.1
  min_lr: 1e-6

optimization:
  max_epoch: 10
  lr: [0.0002]
  update_freq: [1]
  clip_norm: 3.0
  skip_remainder_batch: false

dataset:
  num_workers: 6
  batch_size: 16
  fixed_validation_seed: 3407
  validate_interval: 1
  batch_size_valid: 16
  ensure_equal_batch: true

common:

  # use bf16
  fp16: false
  memory_efficient_fp16: false
  bf16: true
  memory_efficient_bf16: true

  # use fp16
  #  fp16: true
  #  memory_efficient_fp16: true
  #  bf16: false
  #  memory_efficient_bf16: false

  no_bias_decay: true
  log_format: simple
  log_interval: 10
  user_dir: ../../user_module
  disable_iterator_cache: true
  seed: 3407
  tensorboard_logdir: ${checkpoint.save_dir}

checkpoint:
  keep_last_epochs: 10
  save_interval: 1
  best_checkpoint_metric: txt_r1
  maximize_best_checkpoint_metric: true
  no_save_optimizer_state: true
  load_checkpoint_on_all_dp_ranks: true

distributed_training:
  ddp_backend: legacy_ddp

model:
  _name: one_peace_pretrain
  reset_logit_scale: true
  stage2_pretrain: true

  encoder:
    checkpoint_activations: true
    text_adapter:
      bucket_size: 256
      layernorm_embedding: false
      add_type_embedding: false
      shrink_alpha: 1.0
      dropout: 0.0
      use_attn_bias: true

    audio_adapter:
      feature_embed_dim: 512
      feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
      abs_pos_type: conv
      conv_pos_depth: 5
      conv_pos_width: 95
      conv_pos_groups: 16
      conv_pos_pre_ln: false
      bucket_size: 512
      layernorm_embedding: false
      add_type_embedding: false
      shrink_alpha: 1.0
      dropout: 0.0
      use_attn_bias: true

    embed_dim: 1536
    ffn_embed_dim: 6144
    layers: 40
    attention_heads: 24
    normalize_before: true
    learned_pos: true
    drop_path_rate: 0.4
    use_text_moe: true
    use_image_moe: false
    use_audio_moe: true
    attention_dropout: 0.0
    dropout: 0.0
    activation_fn: gelu
    magneto_scale_attn: true
    scale_attn: false
    scale_fc: true
    scale_heads: false
    use_layer_scale: true
    layer_scale_init_value: 1e-6

  decoder:
    checkpoint_activations: true
    text_adapter:
      bucket_size: 256
      layernorm_embedding: false
      add_type_embedding: false
      shrink_alpha: 1.0
      dropout: 0.0
      use_attn_bias: false

    audio_adapter:
      feature_encoder_spec:
      abs_pos_type: fixed
      bucket_size: 256
      layernorm_embedding: false
      add_type_embedding: false
      shrink_alpha: 1.0
      dropout: 0.0
      use_attn_bias: false

    embed_dim: 768
    ffn_embed_dim: 2048
    layers: 2
    attention_heads: 12
    normalize_before: true
    learned_pos: true
    drop_path_rate: 0.0
    use_text_moe: true
    use_image_moe: false
    use_audio_moe: true
    attention_dropout: 0.0
    dropout: 0.0
    activation_fn: gelu
    magneto_scale_attn: true
    scale_attn: false
    scale_fc: true
    scale_heads: false
    use_layer_scale: false
    layer_scale_init_value: 1e-6