# @package _group_

common:
  fp16: true
  log_format: json
  log_interval: 200
  tensorboard_logdir: tblog
  seed: 1337
  fp16_no_flatten_grads: true
  user_dir: ???

checkpoint:
  save_interval: 2
  keep_interval_updates: 1
  no_epoch_checkpoints: false
  best_checkpoint_metric: accuracy
  maximize_best_checkpoint_metric: true

distributed_training:
  ddp_backend: c10d
  distributed_backend: 'nccl'
  distributed_init_method: 'env://'
  find_unused_parameters: true
  distributed_world_size: 4
  distributed_port: 29671
  nprocs_per_node: 4

task:
  _name: av_hubert_pretraining
  is_s2s: true
  data: ???
  label_dir: ???
  tokenizer_bpe_model: ???
  normalize: true  # must be consistent with pre-training
  labels: ["wrd"]
  single_target: true
  fine_tuning: true
  stack_order_audio: 4
  tokenizer_bpe_name: sentencepiece
  max_sample_size: 500
  modalities: ["video","audio"]
  image_aug: true
  pad_audio: true
  random_crop: false
  noise_prob: 0.0
  noise_snr: 0
  noise_wav: ???

dataset:
  num_workers: 6
  max_tokens: 1000
  validate_after_updates: 0
  validate_interval: 2
  train_subset: pretrain
  valid_subset: valid

criterion:
  _name: my_label_smoothed_cross_entropy
  report_accuracy: true
  label_smoothing: 0.1

optimization:
  max_update: 60000
  sentence_avg: true
  update_freq: [1]
  lr: [0.001]

optimizer:
  _name: composite
  groups:
    down_param:
      lr: [0.001]
      lr_float: null
      optimizer: 
        _name: adam
        adam_betas: (0.9,0.98)
        adam_eps: 1e-08
      lr_scheduler:
        _name: tri_stage
        warmup_steps: 20000
        hold_steps: 0
        decay_steps: 40000
        final_lr_scale: 0.05
    adapter_layer_weights:
      lr: [0.001]
      lr_float: null
      optimizer: 
        _name: adam
        adam_betas: (0.9,0.98)
        adam_eps: 1e-08
      lr_scheduler:
        _name: tri_stage
        warmup_steps: 20000
        hold_steps: 0
        decay_steps: 40000
        final_lr_scale: 0.05      
    layer_norm:
      lr: [0.001]
      lr_float: null
      optimizer: 
        _name: adam
        adam_betas: (0.9,0.98)
        adam_eps: 1e-08
      lr_scheduler:
        _name: tri_stage
        warmup_steps: 20000
        hold_steps: 0
        decay_steps: 40000
        final_lr_scale: 0.05
    p-tuning:
      lr: [0.001]
      lr_float: null
      optimizer: 
        _name: adam
        adam_betas: (0.9,0.98)
        adam_eps: 1e-08
      lr_scheduler:
        _name: tri_stage
        warmup_steps: 20000
        hold_steps: 0
        decay_steps: 40000
        final_lr_scale: 0.05

lr_scheduler: pass_through

model:
  _name: av_hubert_seq2seq
  w2v_path: ???
  apply_mask: false
  mask_selection: static
  mask_length: 10
  mask_other: 0
  mask_prob: 0.75
  mask_channel_selection: static
  mask_channel_length: 64
  mask_channel_other: 0
  mask_channel_prob: 0.5
  layer_norm_first: true
  layerdrop: 0.0
  dropout: 0.0
  activation_dropout: 0.1
  attention_dropout: 0.0
  feature_grad_mult: 1.0
  total_encoder_layers: 24
  decoder_layers: 9
  decoder_dropout: 0.1
  decoder_attention_dropout: 0.0
  decoder_activation_dropout: 0.1
  freeze_finetune_updates: 48000
  share_decoder_input_output_embed: true
  decoder_normalize_before: true
  decoder_embed_dim: 1024
  decoder_ffn_embed_dim: 4096
  decoder_attention_heads: 8
  modality_dropout: 0.00
  audio_dropout: 0.25
  modality_fuse: fuse
  mask_length_audio: 10
  mask_prob_audio: 0.60
  mask_length_image: 5
  mask_prob_image: 0.30

hydra:
  job:
    config:
      override_dirname:
        kv_sep: '-'
        item_sep: '__'
        exclude_keys:
          - run
          - task.data
          - task.label_dir
          - model.w2v_path
          - dataset.train_subset
          - dataset.valid_subset
          - criterion.wer_kenlm_model
          - criterion.wer_lexicon
  run:
    dir: ???
  sweep:
    dir: ???
    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
