exp_dir:
hydra:
  run:
    dir: ${exp_dir}/logs/${now:%Y-%m-%d}/${now:%H-%M-%S}  # Force Hydra output (logs, etc.) into your experiment dir

model:
  name: "zipformer-tta" # check auto.model_registry.py to see all available models
  config_path: # Optional if you have a config.json to load for your model. Otherwise it will use the default value defined by model_type's config dataclass
  config_preset: 'large' # 'base', 'large'
  config:
    use_attention_decoder: True
    translate_mode: "src2tgt"
    special_tokens: [        
      "<transcribe>", "<translate>",
      "<zh>", "<ja>", "<ko>", "<en>", "<fr>", "<es>", "<pt>", "<vi>", "<id>", "<ru>",
      "<translate_zh>", "<translate_ja>", "<translate_ko>", "<translate_en>",  "<translate_fr>", "<translate_es>", "<translate_pt>", "<translate_vi>", "<translate_id>", "<translate_ru>"
    ]
    use_s2t_alignment: True
    text_embed_model: "bert-base-multilingual-uncased"
    # you can pass any model parameters to override the model setting.
    # e.g. model.config.dim = 500
  # the final model.config will be saved as 'config.json' to exp_dir

tokenizer:
  tokenizer_type: "asr-spm"
  source: /apdcephfs_cq10/share_1603164/user/jamelynli/workspace/Auden/egs/st/assets/lang_cc100_mdl_unigram_merged_20k_new_decay

trainer:
  optimizer: "scaled_adam"
  base_lr: 0.035
  scheduler: "eden"
  lr_epochs: 3.5 # Number of epochs that affects how rapidly the learning rate decreases.
  lr_batches: 7500 # Number of steps that affects how rapidly the learning rate decreases. We suggest not to change this.
  warmup_batches: 500 # lr warmup steps
  rnnt_warm_step: 2000
  lr_steps_per_epoch: 0 # recommend to adjust this value when you do use_infinite_dataset=True to get exact learning rate schedule as usual. Set it close to your estimated number of steps per epoch
  num_epochs: 30
  start_epoch: 1
  start_batch: 0
  prune_range: 5 # The prune range for rnnt loss, it means how many symbols(context) we are using to compute the loss
  lm_scale: 0.25 # The scale to smooth the loss with lm(output of prediction network) part
  am_scale: 0 #The scale to smooth the loss with am (output of encoder network) part
  simple_loss_scale: 0.5
  attention_loss_scale: 1.0
  s2t_align_loss_scale: 0.1
  s2t_align_warm_step: 20000
  ref_duration: 600 # Reference batch duration for purposes of adjusting batch counts for setting various schedules inside the model
  keep_last_k: 30 # save last_k checkpoints on disk
  use_averaged_model: True
  log_interval: 50
  average_period: 200 # how rapidly the averaged_model is averaged and saved
  reset_interval: 200 # moving average interval for info tracker
  valid_interval: 1000
  save_every_n: 4 # save checkpoint every (n * valid_interval) steps
  use_fp16: True
  initialization:
    checkpoint: ~ # initialized from other pretrained checkpoint (path to a .pt file). We will only use the init_modules
    init_modules: ~ # ['encoder_embed', 'encoder'] # these 2 are for audio encoders
    strict: True
  tensorboard: True
  freeze_modules: ~ # name of modules to be frozen during training

data:
  train_data_config: configs/asr/train_data_config.yaml
  valid_data_config: configs/asr/valid_data_config.yaml
  enable_spec_aug: True
  enable_s2t_dataset: True
  s2t_translate_ratio: 0.5
  enable_musan: False
  musan: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/musan/musan_cuts.jsonl.gz
  enable_speed_perturb: False
  bucketing_sampler: True
  num_buckets: 10
  max_duration: 250 # total secs of speech within a minibatch
  on_the_fly_feats: True
  shuffle: True
  drop_last: True
  num_workers: 8
  enable_data_mixing: False
  text_normalization: True
  use_infinite_dataset: True # the iterator of each dataset will never be exhausted so there will only be steps, no epochs