exp_dir:
hydra:
  run:
    dir: ${exp_dir}/logs/${now:%Y-%m-%d}/${now:%H-%M-%S}  # Force Hydra output (logs, etc.) into your experiment dir

model:
  name: "zipformer-captioning" # check auto.model_registry.py to see all available models
  config_preset: # 'base', 'large'
  config_path: # Optional if you have a config.json to load for your model. Otherwise it will use the default value defined by model_type's config dataclass
  config:
    # you can pass any model parameters to override the model setting.
    # e.g. model.config.dim = 500
    num_decoder_layers: 6 # Number of decoder layers
    decoder_dropout: 0.0
  # the final model.config will be saved as 'config.json' to exp_dir

trainer:
  optimizer: "scaled_adam"
  base_lr: 0.045
  scheduler: "eden"
  lr_epochs: 3.5 # Number of epochs that affects how rapidly the learning rate decreases.
  lr_batches: 7500 # Number of steps that affects how rapidly the learning rate decreases. We suggest not to change this.
  warmup_batches: 500 # lr warmup steps
  lr_steps_per_epoch: 0 # recommend to adjust this value when you do use_infinite_dataset=True to get exact learning rate schedule as usual. Set it close to your estimated number of steps per epoch
  num_epochs: 30
  start_epoch: 1
  start_batch: 0
  ref_duration: 600 # Reference batch duration for purposes of adjusting batch counts for setting various schedules inside the model
  keep_last_k: 30 # save last_k checkpoints on disk
  use_averaged_model: True
  log_interval: 100
  average_period: 200 # how rapidly the averaged_model is averaged and saved
  reset_interval: 200 # moving average interval for info tracker
  valid_interval: 1000
  save_every_n: 4 # save checkpoint every (n * valid_interval) steps
  use_fp16: False
  initialization:
    checkpoint: ~ # initialized from other pretrained checkpoint (path to a .pt file). We will only use the init_modules
    init_modules: ['encoder_embed', 'encoder'] # these 2 are for audio encoders
  tensorboard: True
  freeze_modules: ~ # name of modules to be frozen during training

data:
  train_data_config: configs/train_data_config.yaml
  valid_sets: ['/apdcephfs_cq12/share_302080740/data/audio_test_data/audiocaps/audiocaps_val.jsonl.gz']
  valid_data_path_prefix: false
  enable_spec_aug: True
  input_strategy: ~ # PrecomputedFeatures
  enable_musan: False
  musan: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/musan/musan_cuts.jsonl.gz
  enable_speed_perturb: False
  bucketing_sampler: True
  num_buckets: 30
  max_duration: 600 # total secs of speech within a minibatch
  on_the_fly_feats: True
  shuffle: True
  drop_last: True
  num_workers: 12
  use_infinite_dataset: True # the iterator of each dataset will never be exhausted so there will only be steps, no epochs
  truncate_long_utt: True
  utt_max_duration: 32