project: "LayerWiseAttnReuse"
name: "conformer-ctc-m-bn-128-b480-share8-v3"
run_type: "train"
save_dir: "result/conformer-ctc-m-bn-128-b480-share8-v3"
score_save_dir: "outputs/conformer-ctc-m-bn-128-b480-share8-v3"
gpus: 4

# ---------------------------------------------------------------- #
wandb:
  mode: "online"
  notes: null
  id: null

resume:
  from_scratch: true
  checkpoint: null
  model_only: false
  strict: true
  load_keys: null
  ignore_keys: null

logging:
  stdout_all: false

# ---------------------------------------------------------------- #
data_dir: "/data/librispeech"
vocab_model: "/data/librispeech/libri_128_bpe_cleaned.model"

num_tokens: 128
sample_rate: 16000
num_mels: 80
blank_idx: 0
loss_type: "ctc"

# ---------------------------------------------------------------- #
model:
  encoder:
    num_layers: 16
    hidden_dim: 256
    num_heads: 4
    conv_kernel_size: 31
    feedforward_dim: 1024
    embed_drop_prob: 0.1
    attn_drop_prob: 0.1
    proj_drop_prob: 0.1
    feedforward_drop_prob: 0.1
    eps: 1.0e-5
    momentum: 0.005
    attn_first: true
    rel_attn: true
    attn_bias: true
    share_query_bias: false
    conv_norm_type: "bn"
    conv_sync_bn: true
    conv_gn_groups: 4
    conv_partial: true
    was: true
    was_gamma: 0.5
    x_scaling: true
    out_dim: ${num_tokens}
    left_context_length: -1
    right_context_length: -1
    memory_efficient: false
    share_pattern: [ true, false, false, false, false, false, false, false, true, false, false, false, false, false, false, false ]

  subsampling:
    type: "stride"
    feature_dim: ${num_mels}
    out_dim: ${model.encoder.hidden_dim}
    num_layers: 2
    num_channels: [ 256, 256 ]
    kernel_size: 3
    drop_prob: 0.0
    bn: true
    eps: ${model.encoder.eps}
    momentum: ${model.encoder.momentum}
    act_type: "relu"
    out_norm: false
    sync_bn: ${model.encoder.conv_sync_bn}

  decoder:
    blank_idx: ${blank_idx}
    temperature: 1.0

# ---------------------------------------------------------------- #
additional:
  sorta_grad: false
  beam_width:
    train: 1
    valid: 1
    test: 32
  verbose_metric:
    train: false
    valid: true
    test: true
  freeze_bn: false
  variational_noise: 10000
  variational_noise_std: 0.02

# ---------------------------------------------------------------- #
datasets:
  train:
    - name: "PadoLibriSpeech"
      mode: "train-clean-100"
      data_dir: ${data_dir}
      max_seconds: 16.7
    - name: "PadoLibriSpeech"
      mode: "train-clean-360"
      data_dir: ${data_dir}
      max_seconds: 16.7
    - name: "PadoLibriSpeech"
      mode: "train-other-500"
      data_dir: ${data_dir}
      max_seconds: 16.7
  valid:
    name: "PadoLibriSpeech"
    mode: "dev-clean"
    data_dir: ${data_dir}
  test:
    name: "PadoLibriSpeech"
    mode: "test-clean"
    data_dir: ${data_dir}

dataloaders:
  train:
    batch_size: 40
    shuffle: true
    num_workers: 2
    drop_last: true
  test:
    batch_size: 40
    shuffle: false
    num_workers: 2
    drop_last: false

# ---------------------------------------------------------------- #
transforms:
  train:
    - name: "SpeedPerturbation"
      sample_rate: ${sample_rate}
      speeds: [ 0.9, 0.95, 1.0, 1.05, 1.1 ]
    - name: "MelFilterBank"
      sample_rate: ${sample_rate}
      win_length: 400
      hop_length: 160
      n_mels: ${num_mels}
      window_type: "hann"
      pre_emphasize: 0.97
      f_min: 125
      f_max: 7600
      log: true
      dither: 1.0e-5
      pad_end: 0
    - name: "SpecAugment"
      num_freq_mask: 2
      freq_mask_width: 27
      num_time_mask: 5
      time_mask_width: 0.05
  test:
    - name: "MelFilterBank"
      sample_rate: ${sample_rate}
      win_length: 400
      hop_length: 160
      n_mels: ${num_mels}
      window_type: "hann"
      pre_emphasize: 0.97
      f_min: 125
      f_max: 7600
      log: true
      dither: 0.0
      pad_end: 0

target_transforms:
  train: # SentencePieceTokenizer
    vocab_model: ${vocab_model}
    add_bos: false
    add_eos: false
    enable_sampling: false
    alpha: 0.0
  test: # SentencePieceTokenizer
    vocab_model: ${vocab_model}
    add_bos: false
    add_eos: false
    enable_sampling: false

# ---------------------------------------------------------------- #
optimizer:
  name: "Adam"
  lr: 0.001512  # 0.003125(default) / sqrt(2048/480)
  betas: [ 0.9, 0.99 ]
  weight_decay: 1.0e-5
  eps: 1.0e-8
  decoupled: true
  centralize: false

scheduler:
  name: "InvSqrtLR"
  warmup_iters: 5000
  keep_iters: 95000
  min_lr: 1.0e-6
  mode: "min"

loss: # CTC Loss
  blank_idx: ${blank_idx}
  reduction: "sum"
  zero_infinity: false

metric: # Edit Distance
  space_symbol: " "
  blank_symbol: "<b>"
  unk_symbol: "<unk>"
  ignore_unk: true

# ---------------------------------------------------------------- #
accelerator:
  seed: 1234
  cpu: false
  fp16: false

trainer:
  max_iters: 200000
  print_interval_iters: 25
  valid_interval_epochs: 0.251
  clip_grad: 20.0
  clip_grad_method: "norm"
  acc_num_batches: 3
  final_test_mode: "best"
  ckpt_swa_start_iter: 175000
  verbose: true

evaluator:
  print_interval_iters: 5
  verbose: true