seed: 1003
__set_seed: !apply:torch.manual_seed [!ref <seed>]
root: /share/workhorse3/hippo/biologically_inspired_models/icassp_logs/librispeech-0.0/
model_name: LibrispeechTDNN4L768UAsrCTC
output_folder: !ref <root>/trainings/<model_name>
wer_file: !ref <output_folder>/wer.txt
save_folder: !ref <output_folder>
train_log: !ref <output_folder>/train_log.txt

pretrained_path: !ref <root>/models/<model_name>

# sp_tokenizer_path: /home/hippo/workhorse3/librispeech/ls_train-char.model
tokenizer: !new:speechbrain.dataio.encoder.CTCTextEncoder
blank_index: 0
bos_index: 0
eos_index: 0
unk_index: 0

# Language model (LM) pretraining
# NB: To avoid mismatch, the speech recognizer must be trained with the same
# tokenizer used for LM training. Here, we download everything from the
# speechbrain HuggingFace repository. However, a local path pointing to a
# directory containing the lm.ckpt and tokenizer.ckpt may also be specified
# instead. E.g if you want to use your own LM / tokenizer.

attack_class: null
brain_class: !name:robust_speech.models.ctc.CTCASR
dataset_prepare_fct: !name:robust_speech.data.librispeech.prepare_librispeech
dataio_prepare_fct: !name:robust_speech.data.dataio.dataio_prepare

# Data files
data_folder: /home/hippo/workhorse3/librispeech/ # e.g, /localscratch/LibriSpeech
csv_folder: !ref <data_folder>
# noise/ris dataset will automatically be downloaded
data_folder_rirs: !ref <data_folder>

train_splits: ["train"]
dev_splits: ["val"]
test_splits: ["test_clean"]
skip_prep: True
ckpt_interval_minutes: 15 # save checkpoint every N min
train_csv: !ref <data_folder>/train.csv
valid_csv: !ref <data_folder>/val.csv
test_csv:
   - !ref <data_folder>/test_clean.csv
avoid_if_longer_than: 17.0
avoid_if_shorter_than: 1.0
# Training parameters
number_of_epochs: 50
batch_size: 64
test_batch_size: 64
lr: 0.05
sorting: random
dynamic_batching: False
gradient_accumulation: 1

# dynamic batching parameters, if used
dynamic_batch_sampler:
  max_batch_len: 12000000
  shuffle_ex: True
  batch_ordering: random
  num_buckets: 128

opt_class: !name:torch.optim.SGD
  lr: !ref <lr>
  momentum: 0.9

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
  initial_value: !ref <lr>
  improvement_threshold: 0.0025
  annealing_factor: 0.8
  patient: 0

train_dataloader_opts:
  batch_size: !ref <batch_size>
  num_workers: 10

valid_dataloader_opts:
  batch_size: !ref <batch_size>
  num_workers: 10

test_dataloader_opts:
  batch_size: !ref <test_batch_size>
  num_workers: 10

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 40

n_hidden: 768
dropout_p: 0.25
vocab_size: 31

normalizer: !new:speechbrain.processing.features.InputNormalization
   norm_type: global

compute_features: !new:speechbrain.lobes.features.Fbank
   sample_rate: !ref <sample_rate>
   n_fft: !ref <n_fft>
   n_mels: !ref <n_mels>

# env_corrupt: !new:speechbrain.lobes.augment.EnvCorrupt
#    babble_prob: 0.0
#    reverb_prob: 0.0
#    noise_prob: 1.0
#    noise_snr_low: 0
#    noise_snr_high: 15
  
augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
   sample_rate: !ref <sample_rate>
   speeds: [95, 100, 105]

layer1_params: !new:adversarialML.biologically_inspired_models.src.Interspeech23.models.ConvParams
  out_channels: !ref <n_hidden>
  kernel_size: 13
  stride: 2
  padding: 6

layer2_params: !new:adversarialML.biologically_inspired_models.src.Interspeech23.models.ConvParams
  out_channels: !ref <n_hidden>
  kernel_size: 17
  stride: 2
  padding: 8

layer3_params: !new:adversarialML.biologically_inspired_models.src.Interspeech23.models.ConvParams
  out_channels: !ref <n_hidden>
  kernel_size: 21
  stride: 1
  padding: 10

layer4_params: !new:adversarialML.biologically_inspired_models.src.Interspeech23.models.ConvParams
  out_channels: !ref <n_hidden>
  kernel_size: 25
  stride: 1
  padding: 12

common_params1: !new:adversarialML.biologically_inspired_models.src.models.CommonModelParams
  input_size: [!ref <n_mels>, !ref <sample_rate>]
  dropout_p: !ref <dropout_p>

enc_params: !apply:adversarialML.biologically_inspired_models.src.Interspeech23.models.Conv1dEncoder.get_params
  common_params: !ref <common_params1>
  conv_params: [!ref <layer1_params>, !ref <layer2_params>, !ref <layer3_params>, !ref <layer4_params>]
  group_norm: True
  transpose_chan_and_time_dims: True

enc: !new:adversarialML.biologically_inspired_models.src.Interspeech23.models.Conv1dEncoder
  params: !ref <enc_params>

ctc_lin: !new:speechbrain.nnet.linear.Linear
  input_size: !ref <n_hidden>
  n_neurons: !ref <vocab_size>

log_softmax: !new:speechbrain.nnet.activations.Softmax
  apply_log: True

ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
  blank_index: !ref <blank_index>

modules:
  enc: !ref <enc>
  ctc_lin: !ref <ctc_lin>
  normalize: !ref <normalizer>
  # env_corrupt: !ref <env_corrupt>

model: !new:torch.nn.ModuleList
  - [!ref <enc>, !ref <ctc_lin>]

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
  limit: !ref <number_of_epochs>

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: !ref <train_log>

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
  split_tokens: True
acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: !ref <save_folder>
  recoverables:
    model: !ref <model>
    scheduler: !ref <lr_annealing>
    normalizer: !ref <normalizer>
    counter: !ref <epoch_counter>
    tokenizer: !ref <tokenizer>

# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml.
# pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
#     loadables:
#         tokenizer: !ref <tokenizer>
#     paths:
#       tokenizer: !ref <sp_tokenizer_path>
#     collect_in: !ref <output_folder>/tokenizers