# ################################
# Script for training an ASR model evaluating an SSL representation
# model on one language from the CommonVoice dataset. A SentencePiece tokenizer
# with number of tokens equal to <output_neurons> is learned in a first phase
# on the considered language.
#
# ################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]
language: cy # use 'cy' for Welsh  and 'eu' for Basque
output_folder: !ref results/CommonVoice/<language>/linear/encodec/<seed>
test_wer_file: !ref <output_folder>/wer_test.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt


# Data files
data_folder: !PLACEHOLDER  # e.g, /local/cv-corpus-11.0-2022-09-21/<language>
train_tsv_file: !ref <data_folder>/train.tsv  # Standard CommonVoice .tsv files
dev_tsv_file: !ref <data_folder>/dev.tsv  # Standard CommonVoice .tsv files
test_tsv_file: !ref <data_folder>/test.tsv  # Standard CommonVoice .tsv files
accented_letters: True
train_csv: !ref <save_folder>/train.csv
valid_csv: !ref <save_folder>/dev.csv
test_csv: !ref <save_folder>/test.csv
skip_prep: False # Skip data preparation

avoid_if_longer_than: 10.0

### Config for Tokenizer
# EnCodec parameters
# sample_rate: [24000, 24000, 24000, 24000]
# vocab_size: [1024, 1024, 1024, 1024]
# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
# num_codebooks: [2, 4, 8, 16, 32]
vocab_size: 1024
bandwidth: 1.5
num_codebooks: 2
sample_rate: 24000
# Feature parameters
encoder_dim: 1024
# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
init_embedding: False
freeze_embedding: False


# Training parameters
number_of_epochs: 20
lr: 0.0002
sorting: ascending
precision: fp32
token_type: bpe  # ["unigram", "bpe", "char"]
character_coverage: 1.0


# With data_parallel batch_size is split into N jobs
# With DDP batch_size is multiplied by N jobs
# Must be 3 per GPU to fit 32GB of VRAM
batch_size: 4
test_batch_size: 1

# Dataloader options
train_dataloader_opts:
   batch_size: !ref <batch_size>
dataloader_options:
   batch_size: !ref <batch_size>
   num_workers: 4
test_dataloader_options:
   batch_size: !ref <test_batch_size>
   num_workers: 4


valid_dataloader_opts:
   batch_size: !ref <batch_size>

# Model parameters
activation: !name:torch.nn.Sigmoid
dnn_layers: 1
dnn_neurons: 2048
freeze_encoder: True

# Outputs
output_neurons: 100  # BPE size, index(blank/eos/bos) = 0

# Decoding parameters
blank_index: 0
unk_index: 1

test_beam_search:
   beam_size: 143
   topk: 1
   blank_index: !ref <blank_index>
   space_token: ' ' # make sure this is the same as the one used in the tokenizer
   beam_prune_logp: -12.0
   token_prune_min_logp: -1.2
   prune_history: True
   alpha: 0.8
   beta: 1.2
   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
   # If you don't want to use an LM, comment it out or set it to null
   kenlm_model_path: null

# Functions and classes
#
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
   limit: !ref <number_of_epochs>

# Modules
# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
codec: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec
   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
   save_path: !ref <save_folder>
   sample_rate: !ref <sample_rate>
   bandwidth: !ref <bandwidth>
   flat_embeddings: False
   freeze: True
   renorm_embeddings: False

discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
   num_codebooks: !ref <num_codebooks>
   vocab_size: !ref <vocab_size>
   emb_dim: !ref <encoder_dim>
   freeze: !ref <freeze_embedding>
   init: !ref <init_embedding>

attention_mlp: !new:custom_model.AttentionMLP
   input_dim: !ref <encoder_dim>
   hidden_dim: !ref <encoder_dim>


enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
   input_shape: [null, null, !ref <encoder_dim>]
   activation: !ref <activation>
   dnn_blocks: !ref <dnn_layers>
   dnn_neurons: !ref <dnn_neurons>


ctc_lin: !new:speechbrain.nnet.linear.Linear
   input_size: 2048
   n_neurons: !ref <output_neurons>

log_softmax: !new:speechbrain.nnet.activations.Softmax
   apply_log: True

ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
   blank_index: !ref <blank_index>

modules:
   enc: !ref <enc>
   ctc_lin: !ref <ctc_lin>
   attention_mlp: !ref <attention_mlp>
   codec: !ref <codec>
   discrete_embedding_layer: !ref <discrete_embedding_layer>

model: !new:torch.nn.ModuleList
   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]

model_opt_class: !name:torch.optim.Adam
   lr: !ref <lr>

lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
   initial_value: !ref <lr>
   improvement_threshold: 0.0025
   annealing_factor: 0.8
   patient: 0


label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
   checkpoints_dir: !ref <save_folder>
   recoverables:
      model: !ref <model>
      scheduler_model: !ref <lr_annealing_model>
      attention_mlp: !ref <attention_mlp>
      codec: !ref <codec>
      discrete_embedding_layer: !ref <discrete_embedding_layer>
      counter: !ref <epoch_counter>
      tokenizer: !ref <label_encoder>

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
   save_file: !ref <train_log>

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
   split_tokens: True
