# ################################
# Script for training an ASR model evaluating an SSL representation
# model on one language from the CommonVoice dataset. A SentencePiece tokenizer
# with number of tokens equal to <output_neurons> is learned in a first phase
# on the considered language.
# ################################

# Seed zseed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]
language: cy # use 'cy' for Welsh  and 'eu' for Basque
output_folder: !ref results/CommonVoice/discrete_ssl/<language>/<seed>
test_wer_file: !ref <output_folder>/wer_test.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data files
data_folder: !PLACEHOLDER # e.g, /local/cv-corpus-11.0-2022-09-21/<language>
train_tsv_file: !ref <data_folder>/train.tsv  # Standard CommonVoice .tsv files
dev_tsv_file: !ref <data_folder>/dev.tsv  # Standard CommonVoice .tsv files
test_tsv_file: !ref <data_folder>/test.tsv  # Standard CommonVoice .tsv files
accented_letters: True
train_csv: !ref <save_folder>/train.csv
valid_csv: !ref <save_folder>/dev.csv
test_csv: !ref <save_folder>/test.csv
skip_prep: False # Skip data preparation

avoid_if_longer_than: 10.0

# Training parameters
number_of_epochs: 20
lr: 0.0002
lr_weights: 0.01
sorting: ascending
precision: fp32
token_type: bpe  # ["unigram", "bpe", "char"]
character_coverage: 1.0


# With data_parallel batch_size is split into N jobs
# With DDP batch_size is multiplied by N jobs
# Must be 3 per GPU to fit 32GB of VRAM
batch_size: 4
test_batch_size: 1

# Dataloader options
train_dataloader_opts:
   batch_size: !ref <batch_size>
dataloader_options:
   batch_size: !ref <batch_size>
   num_workers: 4
test_dataloader_options:
   batch_size: !ref <test_batch_size>
   num_workers: 4


valid_dataloader_opts:
   batch_size: !ref <batch_size>

# Model parameters
activation: !name:torch.nn.Sigmoid

# Outputs
output_neurons: 100  # BPE size, index(blank/eos/bos) = 0

# Decoding parameters
blank_index: 0
unk_index: 1

test_beam_search:
   beam_size: 143
   topk: 1
   blank_index: !ref <blank_index>
   space_token: ' ' # make sure this is the same as the one used in the tokenizer
   beam_prune_logp: -12.0
   token_prune_min_logp: -1.2
   prune_history: True
   alpha: 0.8
   beta: 1.2
   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
   # If you don't want to use an LM, comment it out or set it to null
   kenlm_model_path: null

### Configuration for  discrete SSL model
# ssl_model_type: hubert, wavlm, wav2vec2
# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
ssl_model_type: hubert # hubert, wavml or wav2vec2
ssl_hub: facebook/hubert-large-ll60k
ssl_folder: !ref <save_folder>/ssl_checkpoint
kmeans_repo_id: speechbrain/SSL_Quantization
kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
kmeans_dataset: LibriSpeech-100-360-500
freeze_ssl: True
freeze_feature_extractor: True
num_clusters: 1000


### Config for Tokenizer
# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
# ssl_layer_num: [3, 7, 12, 23]
# deduplicate: [False, False, False, False]
# bpe_tokenizer_path: [null , null,  null, null]
ssl_layer_num: [1, 3, 7, 12, 18, 23]
num_codebooks: 6
deduplicate: [False, False, False, False, False, False]
bpe_tokenizer_path: [null, null, null, null, null, null]
sample_rate: 16000

# Feature parameters
encoder_dim: 1024

# Functions and classes
#
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
   limit: !ref <number_of_epochs>

# Modules
tokenizer_config:
   SSL_layers: !ref <ssl_layer_num>
   deduplicates: !ref <deduplicate>
   bpe_tokenizers: !ref <bpe_tokenizer_path>

ssl_model: !apply:speechbrain.utils.hparams.choice
   value: !ref <ssl_model_type>
   choices:
      wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
         source: !ref <ssl_hub>
         output_norm: False
         freeze: !ref <freeze_ssl>
         freeze_feature_extractor: !ref <freeze_feature_extractor>
         output_all_hiddens: True
         save_path: !ref <ssl_folder>
      hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
         source: !ref <ssl_hub>
         output_norm: False
         freeze: !ref <freeze_ssl>
         freeze_feature_extractor: !ref <freeze_feature_extractor>
         output_all_hiddens: True
         save_path: !ref <ssl_folder>
      wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
         source: !ref <ssl_hub>
         output_norm: False
         freeze: !ref <freeze_ssl>
         freeze_feature_extractor: !ref <freeze_feature_extractor>
         output_all_hiddens: True
         save_path: !ref <ssl_folder>

codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
   save_path: !ref <kmeans_cache_dir>
   ssl_model: !ref <ssl_model>
   kmeans_dataset: !ref <kmeans_dataset>
   kmeans_repo_id: !ref <kmeans_repo_id>
   num_clusters: !ref <num_clusters>

discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
   num_codebooks: !ref <num_codebooks>
   vocab_size: !ref <num_clusters>
   emb_dim: !ref <encoder_dim>

attention_mlp: !new:custom_model.AttentionMLP
   input_dim: !ref <encoder_dim>
   hidden_dim: !ref <encoder_dim>


enc: !new:speechbrain.nnet.RNN.LSTM
   input_shape: [Null, Null, !ref <encoder_dim>]
   num_layers: 2
   bidirectional: True
   dropout: 0.2
   hidden_size: 1024

ctc_lin: !new:speechbrain.nnet.linear.Linear
   input_size: 2048
   n_neurons: !ref <output_neurons>

log_softmax: !new:speechbrain.nnet.activations.Softmax
   apply_log: True

ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
   blank_index: !ref <blank_index>

modules:
   enc: !ref <enc>
   ctc_lin: !ref <ctc_lin>
   attention_mlp: !ref <attention_mlp>
   codec: !ref <codec>
   discrete_embedding_layer: !ref <discrete_embedding_layer>

model: !new:torch.nn.ModuleList
   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]

model_opt_class: !name:torch.optim.Adam
   lr: !ref <lr>


lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
   initial_value: !ref <lr>
   improvement_threshold: 0.0025
   annealing_factor: 0.8
   patient: 0

label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
   checkpoints_dir: !ref <save_folder>
   recoverables:
      model: !ref <model>
      scheduler_model: !ref <lr_annealing_model>
      attention_mlp: !ref <attention_mlp>
      codec: !ref <codec>
      discrete_embedding_layer: !ref <discrete_embedding_layer>
      counter: !ref <epoch_counter>
      tokenizer: !ref <label_encoder>

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
   save_file: !ref <train_log>

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
   split_tokens: True
