# ###########################################################################################
# Model: CRDNN with EnCodec audio representations
# ###########################################################################################

experiment_name: encodec

# Seed needs to be set at top of YAML
seed: 0
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Data preparation
data_folder: !PLACEHOLDER
train_csv: !ref <save_folder>/trainset_28spk_wav.csv
valid_csv: !ref <save_folder>/validset_wav.csv
test_csv: !ref <save_folder>/testset_wav.csv
splits: [trainset_28spk_wav, validset_wav, testset_wav]
num_valid_speakers: 2

# Output folders
output_folder: !ref results/<experiment_name>/<seed>
save_folder: !ref <output_folder>/save
cache_folder: !name:huggingface_hub.constants.HUGGINGFACE_HUB_CACHE

# Save options
compute_metrics: True
save_audios: False

# Preprocessing parameters
train_remove_if_longer: 60.0  # Seconds
valid_remove_if_longer: 60.0  # Seconds
test_remove_if_longer: 60.0  # Seconds
sorting: random
use_cache: True

# Training parameters
num_epochs: 50
grad_accumulation_factor: 16
train_batch_size: 1
valid_batch_size: 1
test_batch_size: 1
dataloader_workers: 4
nonfinite_patience: 10
max_grad_norm: 5.0
precision: fp32
ckpt_interval_minutes: 6000
keep_checkpoints: 1
augment: False
augment_prob: 0.75

# Optimizer parameters
lr: 0.0005
weight_decay: 0.01
improvement_threshold: 0.0025
annealing_factor: 0.9
patient: 1

# EnCodec parameters
# sample_rate: [24000, 24000, 24000, 24000]
# vocab_size: [1024, 1024, 1024, 1024]
# num_codebooks: [2, 4, 8, 16, 32]
# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
sample_rate: 24000
vocab_size: 1024
num_codebooks: 2
bandwidth: !ref <num_codebooks> * 75 / 100

# Embedding parameters
embedding_dim: 1024
pretrain_embedding: False  # If True, must match the codec's embedding size (128)
freeze_embedding: False

# Encoder parameters
dropout: 0.1
activation: !name:torch.nn.LeakyReLU
rnn_class: !name:speechbrain.nnet.RNN.LSTM
rnn_layers: 4
time_pooling_size: 1
rnn_bidirectional: True
rnn_neurons: 256
dnn_blocks: 2
dnn_neurons: 256
cnn_blocks: 2
cnn_channels: (16, 16)
inter_layer_pooling_size: (2, 2)
cnn_kernelsize: (3, 3)

# Augmentation
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: 0  # Min frequency band dropout probability
    drop_freq_high: 1  # Max frequency band dropout probability
    drop_freq_count_low: 1  # Min number of frequency bands to drop
    drop_freq_count_high: 3  # Max number of frequency bands to drop
    drop_freq_width: 0.05  # Width of frequency bands to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: 1  # Min number of audio chunks to drop
    drop_length_high: 5  # Max number of audio chunks to drop
    drop_count_low: 1000  # Min length of audio chunks to drop
    drop_count_high: 2000  # Max length of audio chunks to drop

augmentation: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: False
    concat_original: False
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 2
    max_augmentations: 2
    augment_prob: !ref <augment_prob>
    augmentations: [!ref <drop_freq>, !ref <drop_chunk>]

# Modules
codec: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec
    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
    save_path: !ref <cache_folder>
    sample_rate: !ref <sample_rate>
    bandwidth: !ref <bandwidth>
    flat_embeddings: False
    freeze: True
    renorm_embeddings: False

embedding: !new:custom_model.Discrete_EmbeddingLayer
    num_codebooks: !ref <num_codebooks>
    vocab_size: !ref <vocab_size>
    emb_dim: !ref <embedding_dim>
    freeze: !ref <freeze_embedding>

attention_mlp: !new:custom_model.AttentionMLP
    input_dim: !ref <embedding_dim>
    hidden_dim: !ref <embedding_dim>

encoder: !new:speechbrain.lobes.models.CRDNN.CRDNN
    input_shape: [null, null, !ref <embedding_dim>]
    activation: !ref <activation>
    dropout: !ref <dropout>
    cnn_blocks: !ref <cnn_blocks>
    cnn_channels: !ref <cnn_channels>
    cnn_kernelsize: !ref <cnn_kernelsize>
    inter_layer_pooling_size: !ref <inter_layer_pooling_size>
    time_pooling: True
    using_2d_pooling: False
    time_pooling_size: !ref <time_pooling_size>
    rnn_class: !ref <rnn_class>
    rnn_layers: !ref <rnn_layers>
    rnn_neurons: !ref <rnn_neurons>
    rnn_bidirectional: !ref <rnn_bidirectional>
    dnn_blocks: !ref <dnn_blocks>
    dnn_neurons: !ref <dnn_neurons>
    rnn_re_init: True
    use_rnnp: False

head: !new:torch.nn.Linear
    in_features: !ref <dnn_neurons>
    out_features: !ref <num_codebooks> * <vocab_size>

modules:
    embedding: !ref <embedding>
    attention_mlp: !ref <attention_mlp>
    encoder: !ref <encoder>
    head: !ref <head>

model: !new:torch.nn.ModuleList
    [[!ref <embedding>,
      !ref <attention_mlp>,
      !ref <encoder>,
      !ref <head>]]

# Loss functions
ce_loss: !name:speechbrain.nnet.losses.nll_loss
    label_smoothing: 0.0
    allowed_len_diff: 0
    reduction: mean

# Optimizers
opt_class: !name:torch.optim.AdamW
    lr: !ref <lr>
    betas: (0.9, 0.98)
    eps: 1.e-8
    weight_decay: !ref <weight_decay>

# Schedulers
scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: !ref <improvement_threshold>
    annealing_factor: !ref <annealing_factor>
    patient: !ref <patient>

# Dataloaders
train_dataloader_kwargs:
    batch_size: !ref <train_batch_size>
    num_workers: !ref <dataloader_workers>
    pin_memory: True
    shuffle: !apply:str.__eq__ [!ref <sorting>, random]

valid_dataloader_kwargs:
    batch_size: !ref <valid_batch_size>
    num_workers: !ref <dataloader_workers>
    pin_memory: True

test_dataloader_kwargs:
    batch_size: !ref <test_batch_size>
    num_workers: !ref <dataloader_workers>
    pin_memory: True

# Performance metrics
ter_computer: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

dnsmos_computer: !name:metrics.dnsmos.DNSMOS
    sample_rate: !ref <sample_rate>

dwer_computer: !name:metrics.dwer.DWER
    model_hub: openai/whisper-small
    save_path: !ref <cache_folder>
    sample_rate: !ref <sample_rate>

wavlm_sim_computer: !name:metrics.spk_sim.SpkSimWavLM
    model_hub: microsoft/wavlm-base-sv
    save_path: !ref <cache_folder>
    sample_rate: !ref <sample_rate>

ecapatdnn_sim_computer: !name:metrics.spk_sim.SpkSimECAPATDNN
    model_hub: speechbrain/spkrec-ecapa-voxceleb
    save_path: !apply:os.path.join [!ref <cache_folder>, models--speechbrain--spkrec-ecapa-voxceleb]
    sample_rate: !ref <sample_rate>

# Counters, checkpointers, loggers, etc.
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <num_epochs>

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        scheduler: !ref <scheduler>
        counter: !ref <epoch_counter>

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <output_folder>/train_log.txt
