################################
# Recipe for Training K-Means Clustering on CommonVoice Data
# Using Self-Supervised Model-Based Representations
#
# It is used for creating discrete audio representations from CommonVoice data.
################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]
output_folder: !ref results/CommonVoice/clustering/hubert/<seed>
save_folder: !ref <output_folder>/save

# Data files
data_folder: !PLACEHOLDER  # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr
train_tsv_file: !ref <data_folder>/train.tsv  # Standard CommonVoice .tsv files
accented_letters: False
language: en # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
train_csv: !ref <save_folder>/train.csv
skip_prep: False # Skip data preparation
sample_rate: 16000

# We remove utterance slonger than 10s in the train/dev/test sets as
# longer sentences certainly correspond to "open microphones".
avoid_if_longer_than: 10.0

# ssl_model_type: hubert, wavlm, wav2vec2
# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
ssl_model_type: hubert # hubert, wavml or wav2vec2
ssl_hub: facebook/hubert-large-ll60k
freeze_feature_extractor: True
freeze_ssl: True
ssl_folder: !ref <save_folder>/ssl_checkpoint
ssl_layer_num: 7
batch_size: 128 # batch_size for loading and extracting features. It is different from kmeans_batch_size.
dataloader_num_workers: 8
sorting: ascending
checkpoint_interval: 100


# Dataloader options
dataloader_options:
    batch_size: !ref <batch_size>
    num_workers: !ref <dataloader_num_workers>
    drop_last: True

ssl_model: !apply:speechbrain.utils.hparams.choice
    value: !ref <ssl_model_type>
    choices:
        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
            source: !ref <ssl_hub>
            output_norm: False
            freeze: !ref <freeze_ssl>
            freeze_feature_extractor: !ref <freeze_feature_extractor>
            output_all_hiddens: True
            save_path: !ref <ssl_folder>
        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
            source: !ref <ssl_hub>
            output_norm: False
            freeze: !ref <freeze_ssl>
            freeze_feature_extractor: !ref <freeze_feature_extractor>
            output_all_hiddens: True
            save_path: !ref <ssl_folder>
        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
            source: !ref <ssl_hub>
            output_norm: False
            freeze: !ref <freeze_ssl>
            freeze_feature_extractor: !ref <freeze_feature_extractor>
            output_all_hiddens: True
            save_path: !ref <ssl_folder>


####################
# Model Parameters #
####################
num_clusters: 128
init: k-means++
max_iter: 100
kmeans_batch_size: 1000 # should be >= num_clusters
tol: 0.0
max_no_improvement: 100
n_init: 20
reassignment_ratio: 0.0
