################################
# Recipe for Training BPE tokenizer on discrete SSL tokens
################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]
output_folder: !ref results/LibriSpeech/subwording/discrete-ssl-bpe/<seed>
save_folder: !ref <output_folder>/save


# Data files
data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
train_splits: ["train-clean-100"]
dev_splits: []
test_splits: []
skip_prep: False
ckpt_interval_minutes: 25 # save checkpoint every N min
train_csv: !ref <output_folder>/train.csv
sample_rate: 16000
tokenized_train: !ref <output_folder>/tokenized_train.csv
vocab_size: 1000
unk_id: 1
pad_id: 0

# ssl_model_type: hubert, wavlm, wav2vec2
# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
ssl_model_type: hubert # hubert, wavml or wav2vec2
ssl_hub: facebook/hubert-large-ll60k
ssl_folder: !ref <save_folder>/ssl_checkpoint
kmeans_repo_id: speechbrain/SSL_Quantization
kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
kmeans_dataset: LibriSpeech-100-360-500
num_clusters: 800
freeze_ssl: True
freeze_feature_extractor: True
# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
ssl_layer_num: [7, 23]
deduplicate: [False, False]
bpe_tokenizer_path: [null, null]


tokenizer_config:
    SSL_layers: !ref <ssl_layer_num>
    deduplicates: !ref <deduplicate>
    bpe_tokenizers: !ref <bpe_tokenizer_path>


ssl_model: !apply:speechbrain.utils.hparams.choice
    value: !ref <ssl_model_type>
    choices:
        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
            source: !ref <ssl_hub>
            output_norm: False
            freeze: !ref <freeze_ssl>
            freeze_feature_extractor: !ref <freeze_feature_extractor>
            output_all_hiddens: True
            save_path: !ref <ssl_folder>
        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
            source: !ref <ssl_hub>
            output_norm: False
            freeze: !ref <freeze_ssl>
            freeze_feature_extractor: !ref <freeze_feature_extractor>
            output_all_hiddens: True
            save_path: !ref <ssl_folder>
        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
            source: !ref <ssl_hub>
            output_norm: False
            freeze: !ref <freeze_ssl>
            freeze_feature_extractor: !ref <freeze_feature_extractor>
            output_all_hiddens: True
            save_path: !ref <ssl_folder>

discrete_ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
    save_path: !ref <kmeans_cache_dir>
    ssl_model: !ref <ssl_model>
    kmeans_dataset: !ref <kmeans_dataset>
    kmeans_repo_id: !ref <kmeans_repo_id>
    num_clusters: !ref <num_clusters>
