# Config example for training double-ended model: CNN-SA-Align-Fuse-SA-AP (SA: Self-Attention, AP: Attention-Pooling)

# Runname and paths
name: training_run_name # name of current training run
data_dir: C:/Users/Name/Downloads/NISQA_Corpus # main input dir with dataset samples and csv files
output_dir: C:/Users/Name/Downloads/trained_models # output dir, a new subfolder for current run will be created with yaml, results csv, and stored model weights
pretrained_model: false

# Dataset options
csv_file: NISQA_corpus_file.csv # csv-file with MOS labels and filepaths of all datasets, must be placed in 'data_dir', must contain columns 'mos', 'noi', 'dis', 'col', 'loud' with overall and dimension quality ratings
csv_con: null # csv-file with per-condition MOS used for evaluation (optional)
csv_deg: filepath_deg # csv column name of filepath to degraded speech sample, path must be relative to 'data_dir'
csv_ref: filepath_ref # csv column name of filepath to reference speech sample, path must be relative to 'data_dir'
csv_mos_train: mos # csv column name of target training value (usually MOS)
csv_mos_val: mos # csv column name of target validation value (usually MOS)
csv_db_train: # dataset names of training sets, the dataset names must be in 'db' column of csv file
    - NISQA_TRAIN_SIM
    - NISQA_TRAIN_LIVE
csv_db_val:  # dataset names of validation sets, the dataset names must be in 'db' column of csv file
    - NISQA_VAL_SIM
    - NISQA_VAL_LIVE

# Training options
tr_epochs: 500 # number of max training epochs
tr_early_stop: 20 # stop training if neither validation RMSE nor correlation 'r_p' does improve for 'tr_early_stop' epochs
tr_bs: 40 # training dataset mini-batch size (should be increased to 100-200 if enough GPU memory available)
tr_bs_val: 40 # validation dataset mini-batch size (should be increased to 100-200 if enough GPU memory available)
tr_lr: 0.001 # learning rate of ADAM optimiser
tr_lr_patience: 15  # learning rate patience, decrease learning rate if loss does not improve for 'tr_lr_patience' epochs
tr_num_workers: 4 # number of workers to be used by PyTorch Dataloader (may cause problems on Windows machines -> set to 0)
tr_parallel: True # use PyTorch DataParallel for training on multiple GPUs
tr_ds_to_memory: False # load dataset in CPU RAM before starting training (increases speed on some systems, 'tr_num_workers' should be set to 0 or 1)
tr_ds_to_memory_workers: 0  # number of workers used for loading data into CPU RAM (experimental)
tr_device: null # train on 'cpu' or 'cuda', if null 'cuda' is used if available.
tr_checkpoint: every_epoch # 'every_epoch' stores model weights at each training epoch | 'best_only' stores only the weights with best validation correlation | 'null' only stores results but no model weights
tr_verbose: 2 # '0' only basic results after each epoch | '1' more detailed results and bias loss information | '2' adds progression bar

# Bias loss options (optional)
tr_bias_mapping: null # set to 'first_order' if bias loss should be applied, otherwise 'null'
tr_bias_min_r: null # minimum correlation threshold to be reached before estimating bias (e.g. 0.7), set to 'null' if no bias loss should be applied
tr_bias_anchor_db: null # name of anchor dataset (optional)

# Mel-Specs options
ms_sr: null # resample speech signal to 'ms_sr', (usually not needed, because window length 'ms_win_length' is adjusted automatically for different sample frequencies)
ms_fmax: 20000 # maximum considered Mel-band frequency (in Hz), set to 20k for fullband speech samples
ms_n_fft: 4096 # padded fft window length (in bins), 4096 still fits 40 ms window length, even for 96kHz sample rate
ms_hop_length: 0.01 # hop length of fft windowing (in seconds)
ms_win_length: 0.02 # fft window length (in seconds), will be padded with zeros to match 'ms_n_fft'
ms_n_mels: 48 # number of Mel bands
ms_seg_length: 15 # width of extracted Mel-spec segments (in bins)
ms_seg_hop_length: 4 # hop length of segments (in bins), decreasing this may improve performance but increases memory usage and runtime. segment hop length in seconds: ms_hop_length*ms_seg_hop_length
ms_max_segments: 1300 # max segment length (in bins). if samples of different duration are used they will be padded. one segment corresponds to 40ms -> 0.04*1300=52sec max sample duration. increase if you apply the model to longer samples
ms_channel: null # audio channel in case of stereo file (0->left, 1->right). if null, mono mix is used

# Main model
model: NISQA_DE # 'NISQA' single-ended | 'NISQA_DIM' single-ended with multidimension prediction | 'NISQA_DE' double-ended 

# Framewise options (usually CNN)  
cnn_model: adapt # framewise model, either 'adapt' for CNN with adaptive maxpooling | 'standard' for standard CNN | 'dff' for deep feed-forward network | 'skip' to skip framewise modelling
cnn_c_out_1: 16 # number of output channels of first convolutional layer
cnn_c_out_2: 32 # number of output channels of the second convolutional layer
cnn_c_out_3: 64 # number of output channels of the last four convolutional layer
cnn_kernel_size: !!python/tuple [3,3] 
cnn_dropout: 0.2
cnn_fc_out_h: null # length of the CNN output feature vector, if 'null' the last fully connected layer is omitted
cnn_pool_1: [24,7] # outpuf dimensions of first adaptive pooling ('adaptive' CNN only)
cnn_pool_2: [12,5] # outpuf dimensions of second adaptive pooling ('adaptive' CNN only)
cnn_pool_3: [6,3] # outpuf dimensions of third adaptive pooling ('adaptive' CNN only)

# Time-Depedency options 
td: self_att # time-depdency model, either 'self_att' for Transfomer based Self-Attention network (td_lstm_.. options will be ignored) | 'lstm' (td_sa_.. options will be ignored)
td_sa_d_model: 64 # attention network dimension
td_sa_nhead: 1 # number of heads
td_sa_pos_enc: False # apply positional encoding (no improvement in previous experiments)
td_sa_num_layers: 2 # self-attention depth
td_sa_h: 64 # hidden units of self-attention feedforward network
td_sa_dropout: 0.1 # self-attention dropout
td_lstm_h: null # number of LSTM hidden units
td_lstm_num_layers: null # LSTM depth
td_lstm_dropout: null 
td_lstm_bidirectional: null  # use bidirectional LSTM -> hidden units x 2

# Second Time-Depedency options (optional, for example for LSTM-Self-Attention network)
td_2: self_att # time-depdency model, either 'self_att' for Transfomer based Self-Attention network (td_lstm_.. options will be ignored) | 'lstm' (td_sa_.. options will be ignored)
td_2_sa_d_model: 64 # attention network dimension
td_2_sa_nhead: 1 # number of heads
td_2_sa_pos_enc: False  # apply positional encoding (no improvement in previous experiments)
td_2_sa_num_layers: 2
td_2_sa_h: 64 # hidden units of self-attention feedforward network
td_2_sa_dropout: 0.1 # self-attention dropout
td_2_lstm_h: null  # number of LSTM hidden units
td_2_lstm_num_layers: null  # LSTM depth
td_2_lstm_dropout: null
td_2_lstm_bidirectional: null  # use bidirectional LSTM -> hidden units x 2

# Pooling options
pool: att # 'att' for Attention-Pooling | 'avg' for average-pooling | 'max' max-pooling | 'last_step' last-step pooling | 'last_step_bi' last-step pooling with BiLSTMs  
pool_att_h: 128 # number of hidden units of attention-pooling feedforward network
pool_att_dropout: 0 # attention pooling dropout

# Double-Ended options
de_align: cosine # attention mechanism for alignment, 'bahd' | 'luong' | 'dot' | 'cosine' | 'distance' | 'none'
de_align_apply: hard # attention application method for alignment, 'soft' | 'hard'
de_fuse: x/y/-  # fusing mechanism, 'x/y/-' | '+/-' | 'x/y'  
de_fuse_dim: null # apply fully connected to change fuse output dimension, if 'null' fully conneted is skipped




