# task
base_config: configs/config_base.yaml

#############
# dataset
#############
data_dir: ''
raw_data_dir: ''
dict_file: dict
loud_norm: false
endless_ds: true

test_num: 100
max_frames: 1550
max_input_tokens: 200
audio_num_mel_bins: 80
audio_sample_rate: 22050
hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
win_size:  1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: n_fft) (0.05 * sample_rate)
fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
fmax: 7600  # To be increased/reduced depending on data.
n_fft: 1024  # Extra window size is filled with 0 paddings to match this parameter
min_level_db: -100
ref_level_db: 20
power: 1
magnitude_power: 1
griffin_lim_iters: 60
num_spk: 1
vocoder: pwg
gen_wav_denoise: false

#########
# model
#########
dropout: 0.1
arch: '8 8 8 8 13 13 13 13'
enc_layers: 4
dec_layers: 4
hidden_size: 384
prenet_hidden_size: 32
stop_token_weight: 5.0
lr: 2.0
warmup_updates: 8000
optimizer_adam_beta1: 0.9
optimizer_adam_beta2: 0.98
weight_decay: 0
clip_grad_norm: 1
attn_constraint: true
enc_ffn_kernel_size: 9
dec_ffn_kernel_size: 9
ffn_padding: 'SAME'
use_new_ffn: true

# reference encoder and speaker embedding
use_spk_embed: false
use_spk_id: false
use_ref_enc: false
use_ref_tfm: false
ref_norm_layer: bn_nonpad
latent_size: 32
ref_hidden_and_stride:
  - 128,2
  - 128,2
  - 128,1
  - 256,1
  - 256,1
  - 256,1

###########
# train and eval
###########
max_tokens: 20000
max_sentences: -1
max_eval_tokens: 60000
max_eval_sentences: 1
train_set_name: 'train'
valid_set_name: 'valid'
test_set_name: 'test'
vocoder_ckpt: ''
profile_infer: false
out_wav_norm: false
max_updates: 200010