base_config:
  - configs/tts/base.yaml
  - configs/tts/base_zh.yaml


datasets: []
test_prefixes: []
test_num: 0
valid_num: 0

pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
binarizer_cls: data_gen.singing.binarize.SingingBinarizer
pre_align_args:
  use_tone: false # for ZH
  forced_align: mfa
  use_sox: true
hop_size: 128            # Hop size.
fft_size: 512           # FFT size.
win_size: 512           # FFT size.
max_frames: 8000
fmin: 50                 # Minimum freq in mel basis calculation.
fmax: 11025               # Maximum frequency in mel basis calculation.
pitch_type: frame

hidden_size: 256
mel_loss: "ssim:0.5|l1:0.5"
lambda_f0: 0.0
lambda_uv: 0.0
lambda_energy: 0.0
lambda_ph_dur: 0.0
lambda_sent_dur: 0.0
lambda_word_dur: 0.0
predictor_grad: 0.0
use_spk_embed: true
use_spk_id: false

max_tokens: 20000
max_updates: 400000
num_spk: 100
save_f0: true
use_gt_dur: true
use_gt_f0: true
