xvector: false  # whether to use xvector for speaker modeling.

perform_reflow: false  # if true, will need noise_scp be specified

train:
    test_size: 4
    n_epochs: 3000
    batch_size: 24
    learning_rate: !!float 5e-5
    seed: 37
    save_every: 1000
    use_gt_dur: true

data:
    sampling_rate: 16000
    n_mel_channels: 80
    add_blank: false
    hop_length: 200  

    phn2id: "data/libri/phones.txt"

    train:
        utts: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/member/utts.list"
        utt2phns: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/member/text"
        utt2phn_duration: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/member/phn_duration"
        feats_scp: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/feat-member/normed_fbank/feats.scp"
        utt2num_frames: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/feat-member/normed_fbank/utt2num_frames"
        utt2spk: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/member/utt2spk_id.json"

    val:
        utts: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/nonmember/utts.list"
        utt2phns: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/nonmember/text"
        utt2phn_duration: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/nonmember/phn_duration"
        feats_scp: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/feat-nonmember/normed_fbank/feats.scp"
        utt2num_frames: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/feat-nonmember/normed_fbank/utt2num_frames"
        utt2spk: "./VoiceFlow-TTS/voiceflow-dataset/LibriTTS/nonmember/utt2spk_id.json"

model:
    n_vocab: 148
    n_spks: 8976
    spk_emb_dim: 64
    n_enc_channels: 192
    filter_channels: 768
    filter_channels_dp: 256
    n_enc_layers: 6
    enc_kernel: 3
    enc_dropout: 0.1
    n_heads: 2
    window_size: 4
    dec_dim: 128
    pe_scale: 1000
    fm_type: "CFM"  # FM, CFM
    fm_net_type: "unet"  # unet or diffsinger
    shift_by_mu: false 
    condition_by_mu: true