transformer:
  encoder_layer: 3
  encoder_hidden: 512
  encoder_kernel_size: 5
  conv_filter_size: 1024
  conv_kernel_size: [9, 1]
  encoder_dropout: 0.5
  encoder_zoneout: 0.1

variance_predictor:
  variance_hidden: 512

wavegrad:
  upsampling_rate: 300
  factors: [5, 5, 3, 2, 2]
  upsampling_preconv_out_channels: 768
  upsampling_out_channels: [512, 512, 256, 128, 128]
  # upsampling_dilations: [
  #     [1, 2, 1, 2],
  #     [1, 2, 1, 2],
  #     [1, 2, 4, 8],
  #     [1, 2, 4, 8],
  #     [1, 2, 4, 8]
  # ]
  upsampling_dilations: [
      [1, 2, 4, 8],
      [1, 2, 4, 8],
      [1, 2, 4, 8],
      [1, 2, 1, 2],
      [1, 2, 1, 2]
  ]
  downsampling_preconv_out_channels: 32
  downsampling_out_channels: [128, 128, 256, 512]
  downsampling_dilations: [
      [1, 2, 4], [1, 2, 4], [1, 2, 4], [1, 2, 4]
  ]

multi_speaker: False

max_seq_len: 1000
