quantizer:
  init_args:
    codebook_size: 1024
    channels: 256
wav2vec2: # Encoder and Decoder
  init_args:
    embed_dim: 768
    dropout_input: 0.1
    pos_conv_kernel: 128
    pos_conv_groups: 16
    num_layers: 12
    num_heads: 8
    attention_dropout: 0.1
    ff_interm_features: 4096
    ff_interm_dropout: 0.1
    dropout: 0.1
    layer_drop: 0.05
contrast_loss:
  init_args:
    num_distractors: 100
    temperature: 0.1
optimizer:
  lr: 5.0e-4
  weight_decay: 0.0
  betas: [0.9, 0.99]
scheduler:
  # initial_lr: <- Refer to the lr of optimizer
  warmup_iters: 32000
  decay_iters: 368000
  min_lr_constant: 100
batch_size: 128
num_epochs: 128
max_length: 250000