model:
  codecsep_params:
    name: CodecSep
    latent_dim: 1024
    tracks:
    - speech
    - music
    - sfx
    enc_params:
      name: DACEncoder
      d_model: 64
      strides:
      - 2
      - 4
      - 5
      - 8
    dec_params:
      name: DACDecoder
      d_model: 1536
      strides:
      - 8
      - 5
      - 4
      - 2
    transformer_params:
      name: Transformer
      d_model: 256
      nhead: 8
      dim_feedforward: 1024
      dropout: 0.1
      batch_first: true
      num_layers: 16
      norm_first: true
    separator_params:
      name: Separator
      num_spks: 3
      channels: 1024
      block_channels: 256
      pretrain: {}
  codec_params:
    name: SDCodec
    latent_dim: 1024
    tracks:
    - speech
    - music
    - sfx
    enc_params:
      name: DACEncoder
      d_model: 64
      strides:
      - 2
      - 4
      - 5
      - 8
    dec_params:
      name: DACDecoder
      d_model: 1536
      strides:
      - 8
      - 5
      - 4
      - 2
    quant_params:
      name: MultiSourceRVQ
      n_codebooks:
      - 12
      - 12
      - 12
      codebook_size:
      - 1024
      - 1024
      - 1024
      codebook_dim:
      - 8
      - 8
      - 8
      quantizer_dropout: 0.0
      code_jit_prob:
      - 0.0
      - 0.0
      - 0.0
      code_jit_size:
      - 3
      - 5
      - 3
      shared_codebooks: 0
  pretrain: {}
dataset:
  trainset_cfg:
    n_examples: 10000000
    chunk_size: 2.0
    trim_silence: false
    use_background: true
  valset_cfg:
    tsv_filepath: datasets/audiocaps_valid
    chunk_size: 5.0
  testset_cfg:
    tsv_filepath: datasets/audiocaps_test
    chunk_size: 10.0
training:
  total_steps: 100
  warmup_steps: 5
  print_steps: 1
  eval_steps: 3
  vis_steps: 2
  test_steps: 2
  early_stop: 50
  grad_clip: 10.0
  save_iters:
  - 10
  - 15
  vis_idx:
  - 10
  - 20
  - 30
  seed: 42
  transform:
    lufs_norm_db:
      speech: -17
      music: -24
      sfx: -21
      mix: -27
      var: 2
    peak_norm_db: -0.5
    random_num_sources:
    - 0.2
    - 0.2
    - 0.6
    random_swap_prob: 0.5
  optimizer:
    name: AdamW
    lr: 0.0001
    betas:
    - 0.8
    - 0.99
  scheduler:
    name: ExponentialLRScheduler
    total_steps: 100
    warmup_steps: 5
    lr_min_ratio: 0.0
    gamma: 0.999
  loss:
    MultiScaleSTFTLoss:
      window_lengths:
      - 2048
      - 512
    MelSpectrogramLoss:
      n_mels:
      - 5
      - 10
      - 20
      - 40
      - 80
      - 160
      - 320
      window_lengths:
      - 32
      - 64
      - 128
      - 256
      - 512
      - 1024
      - 2048
      mel_fmin:
      - 0
      - 0
      - 0
      - 0
      - 0
      - 0
      - 0
      mel_fmax:
      - null
      - null
      - null
      - null
      - null
      - null
      - null
      pow: 1.0
      clamp_eps: 1.0e-05
      mag_weight: 0.0
    lambdas:
      mel/loss: 15.0
      adv/feat_loss: 2.0
      adv/gen_loss: 1.0
      vq/commitment_loss: 0.25
      vq/codebook_loss: 1.0
  dataloader:
    num_workers: 8
    train_bs: 2
    eval_bs: 16
use_codec_loss: false
sampling_rate: 16000
resume: true
resume_dir: null
backup_code: true
