model:
  name: CodecFormer
  latent_dim: 1024
  tracks:
  - speech
  - music
  - sfx
  enc_params:
    name: DACEncoder
    d_model: 64
    strides:
    - 2
    - 4
    - 5
    - 8
  dec_params:
    name: DACDecoder
    d_model: 1536
    strides:
    - 8
    - 5
    - 4
    - 2
  transformer_params:
    name: Transformer
    d_model: 256
    nhead: 8
    dim_feedforward: 1024
    dropout: 0.1
    batch_first: true
    num_layers: 16
    norm_first: true
  separator_params:
    name: Separator
    num_spks: 3
    channels: 1024
    block_channels: 256
  pretrain: {}
  codec_params:
    name: SDCodec
    latent_dim: 1024
    tracks:
    - speech
    - music
    - sfx
    enc_params:
      name: DACEncoder
      d_model: 64
      strides:
      - 2
      - 4
      - 5
      - 8
    dec_params:
      name: DACDecoder
      d_model: 1536
      strides:
      - 8
      - 5
      - 4
      - 2
    quant_params:
      name: MultiSourceRVQ
      n_codebooks:
      - 12
      - 12
      - 12
      codebook_size:
      - 1024
      - 1024
      - 1024
      codebook_dim:
      - 8
      - 8
      - 8
      quantizer_dropout: 0.0
      code_jit_prob:
      - 0.0
      - 0.0
      - 0.0
      code_jit_size:
      - 3
      - 5
      - 3
      shared_codebooks: 0
    pretrain: {}
discriminator:
  name: Discriminator
  rates: []
  periods:
  - 2
  - 3
  - 5
  - 7
  - 11
  fft_sizes:
  - 2048
  - 1024
  - 512
  bands:
  - - 0.0
    - 0.1
  - - 0.1
    - 0.25
  - - 0.25
    - 0.5
  - - 0.5
    - 0.75
  - - 0.75
    - 1.0
dataset:
  trainset_cfg:
    speech:
    - manifest/speech_dnr.csv
    music:
    - manifest/music_dnr.csv
    sfx:
    - manifest/sfx_dnr.csv
    n_examples: 10000000
    chunk_size: 1.5
    trim_silence: false
  valset_cfg:
    tsv_filepath: manifest/val_dnr.csv
    chunk_size: 5.0
  testset_cfg:
    tsv_filepath: manifest/test_dnr.csv
    chunk_size: 10.0
training:
  total_steps: 400000
  warmup_steps: 10000
  print_steps: 500
  eval_steps: 5000
  vis_steps: 10000
  test_steps: 30000
  early_stop: 15
  grad_clip: 10.0
  save_iters:
  - 50000
  - 100000
  - 150000
  - 200000
  - 250000
  - 300000
  - 350000
  vis_idx:
  - 100
  - 200
  - 300
  seed: 42
  transform:
    lufs_norm_db:
      speech: -17
      music: -24
      sfx: -21
      mix: -27
      var: 2
    peak_norm_db: -0.5
    random_num_sources:
    - 0.6
    - 0.2
    - 0.2
    random_swap_prob: 0.5
  optimizer:
    name: AdamW
    lr: 0.00015
    betas:
    - 0.8
    - 0.99
  scheduler:
    name: ReduceLROnPlateau
    mode: min
    factor: 0.5
    patience: 2
    threshold: 0.0001
    threshold_mode: rel
    cooldown: 0
    min_lr: 0.0001
  loss:
    MultiScaleSTFTLoss:
      window_lengths:
      - 2048
      - 512
    MelSpectrogramLoss:
      n_mels:
      - 5
      - 10
      - 20
      - 40
      - 80
      - 160
      - 320
      window_lengths:
      - 32
      - 64
      - 128
      - 256
      - 512
      - 1024
      - 2048
      mel_fmin:
      - 0
      - 0
      - 0
      - 0
      - 0
      - 0
      - 0
      mel_fmax:
      - null
      - null
      - null
      - null
      - null
      - null
      - null
      pow: 1.0
      clamp_eps: 1.0e-05
      mag_weight: 0.0
    lambdas:
      mel/loss: 15.0
      adv/feat_loss: 2.0
      adv/gen_loss: 1.0
      vq/commitment_loss: 0.25
      vq/codebook_loss: 1.0
  dataloader:
    num_workers: 8
    train_bs: 8
    eval_bs: 32
sampling_rate: 16000
resume: true
resume_dir: null
backup_code: true
