model_type: soundstorm
model:
  target: nn_ss.sound_synthesis2.modeling.controlspeech_model.ControlSpeech
  params:
    content_info: {key: audio}
    condition_info: {key: text}
    n_q: 6  # the encodec codebook's number
    diffusion_config:
        target: nn_ss.sound_synthesis2.modeling.transformers.jsp1_controlspeech_mdn.DiffusionTransformer
        params:
          transformer_config:
            target: nn_ss.sound_synthesis2.modeling.transformers.jsp2_controlspeech_mdn3.Text2ImageTransformer
            params:
              n_q: 6
              n_layer: 8 # 16 we may use large model
              n_embd: 512 # the dim of embedding dims
              n_head: 16
              semantic_token_nums: 500
              acoustic_token_nums: 1024  # 1024-2048


              content_emb_config:
                target: nn_ss.sound_synthesis2.modeling.embeddings.dalle_mask_wav_embedding1d_3code.DalleMaskImageEmbedding
                params:
                  num_embed: 1026  #should be quantize_number 1036-2050
                  max_size: 24000
                  n_q: 6
                  embed_dim: 512 # the dim of postion embedding
                  trainable: True
                  pos_emb_type: embedding
      


# train_corpus: /home/disk2/gongxuefei/DATA/phone_encodec_24k/file_compression
train_corpus: /home/disk2/nips/Data/2024nips/0509
train_stage: 'train'
train_stage_nq: 6
# valid_corpus: /home/disk2/gongxuefei/DATA/phone_encodec_24k/file_compression
valid_corpus: /home/disk2/nips/Data/2024nips/0509
valid_stage: 'val'
valid_stage_nq: 6
max_frames_in_batch: 3000


# checkpoint_dir: /home/disk2/gongxuefei/Project/1_vocder_upgrade/AudioLM/StyleTrolNet/exp_output/model
# tensorboard_dir: /home/disk2/gongxuefei/Project/1_vocder_upgrade/AudioLM/StyleTrolNet/exp_output/tensorborad

checkpoint_dir: /home/disk2/nips/Result/controlspeech/train/model_styletrolnet
tensorboard_dir: /home/disk2/nips/Result/controlspeech/train/tensorborad_styletrolnet

start_epoch: 0
max_epoch: 100
grad_clip: 5.0
accum_grad: 8 # 梯度累积
log_interval: 800
save_steps: 4000
keep_last_k_ckpt: 10
avg_ckpt_num: 5
keep_topk_dir: false

optim: AdamW
optim_conf:
    lr: 0.0005
    betas: [0.9, 0.95]
    weight_decay: 0.01
scheduler_conf:
    early_stop_patient_n_epochs: 10
    lower_better: true
    warmup_n_steps: 1800
    noam: true
    save_checkpoints_topk: 5

pretrained_checkpoint: /home/disk2/nips/Result/controlspeech/train/model_styletrolnet_final1/soundstorm_4.pt
pretrained_optimizer: /home/disk2/nips/Result/controlspeech/train/model_styletrolnet_final1/optimizer_4.pt