# @package _group_

common:
  fp16: true
  log_format: json
  log_interval: 200
  seed: 1337
  tensorboard_logdir: ???
  user_dir: examples/celsds 
  wandb_project: innder_emo

checkpoint:
  save_dir: ???
  save_interval_updates: 3000
  keep_interval_updates: 1
  no_epoch_checkpoints: true

distributed_training:
  ddp_backend: legacy_ddp
  distributed_world_size: 1

task:
  _name: cosyvoice_emosft_dataset
  data: ./datas/wesc/supervision/infos/
  max_sample_size: 1000
  min_sample_size: 30
  max_valid_sample_size: 300
  pretrained_home: ./pretrained_models/CosyVoice2-0.5B/
  prompt_json: ./datas/grouped_emo_speaker_data.json
  attention_bias: "0_1_2_3_4_5_6"
  attention_range: "1.0_5.0"

dataset:
  train_subset: train
  valid_subset: dev
  num_workers: 2
  max_tokens: 1500
  max_tokens_valid: 300
  validate_interval: 100000
  validate_interval_updates: 20000000000

criterion:
  _name: cosyvoice2_inneremo
     
optimization:
  max_update: 600000
  lr: [0.000001]
  clip_norm: 1.0

optimizer:
  _name: adam
  adam_betas: (0.9,0.99)
  adam_eps: 1e-06

lr_scheduler:
  _name: fixed

model:
  _name: cosyvoice2_inneremo_causalalign_fixattb
  text_encoder_input_size: 512
  llm_input_size: 896
  llm_output_size: 896
  text_token_size: 51866
  speech_token_size: 6561
  length_normalized_loss: True
  lsm_weight: 0
  spk_embed_dim: 192
  qwen_pretrained_path: ./pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN
  aligner_layer: 2
  tts_freeze: false