data:
  batch_size: 5
  sampling_rate: 16000
  seg_len: 5  # sec
  feature_type: fbank
  frontend_path: null

model:
  type: FlexiCodec
  sample_rate: 16000
  encoder_dim: 32
  encoder_rates: [4,4,5,8,2]
  decoder_rates: [2,8,5,4,4]
  n_codebooks: 24
  quantizer_dropout: 1.0
  codebook_size: 4096
  semantic_codebook_size: 32768
  is_causal: false
  use_similarity_alignment: true
  similarity_threshold: 0.91
  semantic_downsample_factor: 1.33333
  skip_normalize: true
  # SenseVoice specific parameters
  semantic_model_type: "sensevoice"
  sensevoice_prepend_inputs: true
  latent_dim: 512
  ssl_dim: 512
  use_bottleneck_transformer: true
  transformer_num_layers: 32
  transformer_dim_feedforward: 2048
  transformer_num_heads: 8
  transformer_causal: false
  transformer_context_frames: 16
  semantic_model_path: ./SenseVoiceSmall
  resume_ckpt: null
  use_query_token_aggregator: true
  # second transformer parameters
  use_second_decoder_transformer: false
  transformer_2_num_layers: 24
  insert_query_before_downsample: false 
  # Dynamic similarity threshold parameters
  use_dynamic_similarity_threshold: true
  similarity_threshold_lower: 0.7
  similarity_threshold_upper: 1.0
  use_fsq_for_semantic_vq: true

 
train:
  distributed: True
  use_hinge_loss: False
  reset_lr_schedulers: True
  generator:
    lr: 1.0e-4
    betas: [0.8, 0.99]
    gamma: 0.999998
  discriminator:
    lr: 1.0e-4
    betas: [0.8, 0.99]
    gamma: 0.999998