# @package __global__

classifier_free_guidance:
  training_dropout: 0.3
  inference_coef: 3.0

attribute_dropout:
  text: {}
  wav: {}

fuser:
  cross_attention_pos_emb: false
  cross_attention_pos_emb_scale: 1
  sum: []
  prepend: []
  cross: [description]
  input_interpolate: []

conditioners:
  description:
    model: clap
    clap:
      checkpoint: //reference/clap/music_audioset_epoch_15_esc_90.14.pt
      model_arch: 'HTSAT-base'
      enable_fusion: false
      sample_rate: 48000
      max_audio_length: 10
      audio_stride: 1
      dim: 512
      attribute: description
      normalize: true
      quantize: true  # use RVQ quantization
      n_q: 12
      bins: 1024
      kmeans_iters: 50
      text_p: 0.  # probability of using text embed at train time
      cache_path: null

dataset:
  joint_embed_attributes: [description]
  train:
    merge_text_p: 0.25
    drop_desc_p: 0.5
    drop_other_p: 0.5
