train:
  _target_: data_module.dataset.TextToAudioDataset
  content: data/audiocaps_v2/train/caption.jsonl
  audio: data/audiocaps_v2/train/audio.jsonl
  target_sr: ${sample_rate}
  use_h5_cache: false
  task_instruction: ${instruction_embedding}
val:
  _target_: data_module.dataset.TextToAudioDataset
  content: data/audiocaps_v2/val/caption.jsonl
  audio: data/audiocaps_v2/val/audio.jsonl
  target_sr: ${sample_rate}
  use_h5_cache: false
  task_instruction: ${instruction_embedding}
test:
  _target_: data_module.dataset.TextToAudioDataset
  # content: data/audiocaps_v2_kqq/test/caption.jsonl
  content: data/audiocaps_v2/test/caption.jsonl
  task_instruction: ${instruction_embedding}
  instruction_idx: 1
  max_samples: ${max_test_samples}