train:
  _target_: data_module.dataset.VideoToAudioDataset
  content: data/visual_sound/clip/train/content.jsonl
  audio: data/visual_sound/clip/train/audio.jsonl
  video_fps: 10
  target_sr: ${sample_rate}
  task_instruction: ${instruction_embedding}
val:
  _target_: data_module.dataset.VideoToAudioDataset
  content: data/visual_sound/clip/val/content.jsonl
  audio: data/visual_sound/clip/val/audio.jsonl
  video_fps: 10
  target_sr: ${sample_rate}
  task_instruction: ${instruction_embedding}
  max_samples: ${max_val_samples}
test:
  _target_: data_module.dataset.VideoToAudioDataset
  content: data/visual_sound/clip/test/content.jsonl
  video_fps: 10
  task_instruction: ${instruction_embedding}
  instruction_idx: 1
  max_samples: ${max_test_samples}