train:
  _target_: data_module.dataset.SpeechEnhancementDataset
  content: ${env:VCTK_WHAM_CAPTION_TRAIN,"/path/to/VCTK+Wham/train/metadata_caption.jsonl"}
  audio: ${env:VCTK_WHAM_AUDIO_TRAIN,"/path/to/VCTK+Wham/train/metadata_audio.jsonl"}
  base_content_path: ${env:VCTK_WHAM_BASE,"/path/to/VCTK+Wham"}
  base_audio_path: ${env:VCTK_WHAM_BASE,"/path/to/VCTK+Wham"}
  downsampling_ratio: ${downsampling_ratio}
  target_sr: ${sample_rate}
  use_h5_cache: false
  task_instruction: ${instruction_embedding}
  max_duration: 5.0
val:
  _target_: data_module.dataset.SpeechEnhancementDataset
  content: ${env:VCTK_WHAM_CAPTION_VAL,"/path/to/VCTK+Wham/val/metadata_caption.jsonl"}
  audio: ${env:VCTK_WHAM_AUDIO_VAL,"/path/to/VCTK+Wham/val/metadata_audio.jsonl"}
  base_content_path: ${env:VCTK_WHAM_BASE,"/path/to/VCTK+Wham"}
  base_audio_path: ${env:VCTK_WHAM_BASE,"/path/to/VCTK+Wham"}
  downsampling_ratio: ${downsampling_ratio}
  target_sr: ${sample_rate}
  use_h5_cache: false
  task_instruction: ${instruction_embedding}
  max_samples: ${max_val_samples}
  max_duration: 5.0
  random_crop: false
test:
  _target_: data_module.dataset.SpeechEnhancementDataset
  content: ${env:VCTK_WHAM_CAPTION_TEST,"/path/to/VCTK+Wham/test/metadata_caption.jsonl"}
  base_content_path: ${env:VCTK_WHAM_BASE,"/path/to/VCTK+Wham"}
  downsampling_ratio: ${downsampling_ratio}
  target_sr: ${sample_rate}
  use_h5_cache: false
  task_instruction: ${instruction_embedding}
  instruction_idx: 1