id:
  - _target_: text_ood.dataset.wmt.WMTDataset
    path: wmt/wmt15
    name: fr-en
    split: train
    cache_dir: ${oc.env:WMT_ROOT}

  - _target_: text_ood.dataset.wmt.WMTDataset
    path: wmt/wmt15
    name: fr-en
    split: test
    cache_dir: ${oc.env:WMT_ROOT}

aux:
  - _target_: text_ood.dataset.paracrawl.ParacrawlDataset
    src_lang: en
    trg_lang: fr
    file_path: ${oc.env:PARACRAWL_ROOT}/en-fr.txt
    path: paracrawl/ParaCrawl
    split: train

ood:
  # newstest2014
  - _target_: text_ood.dataset.wmt15.WMT15Dataset
    src_lang: en
    trg_lang: fr
    src_path: ${oc.env:WMT_DEV_ROOT}/newstest2014-fren-src.en.sgm
    trg_path: ${oc.env:WMT_DEV_ROOT}/newstest2014-fren-src.fr.sgm
    path: wmt/wmt15-newstest14  # for saving
    split: test  # for saving

  # newsdiscussdev2015
  - _target_: text_ood.dataset.wmt15.WMT15Dataset
    src_lang: en
    trg_lang: fr
    src_path: ${oc.env:WMT_DEV_ROOT}/newsdiscussdev2015-enfr-src.en.sgm
    trg_path: ${oc.env:WMT_DEV_ROOT}/newsdiscussdev2015-enfr-ref.fr.sgm
    document_tag: p
    path: wmt/wmt15-newsdiscussdev15  # for saving
    split: dev  # for saving

  # newsdiscusstest2015
  - _target_: text_ood.dataset.wmt15.WMT15Dataset
    src_lang: en
    trg_lang: fr
    src_path: ${oc.env:WMT_TEST_ROOT}/newsdiscusstest2015-enfr-src.en.sgm
    trg_path: ${oc.env:WMT_TEST_ROOT}/newsdiscusstest2015-enfr-ref.fr.sgm
    document_tag: p
    path: wmt/wmt15-newsdiscusstest15  # for saving
    split: test  # for saving


  # law
  - _target_: text_ood.dataset.opus.OpusDataset
    src_lang: en
    trg_lang: fr
    src_path: ${oc.env:OPUS_ROOT}/law/ELRC-EUIPO_law.en-fr.en
    trg_path: ${oc.env:OPUS_ROOT}/law/ELRC-EUIPO_law.en-fr.fr
    path: opus/law
    split: train

  # medical
  - _target_: text_ood.dataset.opus.OpusDataset
    src_lang: en
    trg_lang: fr
    src_path: ${oc.env:OPUS_ROOT}/medical/EMEA.en-fr.en
    trg_path: ${oc.env:OPUS_ROOT}/medical/EMEA.en-fr.fr
    path: opus/medical
    split: train

  # Koran
  - _target_: text_ood.dataset.opus.OpusDataset
    src_lang: en
    trg_lang: fr
    src_path: ${oc.env:OPUS_ROOT}/Koran/Tanzil.en-fr.en
    trg_path: ${oc.env:OPUS_ROOT}/Koran/Tanzil.en-fr.fr
    path: opus/Koran
    split: train

  # IT
  - _target_: text_ood.dataset.opus.OpusDataset
    src_lang: en
    trg_lang: fr
    src_path: ${oc.env:OPUS_ROOT}/it/Ubuntu.en-fr.en
    trg_path: ${oc.env:OPUS_ROOT}/it/Ubuntu.en-fr.fr
    path: opus/IT
    split: train

  # subtitles
  - _target_: text_ood.dataset.opus.OpusDataset
    src_lang: en
    trg_lang: fr
    src_path: ${oc.env:OPUS_ROOT}/subtitles/OpenSubtitles.en-fr.en
    trg_path: ${oc.env:OPUS_ROOT}/subtitles/OpenSubtitles.en-fr.fr
    path: opus/subtitles
    split: train
