id:
  - _target_: datasets.load_dataset
    path: EdinburghNLP/xsum
    split: train

  - _target_: datasets.load_dataset
    path: EdinburghNLP/xsum
    split: validation

  - _target_: datasets.load_dataset
    path: EdinburghNLP/xsum
    split: test


aux:
  - _target_: text_ood.utils.dataset_util.load_dataset_preprocess
    path: allenai/c4
    split: train
    name: en
    cache_dir: ${oc.env:COMMON_CRAWL_ROOT}
    rename:
      - original_column_name: text
        new_column_name: document


ood:
  - _target_: text_ood.utils.dataset_util.load_dataset_preprocess
    path: abisee/cnn_dailymail
    name: "1.0.0"
    split: test
    rename:
      - original_column_name: article
        new_column_name: document

  - _target_: text_ood.utils.dataset_util.load_dataset_preprocess
    path: lil-lab/newsroom
    data_dir: ${oc.env:NEWSROOM_ROOT}
    split: test
    rename:
      - original_column_name: text
        new_column_name: document

  - _target_: text_ood.utils.dataset_util.load_dataset_preprocess
    path: ctr4si/reddit_tifu
    name: long
    split: train  # there is no test split
    rename:
      - original_column_name: documents
        new_column_name: document
  
  - _target_: text_ood.utils.dataset_util.load_dataset_preprocess
    path: Samsung/samsum
    split: train  # this is the only split with more than 10k samples
    rename:
      - original_column_name: dialogue
        new_column_name: document

  # the forumsum data set seemingly has been taken offline
