datasets:
  - name: "reddit_full"
    query_prefix: "Identify the topic or theme of Reddit posts based on the titles."
    objective:
      type: "paired"
      columns: ["title", "body"]

  - name: "amazon_reviews_full"
    query_prefix: "Classify the given Amazon review into its appropriate rating category."
    is_symmetric: true
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "paq_full"
    bucket: "s3://contrastive-index-filtered/paq_full/shard-{00000..00538}.jsonl.gz"
    query_prefix: "Given a question, retrieve Wikipedia passages that answer the question."
    objective: 
      type: "paired"
      columns: ["query", "document"]

  - name: "s2orc_citation_title_full"
    query_prefix: "Given a scientific paper title, retrieve the abstract of the given paper."
    document_prefix: "Given a scientific paper title, retrieve the abstract of the given paper."
    objective:
      type: "paired"
      columns: ["query", "pos"]

  - name: "s2orc_title_abstract_full"
    query_prefix: "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "s2orc_abstract_citation_full"
    query_prefix: "Given a scientific paper abstract, retrieve paper abstracts that are cited by the given paper."
    objective:
      type: "paired"
      columns: ["query", "pos"]

  - name: "s2orc_abstract_body_index_filtered"
    query_prefix: "Given a scientific paper abstract, retrieve the body of the given paper."
    objective:
      type: "paired"
      columns: ["query", "document"]

  # sets of duplicate questions
  - name: "wikianswers_full"
    bucket: "s3://contrastive-index-filtered/wikianswers_full/shard-{00000..00100}.jsonl.gz"
    is_symmetric: true
    query_prefix: "Retrieve semantically similar questions."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "wiki_title_body_full"
    bucket: "s3://contrastive-index-filtered/wiki_title_body_full/shard-{00000..0061}.jsonl.gz"
    query_prefix: "Given the title of an article, retrieve Wikipedia articles that answer the question."
    objective:
      type: "paired"
      columns: ["title", "text"]

  - name: "gooaq_full"
    bucket: "s3://contrastive-index-filtered/gooaq_full/shard-{00000..00012}.jsonl.gz"
    query_prefix: "Given a question from Google, retrieve its answer."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "codesearch_full"
    bucket: "s3://contrastive-index-filtered/codesearch_full/shard-{00000..00008}.jsonl.gz"
    query_prefix: "Given the comment from a piece of code, retrieve the code snippet."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "yahoo_title_answer_full"
    bucket: "s3://contrastive-index-filtered/yahoo_title_answer_full/shard-{00000..00002}.jsonl.gz"
    query_prefix: "Given the Yahoo Answers question, retrieve its top answer."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "agnews_full"
    bucket: "s3://contrastive-index-filtered/agnews_full/shard-{00000..00004}.jsonl.gz"
    query_prefix: "Given the title of the news article, retrieve its description."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "amazon_qa_full"
    bucket: "s3://contrastive-index-filtered/amazon_qa_full/shard-{00000..00002}.jsonl.gz"
    query_prefix: "Given the question about an Amazon product, retrieve the answer."
    objective:
      type: "paired_full"
      columns: ["query", "pos"]

  - name: "yahoo_qa_full"
    bucket: "s3://contrastive-index-filtered/yahoo_qa_full/shard-{00000..00001}.jsonl.gz"
    query_prefix: "Given the question from Yahoo Answers, retrieve the answer."
    objective:
      type: "paired_full"
      columns: ["query", "document"]

  - name: "yahoo_title_question_full"
    bucket: "s3://contrastive-index-filtered/yahoo_title_question_full/shard-{00000..00002}.jsonl.gz"
    query_prefix: "Given the title of a question from Yahoo Answers, retrieve the question."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "ccnews_full"
    bucket: "s3://contrastive-index-filtered/ccnews_full/shard-{00000..00003}.jsonl.gz"
    query_prefix: "Given the title of a news article, retrieve the article."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "npr_full"
    bucket: "s3://contrastive-index-filtered/npr_full/shard-{00000..00003}.jsonl.gz"
    query_prefix: "Given the headline of an NPR article, retrieve the article."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "eli5_full"
    bucket: "s3://contrastive-index-filtered/eli5_full/shard-{00000..00001}.jsonl.gz"
    query_prefix: "Given the question from the ELI5 subreddit, retrieve its top answer."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "cnn_full"
    bucket: "s3://contrastive-index-filtered/cnn_full/shard-{00000..00002}.jsonl.gz"
    query_prefix: "Given the summary of a CNN article, retrieve the full article."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "stackexchange_question_question_full"
    bucket: "s3://contrastive-index-filtered/stackexchange_question_question_full/shard-00000.jsonl.gz"
    is_symmetric: true
    query_prefix: "Retrieve semantically similar questions."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "stackexchange_title_body_full"
    bucket: "s3://contrastive-index-filtered/stackexchange_title_body_full/shard-00000.jsonl.gz"
    query_prefix: "Given the total of a StackExchange answer, retrieve the full answer."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "stackexchange_body_body_full"
    bucket: "s3://contrastive-index-filtered/stackexchange_body_body_full/shard-00000.jsonl.gz"
    is_symmetric: true
    query_prefix: "Retrieve semantically similar questions."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "sentence_compression_full"
    bucket: "s3://contrastive-index-filtered/sentence_compression_full/shard-{00000..00001}.jsonl.gz"
    query_prefix: "Given the summary of a sentence, retrieve the full sentence."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "wikihow_full"
    bucket: "s3://contrastive-index-filtered/wikihow_full/shard-00000.jsonl.gz"
    query_prefix: "Given the summary of a WikiHow article, retrieve its full text."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "altlex_full"
    bucket: "s3://contrastive-index-filtered/altlex_full/shard-00000.jsonl.gz"
    is_symmetric: true
    query_prefix: "Retrieve semantically similar questions."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "quora_full"
    bucket: "s3://contrastive-index-filtered/quora_full/shard-00000.jsonl.gz"
    is_symmetric: true
    query_prefix: "Retrieve semantically similar questions."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "simplewiki_full"
    bucket: "s3://contrastive-index-filtered/simplewiki_full/shard-00000.jsonl.gz"
    query_prefix: "Given the simplification of a Simple English Wikipedia article, retrieve the full article."
    objective:
      type: "paired"
      columns: ["query", "document"]

  - name: "squad_full"
    bucket: "s3://contrastive-index-filtered/squad_full/shard-00000.jsonl.gz"
    query_prefix: "Given the question from SQUAD dataset, retrieve the correct answer."
    objective:
      type: "paired"
      columns: ["query", "document"]