
zookeeper: fennec

bench_candidates:
- autoj_bench
- pandalm_bench
- mt_bench
- correction_bench
- system_rank
- metatool_bench
- fennec_bench_v2


autoj_bench:
  name: "autoj_bench"
  domain: "Chatbot Arena Conversations, MTBench, OpenAI Summarization, OpenAI WebGPT, Stanford SHP, Synthetic GPT-J, and PKU-SafeRLHF"
  description: "1,392 testing samples, each with two responses generated by different LLMs and a human-annotated preference label. We refer to this test set as Eval-P, with the distribution on Win/Tie/Lose being 520/373/499."
  url: "https://github.com/GAIR-NLP/auto-j/tree/main/data"
  datasets:
    pairwise_train:
      type: pairwise
      raw_file: data/fennec_train_data/raw/autoj/pairwise_traindata.jsonl
      format_file: data/fennec_train_data/format/autoj/pairwise_traindata.jsonl
      db_file: data/db_file/dialogue/autoj_bench_pairwise_traindata.sqlite
    pairwise_test:
      type: pairwise
      raw_file: data/fennec_eval_data/raw/auto_j_eval_p/testdata_pairwise.jsonl
      format_file: data/fennec_eval_data/format/auto_j_eval_p/testdata_pairwise.jsonl
      db_file: data/db_file/dialogue/autoj_bench_testdata_pairwise.sqlite
    fennec_v2:
      type: pairwise
      raw_file: data/fennec_train_data/raw/fennec_bench_v2/pairwise_data_0324_selected.jsonl
      format_file: data/fennec_train_data/format/fennec_bench_v2/pairwise_data_0324_selected.jsonl
      db_file: data/db_file/dialogue/fennec_bench_v2_pairwise_data_0324_selected.sqlite
    fennec_v2_0331:
      type: pairwise
      raw_file: data/fennec_train_data/raw/fennec_bench_v2/pairwise_data_0331_selected.jsonl
      format_file: data/fennec_train_data/format/fennec_bench_v2/pairwise_data_0331_selected.jsonl
      db_file: data/db_file/dialogue/fennec_bench_v2_pairwise_data_0331_selected.sqlite

pandalm_bench:
  name: "pandalm_bench"
  domain: ""
  description: "The instructions and inputs are sampled from Alpaca 52K data and the response pairs are provided by LLaMA-7B, Bloom-7B, Cerebras-GPT-6.7B, OPT-7B and Pythia-6.9"
  datasets:
    pairwise_test:
      type: pairwise
      raw_file: data/fennec_eval_data/raw/pandalm/testdata_pairwise.json
      format_file: data/fennec_eval_data/format/pandalm/testdata_pairwise.jsonl
      db_file: data/db_file/dialogue/pandalm_bench_testdata_pairwise.sqlite

mt_bench:
  name: "mt_bench"
  domain: "writing, coding, reasoning, and mathematics"
  description: "MT-bench is a set of challenging multi-turn open-ended questions for evaluating chat assistants."
  question:
    raw_file: data/fennec_eval_data/raw/mt_bench/question.jsonl 
  datasets:
    pairwise_test:
      type: pairwise
      raw_file: data/fennec_eval_data/raw/mt_bench/testdata_pairwise.jsonl
      format_file: data/fennec_eval_data/format/mt_bench/testdata_pairwise.jsonl
      db_file: data/db_file/dialogue/mt_bench_testdata_pairwise.sqlite
    pairwise_test_turn0:
      type: pairwise
      raw_file: data/fennec_eval_data/raw/mt_bench/testdata_pairwise.jsonl
      format_file: data/fennec_eval_data/format/mt_bench/testdata_pairwise_turn0.jsonl
      db_file: data/db_file/dialogue/mt_bench_testdata_pairwise_turn0.sqlite

correction_bench:
  name: "correction_bench"
  domain: ""
  description: ""
  datasets:
   llama2_7bchat_test:
      type: pairwise
      raw_file: data/fennec_eval_data/raw/correction/
      format_file: data/fennec_eval_data/format/correction/llama2_7bchat_testdata.json
      db_file: data/db_file/dialogue/llama2_7bchat_correction_testdata.sqlite
   alpaca13b_test:
      type: pairwise
      raw_file: data/fennec_eval_data/raw/correction/
      format_file: data/fennec_eval_data/format/correction/alpaca13b_testdata.json
      db_file: data/db_file/dialogue/alpaca13b_correction_testdata.sqlite
   autoj_test:
      type: pairwise
      raw_file: data/fennec_eval_data/raw/correction/autoj_correction.json
      format_file: data/fennec_eval_data/format/correction/testdata_autoj_correction.json
      db_file: data/db_file/dialogue/testdata_autoj_correction.sqlite

system_rank:
  name: "system_rank"
  domain: ""
  description: ""
  datasets:
   pairwise_test:
      type: pairwise
      raw_file: data/fennec_eval_data/raw/system_rank/testdata_pairwise.jsonl
      format_file: data/fennec_eval_data/format/system_rank/testdata_pairwise.jsonl
      db_file: data/db_file/dialogue/system_rank_testdata_pairwise.sqlite

metatool_bench:
  name: "metatool_bench"
  domain: ""
  description: ""
  datasets:
   tool_awareness:
      type: single
      raw_file: data/fennec_eval_data/raw/metatool/Task1.json
      format_file: data/fennec_eval_data/format/metatool/testdata_tool_awareness.jsonl
      db_file: data/db_file/dialogue/tool_awareness_testdata_single.sqlite

fennec_bench_v2:
  name: "fennec_bench_v2"
  domain: ""
  description: ""
  datasets:
   dialogue_gen:
      type: single
      raw_file: data/fennec_train_data/raw/fennec_bench_v2/selected_data_0314.jsonl
      format_file: data/fennec_train_data/format/fennec_bench_v2/traindata_0314.jsonl
      db_file: data/db_file/dialogue/fennec_bench_v2_traindata_0314_single.sqlite
