eval_gauntlet:
  weighting: EQUAL
  subtract_random_baseline: true
  rescale_accuracy: true
  averages:
    core_average:
    - world_knowledge
    - commonsense_reasoning
    - language_understanding
    - symbolic_problem_solving
    - reading_comprehension
    - programming
    lm_task_average:
    - world_knowledge_lm_task_subscore
    - commonsense_reasoning_lm_task_subscore
    - language_understanding_lm_task_subscore
    - symbolic_problem_solving_lm_task_subscore
    - reading_comprehension_lm_task_subscore
    lite_average:
    - world_knowledge_lite
    - commonsense_reasoning_lite
    - language_understanding_lite
    - symbolic_problem_solving_lite
    - reading_comprehension_lite
    - programming_lite
  categories:
  - name: world_knowledge
    benchmarks:
    - name: jeopardy
      num_fewshot: 10
      random_baseline: 0
    - name: bigbench_qa_wikidata
      num_fewshot: 10
      random_baseline: 0
    - name: arc_easy
      num_fewshot: 10
      random_baseline: 0.25
    - name: arc_challenge
      num_fewshot: 10
      random_baseline: 0.25
    - name: mmlu
      num_fewshot: 10
      random_baseline: 0.25
    - name: bigbench_misconceptions
      num_fewshot: 10
      random_baseline: 0.5
  - name: commonsense_reasoning
    benchmarks:
    - name: copa
      num_fewshot: 0
      random_baseline: 0.5
    - name: piqa
      num_fewshot: 10
      random_baseline: 0.5
    - name: openbook_qa
      num_fewshot: 0
      random_baseline: 0.25
    - name: bigbench_novel_concepts
      num_fewshot: 10
      random_baseline: 0.25
    - name: bigbench_strange_stories
      num_fewshot: 10
      random_baseline: 0.5
    - name: bigbench_strategy_qa
      num_fewshot: 10
      random_baseline: 0.5
  - name: language_understanding
    benchmarks:
    - name: lambada_openai
      num_fewshot: 0
      random_baseline: 0.0
    - name: hellaswag
      num_fewshot: 10
      random_baseline: 0.25
    - name: winograd
      num_fewshot: 0
      random_baseline: 0.5
    - name: winogrande
      num_fewshot: 0
      random_baseline: 0.5
    - name: bigbench_conlang_translation
      num_fewshot: 0
      random_baseline: 0.0
    - name: bigbench_language_identification
      num_fewshot: 10
      random_baseline: 0.25
    - name: bigbench_conceptual_combinations
      num_fewshot: 10
      random_baseline: 0.25
  - name: symbolic_problem_solving
    benchmarks:
    - name: bigbench_elementary_math_qa
      num_fewshot: 10
      random_baseline: 0.25
    - name: bigbench_dyck_languages
      num_fewshot: 10
      random_baseline: 0
    - name: bigbench_cs_algorithms
      num_fewshot: 10
      random_baseline: 0
    - name: bigbench_logical_deduction
      num_fewshot: 10
      random_baseline: 0.25
    - name: bigbench_operators
      num_fewshot: 10
      random_baseline: 0.0
    - name: bigbench_repeat_copy_logic
      num_fewshot: 10
      random_baseline: 0.0
    - name: simple_arithmetic_withspaces
      num_fewshot: 10
      random_baseline: 0.0
    - name: simple_arithmetic_nospaces
      num_fewshot: 10
      random_baseline: 0.0
    - name: math_qa
      num_fewshot: 10
      random_baseline: 0.25
    - name: logi_qa
      num_fewshot: 10
      random_baseline: 0.25
  - name: reading_comprehension
    benchmarks:
    - name: pubmed_qa_labeled
      num_fewshot: 10
      random_baseline: 0.0
    - name: squad
      num_fewshot: 10
      random_baseline: 0
    - name: bigbench_understanding_fables
      num_fewshot: 10
      random_baseline: 0.25
    - name: boolq
      num_fewshot: 10
      random_baseline: 0.5
  # - name: programming
  #   benchmarks:
  #   - name: human_eval
  #     num_fewshot: 0
  #     random_baseline: 0.0
  #   - name: human_eval_cpp
  #     num_fewshot: 0
  #     random_baseline: 0.0
  #   - name: human_eval_js
  #     num_fewshot: 0
  #     random_baseline: 0.0
  #   - name: human_eval_return_simple
  #     num_fewshot: 0
  #     random_baseline: 0.0
  #   - name: human_eval_return_complex
  #     num_fewshot: 0
  #     random_baseline: 0.0
  #   - name: human_eval_25
  #     num_fewshot: 0
  #     random_baseline: 0.0
  #   - name: human_eval_50
  #     num_fewshot: 0
  #     random_baseline: 0.0
  #   - name: human_eval_75
  #     num_fewshot: 0
  #     random_baseline: 0.0
  - name: world_knowledge_lm_task_subscore
    benchmarks:
    - name: jeopardy
      num_fewshot: 10
      random_baseline: 0
    - name: bigbench_qa_wikidata
      num_fewshot: 10
      random_baseline: 0
  - name: language_understanding_lm_task_subscore
    benchmarks:
    - name: lambada_openai
      num_fewshot: 0
      random_baseline: 0.0
    - name: bigbench_conlang_translation
      num_fewshot: 0
      random_baseline: 0.0
  - name: symbolic_problem_solving_lm_task_subscore
    benchmarks:
    - name: bigbench_dyck_languages
      num_fewshot: 10
      random_baseline: 0
    - name: bigbench_cs_algorithms
      num_fewshot: 10
      random_baseline: 0
    - name: bigbench_operators
      num_fewshot: 10
      random_baseline: 0.0
    - name: bigbench_repeat_copy_logic
      num_fewshot: 10
      random_baseline: 0.0
    - name: simple_arithmetic_withspaces
      num_fewshot: 10
      random_baseline: 0.0
    - name: simple_arithmetic_nospaces
      num_fewshot: 10
      random_baseline: 0.0
  - name: reading_comprehension_lm_task_subscore
    benchmarks:
    - name: pubmed_qa_labeled
      num_fewshot: 10
      random_baseline: 0.0
    - name: squad
      num_fewshot: 10
      random_baseline: 0
  - name: world_knowledge_lite
    benchmarks:
    - name: jeopardy
      num_fewshot: 10
      random_baseline: 0
    - name: arc_challenge
      num_fewshot: 10
      random_baseline: 0.25
  - name: commonsense_reasoning_lite
    benchmarks:
    - name: copa
      num_fewshot: 0
      random_baseline: 0.5
    - name: piqa
      num_fewshot: 10
      random_baseline: 0.5
  - name: language_understanding_lite
    benchmarks:
    - name: lambada_openai
      num_fewshot: 0
      random_baseline: 0.0
    - name: hellaswag
      num_fewshot: 10
      random_baseline: 0.25
    - name: winograd
      num_fewshot: 0
      random_baseline: 0.5
  - name: symbolic_problem_solving_lite
    benchmarks:
    - name: bigbench_elementary_math_qa
      num_fewshot: 10
      random_baseline: 0.25
    - name: bigbench_dyck_languages
      num_fewshot: 10
      random_baseline: 0
    - name: bigbench_operators
      num_fewshot: 10
      random_baseline: 0.0
    - name: bigbench_repeat_copy_logic
      num_fewshot: 10
      random_baseline: 0.0
    - name: simple_arithmetic_withspaces
      num_fewshot: 10
      random_baseline: 0.0
    - name: simple_arithmetic_nospaces
      num_fewshot: 10
      random_baseline: 0.0
  - name: reading_comprehension_lite
    benchmarks:
    - name: pubmed_qa_labeled
      num_fewshot: 10
      random_baseline: 0.0
    - name: squad
      num_fewshot: 10
      random_baseline: 0
