dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
test_split: test
fewshot_split: dev
fewshot_config:
  sampler: first_n
output_type: generate_until
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA:"
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
filter_list:
  - name: "strict-match"
    filter:
      - function: "take_first"
  - name: "flexible-extract"
    filter:
      - function: "multi_choice_regex"
        group_select: 0
        regex_pattern: "(\\([A-Z]\\))"
        ignore_case: true
        ignore_punctuation: true
      - function: "take_first"
generation_kwargs:
  until:
    - "</s>"
    - "Q:"
    - "<|im_end|>"
    - "\n"
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
metadata:
  version: 3.0
dataset_kwargs:
  trust_remote_code: true
