dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
test_split: test
fewshot_split: dev
fewshot_config:
  sampler: first_n
output_type: generate_until
doc_to_text: "Please reason step by step, and format your final answer as \\boxed{LETTER}, where LETTER is one of A, B, C, or D.\n\n\n\n{{question.strip()}}\nA) {{choices[0]}}\nB) {{choices[1]}}\nC) {{choices[2]}}\nD) {{choices[3]}}"
doc_to_target: "{{['A', 'B', 'C', 'D'][answer]}}"
generation_kwargs:
  until:
    - "</s>"
    - "<|eot_id|>"
    - "<｜end▁of▁sentence｜>"
    - "<|im_end|>"
  do_sample: false
  temperature: 0.0
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
    ignore_punctuation: true
    ignore_case: true
filter_list:
  - name: get_response
    filter:
      # # Filter everything before the last "Answer:"
      # - function: "regex"
      #   regex_pattern: "^(?:[\\s\\S]*Answer:\\s*)([\\s\\S]*)$"
      # # Filter everything after the first break line
      # - function: "regex"
      #   regex_pattern: "^(.*?)(?=\\n|$)"
      # # Remove leading white spaces
      # - function: remove_whitespace
      # # function to ignore right white spaces or line breaks
      # - function: "regex"
      #   regex_pattern: "^(.*?)\\s*$"
      # - function: "regex"
      #   regex_pattern: "\\\\boxed\\{\\s*(.*?)\\s*\\}"
      # - function: take_first
      - function: "r1_mcqa"

metadata:
  version: 3.0
dataset_kwargs:
  trust_remote_code: true
