task: math500_r1
dataset_path: HuggingFaceH4/MATH-500
dataset_name: default
output_type: generate_until
training_split: null
test_split: test
doc_to_text: "Problem: {{problem}}\n\nPlease reason step by step, and put your final answer within \\boxed{}."
doc_to_target: "{{answer}}"
process_docs: !function utils.process_docs
process_results: !function utils.process_results
generation_kwargs:
  until: ["Problem:", "</s>", "<|im_end|>", "<|endoftext|>", "<|end▁of▁sentence|>"]
  do_sample: true
  temperature: 0.6
  top_p: 0.95
  max_gen_toks: 25000
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
num_fewshot: 0
metadata:
  version: 1.0
  description: "MATH-500 (OpenAI/DeepSeek subset) with R1-style reasoning"
