# HMMT combined — concatenates Feb 2024 + Feb 2025 + Nov 2025 + Feb 2026 (123 problems).
# Larger N reduces pass@32 coverage variance vs single-competition (30-33 problems).
# Loaded as parquet via anon-neurips26/hmmt_combined (see scripts/build_hmmt_combined.py).
task: hmmt_combined_pass32
dataset_path: parquet
dataset_kwargs:
  data_files:
    test: data/hmmt_combined/test.parquet
output_type: generate_until
test_split: test
doc_to_text: "Problem: {{problem}}\n\nPlease reason step by step, and put your final answer within \\boxed{}."
doc_to_target: "{{answer}}"
process_results: !function utils.process_results
generation_kwargs:
  until:
    - "Problem:"
    - "</s>"
    - "<|im_end|>"
    - "<|endoftext|>"
    - "<|end▁of▁sentence|>"
  do_sample: true
  temperature: 0.6
  top_p: 0.95
  max_gen_toks: 25000
repeats: 32
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
num_fewshot: 0
metadata:
  version: 1.0
  description: "HMMT combined (Feb 2024 + Feb 2025 + Nov 2025 + Feb 2026 = 123 problems) pass@32."
