task: countdown_4num_pass32
dataset_path: parquet
dataset_kwargs:
  data_files:
    train: data/countdown_4num/lm_eval_hf/train/data.parquet
output_type: generate_until
test_split: train
doc_to_text: !function utils.doc_to_text
doc_to_target: ""
process_results: !function utils.process_results
generation_kwargs:
  until:
    - "</s>"
    - "<|im_end|>"
    - "<|endoftext|>"
  do_sample: true
  temperature: 1.0
  top_p: 1.0
  max_gen_toks: 1024
repeats: 32
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
num_fewshot: 0
metadata:
  version: 1.0
  description: "Countdown 4-number pass@32 (200 test × 32 samples, temp=1.0)"
