tag:
  - math_word_problems
task: aime_nofigures_pass16
dataset_path: simplescaling/aime_nofigures
dataset_name: default
process_docs: !function utils.process_docs
output_type: generate_until
test_split: train
doc_to_text: !function utils.doc_to_text
doc_to_target: answer
process_results: !function utils.process_results
generation_kwargs:
  until: []
  do_sample: false
  temperature: 1
  max_gen_toks: 32768
repeats: 16  # Number of generations per problem to support pass@k, cov@k, maj@k
filter_list:
  - name: "all"
    filter:
      - function: "take_first_k"
        k: 16
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
  - metric: pass@16
    aggregation: mean
    higher_is_better: true
  - metric: pass@8
    aggregation: mean
    higher_is_better: true
  - metric: pass@4
    aggregation: mean
    higher_is_better: true
  - metric: pass@2
    aggregation: mean
    higher_is_better: true
  - metric: cov@16
    aggregation: mean
    higher_is_better: true
  - metric: cov@8
    aggregation: mean
    higher_is_better: true
  - metric: cov@4
    aggregation: mean
    higher_is_better: true
  - metric: cov@2
    aggregation: mean
    higher_is_better: true
  - metric: maj@16
    aggregation: mean
    higher_is_better: true
  - metric: maj@8
    aggregation: mean
    higher_is_better: true
  - metric: maj@4
    aggregation: mean
    higher_is_better: true
  - metric: maj@2
    aggregation: mean
    higher_is_better: true
  - metric: extracted_answers
    aggregation: bypass
    higher_is_better: true
  - metric: exact_matches
    aggregation: bypass
    higher_is_better: true
metadata:
  version: 1.0
