# OMEGA Explorative — test_out only (N=134, the actual OOD eval split).
# Designed for cross-checkpoint comparison on the harder OOD problems.
# Per the OMEGA repo, "test_out" = "Test Out-of-Distribution: higher complexity
# problems from the same domain/template" — this is the eval split the OMEGA
# paper designed. Our previous omega_explorative_pass32 task bundled test_in
# (saturated easy split) with test_out, diluting recipe signal.
#
# Mapping from subset doc_id (0..133) -> full-410 doc_id (0..409) lives in
# data/omega/explorative_test_out_134_manifest.json.
task: omega_explorative_test_out_pass32
dataset_path: parquet
dataset_kwargs:
  data_files:
    test: data/omega/explorative_test_out_134.parquet
output_type: generate_until
test_split: test
doc_to_text: "Problem: {{question}}\n\nPlease reason step by step, and put your final answer within \\boxed{}."
doc_to_target: "{{answer}}"
process_results: !function utils.process_results
generation_kwargs:
  until:
    - "Problem:"
    - "</s>"
    - "<|im_end|>"
    - "<|endoftext|>"
    - "<|end▁of▁sentence|>"
  do_sample: true
  temperature: 0.6
  top_p: 0.95
  max_gen_toks: 25000
repeats: 32
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
num_fewshot: 0
metadata:
  version: 1.0
  description: "OMEGA Explorative test_out only (N=134) pass@32. The actual OOD eval split — discriminating between checkpoints (vs test_in which saturates). Strict boxed-only scoring (utils.py post-2026-05-05 patch)."
