# OMEGA Explorative — 100-problem subset (~2-3 per config × 41 configs).
# Designed for apples-to-apples cross-checkpoint comparison: base + novelty_s15
# were eval'd on the full 410; for sft_v2_ep5 + gspo_v2_s20 we run only this
# subset, then filter the full-410 JSONLs down to these 100 for analysis.
# Mapping from subset doc_id (0..99) -> full doc_id (0..409) lives in
# data/omega/explorative_test_sub100_manifest.json (`original_doc_ids_in_full_parquet`).
task: omega_explorative_pass32_sub100
dataset_path: parquet
dataset_kwargs:
  data_files:
    test: data/omega/explorative_test_sub100.parquet
output_type: generate_until
test_split: test
doc_to_text: "Problem: {{question}}\n\nPlease reason step by step, and put your final answer within \\boxed{}."
doc_to_target: "{{answer}}"
process_results: !function utils.process_results
generation_kwargs:
  until:
    - "Problem:"
    - "</s>"
    - "<|im_end|>"
    - "<|endoftext|>"
    - "<|end▁of▁sentence|>"
  do_sample: true
  temperature: 0.6
  top_p: 0.95
  max_gen_toks: 25000
repeats: 32
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
num_fewshot: 0
metadata:
  version: 1.0
  description: "OMEGA Explorative 100-problem subset pass@32 (stratified across 41 configs, 2-3 per config). Apples-to-apples with the full-410 base/novelty_s15 runs after JSONL filter."
