# import json
# from datasets import load_dataset, concatenate_datasets

# ds = concatenate_datasets([
#     load_dataset("MathArena/aime_2023_I", split="train"),
#     load_dataset("MathArena/aime_2023_II", split="train"),
# ])

# output_data = []

# for row in ds:
#     output_data.append({
#         "sol" : [row.get("answer")],
#         "question": row.get("problem"),
#         "cot_type": ["math"],
#         "source_type": "AIME23",
        
#     })

# # Save to JSON
# output_path = "aime23.json"
# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(output_data, f, indent=2, ensure_ascii=False)

# print(f"Saved {len(output_data)} rows to {output_path}")

from datasets import load_dataset
import json
from collections import defaultdict

# Load dataset
ds = load_dataset("HuggingFaceH4/MATH-500", split="test")

# Group examples by level
level_data = defaultdict(list)

for ex in ds:
    level = ex["level"]  # levels are integers 1–5

    formatted_ex = {
        "question": ex["problem"],
        "sol": ex["solution"],
        "cot_type": "math",
        "source": "HuggingFaceH4/MATH-500",
        "thinking_trajectories": "solution"
    }

    level_data[level].append(formatted_ex)

# Write one JSON file per level
for level, examples in level_data.items():
    filename = f"math_500_level_{level}.json"
    with open(filename, "w") as f:
        json.dump(examples, f, indent=2)

    print(f"Saved {len(examples)} examples to {filename}")
