from datasets import load_dataset
import json

gsm8k = load_dataset("Qwen/ProcessBench", split="gsm8k")
math = load_dataset("Qwen/ProcessBench", split="math")
olympiadbench = load_dataset("Qwen/ProcessBench", split="olympiadbench")
omnimath = load_dataset("Qwen/ProcessBench", split="omnimath")

gsm8k_ref = load_dataset("openai/gsm8k","main")['test']
math_ref = load_dataset("jeggers/competition_math","original")['test']
olympiadbench_ref = load_dataset("lmms-lab/OlympiadBench",split="test_en")
omnimath_ref = load_dataset("KbsdJames/Omni-MATH",split="test")

final_result = {}
# Create lookup dictionaries for reference datasets
gsm8k_ref_dict = {j['question']: j['answer'].split("####")[1].strip() for j in gsm8k_ref}
math_ref_dict = {j['problem']: j['extracted_solution'] for j in math_ref}
olympiadbench_ref_dict = {j['question'].replace("$\quad$","").strip(): j['final_answer'][0] for j in olympiadbench_ref if j['final_answer'] is not None}
omnimath_ref_dict = {j['problem']: j['answer'] for j in omnimath_ref}

# Process gsm8k
final_result['gsm8k'] = []
for i in gsm8k:
    if i['problem'] in gsm8k_ref_dict:
        final_result['gsm8k'].append({
            "id": i['id'],
            "problem": i['problem'],
            "final_answer": gsm8k_ref_dict[i['problem']]
        })

# Process math
final_result['math'] = []
for i in math:
    if i['problem'] in math_ref_dict:
        final_result['math'].append({
            "id": i['id'],
            "problem": i['problem'],
            "final_answer": math_ref_dict[i['problem']]
        })

# Process olympiadbench
final_result['olympiadbench'] = []
for i in olympiadbench:
    if i['problem'] in olympiadbench_ref_dict:
        final_result['olympiadbench'].append({
            "id": i['id'],
            "problem": i['problem'],
            "final_answer": olympiadbench_ref_dict[i['problem']]
        })
    else:
        print("#"*100)
        print(i['problem'])
        print("#"*100)

# Process omnimath  
final_result['omnimath'] = []
for i in omnimath:
    if i['problem'] in omnimath_ref_dict:
        final_result['omnimath'].append({
            "id": i['id'],
            "problem": i['problem'],
            "final_answer": omnimath_ref_dict[i['problem']]
        })

with open("gsm8k.json","w") as f:
    f.write(json.dumps(final_result["gsm8k"],indent=4))
with open("math.json","w") as f:
    f.write(json.dumps(final_result["math"],indent=4))
with open("olympiadbench.json","w") as f:
    f.write(json.dumps(final_result["olympiadbench"],indent=4))
with open("omnimath.json","w") as f:
    f.write(json.dumps(final_result["omnimath"],indent=4))