from transformers import AutoTokenizer
import json
import re
from collections import defaultdict
from tqdm import tqdm
import pandas as pd

def extract_last_num(text: str) -> float:
    text = re.sub(r"(\d),(\d)", r"\g<1>\g<2>", text)  # 处理形如 123,456
    res = re.findall(r"\\boxed\{(\d+(\.\d+)?)", text)  # 匹配 123456.789
    if len(res) == 0:
        res = re.findall(r"(\d+(\.\d+)?)", text)  # 匹配 123456.789
    if len(res) > 0:
        num_str = res[-1][0]
        return float(num_str)
    else:
        return 0.0

model_name=""
dataset="generate_paraphrase_question-aug_32_5"
input_path = f"../results/{model_name}-{dataset}.jsonl"
output_path = f"../../math_training/data/{model_name}-{dataset}_para_32.parquet"   

data = []
with open(input_path, 'r') as f:
    # data = json.load(f)
    for line in f:
        data.append(json.loads(line))


pattern = r'(?im)^\s*Rephrase .*? question:\s*'
results = []
for item in data:
    for generated_text in item["generated_texts"][:16]:
        results.append({
            "data_source": f"{item['id']}-paraphrase_question",
            "prompt": [
                {"role": "system", "content": item["system"]},
                {"role": "user", "content": re.sub(pattern, '', generated_text)}
            ],
            "reward_model": {
                "style": "rule",
                "ground_truth":  str(item["answer"])
            },
            "extra_info": {
                "id": item["id"],
                "max_fluct": item["max_fluct"],
                "code": item["code"]
            }
        })

df = pd.DataFrame(results)
df.to_parquet(output_path, engine="pyarrow", index=False)