import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import os
from tqdm import tqdm

def generate_output(tokenizer, model, input_texts):
    device = next(model.parameters()).device
    inputs = tokenizer(
        input_texts,
        return_tensors="pt",
        padding=True
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=20000, 
            pad_token_id=tokenizer.eos_token_id, 
            do_sample=False
        )

    all_outputs = []
    all_lengths = []
    for i, out in enumerate(outputs):
        inp = inputs['input_ids'][i]
        output = out[len(inp):]
        result = tokenizer.decode(output, skip_special_tokens=True)
        length = len(tokenizer.encode(result, add_special_tokens=False))
        all_outputs.append(result)
        all_lengths.append(length)
    return all_outputs, all_lengths


def process_jsonl(input_file, output_dir, tokenizer, model):
    os.makedirs(output_dir, exist_ok=True)
    
    idx = 0
    with open(input_file, 'r', encoding='utf-8') as infile:
        for line in tqdm(infile):
            idx += 1
            data = json.loads(line.strip())
            problem = data.get("question")
            correct_answer = data.get("answer")

            input_text = f"<｜User｜>{problem}<｜Assistant｜><think>"

            model_answers, lengths = generate_output(tokenizer, model, [input_text])
            model_answer = model_answers[0] if model_answers else ""

            result = {
                "Problem": problem,
                "ModelAnswer": model_answer,
                "AnswerLength": lengths[0],
                "CorrectAnswer": correct_answer,
            }

            output_path = os.path.join(output_dir, f"{input_file.split("/")[-1].split(".")[0]}-{idx}.json")
            with open(output_path, 'w', encoding='utf-8') as outfile:
                json.dump(result, outfile, ensure_ascii=False, indent=2)


model_path = "path/to/model/DeepSeek-R1-Distill-Qwen-14B"

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    padding_side='left'
)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True,
    local_files_only=True
)

# input_jsonl_file = "path/to/LongCoT/AIME2025/aime2025-I.jsonl"
input_jsonl_file = "path/to/LongCoT/AIME2025/aime2025-II.jsonl"
output_directory = "path/to/LongCoT/AIME2025/aime_2025_answers" 

process_jsonl(input_jsonl_file, output_directory, tokenizer, model)

