from transformers import AutoTokenizer
import json
import subprocess
from tqdm import tqdm

dataset="orca_30k_train-disturbed_[0.5, 1, 5, 10]_512"
model_name="]"
tokenizer = AutoTokenizer.from_pretrained("")
input_path = f"../data/synthetic_data/{model_name}-generate_template_and_code-{dataset}.jsonl"
output_path = f"../data/process_data/generate_cot_with_code-{model_name}-{dataset}.jsonl"

prompt = r"""
Your task is to provide a clear chain-of-thought (COT) explanation that answers the user's question. A Python script may be provided as part of the input, but it is not mandatory to follow it closely. If the provided code doesn't align with the real-world scenario or if the values and logic in the code are incorrect or irrelevant to the problem, feel free to disregard the script. Instead, focus on reasoning through the problem using your own judgment and logic.
Interpret the question clearly and begin by understanding the problem. If the Python script can offer guidance, you may refer to it, but it's not a requirement. If the provided code does not match the context or contains errors, you are free to work through the solution from scratch without referring to it.
Explicitly state the final answer after completing your reasoning, enclosed in \boxed{}.
"""

instruction = """
### Query:
{}

### Python Code:
{}

### Resonse:
"""

with open(input_path, "r") as f, open(output_path, "w") as o:
    result = subprocess.run(['wc', '-l', input_path], stdout=subprocess.PIPE, text=True)
    line_count = int(result.stdout.split()[0])
    for line in tqdm(f, total=line_count):

        item = json.loads(line)
        for inner_idx, inner_item in enumerate(item["disturbed"]):
            
            messages = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt + instruction.format(inner_item["new_query"], inner_item["new_code"])}
            ]
            
            o.write(json.dumps({
                "id": item["id"],
                "inner_id": inner_idx,
                "prompt": tokenizer.apply_chat_template(
                            messages,
                            tokenize=False,
                            add_generation_prompt=True,
                            enable_thinking=False),
                "max_fluct": inner_item["max_fluct"],
                "instruction": inner_item["new_query"],
                "code": inner_item["new_code"],
                "system": "Please reason step by step, and put your final answer within \\boxed{}.",
                "input": "",
                "history": [],
                "answer": inner_item["new_ans"],
                }, ensure_ascii=False) + "\n")
            