

import json
from transformers import AutoTokenizer
import random

tokenizer = AutoTokenizer.from_pretrained("")
dataset_name="orca"
input_path = f"../data/raw_data/{dataset_name}-train.json"
output_path = f"../data/process_data/generate_template_and_code-{dataset_name}_30k_train.json"


instruction = r"""Task Description:
You are given a natural language query and its chain-of-thought response. Your task is to:
Generate a Query Template by abstracting specific values into variables.
Generate Python Code that executes the logic described in the COT response using the abstracted variables.

Input Format:
Query: Original query with specific values
Response: Chain-of-thought reasoning that leads to the answer

Output Requirements:
Query Template:
Replace only concrete values in the query with angle-bracketed placeholders like <variable_name>.
Do not replace names or general nouns (e.g., do not change “Jungkook” to <person_name>).
Preserve the original wording and structure of the query as much as possible.
Python Code:
Begin by defining variables that correspond to the placeholders in the template.
Translate the logic in the response into executable Python code.
The code should end with a print() statement that prints only the final result.
Do not include comments with explanations or reasoning.
Use the same variable names as in the template for consistency.

=== START EXAMPLE ===
### Query:
Find A that satisfies 32×A×A×A=42592

### Response:
To find the value of A that satisfies the equation 32×A×A×A=42592, we can rewrite the equation as:
\(32A^3 = 42592\)
Now, we need to isolate A by dividing both sides of the equation by 32:
\(A^3 = \frac{42592}{32}\)
\(A^3 = 1331\)
Now, we take the cube root of both sides to solve for A:
\(A = \sqrt[3]{1331}\)
\(A = 11\)

### Template:
Find A that satisfies <coefficient>×A×A×A=<result>

### Python Code:
```python
# Variable definitions
coefficient = 32
result = 42592

# Calculation
A_cubed = result / coefficient
A = A_cubed ** (1/3)

# Output
print(A)
```
=== END EXAMPLE ===
"""

prompt = """
Instruction:
### Query:
{}

### Response:
{}

"""


with open(input_path, "r") as f:
    data = json.load(f)

random.shuffle(data)
data = data[:30000]

results = []
for idx, item in enumerate(data):
    
    messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": instruction + prompt.format(item["query"], item["chosen"])},
    ]
    
    results.append({
        "id": idx,
        "query": item["query"],
        "prompt": tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True),
        "chosen": item["chosen"],
        "answer": item["answer"]
    })

with open(output_path, "w") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)