import json
import os

def load_json(file):
    with open(file,'r', encoding="utf8") as load_f:
        data = json.load(load_f)
        return data
    
def write_json(file, dict):
    with open(file, "w", encoding="utf8") as f:
        json.dump(dict, f, indent=4, ensure_ascii=False)


input_file = ''
output_dir = ''


os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, input_file.split('__')[-1])


data = load_json(input_file)

print(len(data))



inst_prompt: str= '''Solve the following problem step by step. You can selectively use Python with z3 (for logic) and sympy (for calculations) to verify reasoning. Python code will run in an external
sandbox, returning output as <interpreter>output</interpreter>. The python code should be complete scripts, including necessary imports. 

Revise reasoning if sandbox returns 'disproved' or fix code if execution errors occur.

Code Format:
Each code snippet is wrapped with
<code>
```python
code snippet
```
</code>

Response must end exactly as:
<answer>
[Summary of all reasoning steps]
\\boxed{[Final answer]}
</answer>

[PROMPT]
'''

sft_data = []
for d in data:
    
    
    if d.get('consist', "Yes") == 'Yes':
        thinking = d.get("formalized_cot")
    else:
        thinking = d.get("verified_cot")
        
    answer = d["summary"]
    formal_answer = f"{thinking}\n\n<answer>\n{answer}\n</answer>"
    
    
    
    item = {
            'id': f"TIGER-Lab/WebInstruct-verified-{d['id']}",
            "instruction": inst_prompt.replace("[PROMPT]", d['question'].strip()),
            "input": "",
            "output": formal_answer, 
            "ground_truth": d['answer'],
            "answer_type": d['answer_type'],
            "category": d['category'],
            "difficulty": d['difficulty']
        }

    sft_data.append(item)


print("Total: ", len(sft_data))
write_json(output_path, sft_data)