import json
import pandas as pd


data = pd.read_parquet("openai/gsm8k/main/train-00000-of-00001.parquet")

prompt = '''
Given the following problem, reason and give a final answer to the problem.
You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>
...
</think>
<answer>
...
</answer>.

Here are examples.

Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?

  A: <think>
There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.
</think>
<answer>
6
</answer>

Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?

  A: <think>
There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.
</think>
<answer>
5
</answer>

Q: {{question}}

  A:
'''.strip()


res = []
for d in data.values:
    _data = {}
    for k, v in zip(data.columns, d):
        _data[k] = v

    res.append(
        {
            "instruction": prompt.replace("{{question}}", _data["question"]),
            "input": "",
            "output": "",
            "answer": _data["answer"].rsplit("####")[-1].strip()
        }
    )

with open("openai_gsm8k_main_train.json", 'w', encoding='utf-8') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)
