import json
import random
random.seed(42)

merge_result = []

slimorca = []
with open("data/slimorca.jsonl", "r") as f:
    for line in f:
        slimorca.append(json.loads(line))
slimorca = random.sample(slimorca, 300000)

for data in slimorca:
    if len(data["conversations"]) != 3:
        inp = data["conversations"][0]["value"]
        out = data["conversations"][1]["value"]
        merge_result.append({
            "corpus": "",
            "dataset": "slimorca",
            "instruction": "",
            "conversation": [{"input": inp,
                              "output": out}]
        })
    else:
        inst = data["conversations"][0]["value"]
        inp = data["conversations"][1]["value"]
        out = data["conversations"][2]["value"]
        merge_result.append({
            "corpus": "",
            "dataset": "slimorca",
            "instruction": inst,
            "conversation": [{"input": inp,
                              "output": out}]
        })

code = []
with open("data/data-evol_instruct-decontaminated.jsonl", "r") as f:
    for line in f:
        code.append(json.loads(line))
code = random.sample(code, 100000)

for data in code:
    inp = data["instruction"]
    out = data["response"]
    merge_result.append({
        "corpus": "",
        "dataset": "code-evol",
        "instruction": "",
        "conversation": [{"input": inp,
                          "output": out}]
    })

with open("data/MetaMathQA-395K.json", "r") as f:
    math = json.load(f)
math = random.sample(math, 100000)

for data in math:
    inp = data["query"]
    out = data["response"]
    merge_result.append({
        "corpus": "",
        "dataset": "meta-math",
        "instruction": "",
        "conversation": [{"input": inp,
                          "output": out}]
    })

with open("data/all-500k.json", 'w') as f:
    json.dump(merge_result, f, indent=4, ensure_ascii=False)
    


