import json
import random

random.seed(42)

with open("data/processed/sft/sft.jsonl", "r") as f:
    data = f.readlines()

data_list = []
for i in data:
    data_list.append(json.loads(i))

now = 0
mapping = {0: [0]}

for i in range(1, len(data_list)):
    if data_list[now]["x"] != data_list[i]["x"]:
        now = i
        mapping[now] = [i]
    else:
        mapping[now].append(i)

keys = list(mapping.keys())
random.shuffle(keys)

test_size = int(0.05 * len(keys))

with open("data/processed/sft/sft_test.jsonl", "w") as f:
    for i in keys[:test_size]:
        for j in mapping[i]:
            f.write(json.dumps(data_list[j]) + "\n")
with open("data/processed/sft/sft_train.jsonl", "w") as f:
    for i in keys[test_size:]:
        for j in mapping[i]:
            f.write(json.dumps(data_list[j]) + "\n")
