#!/usr/bin/env python3
import json
import random
import os

# set random seed
random.seed(42)

# read data
input_file = "/root/Lean-Mutation/datasets/output/total_data.json"
print(f"read data: {input_file}")

try:
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"total data: {len(data)}")
except Exception as e:
    print(f"read file error: {e}")
    exit(1)

# shuffle data
random.shuffle(data)

# calculate split point
total_count = len(data)
train_count = int(total_count * 0.95)
test_count = total_count - train_count

print(f"train data: {train_count} records (95%)")
print(f"test data: {test_count} records (5%)")

# split data
train_data = data[:train_count]
test_data = data[train_count:]

# create output directory
output_dir = "/root/Lean-Mutation/datasets/output"
os.makedirs(output_dir, exist_ok=True)

# save train data
train_file = os.path.join(output_dir, "train_data.json")
with open(train_file, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)
print(f"train data saved to: {train_file}")

# save test data
test_file = os.path.join(output_dir, "test_data.json")
with open(test_file, 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)
print(f"test data saved to: {test_file}")

print("data split done!")
