# This script is used to combine all the json files in the input directory 
# and split them into train and test sets.

import os
import json
import random

def combine_json_arrays(input_dir):
    all_data = []
    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):
            file_path = os.path.join(input_dir, filename)
            print(f"Loading {file_path} ...")
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    all_data.extend(data)
                else:
                    print(f"Warning: {filename} is not a JSON array, skipping.")
    print(f"Total records combined: {len(all_data)}")
    return all_data

def split_and_save(data, train_path, test_path, train_ratio=0.95):
    random.shuffle(data)
    n_train = int(len(data) * train_ratio)
    train_data = data[:n_train]
    test_data = data[n_train:]
    with open(train_path, 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=2)
    with open(test_path, 'w', encoding='utf-8') as f:
        json.dump(test_data, f, ensure_ascii=False, indent=2)
    print(f"Train set: {len(train_data)} samples saved to {train_path}")
    print(f"Test set: {len(test_data)} samples saved to {test_path}")

if __name__ == '__main__':
    input_dir = os.path.join(os.path.dirname(__file__), 'original_data')
    all_data = combine_json_arrays(input_dir)
    train_path = os.path.join(os.path.dirname(__file__), 'train_data.json')
    test_path = os.path.join(os.path.dirname(__file__), 'test_data.json')
    split_and_save(all_data, train_path, test_path, train_ratio=0.95)
