import os
import random

def split_toxic_words(
    source_dir,
    embedding_method,
    split_info,
    seed=42
):
    # Output directory names
    train_dir = f'train_{embedding_method}/original_words'
    test_dir = f'test_{embedding_method}/original_words'

    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    print(f"[Info] Created folders: {train_dir}, {test_dir}")

    # Set random seed
    random.seed(seed)

    # Store counts
    train_counts = {}
    test_counts = {}

    for category, (train_count, test_count) in split_info.items():
        file_path = os.path.join(source_dir, f'{category}.txt')

        with open(file_path, 'r') as f:
            lines = [line.strip() for line in f.readlines() if line.strip()]

        # Remove duplicates
        lines = list(set(lines))

        # Shuffle
        random.shuffle(lines)

        assert len(lines) >= train_count + test_count, \
            f"[Error] Not enough words in category '{category}' to split."

        # Split
        train_words = lines[:train_count]
        test_words = lines[train_count:train_count + test_count]

        # Save
        with open(os.path.join(train_dir, f'{category}.txt'), 'w') as f:
            f.writelines([w + '\n' for w in train_words])

        with open(os.path.join(test_dir, f'{category}.txt'), 'w') as f:
            f.writelines([w + '\n' for w in test_words])

        train_counts[category] = len(train_words)
        test_counts[category] = len(test_words)

        print(f"[Done] {category}: Train = {len(train_words)}, Test = {len(test_words)} saved.")

    print("\n[Summary]")
    for category in split_info.keys():
        print(f"{category}: Train = {train_counts[category]}, Test = {test_counts[category]}")

    print("\n[Complete] Train/Test split finished.")


# Example execution
if __name__ == "__main__":
    embedding_method = 'easyocr'  # example: easyocr, paddleocr, viper
    source_dir = 'VIPER/toxic_word_pool'

    split_vocab_info = {
        'sexual': (507, 127),
        'insult': (468, 118),
        'hate': (130, 33),
        'drug': (32, 9),
        'crime': (36, 10)
    }

    split_toxic_words(source_dir, embedding_method, split_vocab_info)