import random
import shutil
from pathlib import Path

import typer


def main():
    """
    Split the PDFs in each subfolder into training and testing sets.
    """
    # Set the seed for reproducibility
    random.seed(42)

    # Base directory containing subfolders of themes
    base_dir = Path("data/scrapped_pdfs")
    if not base_dir.exists():
        raise FileNotFoundError(f"Directory {base_dir} not found.")

    OUTPUT_DIR = Path("data/scrapped_pdfs_split")
    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

    # Get all subfolders in the base directory
    subfolders = [f for f in base_dir.iterdir() if f.is_dir()]

    # Iterate through each subfolder
    for subfolder in subfolders:
        # Get all PDF files in the current subfolder
        pdf_files = list(subfolder.glob("*.pdf"))

        # Shuffle the list of PDFs to ensure randomness
        random.shuffle(pdf_files)

        # Split the PDFs into training and testing sets
        train_pdfs = pdf_files[:900]
        test_pdfs = pdf_files[900:1000]

        # Create train and test directories for the current subfolder
        train_dir = OUTPUT_DIR / f"{subfolder.name}_train"
        test_dir = OUTPUT_DIR / f"{subfolder.name}_test"

        if not train_dir.exists():
            train_dir.mkdir()
        else:
            continue

        if not test_dir.exists():
            test_dir.mkdir()
        else:
            continue

        # Move the selected PDFs to their respective directories
        for pdf in train_pdfs:
            shutil.copyfile(str(pdf), train_dir / pdf.name)

        for pdf in test_pdfs:
            shutil.copyfile(str(pdf), test_dir / pdf.name)

        print(f"Processed {subfolder.name}: {len(train_pdfs)} train files, {len(test_pdfs)} test files")

    print("Finished processing all subfolders.")


if __name__ == "__main__":
    typer.run(main)
