import os
import argparse
import random
import shutil
import re
from typing import List, Tuple

import pandas as pd


def get_subdirectories(raw_dataset_path: str) -> List[str]:
    if not os.path.exists(raw_dataset_path):
        raise ValueError(f"Raw dataset path does not exist: {raw_dataset_path}")

    subdirs = []
    for item in os.listdir(raw_dataset_path):
        item_path = os.path.join(raw_dataset_path, item)
        if os.path.isdir(item_path):
            subdirs.append(item)

    return subdirs


def parse_instruction_file(instruction_path: str) -> Tuple[str, List[int]]:
    if not os.path.exists(instruction_path):
        raise ValueError(f"Instruction file does not exist: {instruction_path}")

    with open(instruction_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    if len(lines) < 3:
        raise ValueError(
            f"Instruction file must have at least 3 lines: {instruction_path}"
        )

    prompt = lines[1].strip()
    answer_line = lines[2].strip()
    match = re.search(r"\[([0-9,\s]+)\]", answer_line)
    if not match:
        raise ValueError(f"Could not parse coordinates from: {answer_line}")

    coordinates_str = match.group(1)
    coordinates = [int(x.strip()) for x in coordinates_str.split(",")]

    return prompt, coordinates


def copy_image_file(source_path: str, dest_path: str) -> None:
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    shutil.copy2(source_path, dest_path)


def create_dataset(
    raw_dataset_path: str, output_dataset_path: str, num_instances: int
) -> None:
    subdirs = get_subdirectories(raw_dataset_path)
    print(f"Found {len(subdirs)} instances in raw dataset")

    if num_instances > len(subdirs):
        print(
            f"Warning: Requested {num_instances} instances but only {len(subdirs)} available"
        )
        num_instances = len(subdirs)

    selected_instances = random.sample(subdirs, num_instances)
    print(f"Selected {len(selected_instances)} instances: {selected_instances}")

    output_images_dir = os.path.join(output_dataset_path, "images")
    os.makedirs(output_images_dir, exist_ok=True)

    dataset_rows = []

    for instance_name in selected_instances:
        instance_path = os.path.join(raw_dataset_path, instance_name)

        collage_path = os.path.join(instance_path, "collage.jpg")
        instruction_path = os.path.join(instance_path, "instruction.txt")

        if not os.path.exists(collage_path):
            print(f"Warning: collage.jpg not found in {instance_name}, skipping")
            continue

        if not os.path.exists(instruction_path):
            print(f"Warning: instruction.txt not found in {instance_name}, skipping")
            continue

        try:
            prompt, coordinates = parse_instruction_file(instruction_path)

            output_image_path = os.path.join(output_images_dir, f"{instance_name}.jpg")
            copy_image_file(collage_path, output_image_path)

            dataset_rows.append(
                {
                    "ID": instance_name,
                    "SceneName": "recaptcha",
                    "ImagePath": f"images/{instance_name}.jpg",
                    "Prompt": prompt,
                    "Correct": coordinates,
                }
            )

            print(f"Processed {instance_name}: {len(coordinates)} coordinates")

        except Exception as e:
            print(f"Error processing {instance_name}: {str(e)}")
            continue

    if dataset_rows:
        df = pd.DataFrame(dataset_rows)
        csv_path = os.path.join(output_dataset_path, "dataset.csv")
        df.to_csv(csv_path, index=False)
        print(f"Dataset saved to {csv_path} with {len(dataset_rows)} entries")
    else:
        print("No valid instances processed")


def main():
    parser = argparse.ArgumentParser(description="Create dataset from raw captcha data")
    parser.add_argument(
        "--raw-dataset", type=str, required=True, help="Path to raw dataset directory"
    )
    parser.add_argument(
        "--output-dataset",
        type=str,
        required=True,
        help="Path to output dataset directory",
    )
    parser.add_argument(
        "--num-instances",
        type=int,
        required=True,
        help="Number of instances to randomly select",
    )
    parser.add_argument(
        "--seed", type=int, default=42, help="Random seed for reproducible selection"
    )

    args = parser.parse_args()
    random.seed(args.seed)
    raw_dataset_path = os.path.abspath(args.raw_dataset)
    output_dataset_path = os.path.abspath(args.output_dataset)

    print(f"Raw dataset path: {raw_dataset_path}")
    print(f"Output dataset path: {output_dataset_path}")
    print(f"Number of instances: {args.num_instances}")

    create_dataset(
        raw_dataset_path=raw_dataset_path,
        output_dataset_path=output_dataset_path,
        num_instances=args.num_instances,
    )


if __name__ == "__main__":
    main()
