import json
import os
import typer
from loguru import logger

def clean_vitonhd(
    data_dir: str = typer.Option("DATA/VITON-HD", help="The directory where VITON-HD dataset is extracted."),
):
    """Removes duplicate and leaked image files from the specified VITON-HD dataset directories.

    We computed phash of images using `imagehash` library https://github.com/JohannesBuchner/imagehash and detected same
    images. See `tryoffdiff/notebooks/vitonhd_duplicates.ipynb` for details.
    """

    with open("configs/vitonhd_duplicate_filenames.json") as f:
        files_to_remove = json.load(f)

    # Iterate over each folder and remove files
    for folder, file_types in files_to_remove.items():
        filenames = file_types["duplicates"] + file_types.get("leaked", [])
        for filename in filenames:
            for subfolder in ["cloth", "image"]:
                file_path = os.path.join(data_dir, folder, subfolder, filename)
                if os.path.isfile(file_path):
                    os.remove(file_path)
                    logger.info(f"Removed: {file_path}")
                else:
                    logger.info(f"File not found, skipped: {file_path}")

if __name__ == "__main__":
    typer.run(clean_vitonhd)
    # python datasets/cp_dataset_clean.py --data_dir DATA/VITON-HD