import glob
import json
import os
from pathlib import Path
import re
import shutil
import sys
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.")))
from utils.file_utils import contains_file, count_files, remove_empty_dirs, is_experiments_dir


def should_delete_dir(
    dir_path: str,
    min_traces: int = 3,
    trace_starts_with: str = None,
    trace_ends_with: str = None,
    contain_files: list[str] = None,
    contains_any: bool = True,
):
    if not os.path.isdir(dir_path):
        print(f"Directory {dir_path} does not exist. Returning False.")
        return False

    # If directory or nested subdirs do not contain at least min_traces traces, delete
    if trace_starts_with or trace_ends_with:
        traces_count = count_files(dir_path, starts_with=trace_starts_with, ends_with=trace_ends_with)
        if traces_count < min_traces:
            return True

    # If no files to search for, dont delete
    if not contain_files:
        return False

    # Regularize parameter
    if isinstance(contain_files, str):
        contain_files = [contain_files]

    # If directory or nested subdirs do contain any of the files, dont delete
    if contains_any:
        return not any(contains_file(dir_path, file, num_files=1) for file in contain_files)

    # If directory or nested subdirs do not contain all of the files, delete
    else:
        return not all(contains_file(dir_path, file, num_files=1) for file in contain_files)


def delete_dirs(
    base_path,
    min_traces,
    exclude_dirs: set = {},
    trace_starts_with=None,
    trace_ends_with=None,
    contain_files: list = None,
    contains_any: bool = True,
    exper_identifier_files: list[str] = ["args.json", "log_files.txt"],
):
    """Delete all nested directories in base_path not containing a minimum number of traces or not containing files in `contain_files` / `contains_any`.
    Obs.: includes not only the immediate subdirs, but also the subdirs of the subdirs, etc.

    Args:
        base_path (str): Path to the base directory containing the subdirs to delete.
        min_traces (int): Minimum number of traces required in a directory.
        exclude_dirs (set[str], optional): Dictionary of subdirs to not consider for deletion.
        trace_starts_with (str, optional): Start of the trace file name. Defaults to None.
        trace_ends_with (str, optional): End of the trace file name. Defaults to None.
        contain_files (list, optional): List of files to look for in the directory to use as criteria for deletion.
        contains_any (bool, optional): If True and any of the files in `contain_files` present, don't delete.
                                        If False and any of the files in `contain_files` not present, delete.
    """

    # iterate over immediate subdirs
    for topmost_dir in Path(base_path).iterdir():  # base_path = "./results"
        if any(d in str(topmost_dir) for d in exclude_dirs):
            continue

        if not topmost_dir.is_dir():
            continue

        for root, dirs, files in os.walk(topmost_dir, topdown=False):
            if not is_experiments_dir(root, identifier_files=exper_identifier_files):
                continue

            if should_delete_dir(
                dir_path=root,
                min_traces=min_traces,
                trace_starts_with=trace_starts_with,
                trace_ends_with=trace_ends_with,
                contain_files=contain_files,
                contains_any=contains_any,
            ):
                print(f"Deleting {root}.")
                shutil.rmtree(root)

        # Remove any directories with no files (including if only empty subdirectories)
        remove_empty_dirs(os.path.join(base_path, topmost_dir))


def delete_htmls(results_dir, save_dir=None, exclude_dirs: set = {}):
    """
    Delete HTML files that do not have a corresponding task in the 'summary_data.csv' file.
    """
    # 1. Iterate through each directory in results_dir
    for root, dirs, files in os.walk(results_dir, topdown=False):
        if not is_experiments_dir(root, identifier_files=["args.json"]):
            continue

        if root in exclude_dirs:
            continue

        # Files that inform about the tasks whose execution did not fail
        csv_path = os.path.join(root, "summary_data.csv")
        json_path = os.path.join(root, "task_scores_per_round.json")

        if not os.path.exists(csv_path) and not os.path.exists(json_path):
            continue

        # Get a list of tasks whose execution did not fail from the `csv` file
        executed_tasks = set()
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            executed_tasks = set(df["task_id"].astype(str))

        # If a `task_scores_per_round.json` file exists, include the tasks there as non-failed
        if os.path.exists(json_path):
            with open(json_path, "r") as f:
                data = json.load(f)
                executed_tasks.update(str(task_id) for task_id in data.keys())

        # Get all HTML files in the current directory and subdirectories
        html_files = glob.glob(os.path.join(root, "**/*.html"), recursive=True)

        # Delete all *<task_id>.html files not in the list of tasks executed
        for html_file in html_files:
            task_id = re.search(r"(\d+)\.html$", html_file).group(1)
            if task_id not in executed_tasks:
                os.remove(html_file)
                print(f"Deleted: {html_file}")
    print(f"Cleaning of HTML files of {results_dir} completed.")


def remove_empty_files(dir_path: str):
    for file in Path(dir_path).rglob("*"):
        if file.is_file() and file.stat().st_size == 0:
            file.unlink()


# Direct usage example
if __name__ == "__main__":
    # input_dir = input("Enter the dir containing the subdirs to delete:")
    input_dir = "../experiments"
    # delete_dirs(
    #     base_path=input_dir,
    #     min_traces=2,
    #     exclude_dirs={"debug", "prompt_tuning_tests", "zOld", "experiments"},
    #     trace_starts_with="render",
    #     trace_ends_with="html",
    #     contain_files=["summary_data.csv", "task_scores_per_round.json"],
    #     contains_any=True,
    # )
    remove_empty_files(input_dir)
    remove_empty_dirs(input_dir)
