import time
from sklearn.model_selection import train_test_split
import numpy as np
from utils.metrics import get_metrics_preds
from utils.data import get_files_in_dir, build_train_label
from .vcbod import get_model
import pandas as pd


def run_experiment(dir: str, args):
    files = get_files_in_dir(dir, None)
    columns = [
        "dataset",
        "measure_type",
        "feature_size",
        "dim",
        "tree_depth",
        "contamination",
        "roc_auc_score",
        "f1_ratio",
        "train_time",
        "inference_time",
    ]
    df = pd.DataFrame(columns=columns)
    print("Start experiment")

    for path in files:
        if (
            args.dataset is not None
            and args.dataset not in path
            or "kdd" in path
            or ("fraud" in path and args.dataset != "fraud")
            or ("mullcross" in path and args.dataset != "mullcross")
        ):
            continue

        res = build_train_label(path)
        if res is None:
            print(f"Could not process {path}")
            continue
        data, label = res

        label = label.reshape(-1)
        print(f"Start processing {path}")

        name = path.split("/")[-1].split(".")[0]
        contamination = len(label[label == 1]) / len(label)
        dim = data.shape[1]  # feature dimension

        # Process results from both tree based and non-tree based experiments
        if args.dependency or args.max_depth > 1:
            try:
                all_results = run_trees(data, label, args)
                all_results[0] = run_without_dep(
                    data, label, args
                )  # Assuming this function returns mean and std metrics
            except Exception as e:
                print(f"Could not process {path}")
                print(e)
                continue
        else:
            all_results = {0: run_without_dep(data, label, args)}

        for depth, metrics in all_results.items():
            mean_metric, std_metric = metrics
            for measure_type, metric in zip(["mean", "std"], [mean_metric, std_metric]):
                print(metric)
                df_row = {
                    "dataset": name,
                    "measure_type": measure_type,
                    "feature_size": len(label),
                    "dim": dim,
                    "tree_depth": depth,
                    "contamination": contamination,
                    "roc_auc_score": metric["roc_auc"],
                    "f1_ratio": metric["f1_ratio"],
                    "train_time": metric["train_time"],
                    "inference_time": metric["inference_time"],
                }
                df = df._append(df_row, ignore_index=True)

    # Export the DataFrame to a CSV file
    if args.dataset is not None:
        output_path = f"results/results_{args.dataset}.csv"
    else:
        output_path = "results/results_all_sets.csv"
    df.to_csv(output_path, index=False)
    print(f"Results have been saved to {output_path}")


def run_trees(data: np.array, label: np.array, args):
    results = {}
    max_depth = args.max_depth
    if max_depth > data.shape[1]:
        max_depth = data.shape[1]
    for depth in [max_depth, 1]:  # start from depth 1 to max_depth
        model = get_model(
            depth,
            dependence=True,
            discount=args.discount,
            desc_thresh=args.discret_threshold,
            cores=args.cores,
            use_dep_scores_only=True,
        )
        mean, std = apply_data_on_model_seeds(data, label, model, args)
        results[depth] = (mean, std)
    return results


def run_without_dep(data: np.array, label: np.array, args):
    model = get_model(
        1,
        dependence=False,
        discount=args.discount,
        desc_thresh=args.discret_threshold,
        cores=args.cores,
        use_dep_scores_only=False,
    )
    mean, std = apply_data_on_model_seeds(data, label, model, args)
    return mean, std


def apply_data_on_model_seeds(data: np.array, label: np.array, model, args):
    """Getting the mean metrics and the std over 10 different seeds"""
    scores_seed = []
    train_time_seed = []
    inference_time_seed = []
    for i in range(args.amount_seeds):
        # Get Random seed
        pred, new_label, train_time, inference_time = apply_data_on_model(
            data, label, model, seed=i, args=args
        )
        metrics = get_metrics_preds(new_label, pred)
        print(f"Metrics for seed {i}: {metrics}")
        scores_seed.append(metrics)
        train_time_seed.append(train_time)
        inference_time_seed.append(inference_time)

    # Calculate mean and std of metrics
    mean_scores = {
        key: np.mean([score[key] for score in scores_seed])
        for key in scores_seed[0].keys()
    }
    std_scores = {
        key: np.std([score[key] for score in scores_seed])
        for key in scores_seed[0].keys()
    }
    mean_scores["train_time"] = np.mean(train_time_seed)
    mean_scores["inference_time"] = np.mean(inference_time_seed)
    std_scores["train_time"] = np.std(train_time_seed)
    std_scores["inference_time"] = np.std(inference_time_seed)

    return mean_scores, std_scores


def apply_data_on_model(
    data: np.array, label: np.array, model, seed: int = 42, args=None
):
    real, outlier = data[label == 0], data[label == 1]
    real_train, real_test = train_test_split(
        real,
        test_size=args.train_size,
        train_size=args.test_size,
        random_state=seed,
        shuffle=True,
    )
    now = time.time()
    model = model.fit(real_train)

    train_time = time.time() - now
    test = np.concatenate((real_test, outlier), axis=0)
    test_labels = np.concatenate(
        (np.zeros(len(real_test)), np.ones(len(outlier))), axis=0
    )
    now = time.time()
    scores = model.predict_point(test)
    inference_time = time.time() - now
    return scores, test_labels, train_time, inference_time
