


import time
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    matthews_corrcoef, f1_score, precision_score, recall_score,
    accuracy_score, roc_auc_score, average_precision_score,
)
from pyod.models.ae1svm import AE1SVM
from pyod.models.devnet import DevNet
from pyod.models.lunar import LUNAR
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.iforest import IForest
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.deep_svdd import DeepSVDD
from pyod.models.hbos import HBOS
from pyod.models.pca import PCA
from pyod.models.sod import SOD
from pyod.models.cof import COF
from pyod.models.loda import LODA
from pyod.models.alad import ALAD
from pyod.models.ecod import ECOD
from pyod.models.copod import COPOD
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.suod import SUOD
from pyod.models.anogan import AnoGAN
from pyod.models.vae import VAE
from pyod.models.dif import DIF
from pyod.models.so_gaal import SO_GAAL
from Model.KNFST import KNFST
from Model.PMKFN import PMKFN
from Model.DRLAD import DRLAD
from Model.NeuTraLAD import NeuTraLAD, CustomDataset
from Model.DRROC import DROCCTrainer
from torch.utils.data import DataLoader

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
ROOT_DIR = BASE_DIR
while ROOT_DIR != ROOT_DIR.parent and not (ROOT_DIR / "process_tabular").is_dir():
    ROOT_DIR = ROOT_DIR.parent

DATASET_LINKS = {
    : ROOT_DIR / "Data" / "NLP_by_BERT",
    : ROOT_DIR / "Data" / "CV_by_ResNet18",
}

DATASET_NAME_CV = [
    , "CIFAR10_1", "CIFAR10_2", "CIFAR10_3", "CIFAR10_4",
    , "CIFAR10_6", "CIFAR10_7", "CIFAR10_8", "CIFAR10_9",
    , "FashionMNIST_1", "FashionMNIST_2", "FashionMNIST_3",
    , "FashionMNIST_5", "FashionMNIST_6", "FashionMNIST_7",
    , "FashionMNIST_9",
    , "MNIST-C_brightness", "MNIST-C_canny_edges",
    , "MNIST-C_fog", "MNIST-C_glass_blur",
    , "MNIST-C_motion_blur", "MNIST-C_rotate",
    , "MNIST-C_shear", "MNIST-C_shot_noise",
    , "MNIST-C_stripe", "MNIST-C_translate",
    ,
    , "MVTec-AD_cable", "MVTec-AD_capsule",
    , "MVTec-AD_grid", "MVTec-AD_hazelnut",
    , "MVTec-AD_metal_nut", "MVTec-AD_pill",
    , "MVTec-AD_tile", "MVTec-AD_toothbrush",
    , "MVTec-AD_wood", "MVTec-AD_zipper",
    , "SVHN_1", "SVHN_2", "SVHN_3", "SVHN_4",
    , "SVHN_6", "SVHN_7", "SVHN_8", "SVHN_9",
]

DATASET_NAME_NLP = [
    , "20news_1", "20news_2",
    , "20news_4", "20news_5",
    , "agnews_1", "agnews_2", "agnews_3",
    , "imdb", "yelp",
]

MODELS_LIST = [
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
]

OUTPUT_DIR = ROOT_DIR / "process_nlp_cv" / "processed_results"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE = OUTPUT_DIR / "Baseline_result_nlp_cv_10_datasets.csv"
COLUMNS = [
    , "Model", "AUCROC", "AUCPR", "Accuracy", "MCC", "F1 Score",
    , "Recall", "Time Train", "Time Test",
]

if not OUTPUT_FILE.exists():
    pd.DataFrame(columns=COLUMNS).to_csv(OUTPUT_FILE, index=False)


def preprocess_data(train_data: pd.DataFrame, test_data: pd.DataFrame):
    print("..............................Data Overview................................")
    print("Train Data Shape:", train_data.shape)
    print("Test Data Shape:", test_data.shape)

    X_train_total = train_data.iloc[:, :-1].to_numpy()
    y_train_total = train_data.iloc[:, -1].to_numpy()
    X_train = X_train_total[y_train_total == 0]
    y_train = y_train_total[y_train_total == 0]

    print("Train Data Labels [0]:", np.unique(y_train))

    X_test = test_data.iloc[:, :-1].to_numpy()
    y_test = test_data.iloc[:, -1].to_numpy()

    print("Number of samples:", X_train.shape[0])
    print("Number of features:", X_train.shape[1])

    return X_train, y_train, X_test, y_test


def evaluate_model(y_true, y_pred, y_probabilities=None):
    print("..............................Evaluation Metrics...............................")

    mcc = matthews_corrcoef(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    auc_roc, auc_pr = None, None
    if y_probabilities is not None:
        probs = y_probabilities
        if probs.ndim == 1:
            probs = np.stack([1 - probs, probs], axis=1)
        auc_roc = roc_auc_score(y_true, probs[:, 1])
        auc_pr = average_precision_score(y_true, probs[:, 1])

    print(f"AUCROC: {auc_roc * 100 if auc_roc else 'N/A'}")
    print(f"AUCPR: {auc_pr * 100 if auc_pr else 'N/A'}")
    print(f"Accuracy: {accuracy * 100:.2f}")
    print(f"MCC: {mcc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

    return [
        auc_roc * 100 if auc_roc else None,
        auc_pr * 100 if auc_pr else None,
        accuracy * 100,
        mcc,
        f1,
        precision,
        recall,
    ]

def scores_to_probabilities(scores):
    scores = np.asarray(scores, dtype=float).reshape(-1)
    min_score = scores.min()
    max_score = scores.max()
    if max_score == min_score:
        norm_scores = np.zeros_like(scores)
    else:
        norm_scores = (scores - min_score) / (max_score - min_score)
    return np.stack([1 - norm_scores, norm_scores], axis=1)


def init_knfst_model():
    return KNFST(
        gamma="scale",
        proxy_strategy="max_corner",
        proxy_margin=0.1,
        max_train_size=3000,
        random_state=42,
    )


def make_pmkfn_views(X_train, X_test, n_views=3):
    rng = np.random.RandomState(42)
    dim = X_train.shape[1]
    proj_dim = max(2, dim // 2)

    projections = [rng.randn(dim, proj_dim) for _ in range(n_views)]
    Xv_train = [X_train @ w for w in projections]
    Xv_test = [X_test @ w for w in projections]

    return Xv_train, Xv_test


def init_pmkfn_model(n_views=3):
    return PMKFN(
        p=2.0,
        gamma=["scale"] * n_views,
        delta="auto",
    )


def get_model(name, **kwargs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_dict = {
        : CBLOF,
        : KNN,
        : IForest,
        : OCSVM,
        : LOF,
        : DeepSVDD,
        : HBOS,
        : PCA,
        : SOD,
        : COF,
        : LODA,
        : ECOD,
        : COPOD,
        : AutoEncoder,
        : DevNet,
        : LUNAR,
        : AE1SVM,
        : ALAD,
        : SUOD,
        : AnoGAN,
        : VAE,
        : DIF,
        : SO_GAAL,
        : DRLAD,
        : NeuTraLAD,
        : DROCCTrainer,
    }

    model_class = model_dict.get(name)
    if model_class is None:
        raise ValueError(f"Model {name} not found.")

    if name == "NeuTraLAD":
        return model_class(
            in_features=kwargs.get("in_features", 10),
            fc_1_out=kwargs.get("fc_1_out", 128),
            fc_last_out=kwargs.get("fc_last_out", 32),
            compression_unit=kwargs.get("compression_unit", 16),
            n_transforms=kwargs.get("n_transforms", 4),
            n_layers=kwargs.get("n_layers", 3),
            trans_type=kwargs.get("trans_type", "mlp"),
            temperature=kwargs.get("temperature", 0.07),
            device=device,
        )
    if name == "DROCC":
        return model_class(
            in_dim=kwargs.get("in_features", 10),
            hid_dim=kwargs.get("hid_dim", 128),
            lamda=kwargs.get("lamda", 1.0),
            radius=kwargs.get("radius", 0.2),
            gamma=kwargs.get("gamma", 2.0),
            lr=kwargs.get("lr", 0.001),
            batch_size=kwargs.get("batch_size", 128),
            num_epochs=kwargs.get("num_epochs", 50),
            device=device,
        )
    if name == "DRLAD":
        return model_class(
            in_features=kwargs.get("in_features", 10),
            hidden_dim=kwargs.get("hidden_dim", 128),
            en_nlayers=kwargs.get("en_nlayers", 3),
            de_nlayers=kwargs.get("de_nlayers", 3),
            basis_vector_num=kwargs.get("basis_vector_num", 5),
            diversity=kwargs.get("diversity", True),
            plearn=kwargs.get("plearn", False),
            input_info=kwargs.get("input_info", True),
            input_info_ratio=kwargs.get("input_info_ratio", 0.1),
            cl=kwargs.get("cl", True),
            cl_ratio=kwargs.get("cl_ratio", 0.06),
            epochs=kwargs.get("num_epochs", 200),
            learning_rate=kwargs.get("lr", 0.05),
            sche_gamma=kwargs.get("sche_gamma", 0.98),
            batch_size=kwargs.get("batch_size", 512),
            weight_decay=kwargs.get("weight_decay", 1e-5),
            device=device,
        )
    if name == "DeepSVDD":
        return model_class(n_features=kwargs.get("in_features", 10))

    return model_class(**kwargs)


def run_experiment(dataset_names, dataset_type):
    for name in dataset_names:
        for model_name in MODELS_LIST:
            try:
                print(f"\nRunning dataset {name} with model {model_name}")

                dataset_path = DATASET_LINKS[dataset_type] / f"{name}.npz"
                if not dataset_path.exists():
                    print(f"Dataset {name} not found, skipping.")
                    continue

                data = np.load(dataset_path, allow_pickle=True)
                X, y = pd.DataFrame(data["X"]), pd.DataFrame(data["y"])

                if len(y) > 10000:
                    print("Reducing data size to 10000")
                    _, X, _, y = train_test_split(X, y, test_size=10000, random_state=42)

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.3, random_state=42
                )

                train_data = pd.concat([X_train, y_train], axis=1, ignore_index=True)
                test_data = pd.concat([X_test, y_test], axis=1, ignore_index=True)

                X_train, y_train, X_test, y_test = preprocess_data(train_data, test_data)

                start_time = time.time()
                n_features = X_train.shape[1]
                X_test_input = X_test
                if model_name == "KNFST":
                    model = init_knfst_model()
                    model.fit(X_train)
                elif model_name == "PMKFN":
                    X_train_views, X_test_input = make_pmkfn_views(X_train, X_test)
                    model = init_pmkfn_model(n_views=len(X_train_views))
                    model.fit(X_train_views)
                elif model_name == "DRLAD":
                    model = get_model(
                        model_name,
                        in_features=n_features,
                        hidden_dim=128,
                        batch_size=512,
                        num_epochs=200,
                        lr=0.05,
                    )
                    model.fit(X_train, y_train)
                elif model_name in ["NeuTraLAD", "DROCC"]:
                    model = get_model(
                        model_name,
                        in_features=n_features,
                        batch_size=128,
                        num_epochs=50,
                        lr=0.001,
                    )
                    dataset = CustomDataset(X_train, y_train)
                    _ = DataLoader(dataset, batch_size=128, shuffle=True)
                    model.fit(X_train, y_train)
                elif model_name == "DeepSVDD":
                    model = get_model(model_name, in_features=n_features)
                    model.fit(X_train)
                elif model_name == "DevNet":
                    model = get_model(model_name)
                    model.fit(X_train, y_train)
                else:
                    model = get_model(model_name)
                    model.fit(X_train)
                train_time = time.time() - start_time

                start_time = time.time()
                y_probabilities = None
                if model_name == "DRLAD":
                    scores = model.decision_function(X_test_input)
                    y_probabilities = scores_to_probabilities(scores)
                    y_pred = (y_probabilities[:, 1] >= 0.5).astype(int)
                else:
                    y_pred = np.asarray(model.predict(X_test_input)).reshape(-1)
                    if hasattr(model, "predict_proba"):
                        y_probabilities = model.predict_proba(X_test_input)
                        y_probabilities = np.asarray(y_probabilities)
                        if y_probabilities.ndim == 1:
                            y_probabilities = scores_to_probabilities(y_probabilities)
                    elif hasattr(model, "decision_function"):
                        scores = model.decision_function(X_test_input)
                        y_probabilities = scores_to_probabilities(scores)

                    if y_pred.ndim != 1 or not np.isin(y_pred, [0, 1]).all():
                        scores = y_pred
                        if y_probabilities is None:
                            y_probabilities = scores_to_probabilities(scores)
                        y_pred = (y_probabilities[:, 1] >= 0.5).astype(int)
                test_time = time.time() - start_time

                metrics = evaluate_model(y_test, y_pred, y_probabilities=y_probabilities)
                result = [name, model_name] + metrics + [train_time, test_time]
                pd.DataFrame([result], columns=COLUMNS).to_csv(
                    OUTPUT_FILE, mode="a", header=False, index=False
                )

                print(f"Results saved for {name} with model {model_name}")

            except Exception as exc:
                print(f"Error with dataset {name}, model {model_name}: {exc}")


def main():
    run_experiment(DATASET_NAME_CV, "CV")
    run_experiment(DATASET_NAME_NLP, "NLP")


if __name__ == "__main__":
    main()
