


import pandas as pd
import numpy as np
import time
import os
from pathlib import Path
from Model.DVM_AD import DVM_AD
from sklearn.metrics import (
    roc_auc_score, precision_score, average_precision_score, recall_score,
    f1_score, accuracy_score, matthews_corrcoef)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    StandardScaler, QuantileTransformer, RobustScaler,
    MinMaxScaler, MaxAbsScaler, Normalizer)
from scipy.spatial.distance import cdist

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
ROOT_DIR = BASE_DIR
while ROOT_DIR != ROOT_DIR.parent and not (ROOT_DIR / "process_tabular").is_dir():
    ROOT_DIR = ROOT_DIR.parent

def Get_Scaler(name):
    if name == "StandardScaler": return StandardScaler()
    if name == "MinMaxScaler": return MinMaxScaler()
    if name == "RobustScaler": return RobustScaler()
    if name == "Normalizer": return Normalizer()
    if name == "QuantileTransformer": return QuantileTransformer(output_distribution="normal", random_state=42)
    if name == "MaxAbsScaler": return MaxAbsScaler()
    return None




def distance_vector(point_X, point_Y):
    norm_X = np.sum(point_X**2, axis=1)
    norm_Y = np.sum(point_Y**2, axis=1)
    dot_product = np.dot(point_Y, point_X.T)
    distance = np.sqrt(np.abs(norm_Y[:, np.newaxis] + norm_X[np.newaxis, :] - 2 * dot_product))
    return distance

def preprocess_data_OC(train_data, test_data):
    X_train_total = train_data.iloc[:, :-1].to_numpy()
    y_train_total = train_data.iloc[:, -1].to_numpy()
    X_train = X_train_total[y_train_total == 0]
    y_train = y_train_total[y_train_total == 0]
    X_test = test_data.iloc[:, :-1].to_numpy()
    y_test = test_data.iloc[:, -1].to_numpy()
    return X_train, y_train, X_test, y_test

def Model_evaluating(y_true, y_predict, y_scores):
    mcc = matthews_corrcoef(y_true, y_predict)
    f1 = f1_score(y_true, y_predict)
    ppv = precision_score(y_true, y_predict)
    tpr = recall_score(y_true, y_predict)
    accuracy = accuracy_score(y_true, y_predict)
    AUC = roc_auc_score(y_true, y_scores[:, 1])
    AUCPR = average_precision_score(y_true, y_scores[:, 1])
    return [AUC*100, AUCPR*100, accuracy*100, mcc, f1, ppv, tpr]

def load_and_process_dataset(name, domain):
    try:
        dataset_path = os.path.join(dataset_links[domain], f"{name}.npz")
        data = np.load(dataset_path, allow_pickle=True)
        return data
    except Exception as e:
        print(f"❌ Error loading {domain} dataset {name}: {e}")
        return None





dataset_name_cv = [
    , "CIFAR10_1", "CIFAR10_2", "CIFAR10_3", "CIFAR10_4",
    , "CIFAR10_6", "CIFAR10_7", "CIFAR10_8", "CIFAR10_9",

    , "FashionMNIST_1", "FashionMNIST_2", "FashionMNIST_3",
    , "FashionMNIST_5", "FashionMNIST_6", "FashionMNIST_7",
    , "FashionMNIST_9",

    , "MNIST-C_brightness", "MNIST-C_canny_edges",
    , "MNIST-C_fog", "MNIST-C_glass_blur",
    , "MNIST-C_motion_blur", "MNIST-C_rotate",
    , "MNIST-C_shear", "MNIST-C_shot_noise",
    , "MNIST-C_stripe", "MNIST-C_translate",
    ,

    , "MVTec-AD_cable", "MVTec-AD_capsule",
    , "MVTec-AD_grid", "MVTec-AD_hazelnut",
    , "MVTec-AD_metal_nut", "MVTec-AD_pill",
    , "MVTec-AD_tile", "MVTec-AD_toothbrush",
    , "MVTec-AD_wood", "MVTec-AD_zipper",

    , "SVHN_1", "SVHN_2", "SVHN_3", "SVHN_4",
    , "SVHN_6", "SVHN_7", "SVHN_8", "SVHN_9"
] 

dataset_name_nlp = [
    , "20news_1", "20news_2",
    , "20news_4", "20news_5",

    , "agnews_1", "agnews_2", "agnews_3",

    , "imdb", "yelp"
]

dataset_links = {
    : str(ROOT_DIR / 'Data' / 'NLP_by_BERT'),
    : str(ROOT_DIR / 'Data' / 'CV_by_ResNet18')
}




result_dir = ROOT_DIR / "process_nlp_cv" / "raw_data"
result_dir.mkdir(exist_ok=True)
performance_file = result_dir / "dvmad_result_realworld_data.csv"
performance_columns = [
    , "Scaler", "AUCROC", "AUCPR", "Accuracy", "MCC", "F1 score",
    , "TPR (Recall)", "Time train", "Time test"]


if not performance_file.exists():
    pd.DataFrame(columns=performance_columns).to_csv(performance_file, index=False)




__SCALER = ["None", "QuantileTransformer", "StandardScaler", "Normalizer", "RobustScaler", "MinMaxScaler", "MaxAbsScaler"]







if __name__ == "__main__":
    
    for dataset_array, domain in zip([dataset_name_cv, dataset_name_nlp], ['CV','NLP']):
        for name in dataset_array:
            data = load_and_process_dataset(name, domain)
            if data is None:
                continue
            try:
                X, y = data['X'], data['y']
                print(f"\n>>> Dataset: {name} | Size: {len(y)}")
                if len(y) > 10000:
                    
                    _, X, _, y = train_test_split(X, y, test_size=10000, random_state=42)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

                for scaler_name in __SCALER:
                    scaler = Get_Scaler(scaler_name)
                    if scaler is not None:
                        scaler.fit(X_train)
                        X_train_scaled = scaler.transform(X_train)
                        X_test_scaled = scaler.transform(X_test)
                    else:
                        X_train_scaled = X_train
                        X_test_scaled = X_test

                    Train_data = pd.DataFrame(np.hstack([X_train_scaled, y_train.reshape(-1, 1)]))
                    Test_data = pd.DataFrame(np.hstack([X_test_scaled, y_test.reshape(-1, 1)]))

                    X_train_proc, y_train_proc, X_test_proc, y_test_proc = preprocess_data_OC(Train_data, Test_data)

                    t0 = time.time()
                    dvm_ad = DVM_AD()
                    dvm_ad.fit(X_train_proc, y_train_proc) 
                    scores = dvm_ad.predict(X_test_proc)
                    t1 = time.time()

                    
                    y_train_score = dvm_ad.transform(X_train_proc)
                    train_score_tmp = distance_vector(y_train_score, y_train_score)
                    np.fill_diagonal(train_score_tmp, 1e9)
                    train_score = np.amin(train_score_tmp, axis=1)

                    y_proba = np.zeros((len(scores), 2))
                    y_proba[:, 1] = np.minimum(scores / np.max(train_score), 1)
                    y_proba[:, 0] = 1 - y_proba[:, 1]
                    y_predict = (y_proba[:, 1] > 0.5).astype(int)

                    v = Model_evaluating(y_test_proc, y_predict, y_proba)
                    t2 = time.time()

                    result = [name, scaler_name] + v + [t1 - t0, t2 - t1]
                    pd.DataFrame([result], columns=performance_columns).to_csv(performance_file, mode='a', header=False, index=False)
                    print(f"✅ Finished {name} | Scaler={scaler_name}")

            except Exception as e:
                print(f"❌ Error processing dataset {name}: {e}")
