


import pandas as pd
import numpy as np
import time
import os 
from pathlib import Path

from Model.DVM_AD import DVM_AD

from sklearn.metrics import (roc_auc_score, precision_score, average_precision_score,
                             recall_score, f1_score, accuracy_score, matthews_corrcoef)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, StandardScaler, QuantileTransformer, RobustScaler, MinMaxScaler, MaxAbsScaler

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
ROOT_DIR = BASE_DIR
while ROOT_DIR != ROOT_DIR.parent and not (ROOT_DIR / "process_tabular").is_dir():
    ROOT_DIR = ROOT_DIR.parent





def load_and_process_dataset_X(dataset_name, dataset_links):
    try:
        path = f"{dataset_links}{dataset_name}_X.csv"
        data = pd.read_csv(path)
        return data
    except Exception as e:
        print(f"Error loading X dataset {dataset_name}: {e}")
        return None

def load_and_process_dataset_y(dataset_name, dataset_links):
    try:
        path = f"{dataset_links}{dataset_name}_y.csv"
        data = pd.read_csv(path)
        return data
    except Exception as e:
        print(f"Error loading y dataset {dataset_name}: {e}")
        return None






def distance_vector(null_point_X, null_point_Y):
    norm_X = np.sum(null_point_X**2, axis=1)
    norm_Y = np.sum(null_point_Y**2, axis=1)
    dot_product = np.dot(null_point_Y, null_point_X.T)
    return np.sqrt(abs(norm_Y[:, np.newaxis] + norm_X[np.newaxis, :] - 2 * dot_product))

def preprocess_data_OC(train_data, test_data):
    print("..............................Data Overview................................")
    print("Train Data Shape:", train_data.shape)
    print("Test Data Shape:", test_data.shape)

    X_train_total = train_data.iloc[:, :-1].to_numpy()
    y_train_total = train_data.iloc[:, -1].to_numpy()

    X_train = X_train_total[y_train_total == 0]
    y_train = y_train_total[y_train_total == 0]

    X_test = test_data.iloc[:, :-1].to_numpy()
    y_test = test_data.iloc[:, -1].to_numpy()

    print("Train Data Labels [0]:", np.unique(y_train))
    return X_train, y_train, X_test, y_test


def Model_evaluating(y_true, y_predict, y_scores):
    print("..............................Report Parameter...............................")
    
    mcc = matthews_corrcoef(y_true, y_predict)
    f1 = f1_score(y_true, y_predict)
    ppv = precision_score(y_true, y_predict)
    tpr = recall_score(y_true, y_predict)
    accuracy = accuracy_score(y_true, y_predict)

    AUC = roc_auc_score(y_true, y_scores[:, 1])
    AUCPR = average_precision_score(y_true, y_scores[:, 1])

    print("AUCROC:", AUC * 100)
    print("AUCPR:", AUCPR * 100)
    print("Accuracy:", accuracy * 100)
    print("MCC:", mcc)
    print("F1 score:", f1)
    print("PPV (Precision):", ppv)
    print("TPR (Recall):", tpr)

    return [AUC * 100, AUCPR * 100, accuracy * 100, mcc, f1, ppv, tpr]


def Get_Scaler(name):
    if name == "None":
        return None
    if name == "StandardScaler":
        return StandardScaler()
    if name == "MinMaxScaler":
        return MinMaxScaler()
    if name == "RobustScaler":
        return RobustScaler()
    if name == "Normalizer":
        return Normalizer()
    if name == "QuantileTransformer":
        return QuantileTransformer(output_distribution="normal", random_state=42)
    if name == "MaxAbsScaler":
        return MaxAbsScaler()
    return None





dataset_names = np.array([
    , '2_annthyroid', '3_backdoor', '4_breastw', '5_campaign', '6_cardio',
    , '8_celeba', '9_census', '10_cover', '11_donors', '12_fault',
    , '14_glass', '15_Hepatitis', '16_http', '17_InternetAds', '18_Ionosphere',
    , '20_letter', '21_Lymphography', '22_magic.gamma', '23_mammography',
    , '25_musk', '26_optdigits', '27_PageBlocks', '28_pendigits', '29_Pima',
    , '31_satimage-2', '32_shuttle', '33_skin', '34_smtp', '35_SpamBase',
    , '37_Stamps', '38_thyroid', '39_vertebral', '40_vowels', '41_Waveform',
    , '43_WDBC', '44_Wilt', '45_wine', '46_WPBC', '47_yeast'
])

__TYPE = ["cluster", "global", "dependency", "local"]

__SCALER = ["None", "QuantileTransformer", "StandardScaler", "Normalizer", "RobustScaler", "MinMaxScaler", "MaxAbsScaler"]

columns = [
    , "Scaler", "AUCROC", "AUCPR", "Accuracy", "MCC", "F1 score",
    , "TPR (Recall)", "Time train", "Time test", "Threshold"]







if __name__ == "__main__":
    
    for type_ in __TYPE:

        output_dir = ROOT_DIR / "process_tabular" / "raw_data"
        output_dir.mkdir(parents=True, exist_ok=True)
        output_file = output_dir / f"dvmad_result_{type_}_synthetic_data.csv"
        dataset_links = ROOT_DIR / "Data" / "Synthetic_Datasets" / f"{type_}_outliers_datasets"
        dataset_links = str(dataset_links) + "/"


        if not output_file.exists():
            pd.DataFrame(columns=columns).to_csv(output_file, index=False)

        for scaler_name in __SCALER:
            for dataset_name in dataset_names:

                X = load_and_process_dataset_X(dataset_name, dataset_links)
                y = load_and_process_dataset_y(dataset_name, dataset_links)

                if X is None or y is None:
                    continue

                try:
                    print("Original data size:", len(y))

                    X_train, X_test, y_train, y_test = train_test_split(
                        X, y, test_size=0.3, random_state=42
                    )

                    Train_data = pd.concat([X_train, y_train], axis=1, ignore_index=True)
                    Test_data = pd.concat([X_test, y_test], axis=1, ignore_index=True)

                    t0 = time.time()

                    X_train, y_train, X_test, y_test = preprocess_data_OC(
                        Train_data, Test_data
                    )

                    
                    
                    
                    scaler = Get_Scaler(scaler_name)

                    if scaler is not None:
                        print(f"Scaler = {scaler_name} → Applying scaling")
                        scaler.fit(X_train)
                        X_train = scaler.transform(X_train)
                        X_test = scaler.transform(X_test)
                    else:
                        print(f"Scaler = None → No scaling is applied for {dataset_name}")

                    
                    
                    
                    dvm_ad = DVM_AD()
                    dvm_ad.fit(X_train, y_train)
                    t1 = time.time()

                    y_score = dvm_ad.predict(X_test)
                    t2 = time.time()

                    y_train_score = dvm_ad.transform(X_train)
                    train_score_tmp = distance_vector(y_train_score, y_train_score)

                    for i in range(len(train_score_tmp)):
                        train_score_tmp[i, i] = 1e9

                    train_score = np.amin(train_score_tmp, axis=1)

                    max_train_score = np.max(train_score)

                    y_proba = np.zeros((len(y_score), 2))
                    y_proba[:, 1] = np.minimum(y_score / max_train_score, 1)
                    y_proba[:, 0] = 1 - y_proba[:, 1]

                    y_predict = (y_proba[:, 1] > 0.5).astype(int)
                    threshold = 0.5 * max_train_score

                    v = Model_evaluating(y_test, y_predict, y_proba)

                    t_train = t1 - t0
                    t_test = t2 - t1

                    result = [dataset_name, scaler_name] + v + [t_train, t_test, threshold]
                    pd.DataFrame([result], columns=columns).to_csv(
                        output_file, mode="a", header=False, index=False
                    )

                    print(f"Result appended to {output_file}")

                except Exception as e:
                    print(f"Error processing dataset {dataset_name}: {e}")
