import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from copulas.multivariate import VineCopula
from copulas.univariate import GaussianKDE

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
ROOT_DIR = BASE_DIR
while ROOT_DIR != ROOT_DIR.parent and not (ROOT_DIR / "process_tabular").is_dir():
    ROOT_DIR = ROOT_DIR.parent

OUTPUT_DIRS = {
    : ROOT_DIR / "Data" / "Synthetic_Datasets" / "cluster_outliers_datasets",
    : ROOT_DIR / "Data" / "Synthetic_Datasets" / "local_outliers_datasets",
    : ROOT_DIR / "Data" / "Synthetic_Datasets" / "global_outliers_datasets",
    : ROOT_DIR / "Data" / "Synthetic_Datasets" / "dependency_outliers_datasets",
}

for path in OUTPUT_DIRS.values():
    path.mkdir(parents=True, exist_ok=True)

DATASET_LINKS = {
    : ROOT_DIR / "Data" / "Classical",
}


DATASET_NAMES = np.array([
    , "2_annthyroid", "3_backdoor", "4_breastw", "5_campaign",
    , "7_Cardiotocography", "8_celeba", "9_census", "10_cover",
    , "12_fault", "13_fraud", "14_glass", "15_Hepatitis",
    , "17_InternetAds", "18_Ionosphere", "19_landsat", "20_letter",
    , "22_magic.gamma", "23_mammography", "24_mnist", "25_musk",
    , "27_PageBlocks", "28_pendigits", "29_Pima", "30_satellite",
    , "32_shuttle", "33_skin", "34_smtp", "35_SpamBase",
    , "37_Stamps", "38_thyroid", "39_vertebral", "40_vowels",
    , "42_WBC", "43_WDBC", "44_Wilt", "45_wine", "46_WPBC", "47_yeast",
])

ANOMALY_TYPES = ["cluster", "global", "dependency", "local"]


def generate_realistic_synthetic(
    X: pd.DataFrame,
    y: pd.Series,
    realistic_synthetic_mode: str,
    alpha: float,
    percentage: float,
    seed: int = 42,
):
    
    if realistic_synthetic_mode not in ANOMALY_TYPES:
        raise NotImplementedError(f"Mode {realistic_synthetic_mode} is not implemented.")

    y = pd.Series(np.asarray(y).reshape(-1))
    X = pd.DataFrame(X)

    pts_n = (y == 0).sum()
    pts_a = (y == 1).sum()

    X_normal = X[y.values == 0]

    if realistic_synthetic_mode in ["local", "cluster", "global"]:
        metric_list = []
        n_components_list = list(np.arange(1, 10))

        for n_components in n_components_list:
            gm = GaussianMixture(n_components=n_components, random_state=seed).fit(X_normal)
            metric_list.append(gm.bic(X_normal))

        best_n_components = n_components_list[int(np.argmin(metric_list))]
        gm = GaussianMixture(n_components=best_n_components, random_state=seed).fit(X_normal)
        X_synthetic_normal = gm.sample(int(pts_n))[0]

    elif realistic_synthetic_mode == "dependency":
        if X_normal.shape[1] > 50:
            idx = np.random.RandomState(seed).choice(np.arange(X_normal.shape[1]), 50, replace=False)
            X_normal = X_normal.iloc[:, idx]

        copula = VineCopula("center")
        X_sampled = X_normal.sample(n=min(2000, len(X_normal)), random_state=seed)
        copula.fit(pd.DataFrame(X_sampled))
        X_synthetic_normal = copula.sample(int(pts_n)).values
    else:
        raise NotImplementedError(f"Mode {realistic_synthetic_mode} is not implemented.")

    if realistic_synthetic_mode == "local":
        gm.covariances_ = alpha * gm.covariances_
        X_synthetic_anomalies = gm.sample(int(pts_a))[0]

    elif realistic_synthetic_mode == "cluster":
        gm.means_ = alpha * gm.means_
        X_synthetic_anomalies = gm.sample(int(pts_a))[0]

    elif realistic_synthetic_mode == "dependency":
        X_synthetic_anomalies = np.zeros((int(pts_a), X_normal.shape[1]))
        for i in range(X_normal.shape[1]):
            kde = GaussianKDE()
            kde.fit(X_normal.iloc[:, i])
            X_synthetic_anomalies[:, i] = kde.sample(int(pts_a))

    elif realistic_synthetic_mode == "global":
        X_synthetic_anomalies = []
        for i in range(X_synthetic_normal.shape[1]):
            low = np.min(X_synthetic_normal[:, i]) * (1 + percentage)
            high = np.max(X_synthetic_normal[:, i]) * (1 + percentage)
            X_synthetic_anomalies.append(np.random.uniform(low=low, high=high, size=int(pts_a)))
        X_synthetic_anomalies = np.array(X_synthetic_anomalies).T

    X_combined = np.concatenate((X_synthetic_normal, X_synthetic_anomalies), axis=0)
    y_combined = np.append(
        np.repeat(0, X_synthetic_normal.shape[0]),
        np.repeat(1, X_synthetic_anomalies.shape[0]),
    )

    return X_combined, y_combined


def load_and_process_dataset(name: str, domain: str):
    try:
        dataset_path = DATASET_LINKS[domain] / f"{name}.npz"
        return np.load(dataset_path, allow_pickle=True)
    except Exception as exc:
        print(f"Error loading {domain} dataset {name}: {exc}")
        return None


def main():
    for name in DATASET_NAMES:
        data = load_and_process_dataset(name, "Classical")
        if data is None:
            continue

        try:
            X, y = data["X"], data["y"]
            X, y = pd.DataFrame(X), pd.Series(y.reshape(-1))
            print(f"Original data size: {len(y)}")

            if len(y) > 10000:
                print("Reducing data size to 10000")
                _, X, _, y = train_test_split(X, y, test_size=10000, random_state=42)

            for mode in ANOMALY_TYPES:
                if mode == "global":
                    X_gen, y_gen = generate_realistic_synthetic(X, y, mode, alpha=1.1, percentage=0.1)
                elif mode == "cluster":
                    X_gen, y_gen = generate_realistic_synthetic(X, y, mode, alpha=5, percentage=0.1)
                elif mode == "local":
                    X_gen, y_gen = generate_realistic_synthetic(X, y, mode, alpha=5, percentage=0.1)
                elif mode == "dependency":
                    X_gen, y_gen = generate_realistic_synthetic(X, y, mode, alpha=1.1, percentage=0.1)
                else:
                    continue

                output_dir = OUTPUT_DIRS[mode]
                X_gen_df = pd.DataFrame(X_gen)
                y_gen_df = pd.DataFrame(y_gen)

                output_filename_x = output_dir / f"{name}_X.csv"
                output_filename_y = output_dir / f"{name}_y.csv"

                X_gen_df.to_csv(output_filename_x, index=False)
                y_gen_df.to_csv(output_filename_y, index=False)

                print(f"Saved synthetic data for {name} in {output_dir}")

        except Exception as exc:
            print(f"Error processing dataset {name}: {exc}")


if __name__ == "__main__":
    main()
