import pandas as pd
import arff
from sklearn import datasets
import zipfile
from gemclus.data import celeux_one
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

all_datasets = ["avila", "breast_cancer", "car_evaluation", "celeux_s1", "celeux_s2", "celeux_s3", "celeux_s4",
                "celeux_s5", "congressional_votes", "digits", "haberman_survival", "iris", "mice_protein", "poker_hand",
                "sensorless", "vowel", "wine"]


def get_df_from_zip(zip_name, file_name):
    with zipfile.ZipFile(zip_name) as zip:
        with zip.open(file_name, "r") as file:
            df = pd.read_csv(file, header=None)
    return df


def get_celeux_s1(path_to_data):
    return celeux_one(n=30, p=20, mu=0.6)


def get_celeux_s2(path_to_data):
    return celeux_one(n=30, p=20, mu=1.7)


def get_celeux_s3(path_to_data):
    return celeux_one(n=300, p=20, mu=0.6)


def get_celeux_s4(path_to_data):
    return celeux_one(n=300, p=20, mu=1.7)


def get_celeux_s5(path_to_data):
    return celeux_one(n=300, p=95, mu=1.7)


def get_avila(path_to_data):
    dataset = arff.load(open(f"{path_to_data}/avila.arff", "r"))

    # Put to dataframe format
    df = pd.DataFrame(dataset['data'], columns=[x[0] for x in dataset['attributes']])

    # Extract labels
    y = LabelEncoder().fit_transform(df["10"])

    # Drop columns "10", "train" and "test"
    df.drop(["10", "train", "test"], axis=1, inplace=True)

    return df.to_numpy(), y


def get_vowel(path_to_data):
    dataset = arff.load(open(f"{path_to_data}/vowel.arff", "r"))

    # Put to dataframe format
    df = pd.DataFrame(dataset['data'], columns=[x[0] for x in dataset['attributes']])

    # Get the label
    y = LabelEncoder().fit_transform(df["binaryClass"])

    # Keep only the features
    df = df[[f"Feature {i}" for i in range(10)]]

    return df.to_numpy(), y



def get_poker_hand(path_to_data):
    df = get_df_from_zip(f"{path_to_data}/poker_hand.zip", "poker-hand-training-true.data")

    y = df[df.columns[-1]]
    df.drop(df.columns[-1], axis=1, inplace=True)

    return df.to_numpy(), y


def get_haberman_survival(path_to_data):
    df = get_df_from_zip(f"{path_to_data}/haberman_survival.zip", "haberman.data")

    # Extract binary targets
    y = df[df.columns[-1]].to_numpy() - 1

    # Drop this target column
    df.drop(df.columns[-1], axis=1, inplace=True)

    return df.to_numpy(), y


def get_breast_cancer(path_to_data):
    df = get_df_from_zip(f"{path_to_data}/breast_cancer.zip", "breast-cancer-wisconsin.data")

    # Drop first column (ID)
    df.drop([0], axis=1, inplace=True)

    # Drop samples that have nan
    df.replace("?", None, inplace=True)
    df.dropna(axis=0, how="any", inplace=True)

    # Extract labels
    y = df[df.columns[-1]].to_numpy()
    df.drop(df.columns[-1], axis=1, inplace=True)

    # Pass from {2,4} encoding to {0,1}
    y = y // 2 - 1

    return df.to_numpy().astype(float), y


def get_congressional_votes(path_to_data):
    df = get_df_from_zip(f"{path_to_data}/congressional_voting_records.zip", "house-votes-84.data")

    # Extract label and encode
    y = LabelEncoder().fit_transform(df[0])
    df.drop(0, axis=1, inplace=True)

    # Convert {n, ?, y} to {-1,0,1}
    X = df.applymap(lambda x: ["n", "?", "y"].index(x) - 1).to_numpy()

    return X, y


def get_car_evaluation(path_to_data):
    df = get_df_from_zip(f"{path_to_data}/car_evaluation.zip", "car.data")

    # Start by removing the targets
    y = LabelEncoder().fit_transform(df[df.columns[-1]])
    df.drop(df.columns[-1], axis=1, inplace=True)

    # Extend all features to one hot encoding
    X = OneHotEncoder(sparse_output=False).fit_transform(df)

    return X, y


def get_mice_protein(path_to_data):
    with zipfile.ZipFile(f"{path_to_data}/mice_protein_expression.zip") as zip:
        with zip.open("Data_Cortex_Nuclear.xls", "r") as file:
            df = pd.read_excel(file)

#    y = LabelEncoder().fit_transform(df["class"])
#    df.drop(["Genotype", "Treatment", "Behavior", "MouseID", "class"], axis=1, inplace=True)

#    df.dropna(axis=0, how="any", inplace=True)

    df.drop(["Genotype","Treatment","Behavior","MouseID"], axis=1, inplace=True)
    df.dropna(axis=0, how="any", inplace=True)
    y = LabelEncoder().fit_transform(df["class"])
    df.drop(["class"], axis=1, inplace=True)

    return df.to_numpy(), y


def get_sensorless(path_to_data):
    with zipfile.ZipFile(f"{path_to_data}/sensorless_drive_diagnosis.zip") as zip:
        with zip.open("Sensorless_drive_diagnosis.txt", "r") as file:
            df = pd.read_csv(file, delimiter=" ", header=None)

    y = df[df.columns[-1]]
    df.drop(df.columns[-1], axis=1, inplace=True)

    return df.to_numpy(), y


def get_iris(path_to_data):
    return datasets.load_iris(return_X_y=True)


def get_digits(path_to_data):
    return datasets.load_digits(return_X_y=True)

def get_wine(path_to_data):
    return datasets.load_wine(return_X_y=True)

def get_data(name, path_to_data):
    assert name in all_datasets

    return globals()[f"get_{name}"](path_to_data)


if __name__ == "__main__":
    for dataset in all_datasets:
        print(f"Dataset {dataset}")
        X, y = get_data(dataset, "datasets")
        print(f"\tX = {X.shape} / y = {len(pd.unique(y))}")
        new_size = int(len(X)*0.8)
        print(f"\tNew size will be: {new_size}")
