import numpy as np
import os
import scipy.io as spio

import pandas as pd
from scipy.io import arff


def read_csv_to_numpy(filepath):

    # Load data from CSV, skipping the first row (header) and inferring data type automatically
    data = np.genfromtxt(filepath, delimiter=",", skip_header=1)
    return data


def read_nat_to_numpy(filepath):

    return spio.loadmat(filepath)


def get_files_in_dir(dir, file_type=None):

    extracted_files = []
    for root, dirs, files in os.walk(dir):
        for file in files:
            print(file)
            if file_type:
                if file.endswith(file_type):
                    extracted_files.append(os.path.join(root, file))
            else:
                extracted_files.append(os.path.join(root, file))
    return extracted_files


def build_train_label(path):
    if path.endswith(".mat"):
        mat = read_nat_to_numpy(path)
        data = mat["X"]
        labels = mat["y"]
        labels = labels.reshape(-1)
        # data = data.astype(np.float64)
        # np.savetxt("data.csv", data[:, 54], delimiter=",", fmt="%s")
        return (data, labels)

    if path.endswith(".arf") or path.endswith(".arff"):
        data, _ = arff.loadarff(path)
        dataset = pd.DataFrame(data)
        # Convert classes and replace with binary values in a single line
        labels_series = dataset.iloc[:, -1]
        # Perform replacement
        labels = labels_series.replace(
            {b"0": 0, b"1": 1, b"Normal": 0, b"Anomaly": 1}
        ).to_numpy()
        # Convert to numpy arrayå

        # Extract features
        data = pd.get_dummies(dataset.iloc[:, :-1]).to_numpy()

        data = data.astype(np.float64)

        return (data, labels)

    if path.endswith(".data") and "abalone" in path:
        data = pd.read_csv(path, header=None, sep=",")
        data = data.rename(columns={8: "y"})
        data["y"].replace([8, 9, 10], 0, inplace=True)
        data["y"].replace([3, 21], 1, inplace=True)
        data.iloc[:, 0].replace("M", 0, inplace=True)
        data.iloc[:, 0].replace("F", 1, inplace=True)
        data.iloc[:, 0].replace("I", 2, inplace=True)

        labels = ((np.array(data["y"])).astype(np.int32)).reshape(-1)
        newData = data.loc[:, data.columns != "y"].to_numpy()

        return (newData, labels)
    if path.endswith(".data") and "ecoli" in path:
        dataset = pd.read_csv(path, header=None, sep="\s+")
        dataset = dataset.iloc[:, 1:]
        anomalies = np.array(
            dataset[
                (dataset.iloc[:, 7] == "omL")
                | (dataset.iloc[:, 7] == "imL")
                | (dataset.iloc[:, 7] == "imS")
            ]
        )[:, :-1]
        normals = np.array(
            dataset[
                (dataset.iloc[:, 7] == "cp")
                | (dataset.iloc[:, 7] == "im")
                | (dataset.iloc[:, 7] == "pp")
                | (dataset.iloc[:, 7] == "imU")
                | (dataset.iloc[:, 7] == "om")
            ]
        )[:, :-1]
        labels = np.concat(
            (np.zeros(normals.shape[0]), np.ones(anomalies.shape[0])), axis=0
        )
        data = np.concat((normals, anomalies), axis=0)
        data = data.astype(np.float64)

        return (data, labels)

    if path.endswith(".npz") and (
        "fraud" in path or "backdoor" in path or "campaign" in path or "progan" in path
    ):

        loadeddata = np.load(path, allow_pickle=True)
        data = loadeddata["X"]
        labels = (loadeddata["y"]).astype(np.int32)
        return (data, labels)

    if path.endswith(".npy") and "separable" in path:
        data = np.load(path)
        return (data[:, :-1], data[:, -1])
