################################################################################################
################################################################################################
# based on https://github.com/RoelBouman/outlierdetection/tree/master, adapted by first author
################################################################################################
################################################################################################

#Current procedure does the following:
#Filter based on variance

#%% setup
import numpy as np
from scipy.io import loadmat
from scipy.io import arff
from scipy.stats import mode
#from scipy.stats import hypergeom
#from scipy.stats import binom
import h5py
import pandas as pd
import os
import pickle
import re
import json
import torch

raw_dir = "raw_data"
target_dir = "formatted_data"

#output format can be either pickle or npz
output_format = "npz" 


if not os.path.exists(target_dir):
    os.mkdir(target_dir)

dataset_summaries = []

#%% Define filtering function

def get_variance_filter_index(X, max_mode_samples):
    
    _, X_mode_count = mode(X)
    
    variance_filter = np.array([i < max_mode_samples for i in np.squeeze(X_mode_count)],dtype=bool)
    
    return(variance_filter)
   
    
def preprocess_data(X,y): #, remove_duplicates=False):
    # (X_unique, unique_indices, unique_counts) = np.unique(X, axis=0, return_index=True, return_counts=True)
    # max_duplicates = np.max(unique_counts)
    
    # if remove_duplicates:
    #     old_y_len = len(y)
    #     (X_unique, unique_indices) = np.unique(X, axis=0, return_index=True)
    #     y = y[unique_indices]
    #     X = X_unique
        
    #     n_removed_duplicates = old_y_len-len(y)
    #     print(str(n_removed_duplicates) + " samples were removed due to being duplicates.")
    #     print("----------------------------------------------------")
    # else:
    #     n_removed_duplicates = 0
        
    # variance_filter = get_variance_filter_index(X, X.shape[0])
    # n_variables_filtered = np.count_nonzero(variance_filter==0)
    # X_filtered = X[:,variance_filter]
    # for i, v in enumerate(variance_filter):
    #     if not v:
    #         print("Variable " + str(i) + " was removed due to having low variance")
            
    data_dict = {"X": X, "y": y} #, "n_removed_duplicates": n_removed_duplicates, "n_variables_filtered": n_variables_filtered, "max_duplicates":max_duplicates}
    
    return(data_dict)

def make_dataset_summary(dataset_name, data_dict, origin):
    
    n_samples, n_variables = data_dict["X"].shape
    n_outliers = int(np.sum(data_dict["y"]))
    outlier_percentage = round(n_outliers/n_samples * 100,2)
    # n_removed_duplicates = data_dict["n_removed_duplicates"]
    # n_categorical_variables = len(categorical_variables)
    # n_numeric_variables = n_variables  - n_categorical_variables
    # n_variables_filtered = data_dict["n_variables_filtered"]
    # max_duplicates = data_dict["max_duplicates"]
    
    #summary = pd.DataFrame([[n_samples, n_variables, str(n_outliers) + " (" + str(outlier_percentage) + "%)", n_removed_duplicates, n_numeric_variables, n_categorical_variables, n_variables_filtered]], 
    #                       columns=["#samples", "#variables", "#outliers (%outliers)", "#removed duplicates", "#numeric variables", "#categorical variables", "#removed variables"])

    summary = {"Name": dataset_name,
               "Origin": origin,
               "#samples": n_samples, 
               "#features": n_variables, 
               "#outliers": n_outliers,
               "%outliers": "("+str(outlier_percentage) + "%)", 
            #    "#duplicates": n_removed_duplicates, 
            #    "#numeric features": n_numeric_variables, 
            #    "#categorical features": n_categorical_variables, 
            #    "#removed features": n_variables_filtered,
            #    "#max duplicates": max_duplicates
            }
    
    return(summary)
#%% ODDS
data_dir = os.path.join(raw_dir, "ODDS_data_raw", "matfile_data")
nonmat_data_dir = os.path.join(raw_dir, "ODDS_data_raw", "other_data")


matfile_names = os.listdir(data_dir)

HDFlist = [] #use MATLAB 7.3 file format (need HDF reader)
exclude_list = [".gitkeep"] #ecoli is broken, lympho is removed due to being categorical, breastw has too many outliers %-wise, this is fixed in wbc


# json_meta_data_path = os.path.join(raw_dir, "ODDS_data_raw","categorical_variables_per_dataset.json")
# with open(json_meta_data_path, "r") as json_file:
#     categorical_variables_per_dataset = json.load(json_file)
    
origin = "ODDS"

#%%
# Regular mat files

for file_name in [f for f in matfile_names if f not in HDFlist and f not in exclude_list]:
    
    full_path_filename = os.path.join(data_dir, file_name)
    mat_file = loadmat(full_path_filename)
    print("----------------------------------------------------")
    print("Processing: " + file_name)
    print("----------------------------------------------------")

    X = mat_file["X"].astype(np.float64) 
    y = mat_file["y"].astype(np.float64)
    y = np.hstack(y)
    if file_name == "wbc.mat":
        dataset_name = "wbc2"
        # print("X: ", X.shape, "y: ", y.shape)
    else:
        dataset_name = re.search('(.+?)\.mat', file_name).group(1)
            
    # try:
    #     categorical_variables = categorical_variables_per_dataset[dataset_name]
    #     print("some categorical variables")
    # except KeyError:
    #     categorical_variables = []
    #     print("no categorical variables")
    
    data_dict = preprocess_data(X, y)
    
    dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
    dataset_summaries.append(dataset_summary)
    
    if output_format == "pickle":
        target_file_name =  dataset_name + ".pickle"
        target_file_name_with_dir = os.path.join(target_dir, target_file_name)
        pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
    elif output_format == "npz":
        target_file_name =  dataset_name + ".npz"
        target_file_name_with_dir = os.path.join(target_dir, target_file_name)
        np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    
#%%
# HDF mat files
    
for file_name in [f for f in HDFlist if f not in exclude_list]:
    print("----------------------------------------------------")
    print("Processing: " + file_name)
    print("----------------------------------------------------")
    full_path_filename = os.path.join(data_dir, file_name)
    # mat_file = h5py.File(full_path_filename, 'r')
    mat_file = h5py.File(full_path_filename, 'r')
    # data = scipy.io.loadmat('/home/manhirt/Git/Score-based-Anomaly-Detection/raw_data/ODDS_data_raw/matfile_data/arrhythmia.mat')
        
    X = mat_file["X"][()].T.astype(np.float64)
    y = mat_file["y"][()].T.astype(np.float64)
    
    dataset_name = re.search('(.+?)\.mat', file_name).group(1)
    
    # try:
    #     categorical_variables = categorical_variables_per_dataset[dataset_name]
    #     print("some categorical variables")
    # except KeyError:
    #     categorical_variables = []
    #     print("no categorical variables")
    
    data_dict = preprocess_data(X, y)
    
    dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
    dataset_summaries.append(dataset_summary)
    
    if output_format == "pickle":
        target_file_name =  dataset_name + ".pickle"
        target_file_name_with_dir = os.path.join(target_dir, target_file_name)
        pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
    elif output_format == "npz":
        target_file_name =  dataset_name + ".npz"
        target_file_name_with_dir = os.path.join(target_dir, target_file_name)
        np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    
    
#%%
#seismic arff files
    
#What should be done with the data in order to acquire the proper 11/19 attributes is unknown. I suppose that the first 11 are used, and the nbumps are all omitted.
file_name = "seismic-bumps.arff"
file_name_load = "seismic.arff"
print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")
seismic = arff.loadarff(os.path.join(nonmat_data_dir, file_name_load))

seismic_data = pd.DataFrame(seismic[0])

#from sklearn.preprocessing import OneHotEncoder
#enc = OneHotEncoder()
cat_columns = ["seismic", "seismoacoustic", "shift", "ghazard", "class"]
seismic_data_numerical = pd.get_dummies(seismic_data, prefix_sep="_", columns=cat_columns)

X = seismic_data_numerical.values[:,:-1].astype(np.float64)
y = seismic_data_numerical.values[:,-1].astype(np.float64) #can be made vector as they are complementary

dataset_name = re.search('(.+?)\.arff', file_name).group(1)

# try:
#     categorical_variables = categorical_variables_per_dataset[dataset_name]
#     print("some categorical variables")
# except KeyError:
#     categorical_variables = []
#     print("no categorical variables")

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict) 

#%%
#mulcross arff files
    
#What should be done with the data in order to acquire the proper 11/19 attributes is unknown. I suppose that the first 11 are used, and the nbumps are all omitted.
file_name = "mulcross.arff"
print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")
seismic = arff.loadarff(os.path.join(nonmat_data_dir, file_name))

seismic_data_numerical = pd.DataFrame(seismic[0])

X = seismic_data_numerical.values[:,:-1].astype(np.float64)
# y_raw = seismic_data_numerical[:,-1] #can be made vector as they are complementary
y_raw = seismic_data_numerical.iloc[:, 4].astype(str).tolist()
# y_raw = csv_file.iloc[:,-1]
# print(y_raw)
# print(type(y_raw[0]))
y = np.array([1 if v == 'Anomaly' else 0 for v in y_raw], dtype=np.float64)
# print(y)

dataset_name = re.search('(.+?)\.arff', file_name).group(1)

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    
   

#%% Yeast data is commented out due to being undocumented/unsolvable (see other comparison papers)
# .data/csv files

file_name = "yeast.data"
print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")
yeast = pd.read_csv(os.path.join(nonmat_data_dir, file_name), delim_whitespace=True, header=None)
#Cyt

X = yeast.iloc[:,1:9].values.astype(np.float64)
y = pd.get_dummies(yeast.iloc[:,9])[["EXC"]].sum(axis=1).values.astype(np.float64)

dataset_name = "yeast6"

# try:
#     categorical_variables = categorical_variables_per_dataset[dataset_name]
#     print("some categorical variables")
# except KeyError:
#     categorical_variables = []
#     print("no categorical variables")

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin="EOAD")
dataset_summaries.append(dataset_summary)


if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    


#%% Goldstein CSV data


data_dir = os.path.join(raw_dir, "Goldstein_data_raw")

if not os.path.exists(target_dir):
    os.mkdir(target_dir)

csv_file_names = os.listdir(data_dir)

exclude_list =  []
exclude_list = [f+"-unsupervised-ad.csv" for f in exclude_list]
exclude_list += [".gitkeep"]
origin="Goldstein"

#%% Write Goldstein CSVs to pickles
for file_name in [f for f in csv_file_names if f not in exclude_list]:
    
    full_path_filename = os.path.join(data_dir, file_name)
    csv_file = pd.read_csv(full_path_filename)
    print("----------------------------------------------------")
    print("Processing: " + file_name)
    print("----------------------------------------------------")
    X = csv_file.iloc[:,:-1].values.astype(np.float64) 
    y_raw = csv_file.iloc[:,-1]
    y = np.array([1 if v == 'o' else 0 for v in y_raw], dtype=np.float64)
    dataset_name = re.search('(.+?)-unsupervised-ad\.csv', file_name).group(1)
    print(dataset_name)

    categorical_variables = []
    print("no categorical variables")
    
    data_dict = preprocess_data(X, y)
    
    dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
    dataset_summaries.append(dataset_summary)
    
    if output_format == "pickle":
        target_file_name =  dataset_name + ".pickle"
        target_file_name_with_dir = os.path.join(target_dir, target_file_name)
        pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
    elif output_format == "npz":
        target_file_name =  dataset_name + ".npz"
        target_file_name_with_dir = os.path.join(target_dir, target_file_name)
        np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    


#%% Write ELKI data

data_dir = os.path.join(raw_dir, "ELKI_data_raw")

if not os.path.exists(target_dir):
    os.mkdir(target_dir)

arff_file_folders = os.listdir(data_dir)

exclude_list = [] 
file_name = "Parkinson_withoutdupl_norm_75.arff"

origin = "ELKI"

#%% Write Campos paper ARFF to pickles:
# for file_folder, file_name in zip(sorted([f for f in arff_file_folders if f not in exclude_list]), original_file_list):

full_path_filename = os.path.join(data_dir, file_name)
arff_data = arff.loadarff(full_path_filename)
print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")
arff_df = pd.DataFrame(arff_data[0])
X = arff_df.iloc[:,:-2].values.astype(np.float64)
y_raw = arff_df["outlier"]
y = np.array([0 if v == b'no' else 1 for v in y_raw], dtype=np.float64)
# dataset_name = file_folder.lower()
dataset_name = "Parkinson"
print(dataset_name)

# categorical_variables = []
# print("no categorical variables")

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    

#%% write extended AE:

data_dir = os.path.join(raw_dir, "extended_AE_data_raw")

if not os.path.exists(target_dir):
    os.mkdir(target_dir)

csv_file_names = os.listdir(data_dir)

exclude_list = [] 

origin = "ex-AE"

#%% Write extended AE paper CSVs to pickles:
    
#%% nasa:
file_name = "nasa.csv"

full_path_filename = os.path.join(data_dir, file_name)
csv_file = pd.read_csv(full_path_filename)
print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")
X = csv_file[csv_file.columns.difference(["Neo Reference ID", "Name", "Close Approach Date", "Epoch Date Close Approach", "Orbiting Body", "Orbit Determination Date", "Equinox", "Hazardous"])].values.astype(np.float64) 
y = csv_file["Hazardous"].values.astype(np.float64)
dataset_name = file_name.lower()[:-4]
print(dataset_name)

categorical_variables = []
print("no categorical variables")

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    
            
#%% https://www.kaggle.com/datasets/inIT-OWL/high-storage-system-data-for-energy-optimization

file_name = "HRSS_anomalous_optimized.csv"

full_path_filename = os.path.join(data_dir, file_name)
csv_file = pd.read_csv(full_path_filename)
print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")
X = csv_file[csv_file.columns.difference(["Timestamp", "Labels"])].values.astype(np.float64) 
y = csv_file["Labels"].values.astype(np.float64)
dataset_name = file_name.lower()[:-4]
print(dataset_name)

categorical_variables = []
print("no categorical variables")

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    

file_name = "HRSS_anomalous_standard.csv"

full_path_filename = os.path.join(data_dir, file_name)
csv_file = pd.read_csv(full_path_filename)
print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")
X = csv_file[csv_file.columns.difference(["Timestamp", "Labels"])].values.astype(np.float64) 
y = csv_file["Labels"].values.astype(np.float64)
dataset_name = file_name.lower()[:-4]
print(dataset_name)

categorical_variables = []
print("no categorical variables")

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)   

#%% Yeast data is commented out due to being undocumented/unsolvable (see other comparison papers)
# .data/csv files

file_name = "ecoli.data"
print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")

dataset_name = "ecoli"

dataset = pd.read_csv(os.path.join(nonmat_data_dir, file_name), header=None, sep='\s+')
dataset = dataset.iloc[:, 1:]
anomalies = np.array(
    dataset[(dataset.iloc[:, 7] == 'omL') | (dataset.iloc[:, 7] == 'imL') | (dataset.iloc[:, 7] == 'imS')])[:,
            :-1]
normals = np.array(dataset[(dataset.iloc[:, 7] == 'cp') | (dataset.iloc[:, 7] == 'im') | (
            dataset.iloc[:, 7] == 'pp') | (dataset.iloc[:, 7] == 'imU') | (dataset.iloc[:, 7] == 'om')])[:, :-1]
normals = torch.tensor(normals.astype('double'))
anomalies = torch.tensor(anomalies.astype('double'))
normals = torch.cat((normals, torch.zeros(normals.shape[0], 1,dtype=torch.double)), dim=1)
anomalies = torch.cat((anomalies, torch.ones(anomalies.shape[0], 1,dtype=torch.double)), dim=1)
normals = normals[torch.randperm(normals.shape[0])]
anomalies = anomalies[torch.randperm(anomalies.shape[0])]
# train, test_norm = torch.split(normals, int(normals.shape[0] / 2) + 1)
# test = torch.cat((test_norm, anomalies))
# test = test[torch.randperm(test.shape[0])]

data = torch.cat((normals, anomalies))

# If the tensor is on the GPU, move it to the CPU first
if data.is_cuda:
    data = data.cpu()

# Convert the torch tensor to a numpy array
data = data.numpy()

y = data[:, -1]#.view(-1, 1)
X = data[:, :-1]
# test = test[:, :-1]
# return (train, test, test_classes)

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin="ICL")
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    

#%% Yeast data is commented out due to being undocumented/unsolvable (see other comparison papers)
# .data/csv files

file_name = "abalone.data"
print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")

dataset_name = "abalone"

data = pd.read_csv(os.path.join(nonmat_data_dir, file_name), header=None, sep=',')
data = data.rename(columns={8: 'y'})
data = data.rename(columns={0: 's'})
# df['C'] = df['C'].replace([40, 50], 100)
data['y'] = data['y'].replace([8, 9, 10], 0) #, inplace=True)
data['y'] = data['y'].replace([3, 21], 1) #, inplace=True)
# data.iloc[:, 0].replace('M', 0) #, inplace=True)
# data.iloc[:, 0].replace('F', 1) #, inplace=True)
# data.iloc[:, 0].replace('I', 2) #, inplace=True)
data['s'] = data['s'].replace('M', 0) #, inplace=True)
data['s'] = data['s'].replace('F', 1) #, inplace=True)
data['s'] = data['s'].replace('I', 2) #, inplace=True)
test = data[data['y'] == 1]
normal = data[data['y'] == 0].sample(frac=1)

# num_normal_samples_test = normal.shape[0] // 2
# test_data = np.concatenate((test.drop('y', axis=1), normal[:num_normal_samples_test].drop('y', axis=1)), axis=0)
# train = normal[num_normal_samples_test:]
# train_data = train.drop('y', axis=1).values
# test_labels = np.concatenate((test['y'], normal[:num_normal_samples_test]['y'].replace(-1, 1)), axis=0)
# for i in range(data_sort.shape[0]):
#     if test_labels[i] == 0:
#         test_labels[i] = 1
#     else:
#         test_labels[i] = 0
# train_data=torch.tensor(train_data.astype('double'))
# test_data=torch.tensor(test_data.astype('double'))
# test_labels=torch.tensor(test_labels.astype('double'))

# test=torch.tensor(test.astype('double'))
print("test", test.shape)
# normal=torch.tensor(normal.astype('double'))
print("normal", normal.shape)

data_sort = np.concatenate((normal.astype('double'), test.astype('double')), axis = 0)
print("data_sort", data_sort.shape)

# # If the tensor is on the GPU, move it to the CPU first
# if data_sort.is_cuda:
#     data_sort = data_sort.cpu()

# # Convert the torch tensor to a numpy array
# data = data_sort.numpy()

# return (train_data, test_data, test_labels)

# try:
#     categorical_variables = categorical_variables_per_dataset[dataset_name]
#     print("some categorical variables")
# except KeyError:
#     categorical_variables = []
#     print("no categorical variables")

y = data_sort[:, -1] #.view(-1, 1)
X = data_sort[:, :-1]

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin="ICL")
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    

#%% MI-F/V
CNC_file_folder = "CNC-kaggle"


CNC_files = ["experiment_{:02d}.csv".format(i) for i in range(1,19)]

#%% MI-F
file_name = "MI-F.csv"

print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")


CNC_1_files = ["experiment_{:02d}.csv".format(i) for i in [4,5,7,16]]
CNC_0_files = list(set(CNC_files) - set(CNC_1_files))

CNC_dfs = []
for i, CNC_file in enumerate(CNC_1_files):
    full_path_filename = os.path.join(data_dir, CNC_file_folder, CNC_file)
    csv_file = pd.read_csv(full_path_filename).iloc[:,:-3]
    csv_file["label"] = 1
    CNC_dfs.append(csv_file)
    
for i, CNC_file in enumerate(CNC_0_files):
    full_path_filename = os.path.join(data_dir, CNC_file_folder, CNC_file)
    csv_file = pd.read_csv(full_path_filename).iloc[:,:-3]
    csv_file["label"] = 0
    CNC_dfs.append(csv_file)

data = pd.concat(CNC_dfs)
X = data[data.columns.difference(["label"])].values.astype(np.float64) 
y = data["label"].values.astype(np.float64)
dataset_name = "mif"
print(dataset_name)

categorical_variables = []
print("no categorical variables")

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    

#%% MI-V
file_name = "MI-V.csv"

print("----------------------------------------------------")
print("Processing: " + file_name)
print("----------------------------------------------------")

CNC_skip_files = ["experiment_{:02d}.csv".format(i) for i in [4,5,7,16]]
CNC_1_files = ["experiment_{:02d}.csv".format(i) for i in [6,8,9,10]]
CNC_0_files = list(set(CNC_files) - set(CNC_1_files))

CNC_dfs = []
for i, CNC_file in enumerate(CNC_1_files):
    full_path_filename = os.path.join(data_dir, CNC_file_folder, CNC_file)
    csv_file = pd.read_csv(full_path_filename).iloc[:,:-3]
    csv_file["label"] = 1
    CNC_dfs.append(csv_file)
    
for i, CNC_file in enumerate(CNC_0_files):
    full_path_filename = os.path.join(data_dir, CNC_file_folder, CNC_file)
    csv_file = pd.read_csv(full_path_filename).iloc[:,:-3]
    csv_file["label"] = 0
    CNC_dfs.append(csv_file)

data = pd.concat(CNC_dfs)
X = data[data.columns.difference(["label"])].values.astype(np.float64) 
y = data["label"].values.astype(np.float64)
dataset_name = "miv"
print(dataset_name)

categorical_variables = []
print("no categorical variables")

data_dict = preprocess_data(X, y)

dataset_summary = make_dataset_summary(dataset_name, data_dict, origin)
dataset_summaries.append(dataset_summary)

if output_format == "pickle":
    target_file_name =  dataset_name + ".pickle"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    pickle.dump(data_dict, open(target_file_name_with_dir, "wb"))    
elif output_format == "npz":
    target_file_name =  dataset_name + ".npz"
    target_file_name_with_dir = os.path.join(target_dir, target_file_name)
    np.savez(open(target_file_name_with_dir, "wb"), **data_dict)    



#filter names:
#filter_rows = ["hrss_anomalous_standard", "speech", "vertebral"]
filter_rows = ["backdoor", "celeba", "fraud"]
#rename_rows = {"hrss_anomalous_optimized":"hrss"}
summaries_df = pd.DataFrame(dataset_summaries).sort_values("Name")
summaries_df.set_index("Name", inplace=True)

# summaries_df.drop(["#numeric features", "#categorical features", "#max duplicates", "#duplicates"], axis=1, inplace=True) #remove columns irrelevant to current iteration of research
#summaries_df.drop(filter_rows, inplace=True)
#summaries_df.rename(rename_rows, inplace=True)

file_name = "datasets_summaries.csv"
csv_path = os.path.join(raw_dir, file_name)

summaries_df.to_csv(csv_path, index=True)

file_name = "datasets_table.tex"
tab_path = os.path.join(raw_dir, file_name)

table_file = open(tab_path,"w")
summaries_df.to_latex(table_file) 
table_file.close()