__author__ = ''
__date__ = '2023/09/01'

'''
    data preprocessing.
'''
from os import path as osp
import sys
sys.path.append(osp.dirname(osp.dirname(osp.abspath(__file__))))
import numpy as np
import pandas as pd
import numpy as np
from scipy.io import loadmat


from configs import DATA_DIR



def Adult_preprocess():

    # https://archive.ics.uci.edu/dataset/2/adult

    data_path = osp.join(DATA_DIR, 'Adult/adult.data')

    # read data
    df_data = pd.read_csv(data_path, header=None, names=['age', 'workclass', 'fnlwgt', 'education',
                                                                    'education-num', 'marital-status', 'occupation',
                                                                    'relationship', 'race', 'sex', 'capital-gain',
                                                                    'capital-loss',
                                                                    'hours-per-week', 'native-country', 'y'])
    print(f'all data: {len(df_data.iloc[:, 0])}')

    # data cleaning
    df_data_clean = None
    for i in range(len(df_data.iloc[:, 0])):
        missing_value = False
        for j in range(len(df_data.iloc[0, :])):
            if str(df_data.iloc[i, j]) == ' ?':
                missing_value = True
                break  
        if not missing_value:
            if df_data_clean is None:
                df_data_clean = df_data.iloc[i:i+1]
            else:
                df_data_clean = pd.concat([df_data_clean, df_data.iloc[i:i+1]])

    print(f'clean data: {df_data_clean.shape}')

    df_data_clean.loc[df_data_clean['y'] == ' >50K', 'y'] = 1
    df_data_clean.loc[df_data_clean['y'] == ' <=50K', 'y'] = 0
    df_data_clean.loc[df_data_clean['sex'] == ' Female', 'sex'] = 1
    df_data_clean.loc[df_data_clean['sex'] == ' Male', 'sex'] = -1

    print(f'a instance before preprocess: {df_data_clean.values[0]}')
    for i in ['workclass', 'marital-status', 'occupation', 'education', 'relationship', 'race', 'native-country']:
        column_value = list(set(df_data_clean[i].values))
        for j, value in enumerate(column_value):
            df_data_clean.loc[df_data_clean[i] == value, i] = j + 1
    
    df_data_clean = df_data_clean[['sex', 'age', 'native-country', 'race', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                       'relationship', 'capital-gain', 'capital-loss',
                       'hours-per-week', 'y']]
    print(f'a instance after preprocess: {df_data_clean.values[0]}')
    
    # saving complete data
    complete_data = df_data_clean.iloc[:, :].values.tolist()
    complete_data = np.array(complete_data, dtype=np.float32)

    
    print(f'complete_data: {type(complete_data), complete_data.shape}')
    save_path = osp.join(DATA_DIR, 'Adult/complete_data.npy')
    np.save(save_path, complete_data)
    print(f'Saving complete data successfully!')


def KDD_preprocess(num_normal=0, num_abnormal=0):

    path = osp.join(DATA_DIR, 'KDD')
    file_names = [osp.join(path, "kddcup.data_10_percent.gz"), osp.join(path, "kddcup.names")]

    column_name = pd.read_csv(file_names[1], skiprows=1, sep=':', names=['f_names', 'f_types'])
    column_name.loc[column_name.shape[0]] = ['status', ' symbolic.']
    data = pd.read_csv(file_names[0], header=None, names=column_name['f_names'].values)
    data_symbolic = column_name[column_name['f_types'].str.contains('symbolic.')]
    data_continuous = column_name[column_name['f_types'].str.contains('continuous.')]
    samples = pd.get_dummies(data.iloc[:, :-1], columns=data_symbolic['f_names'][:-1])

    sample_keys = samples.keys()
    continuous_idx = []
    for cont_idx in data_continuous['f_names']:
        continuous_idx.append(sample_keys.get_loc(cont_idx))

    labels = np.reshape(np.where(data['status'] == 'normal.', 1, 0), (-1, 1))

    complete_data = np.concatenate((samples, labels), axis=1)
    print(f'data: {type(samples), samples.shape}')
    print(f'labels: {type(labels), labels.shape}')
    print(f'complete data: {complete_data.shape}')

    # saving complete data
    np.save(osp.join(path, 'complete_data.npy'), complete_data)
    print('Saving complete data successfully!')
