import pandas as pd
import numpy as np
import h5py
from scipy.io import loadmat, arff
import time

# preprocess data and load as DataFrame
def load_data(data_path):
    start = time.time()
    if 'http' in data_path or 'smtp' in data_path:
        h5py_data = h5py.File(data_path)
        init_data = np.transpose(h5py_data["X"])
        init_label = np.transpose(h5py_data["y"])
    elif 'arff' in data_path:
        arff_data = arff.loadarff(data_path)
        init_data = pd.DataFrame(arff_data[0])
        init_data["outlier"] = init_data["outlier"].apply(lambda x:x.decode('utf-8'))
        init_label = init_data["outlier"].apply(lambda x:0 if 'no' in x else 1).values.tolist()
        init_data.drop(columns = ["id","outlier"],inplace=True)
        init_data = init_data.values
    else:
        mat_data = loadmat(data_path)
        init_data = mat_data["X"]
        init_label = mat_data["y"]
    init_key = []
    for size in range(init_data.shape[1]):
        init_key.append("col"+str(size))
    init_data = pd.DataFrame(init_data, columns = init_key)
    init_label = list(map(int, init_label))
    contamination = sum(init_label) / len(init_label)
    print("-"*10 + data_path.split('/')[-1].split('.')[0] + "dataset loaded. Cost "+ str(time.time() - start) + "s. " + "-"*10)
    
    return init_data, init_label, contamination

# generate negative samples
def generate_negative_samples(x, neg_rate, neg_min, neg_max, if_neg_every_feature):
    start = time.time()
    
    n_samples = int(x.shape[0] * neg_rate) # number of negative samles
    n_dim = x.shape[-1] # numbers of features
    if not if_neg_every_feature:
        neg_x = np.random.uniform(neg_min, neg_max, (n_samples, n_dim))
    else:
        min_feature, max_feature = x.min(axis=0), x.max(axis=0)
        neg_x = np.zeros((n_samples, n_dim))
        for i in range(n_dim):
            neg_x[:, i] = np.random.uniform(min_feature[i], max_feature[i], (1, n_samples))[0]
    neg_y = np.ones(len(neg_x))
    
    print("-"*10 + "nosie generated. Cost " + str(time.time() - start) + "s. " + "-"*10)
    return neg_x.astype('float32'), neg_y.astype('float32')

# 1. X = X - X.min() + 1  
# 2. ln(X) 
def ln_x(init_data):
    init_data = np.array(init_data)
    init_data_min = np.min(init_data)
    init_data = init_data - init_data_min + 1
    init_data = np.log(init_data)
    return pd.DataFrame(init_data)