!pip install pulp
import os
import numpy as np
import sys
# sys.path.append("/data/")
import scipy.stats
from os import path
from sklearn.datasets import load_svmlight_file


def loadData(datasetName):
    if datasetName == "synthetic":
        Np =-1
        dataset = loadsynthetic(Np)

    #boston dataset
    if datasetName == "Boston":
        Np =-1
        dataset = loadBoston()

    #diabetes dataset
    if datasetName == "Diabetes":
        Np =-1
        dataset = loadDiabetes()

    #diabetes dataset
    if datasetName == "California":
        Np =-1
        dataset = loadCalifornia()

    #tomblog dataset
    if datasetName == "TomBlog":
        Np =-1
        dataset = loadTomBlog(Np)

    #soil dataset
    if datasetName == "Soil":
        Np =-1
        dataset = loadSoil(Np)

    #dinnor dataset
    if datasetName == "indoor":
        Np =-1
        dataset = loadIndoor(Np)

    #fruit dataset
    if datasetName == "Fruit":
        Np =-1
        dataset = loadFruit(Np)

    #fruit dataset
    if datasetName == "Meat":
        Np =-1
        dataset = loadMeat(Np)

    #parkinsons dataset
    if datasetName == "Parkinsons":
        Np =-1
        dataset = loadParkinsons(Np)

    # superconductor dataset
    if datasetName == "superconductor":
        Np =-1
        dataset = loadSuperconductor(Np)

    # Online news populartiy dataset
    if datasetName == "onlineNewsPopularity":
        Np =-1
        dataset = loadOnlineNewsPopularity(Np)

    #Twitter dataset
    if datasetName == "Twitter":
        Np = 50000
        dataset = loadTwitter(Np)

    #Blog dataset
    if datasetName == "Blog":
        Np =-1
        Np = 10000
        dataset = loadBlog(Np)

    #ghg dataset
    if datasetName == "ghg":
        Np =-1
        dataset = load_ghg(Np)

    #CTslice dataset
    if datasetName == "CTslice":
        Np =-1
        dataset = loadCTslice(Np)

    #malware dataset
    if datasetName == "malware":
        Np = 10000
        dataset = load_malware(Np)

    return dataset

def loadsynthetic():
    np.random.seed(0) # this random seed determines the dataset
    N = 200 # number of points
    x = np.linspace(-1,1,N)
    model = np.random.uniform(-1,1,2)
    y = model[0]*x + model[1] + np.random.normal(scale = 0.1, size=N)
    print("y = "+str(model[0])+"x + "+str(model[1]))
    ones = np.ones_like(x)
    dataset = np.vstack((x,ones,y))
    print (dataset.shape)
    dataset = np.transpose(dataset)
    print (x.shape)
    print (y.shape)
    return dataset
    # plt.figure()
    # plt.scatter(x,y)
    # plt.show()

def loadBoston():
    np.random.seed(0) # this random seed determines the dataset
    from sklearn.datasets import load_boston
    boston_dataset = load_boston()
    x = boston_dataset['data']
    y = boston_dataset['target']
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    ones = np.ones(x.shape[0])
    dataset = np.c_[x, ones, y]
    np.random.shuffle(dataset)
    return dataset

def loadDiabetes():
    np.random.seed(0) # this random seed determines the dataset
    from sklearn.datasets import load_diabetes
    boston_dataset = load_diabetes()
    x = boston_dataset['data']
    y = boston_dataset['target']
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    ones = np.ones(x.shape[0])
    dataset = np.c_[x, y]
    np.random.shuffle(dataset)
    return dataset

def loadCalifornia():
    np.random.seed(0) # this random seed determines the dataset
    from sklearn.datasets import fetch_california_housing
    boston_dataset = fetch_california_housing()
    x = boston_dataset['data']
    y = boston_dataset['target']
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    ones = np.ones(x.shape[0])
    dataset = np.c_[x, y]
    np.random.shuffle(dataset)
    return dataset

def loadTwitter(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.loadtxt("../data/Buzzinsocialmedia/Twitter/Twitter.data", delimiter = ',')
        # dataset1 = dataset
    else:
        dataset = np.loadtxt("../data/Buzzinsocialmedia/Twitter/Twitter.data", delimiter = ',', max_rows=Np)
        # dataset1 = dataset[0:Np,:]
    np.random.shuffle(dataset)

    x = dataset[:,:-1]
    y = dataset[:,-1]

    # centering
    x -= np.average(x, axis=0)
    # x /= np.max(abs(x + 0.000000001), axis=0)

    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def loadBlog(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.loadtxt("../data/BlogFeedback/blogData_train.csv", delimiter = ',')
        # dataset1 = dataset
    else:
        dataset = np.loadtxt("../data/BlogFeedback/blogData_train.csv", delimiter = ',', max_rows=Np)
        # dataset1 = dataset[0:Np,:]
    np.random.shuffle(dataset)

    x = dataset[:,:-1]
    y = dataset[:,-1]

    # centering
    x -= np.average(x, axis=0)
    # x /= np.max(abs(x + 0.000000001), axis=0)

    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def loadParkinsons(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.loadtxt("parkinsons.csv", delimiter = ',')
        # dataset1 = dataset
    #else:
        #dataset = np.loadtxt("../data/BlogFeedback/blogData_train.csv", delimiter = ',', max_rows=Np)
        # dataset1 = dataset[0:Np,:]
    np.random.shuffle(dataset)

    x1 = dataset[:,:5]
    x2 = dataset[:,6:]
    x = np.hstack((x1,x2))
    y = dataset[:,5]

    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    dataset= np.column_stack((x, y))
    return dataset

def loadIndoor(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.genfromtxt("trainingData.csv", delimiter = ',')
        # dataset1 = dataset
    else:
        dataset = np.loadtxt("trainingData.csv", delimiter = ',', max_rows=Np)
        # dataset1 = dataset[0:Np,:]
    dataset = dataset[1:, :]
    np.random.shuffle(dataset)

    x = dataset[:,:-9]
    y = dataset[:,-8]

    # centering
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    # x /= np.max(abs(x + 0.000000001), axis=0)

    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def loadTomBlog(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.genfromtxt("TomsHardware.txt",#"/Users/johnchen/4sernior/raceregression/blog/TomsHardware/TomsHardware.txt",
                             delimiter=",",dtype=float)
        # dataset1 = dataset
    else:
        dataset = np.genfromtxt("TomsHardware.txt",#"/Users/johnchen/4sernior/raceregression/blog/TomsHardware/TomsHardware.txt",
                             delimiter=",",dtype=float)
        # dataset1 = dataset[0:Np,:]
    np.random.shuffle(dataset)

    x = dataset[:,:-1]
    y = dataset[:,-1]

    # centering
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def loadSuperconductor(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.loadtxt("train.csv",#"/Users/johnchen/4sernior/raceregression/blog/TomsHardware/TomsHardware.txt",
                             delimiter=",",skiprows = 1)
        # dataset1 = dataset
    else:
        dataset = np.loadtxt("train.csv",#"/Users/johnchen/4sernior/raceregression/blog/TomsHardware/TomsHardware.txt",
                             delimiter=",",skiprows = 1)
        # dataset1 = dataset[0:Np,:]
    np.random.shuffle(dataset)

    x = dataset[:,:-1]
    y = dataset[:,-1]

    # centering
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def loadOnlineNewsPopularity(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.loadtxt("OnlineNewsPopularity.csv",#"/Users/johnchen/4sernior/raceregression/blog/TomsHardware/TomsHardware.txt",
                             delimiter=",",skiprows = 1, usecols=(2,60))
        # dataset1 = dataset
    else:
        dataset = np.loadtxt("OnlineNewsPopularity.csv",#"/Users/johnchen/4sernior/raceregression/blog/TomsHardware/TomsHardware.txt",
                             delimiter=",",skiprows = 1, usecols=(2,60))
        # dataset1 = dataset[0:Np,:]
    np.random.shuffle(dataset)

    x = dataset[:,:-1]
    y = dataset[:,-1]

    # centering
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def loadSoil(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.loadtxt('soil.csv',#'/Users/johnchen/4sernior/raceregression/soil/soil.csv',
                             delimiter = ',',skiprows = 1)
        # dataset1 = dataset
    else:
        dataset = np.loadtxt('soil.csv',#'/Users/johnchen/4sernior/raceregression/soil/soil.csv',
                             delimiter = ',',skiprows = 1)
        # dataset1 = dataset[0:Np,:]
    np.random.shuffle(dataset)

    x = dataset[:,1:]
    y = dataset[:,0]

    # centering
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def loadFruit(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.loadtxt('fruit.csv',#'/Users/johnchen/4sernior/raceregression/soil/soil.csv',
                             delimiter = ',',skiprows = 1)
        # dataset1 = dataset
    else:
        dataset = np.loadtxt('fruit.csv',#'/Users/johnchen/4sernior/raceregression/soil/soil.csv',
                             delimiter = ',',skiprows = 1)
        # dataset1 = dataset[0:Np,:]
    dataset = dataset.T
    dataset = dataset[1:, :]
    np.random.shuffle(dataset)

    x = dataset[:,1:]
    y = dataset[:,0]

    # centering
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def loadMeat(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.loadtxt('Tecator.csv',#'/Users/johnchen/4sernior/raceregression/soil/soil.csv',
                             delimiter = ',',skiprows = 1)
        # dataset1 = dataset
    else:
        dataset = np.loadtxt('Tecator.csv',#'/Users/johnchen/4sernior/raceregression/soil/soil.csv',
                             delimiter = ',',skiprows = 1)
        # dataset1 = dataset[0:Np,:]
    # X[:,-1] = protein
    # X[:,-2] = fat
    # X[:,-3] = moisture
#    y = X[:,-3]
#    x = X[:,0:-3]
#    d = x.shape[1]
#    N = x.shape[0]

    np.random.shuffle(dataset)

    x = dataset[:,0:-3]
    y = dataset[:,-3]

    # centering
    x -= np.average(x, axis=0)
    y -= np.average(y, axis=0)
    sx = np.max(abs(x),axis = 0)
    #sx[np.where(sx == 0)] = 1
    sy = np.max(abs(y))
    # putting in a ball of 1
    x /= sx
    y /= sy
    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def load_ghg(Np):
    np.random.seed(0) # this random seed determines the dataset
    dataset = []
    directory = '../data/ghg_data/'
    for filename in os.listdir(directory):
        filename = directory+ filename
        # print (filename)
        with open(filename) as f:
            L = []
            for line in f:
                l1 = np.array(line.split(' '))
                L.append(l1)

            vec = []
            for line in L:
                l2 = line.astype(np.float)
                l2 = np.array(l2)
                vec = np.append(vec,l2)
            # print (len(vec))
        dataset.append(np.array(vec))
    dataset = np.array(dataset)
    print (len(dataset), len(dataset[1]))
    np.random.shuffle(dataset)
    if Np >0:
        dataset = dataset[0:Np]

    x = dataset[:,:-1]
    y = dataset[:,-1]
    # centering
    x -= np.average(x, axis=0)
    dataset= np.column_stack((x, y))
    return dataset

def loadCTslice(Np):
    np.random.seed(0) # this random seed determines the dataset
    if Np==-1:
        dataset = np.loadtxt("../data/sliceLocalization/slice_localization_data.csv", delimiter = ',', skiprows=1)
        # dataset1 = dataset
    else:
        dataset = np.loadtxt("../data/sliceLocalization/slice_localization_data.csv", delimiter = ',',skiprows=1, max_rows=Np)
        # dataset1 = dataset[0:Np,:]
    np.random.shuffle(dataset)

    x = dataset[:,:-1]
    y = dataset[:,-1]

    # centering
    x -= np.average(x, axis=0)
    # x /= np.max(abs(x + 0.000000001), axis=0)

    # y -= np.min(y)
    # y /= np.max(abs(y))

    dataset= np.column_stack((x, y))
    return dataset

def load_malware(Np):
    np.random.seed(0) # this random seed determines the dataset
    dataset = []
    directory = '../data/malwareVirusShare/'

    cnt = 0
    for filename in os.listdir(directory):
        filename = directory+ filename
        # print (filename)
        train_features, train_labels = load_svmlight_file(filename, 482) # dataset[0] is x, dataset[1] is y
        train_features = train_features.todense()
        if cnt >0:
            x = np.concatenate((x,train_features), axis=0)
            y = np.concatenate((y,train_labels), axis=0)
        else:
            x = train_features
            y = train_labels
        cnt+=1

    # centering
    x -= np.average(x, axis=0)
    dataset= np.column_stack((x, y))
    np.random.shuffle(dataset)
    if Np>0:
        dataset1 = np.array(dataset[:Np, :])
    return dataset1
