#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import os
import shutil
import pickle
import requests, zipfile, io
from ucimlrepo import fetch_ucirepo

exec(open("python/sim_settings.py").read())

staging_dir = 'staging/'

if not os.path.exists(staging_dir):
    os.makedirs(staging_dir)

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

for ds in datasets_to_use:
    print(ds)
    if ds=='kin40k':
        test_X_url = "https://github.com/trungngv/fgp/raw/master/data/kin40k/kin40k_test_data.asc"
        test_y_url = "https://github.com/trungngv/fgp/raw/master/data/kin40k/kin40k_test_labels.asc"
        train_X_url = "https://github.com/trungngv/fgp/raw/master/data/kin40k/kin40k_train_data.asc"
        train_y_url = "https://github.com/trungngv/fgp/raw/master/data/kin40k/kin40k_train_labels.asc"

        test_X = np.array(pd.read_csv(test_X_url, sep = "\s+", header = None))
        test_y = np.array(pd.read_csv(test_y_url, sep = "\s+", header = None)).flatten()
        train_X = np.array(pd.read_csv(train_X_url, sep = "\s+", header = None))
        train_y = np.array(pd.read_csv(train_y_url, sep = "\s+", header = None)).flatten()

        X = np.concatenate([train_X, test_X])
        y = np.concatenate([train_y, test_y])
    elif ds == 'supercond':
        df = pd.read_csv("raw_data/superconduct.csv")
        X = np.array(df.iloc[:,:81])
        y = np.array(df.iloc[:,81])
    elif ds == 'gas':
        uds = fetch_ucirepo(id=551)
        X = np.array(uds['data']['features'].drop('NOX',axis=1))
        y = np.array(uds['data']['features']['NOX'])
    elif ds == 'concrete':
        df = pd.read_excel('raw_data/Concrete_Data.xls')
        X = np.array(df.iloc[:,:8])
        y = np.array(df.iloc[:,-1]).flatten()
    elif ds == 'grid':
        uds = fetch_ucirepo(id=471)
        X = np.array(uds['data']['features'])
        y = np.array(uds['data']['targets'].iloc[:,0])
    elif ds=='keggu':
        zip_file_url = 'https://archive.ics.uci.edu/static/public/221/kegg+metabolic+reaction+network+undirected.zip'
        r = requests.get(zip_file_url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(staging_dir)

        datstr = 'Reaction Network (Undirected).data'
        df = pd.read_csv(staging_dir+datstr, header = None)
        # Fill missing with zero in this benchmark.
        df = df.iloc[:,1:].apply(lambda x: pd.to_numeric(x, errors = 'coerce'), axis = 0).fillna(0)

        X = np.array(df.iloc[:,1:]).astype(float)
        y = np.log10(np.array(df.iloc[:,0]).astype(float))
    elif ds=='bike':
        uds = fetch_ucirepo(id=275)

        X = np.array(uds['data']['features'].iloc[:,1:])
        y = np.array(uds['data']['targets']).flatten()

    elif ds =='obesity':
        uds = fetch_ucirepo(id=544)

        Xdf = uds['data']['features']
        cat = ['Gender', 'family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']
        Xdfcat = pd.get_dummies(Xdf.loc[:,cat], drop_first = True).astype(float)
        X = np.array(pd.concat([Xdf.drop(cat,axis=1), Xdfcat], axis = 1))
        od = {
                'Insufficient_Weight' : 0,
                'Normal_Weight' : 1,
                'Overweight_Level_I' : 2,
                'Overweight_Level_II' : 3,
                'Obesity_Type_I' : 4,
                'Obesity_Type_II' : 5,
                'Obesity_Type_III' : 6,
                }
        y = np.array(uds['data']['targets'].map(lambda x: od[x])).flatten()
    elif ds=='seoul':
        uds = fetch_ucirepo(id=560)
        dat = uds['data']['features']
        Xdf = dat.iloc[:,2:]
        cat = ['Seasons','Holiday']
        Xdfcat = pd.get_dummies(Xdf.loc[:,cat], drop_first = True).astype(float)
        X = np.array(pd.concat([Xdf.drop(cat,axis=1), Xdfcat], axis = 1))
        y = np.sqrt(dat['Rented Bike Count'])
    else:
        raise Exception("Unknown Dataset!")

    df = pd.DataFrame(X)
    df.columns = ['X'+str(i) for i in range(X.shape[1])]
    df['y'] = y

    eps = 1e-6
    df['y'] = (df['y'] - np.mean(df['y'])) / (np.std(df['y'])+eps)
    for i in range(df.shape[1]-1):
        df.iloc[:,i] = (df.iloc[:,i] - np.mean(df.iloc[:,i])) / (np.std(df.iloc[:,i])+eps)

    outname = data_dir+ds+'.csv'
    df.to_csv(outname, index = False)

