print("Importing dependencies..")

import scipy
import numpy as np
from scipy.linalg import lstsq
from scipy.linalg import norm 
from sklearn import preprocessing
import pandas as pd
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics
from sklearn.base import clone
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import PolynomialFeatures
from util import RegressionGame
from util_sparse import getShapleyResiduals, getShapleyProjection
import xgboost
from sklearn.model_selection import train_test_split
import matplotlib.pylab as pl
import cProfile
import re
import sklearn
import time
import pickle
import os

sklearn.set_config(assume_finite=True)

print("  ..ok!")
print("Setting up parameters..")

if not os.path.exists('data'):
    os.makedirs('data')

try:
    N = int(os.getenv("EXPLANATION_COUNT"))
    print("Explanation count: %s" % N)
except:
    print("Using default explanation count of 10 (run with EXPLANATION_COUNT=1000 for results from the paper)")
    N = 10

try:
    k = int(os.getenv("SHAP_SAMPLES"))
    print("SHAP sample count: %s" % k)
except:
    print("Using default SHAP sample count of 25 (run with SHAP_SAMPLES=25 for results from the paper)")
    k = 10

# N = 1000
# k = 25

print("  ..ok!")
print("Preprocessing data..")

# Setup
np.random.seed(1)
adult_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
               'marital_status', 'occupation', 'relationship', 'race', 'sex',
               'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
dat = pd.read_csv("data/adult.test", names = adult_names, index_col = False, skiprows = 1, sep = ', ')

# subset columns
dat = dat.dropna()
cols = ['age', 'workclass', 'education', 'education-num',
        'marital_status', 'occupation', 'relationship', 'race', 'sex',
        'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
categorical_feats = ['workclass', 'education', 'marital_status', 'occupation',
                     'relationship', 'race', 'sex', 'native_country']
feats = cols[:-1]
dat = dat[cols]

# data preprocessing functions

onehot_encoders = {}
for feat in categorical_feats:
    enc = preprocessing.OneHotEncoder()
    enc.fit(dat[[feat]])
    onehot_encoders.update({feat: enc})

def preprocess(X):
    newX = np.empty((X.shape[0], 0))
    for i, feat in enumerate(feats):
        if feat in categorical_feats:
            newCols = onehot_encoders[feat].transform(X[:,i].reshape((X.shape[0], 1))).toarray()
        else:
            newCols = X[:,i].reshape((X.shape[0], 1))
        newX = np.append(newX, newCols, axis=1)
    return(newX)

X = (dat.loc[:, dat.columns != 'income']).to_numpy()
y = dat[['income']].to_numpy().reshape((X.shape[0],))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

print("  ..ok!")
print("Preprocessing data..")
pp_X_train = preprocess(X_train)
print("  ..ok!")
print("Training model..")
clf = RandomForestClassifier(n_estimators = 10)
clf.fit(pp_X_train, y_train)

print(sklearn.metrics.roc_auc_score(y_true = y_test, y_score = (clf.predict_proba(preprocess(X_test)))[:,1]))

predict_fn = lambda x: (clf.predict_proba(x))[:,1]
obj = RegressionGame(X = X_test[0:k,:], function = predict_fn, transform = preprocess)

X_samp = X_test[k:(N+k),:]

shapley_values = np.empty((0, X.shape[1]))
partial_residuals = np.empty((0, X.shape[1]))
games = np.empty((0, 2 ** X.shape[1]))

print("  ..ok!")
print("Generating explanations..")

for i in range(0, N):
    example_row = X_samp[i,:].reshape((1,X_samp.shape[1]))
    game = obj.getKernelSHAPGame(example_row)
    games = np.append(games, game.reshape((1,game.shape[0])), axis = 0)
    results, residualGame, origGame = getShapleyProjection(game)
    shapley_values = np.append(shapley_values,
                               np.array([np.flip(results[-1])]), axis=0)
    partial_residuals = np.append(partial_residuals,
                                  np.array([np.flip(norm(residualGame, axis = 0)/norm(origGame, axis = 0))]), axis = 0)
    # if i % 100 == 0:
    #     with open('data/adult_games', "wb") as fout:
    #         pickle.dump(games, fout)
    print("%s/%s samples done." % (i+1, N))

print("  ..ok Explanations saved to data/adult_*.csv!")

pd.DataFrame(X_samp, columns = feats).to_csv('data/adult_input.csv')
pd.DataFrame(shapley_values, columns = feats).to_csv('data/adult_shapley_values.csv')
pd.DataFrame(partial_residuals, columns = feats).to_csv('data/adult_partial_residuals.csv')

