import scipy
import numpy as np
from scipy.linalg import lstsq
from scipy.linalg import norm 
from sklearn import preprocessing
import pandas as pd
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics
from sklearn.base import clone
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import PolynomialFeatures
from util import RegressionGame
from util_sparse import getShapleyResiduals, getShapleyProjection
import xgboost
from sklearn.model_selection import train_test_split
import matplotlib.pylab as pl
import cProfile
import re
import sklearn
import time
import pickle
sklearn.set_config(assume_finite=True)

if not os.path.exists('data'):
    os.makedirs('data')

N = 1000
k = 50

# Setup
np.random.seed(1)
dat = pd.read_csv("data/occupancy.csv")
feats = ['light','hour']

X = (dat.loc[:, feats]).to_numpy()
y = dat[['occupancy']].to_numpy().reshape((X.shape[0],))



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)
clf = DecisionTreeClassifier(max_depth = 3)
clf.fit(X_train, y_train)

sklearn.tree.plot_tree(clf)

print(sklearn.metrics.roc_auc_score(y_true = y_test, y_score = (clf.predict_proba(X_test))[:,1]))

predict_fn = lambda x: (clf.predict_proba(x))[:,1]
obj = RegressionGame(X = X_test[0:k,:], function = predict_fn)


X_samp = X_test[k:(N+k),:]

shapley_values = np.empty((0, X.shape[1]))
partial_residuals = np.empty((0, X.shape[1]))
games = np.empty((0, 2 ** X.shape[1]))

for i in range(0, N):
    example_row = X_samp[i,:].reshape((1,X_samp.shape[1]))
    game = obj.getKernelSHAPGame(example_row)
    games = np.append(games, game.reshape((1,game.shape[0])), axis = 0)
    results, residualGame, origGame = getShapleyProjection(game)
    shapley_values = np.append(shapley_values,
                               np.array([np.flip(results[-1])]), axis=0)
    partial_residuals = np.append(partial_residuals,
                                  np.array([np.flip(norm(residualGame, axis = 0)/norm(origGame, axis = 0))]), axis = 0)
    if i % 100 == 0:
        with open('data/occupancy_small_games', "wb") as fout:
            pickle.dump(games, fout)

pd.DataFrame(games).to_csv('data/occupancy_small_games.csv')
pd.DataFrame(X_samp, columns = feats).to_csv('data/occupancy_small_input.csv')
pd.DataFrame(shapley_values, columns = feats).to_csv('data/occupancy_small_shapley_values.csv')
pd.DataFrame(partial_residuals, columns = feats).to_csv('data/occupancy_small_partial_residuals.csv')

