print("Importing dependencies..")

import scipy
import numpy as np
from scipy.linalg import lstsq
from scipy.linalg import norm 
import pandas as pd
from sklearn import linear_model
import sklearn.metrics as metrics
from sklearn.base import clone
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import PolynomialFeatures
from util import RegressionGame, getShapleyProjection
import shap
import xgboost
from sklearn.model_selection import train_test_split
import pickle

print("  ..ok!")
print("Setting up parameters..")

if not os.path.exists('data'):
    os.makedirs('data')

# Setup
np.random.seed(1)

try:
    N = int(os.getenv("EXPLANATION_COUNT"))
    print("Explanation count: %s" % N)
except:
    print("Using default explanation count of 10 (run with EXPLANATION_COUNT=1000 for results from the paper)")
    N = 10

try:
    k = int(os.getenv("SHAP_SAMPLES"))
    print("SHAP sample count: %s" % k)
except:
    print("Using default SHAP sample count of 10 (run with SHAP_SAMPLES=100 for results from the paper)")
    k = 10

# N = 1000
# k = 100

print("  ..ok!")
print("Training model..")
    
# Data
X,y = shap.datasets.nhanesi()
X = X[['Unnamed: 0', 'Age', 'Diastolic BP', 'Sex', 'Systolic BP', 'Poverty index', 'White blood cells', 'BMI']]

# Model training
xgb_full = xgboost.DMatrix(X, label=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
xgb_train = xgboost.DMatrix(X_train, label=y_train)
xgb_test = xgboost.DMatrix(X_test, label=y_test)
params = {
    "eta": 0.002,
    "max_depth": 3,
    "objective": "survival:cox",
    "subsample": 0.5
}
model_train = xgboost.train(params, xgb_train, 5000, evals = [(xgb_test, "test")], verbose_eval=1000)


# Evaluation
def c_statistic_harrell(pred, labels):
    total = 0
    matches = 0
    for i in range(len(labels)):
        for j in range(len(labels)):
            if labels[j] > 0 and abs(labels[i]) > labels[j]:
                total += 1
                if pred[j] > pred[i]:
                    matches += 1
    return matches/total

print(c_statistic_harrell(model_train.predict(xgb_test, ntree_limit=5000), y_test))


print("  ..ok!")
print("Generating explanations..")
# Setup kernel/conditional shap object
predict = lambda x: model_train.predict(xgboost.DMatrix(pd.DataFrame(data = x, columns = X.columns)),
                                  output_margin = True)
obj = RegressionGame(X = X_test.values[0:k,:], function = predict)

pd.DataFrame(X_test.values[k:(N+k),:], columns = X.columns).to_csv('data/nhanes_margin_input.csv')

# Calculate shapley values and residuals
shapley_values = np.empty((0, X.shape[1]))
partial_residuals = np.empty((0, X.shape[1]))
games = np.empty((0, 2 ** X.shape[1]))
for i in range(0, N):
    example_row = X_test.values[i+k,:].reshape((1,X.shape[1]))
    game = obj.getKernelSHAPGame(example_row)
    games = np.append(games, game.reshape((1,game.shape[0])), axis = 0)
    results, residualGame, origGame = getShapleyProjection(game)
    shapley_values = np.append(shapley_values, np.array([np.flip(results[-1])]), axis=0)
    partial_residuals = np.append(partial_residuals, np.array([np.flip(norm(residualGame, axis = 0)/norm(origGame, axis = 0))]), axis = 0)
    # if i % 100 == 0:
    #     with open('data/nhanes_margin_games', "wb") as fout:
    #         pickle.dump(games, fout)

    print("%s/%s samples done." % (i+1, N))

pd.DataFrame(shapley_values, columns = X.columns).to_csv('data/nhanes_margin_shapley_values.csv')
pd.DataFrame(partial_residuals, columns = X.columns).to_csv('data/nhanes_margin_partial_residuals.csv')

print("  ..ok! Explanations saved to data/nhanes_margin_*.csv")

