# Import required modules etc
import sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split
import pandas as pd
from scipy import special
import tensorflow_hub as hub
import os
import json
import gzip
from urllib.request import urlopen
import random
from sklearn.metrics import confusion_matrix
from datetime import datetime
import tensorflow as tf
import coral_ordinal as coral
import numpy as np

##########################
### SETTINGS
##########################

# Seeds for reproducibility

seed_value=321
#seed_value=231
#seed_value=123
np.random.seed(seed_value)
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
tf.random.set_seed(seed_value)

# Hyperparameters

batch_size = 32
num_epochs = 50
patient=10
#min_delta=0.001
validation_split=0.2

# Architecture (Amazon class number)
num_classes = 5

data = []
with gzip.open('Prime_Pantry_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

df = pd.DataFrame.from_dict(data)
df = df[['overall', 'reviewText']]

# There is a large amount of duplicate text in here, possibly due to paid/fraudulent reviews.
df.drop_duplicates("reviewText", inplace = True)

# Some of the text is blank, which causes an obscure error about floating point conversion.
df.dropna(inplace = True)

print(len(df))
print(df.head())

outcome_col = "overall"
text_col = "reviewText"

# We subtract the minimum value from the outcomes so that they start at 0.
df[outcome_col] = df[outcome_col].values - df[outcome_col].min()

print("\n", df.overall.value_counts())


# Train/Test split
text_train, text_test, labels_train, labels_test = \
  train_test_split(df[text_col].values, df[outcome_col].values, test_size = 10000, random_state = seed_value)

print("Training text shape:", text_train.shape)
print("Training labels shape:", labels_train.shape)
print("Testing text shape:", text_test.shape)
print("Testing labels shape:", labels_test.shape)

# Clear our GPU memory to stay efficient.
tf.keras.backend.clear_session()

input_text = tf.keras.layers.Input(shape = [], dtype = tf.string, name = 'input_text')

#model_url = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

#base_model = hub.KerasLayer(model_url, input_shape = [],
#                            dtype = tf.string,
#                            trainable = False)

base_model = hub.KerasLayer("5/", input_shape = [],
                            dtype = tf.string,
                            trainable = False)

                            
embedded = base_model(input_text)

x = tf.keras.layers.Dense(64, activation = 'relu')(embedded)
x = tf.keras.layers.Dropout(0.1)(x)
output = coral.CoralOrdinal(num_classes)(x) 

model = tf.keras.Model(inputs = input_text, outputs = output)


model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(),
    loss=coral.OrdinalCrossEntropy(num_classes=num_classes),
    metrics = [coral.SparseOrdinalEarthMoversDistance(),
        coral.MeanAbsoluteErrorLabels()])


base_model(np.array(["test_string"])).numpy()[0, :10]

history = model.fit(x = text_train,
                    y = labels_train,
                    epochs = num_epochs,
                    batch_size = batch_size, 
                    validation_split = validation_split,
                    callbacks = [tf.keras.callbacks.EarlyStopping(patience = patient,
                                                                  #min_delta = min_delta,
                                                                  restore_best_weights = True)])


timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')

model.save(os.path.join('./', 'model', timestamp + '_' + 'model.h5'))



model.evaluate(text_test, labels_test) 

preds = model.predict(text_test)
print(preds)

probs = pd.DataFrame(coral.ordinal_softmax(preds).numpy())

print(probs.head(10))
print(labels_test[:10])

labels_v1 = probs.idxmax(axis = 1)
print("Accuracy of label version 1:", np.mean(labels_v1 == labels_test))

#cum_probs = pd.DataFrame(preds).apply(special.expit).cumprod(axis=1)
cum_probs = pd.DataFrame(preds).apply(special.expit)
labels_v2 = cum_probs.apply(lambda x: x > 0.5).sum(axis = 1)
print("Accuracy of label version 2:", np.mean(labels_v2 == labels_test))

# These do not correspond with what we get from the model evaluation. Something must be off in one of these.
print("Mean absolute label error version 1:", np.mean(np.abs(labels_v1 - labels_test)))
print("Mean absolute label error version 2:", np.mean(np.abs(labels_v2 - labels_test)))

print("Root mean squared label error version 1:", np.sqrt(np.mean(np.square(labels_v1 - labels_test))))
print("Root mean squared label error version 2:", np.sqrt(np.mean(np.square(labels_v2 - labels_test))))

# Review how absolute error is calculated for ordinal labels:
pd.DataFrame({"true": labels_test, "pred_v2": labels_v1, "abs": labels_v2 - labels_test}).head()


print("Accuracy tolerance 1 of label version 1:", np.mean(np.abs(labels_v1 - labels_test) <= 1))

print("Accuracy tolerance 1 of label version 2:", np.mean(np.abs(labels_v2 - labels_test) <= 1))

conf_matrix1 = confusion_matrix(labels_test, labels_v1)
print(conf_matrix1)

conf_matrix2 = confusion_matrix(labels_test, labels_v2)
print(conf_matrix2)

bin_conf_matrix1 = np.array([[conf_matrix1[0,0],
np.sum(conf_matrix1[0,1:])],
[np.sum(conf_matrix1[1:,0]),
np.sum(conf_matrix1[1:,1:])]])
print("Binary confusion matrix label version 1:\n",bin_conf_matrix1)

bin_conf_matrix2 = np.array([[conf_matrix2[0,0],
np.sum(conf_matrix2[0,1:])],
[np.sum(conf_matrix2[1:,0]),
np.sum(conf_matrix2[1:,1:])]])
print("Binary confusion matrix label version 2:\n",bin_conf_matrix2)

#pred_probs['true_label'] = labels_test
#pred_probs.to_csv('predictedProbs.csv')

