

from preprocessing.semantics import extractPreAttributionDataset, convertListOfSentences
from preprocessing.files import extractSentencesFromPDF


from statistics import median



#######  
######
# Pre-Attribution
####
###

import os.path
from joblib import load, dump
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

datasets = ["HAGRID_Clean_train", "HAGRID_Clean_dev"]
#datasets = ["HAGRID_train", "HAGRID_dev"]
#datasets = ["WGLM_train", "WGLM_test", "WGLM_validation"]

dsname = "_".join(datasets[0].split("_")[:-1])
dumpfile = "precomputed/%s_EMB.lib" % dsname 

if not os.path.isfile(dumpfile):
    print("Calculating embeddings for preattribution with dataset:",dsname)
    from sentence_transformers import SentenceTransformer, models
    model_st = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cpu")
    X = []
    Y = []
    for d in datasets:
        ds = load("precomputed/%s.lib"%d)


        for query in ds:
            for sentence in query[1]:
                X.append( list(model_st.encode(sentence[0])) )
                Y.append( sentence[2] )

    dump( (X,Y), dumpfile)
else:
    print("Loading from dump file text features for preattribution with dataset:",dsname)
    X,Y = load(dumpfile)

tr_split = int(len(Y)*0.7)
X_tr = X[:tr_split]
Y_tr = Y[:tr_split]
X_te = X[tr_split:]
Y_te = Y[tr_split:]


print(len(X[0]))
assert False



import random

def shuffle(A,B,random_state=42):
    random.seed(random_state)
    random.shuffle(A)
    random.seed(random_state)
    random.shuffle(B)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

classification_method = ["RF", "XGB", "MLP", "TabNet"][3:]
for classification_method in classification_method:
    acc_tr_30 = []
    acc_te_30 = []
    cm_tr_30 = []
    cm_te_30 = []
    for run in range(27,30):
        shuffle(X, Y, run)
        X_tr = X[:tr_split]
        Y_tr = Y[:tr_split]
        X_te = X[tr_split:]
        Y_te = Y[tr_split:]

        if classification_method == "RF":
            from sklearn.ensemble import RandomForestClassifier

            Y_tr_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_tr]
            Y_te_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_te]
            #model_preatt = RandomForestClassifier(max_depth=12, random_state=42, class_weight="balanced") #HAG CLEAN
            model_preatt = RandomForestClassifier(max_depth=16, random_state=42, class_weight="balanced")
            model_preatt.fit(X_tr, Y_tr_)

        elif classification_method == "XGB":
            import xgboost as xgb
            from sklearn.utils.class_weight import compute_sample_weight

            Y_tr_ = ["NSMU".index(c[0])%3 for c in Y_tr]
            Y_te_ = ["NSMU".index(c[0])%3 for c in Y_te]

            sample_weights = compute_sample_weight(class_weight="balanced", y=Y_tr)
            #model_preatt = xgb.XGBClassifier(max_depth=6, random_state=42, n_estimators=50) #HAG CLEAN
            model_preatt = xgb.XGBClassifier(max_depth=8, random_state=42)

            model_preatt.fit(X_tr, Y_tr_, sample_weight=sample_weights)

        elif classification_method == "MLP":
            from sklearn.neural_network import MLPClassifier

            Y_tr_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_tr]
            Y_te_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_te]

            #model_preatt = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='lbfgs',max_iter=500,random_state=42) #HAG CLEAN
            model_preatt = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='lbfgs',max_iter=20000,random_state=42, learning_rate="adaptive", learning_rate_init=0.1)

            model_preatt.fit(X_tr, Y_tr_)

        elif classification_method == "TabNet":
            from pytorch_tabnet.tab_model import TabNetClassifier

            Y_tr_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_tr]
            Y_te_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_te]

            model_preatt = TabNetClassifier(n_d=8, n_a=8, n_steps=2,gamma=1.5,lambda_sparse=1e-7, mask_type='entmax', optimizer_params=dict(lr=2e-3), verbose= False)

            X_tr = pd.DataFrame(X_tr, columns = ["X%d" for x in range(len(X_tr[0]))]).values
            X_te = pd.DataFrame(X_te, columns = ["X%d" for x in range(len(X_te[0]))]).values

            #model_preatt.fit(X_tr, Y_tr_, max_epochs=200) #HAG CLEAN
            model_preatt.fit(X_tr, Y_tr_, max_epochs=5000)



        tr_acc = accuracy_score(Y_tr_, model_preatt.predict(X_tr))
        te_acc = accuracy_score(Y_te_, model_preatt.predict(X_te))

        acc_tr_30.append(tr_acc)
        acc_te_30.append(te_acc)

        cm_tr_30.append( confusion_matrix(Y_tr_, model_preatt.predict(X_tr)) )
        cm_te_30.append( confusion_matrix(Y_te_, model_preatt.predict(X_te)) )


    print("CLASSIFICATION METHOD:")
    print("  Name: %s" % classification_method)
    print("  Training Accuracy on Pre-Attribution: ", acc_tr_30, "---", median(acc_tr_30))
    print("  Test     Accuracy on Pre-Attribution: ", acc_te_30, "---", median(acc_te_30))

    from time import time 

    start = time()
    for i in range(30):
        model_preatt.predict(X_tr)
        model_preatt.predict(X_te)
    end = time()

    print("  Time taken to predict HAGRID CLEAN 30 times: %.2f" %(end-start))

    print( "CM_TR =", cm_tr_30 )
    print( "CM_TE =", cm_te_30 )






