

from preprocessing.semantics import extractPreAttributionDataset, convertListOfSentences
from preprocessing.files import extractSentencesFromPDF






#######  
######
# Pre-Attribution
####
###

import os.path
from joblib import load, dump
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

datasets = ["HAGRID_Clean_train", "HAGRID_Clean_dev"]
datasets = ["HAGRID_train", "HAGRID_dev"]
#datasets = ["WGLM_train", "WGLM_test", "WGLM_validation"]

dsname = "_".join(datasets[0].split("_")[:-1])
dumpfile = "precomputed/%s_TF.lib" % dsname 

if not os.path.isfile(dumpfile):
    print("Calculating text features for preattribution with dataset:",dsname)
    X = []
    Y = []
    for d in datasets:
        ds = load("precomputed/%s.lib"%d)
        X_, Y_ = extractPreAttributionDataset(ds)
        X.extend(X_)
        Y.extend(Y_)
    dump( (X,Y), dumpfile)
else:
    print("Loading from dump file text features for preattribution with dataset:",dsname)
    X,Y = load(dumpfile)

tr_split = int(len(Y)*0.7)
X_tr = X[:tr_split]
Y_tr = Y[:tr_split]
X_te = X[tr_split:]
Y_te = Y[tr_split:]




classification_method = ["RF", "XGB", "MLP", "TabNet"]
for classification_method in classification_method[2:3]:

    if classification_method == "RF":
        from sklearn.ensemble import RandomForestClassifier

        Y_tr_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_tr]
        Y_te_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_te]
        model_preatt = RandomForestClassifier(max_depth=6, random_state=42, class_weight="balanced")
        model_preatt.fit(X_tr, Y_tr_)

    elif classification_method == "XGB":
        import xgboost as xgb
        from sklearn.utils.class_weight import compute_sample_weight

        Y_tr_ = ["NSMU".index(c[0])%3 for c in Y_tr]
        Y_te_ = ["NSMU".index(c[0])%3 for c in Y_te]

        sample_weights = compute_sample_weight(class_weight="balanced", y=Y_tr)
        model_preatt = xgb.XGBClassifier(max_depth=4, random_state=42, n_estimators = 50)

        model_preatt.fit(X_tr, Y_tr_, sample_weight=sample_weights)

    elif classification_method == "MLP":
        from sklearn.neural_network import MLPClassifier

        Y_tr_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_tr]
        Y_te_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_te]

        model_preatt = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='lbfgs',max_iter=500,random_state=42)

        model_preatt.fit(X_tr, Y_tr_)

    elif classification_method == "TabNet":
        from pytorch_tabnet.tab_model import TabNetClassifier

        Y_tr_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_tr]
        Y_te_ = ["R%s" % ("NSMU".index(c[0])%3) for c in Y_te]

        model_preatt = TabNetClassifier(n_d=8, n_a=8, n_steps=2,gamma=1.5,lambda_sparse=1e-5, mask_type='entmax', optimizer_params=dict(lr=2e-2), verbose=False)

        X_tr = pd.DataFrame(X_tr, columns = ["X%d" for x in range(len(X_tr[0]))]).values # will bug later
        X_te = pd.DataFrame(X_te, columns = ["X%d" for x in range(len(X_te[0]))]).values # will bug later

        model_preatt.fit(X_tr, Y_tr_, max_epochs=200)


    tr_acc = accuracy_score(Y_tr_, model_preatt.predict(X_tr))
    te_acc = accuracy_score(Y_te_, model_preatt.predict(X_te))



    print("CLASSIFICATION METHOD:")
    print("  Name: %s" % classification_method)
    print("  Training Accuracy on Pre-Attribution: %.2f" %(tr_acc*100))
    print("  Test     Accuracy on Pre-Attribution: %.2f" %(te_acc*100))

    from time import time 

    start = time()
    for i in range(10):
        model_preatt.predict(X_tr)
        model_preatt.predict(X_te)
    end = time()

    print("  Time taken to predict HAGRID CLEAN 10 times: %.2f" %(end-start))







