




datasets = ["HAGRID_Clean_train", "HAGRID_Clean_dev"]
#datasets = ["HAGRID_train", "HAGRID_dev"]

import os.path
from joblib import load, dump
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight

doingPreattribution = False

import random


if doingPreattribution:
    from preprocessing.semantics import extractPreAttributionDataset, convertListOfSentences
    from preprocessing.files import extractSentencesFromPDF

    dsname = "_".join(datasets[0].split("_")[:-1])
    dumpfile = "precomputed/%s_TF.lib" % dsname 

    if not os.path.isfile(dumpfile):
        print("Calculating text features for preattribution with dataset:",dsname)
        X = []
        Y = []
        for d in datasets:
            ds = load("precomputed/%s.lib"%d)
            X_, Y_ = extractPreAttributionDataset(ds)
            X.extend(X_)
            Y.extend(Y_)
        dump( (X,Y), dumpfile)
    else:
        print("Loading from dump file text features for preattribution with dataset:",dsname)
        X_pa,Y_pa = load(dumpfile)


def shuffle(A,B,random_state=42):
    random.seed(random_state)
    random.shuffle(A)
    random.seed(random_state)
    random.shuffle(B)



def train_pa_model(X, Y, random_state=42):
    #######  
    ######
    # Pre-Attribution
    ####
    ###

    tr_split = int(len(Y)*0.7)
    shuffle(X, Y, random_state)
    X_tr = X[:tr_split]
    Y_tr = Y[:tr_split]
    X_te = X[tr_split:]
    Y_te = Y[tr_split:]

    Y_tr_ = ["NSMU".index(c[0])%3 for c in Y_tr]
    Y_te_ = ["NSMU".index(c[0])%3 for c in Y_te]

    sample_weights = compute_sample_weight(class_weight="balanced", y=Y_tr)
    model_preatt = xgb.XGBClassifier(max_depth=1, random_state=42, n_estimators = 20)

    model_preatt.fit(X_tr, Y_tr_, sample_weight=sample_weights)

    tr_acc = accuracy_score(Y_tr_, model_preatt.predict(X_tr))
    te_acc = accuracy_score(Y_te_, model_preatt.predict(X_te))


    print("PA CLASSIFICATION METHOD:")
    print("  Name: XGB")
    print("  Training Accuracy on Pre-Attribution: %.2f" %(tr_acc*100))
    print("  Test     Accuracy on Pre-Attribution: %.2f" %(te_acc*100))

    return model_preatt



class AlwaysOne:
    def predict(self, sample, label):
        return 1

class TrueLabel:
    def predict(self, sample, label):
        #print(sample, label)
        return "NSMU".index(label[0])%3

class UseClassifier:
    def __init__(self, model):
        self.model = model

    def predict(self, sample, label):
        return self.model.predict(convertListOfSentences([sample]))[0]



#######  
######
# Attribution (DATASET LOADING)
####
###



dsname = "_".join(datasets[0].split("_")[:-1])
dumpfile = "precomputed/%s_ATT.lib" % dsname 

if not os.path.isfile(dumpfile):
    from sentence_transformers import SentenceTransformer, models
    model_st = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cpu")

    print("Generating attribution dataset:",dsname)
    AllQuotes = []
    AllQuotes_emb = []
    X = []
    X_emb = []
    Y_fetch = []
    Y = []
    Y_searchSpace = []
    Y_label = []
    for d in datasets:
        ds = load("precomputed/%s.lib"%d)
        for query in ds:
            current_index = len(AllQuotes)
            AllQuotes.extend(query[0])
            AllQuotes_emb.extend([list(model_st.encode(s)) for s in query[0]])

            ss = [v+current_index for v in range(len(query[0]))]
            for sentence in query[1]:
                X.append(sentence[0])
                X_emb.append(list(model_st.encode(sentence[0])))
                Y_fetch.append([v + current_index for v in sentence[1]])
                Y.append(sentence[1])
                Y_searchSpace.append(ss)
                Y_label.append(sentence[2])

    dump( ( (AllQuotes, AllQuotes_emb), (X, X_emb,Y_fetch, Y, Y_searchSpace,Y_label)), dumpfile)
else:
    print("Loading from dump file text features for attribution with dataset:",dsname)
    (AllQuotes, AllQuotes_emb), (X, X_emb,Y_fetch, Y, Y_searchSpace,Y_label) = load(dumpfile)



#######  
######
# Attribution
####
###

pa0 = AlwaysOne()
pa1 = [UseClassifier(train_pa_model(X_pa, Y_pa, random_state=run)) for run in range(30)] if doingPreattribution else None
pa2 = TrueLabel()

pa_method = [pa0, pa1, pa2]
pa_method_name = ["Always1", "XGB", "TrueLabel"]


from attribution.embeddingsONE import SelectClosestAttributor
from attribution.embeddingsTWO import SelectClosestNAttributor
from attribution.Fuzzy import Fuzzy
from attribution.BM25 import BM25
from attribution.SPLADE import SPLADE
from attribution.MMR import MMR


att0 = SelectClosestAttributor(None)
att1 = SelectClosestNAttributor(None)
att2 = Fuzzy()
att3 = BM25()
att4 = SPLADE()
att5 = MMR()

att_method = [att0, att1, att2, att3, att4, att5]
att_method_name = ["Closest-1", "Closest-2", "Fuzzy", "BM25", "SPLADE", "MMR"]



pa_selection = 0

pa = pa_method[pa_selection]




def isAttributionCorrect(pred, y_refs, y_label):
    hits = 0
    for i in range(len(y_label)):
        ishit = False
        if y_label[i][0] in "NU":
            if len(pred[i]) == 0:
                hits += 1
                ishit = True
        elif y_label[i][0] in "S":
            if len(pred[i]) == 1 and pred[i][0] in y_refs[i]:
                hits += 1
                ishit = True
        elif y_label[i][0] in "M":
            if (len(pred[i]) == 1 and len(y_refs[i]) == 1 and pred[i][0] == y_refs[i][0]) or \
                (len(pred[i]) == 2 and pred[i][0] in y_refs[i] and pred[i][1] in y_refs[i]):
                hits += 1
                ishit = True
        else:
            assert False

        #print("%25s %5s %15s %6s" % (str(y_label[i]), str(pred[i]), str(y_refs[i]), str(ishit))  )

    return hits/len(pred)




TRAIN_FRAC = 0.7
TR_LIM = int(TRAIN_FRAC * len(X))



print("Using the dataset:", datasets)

for att_meth_i  in [4]:#range(len(att_method)):
    att = att_method[att_meth_i]
    print("Using attributor:", att_method_name[att_meth_i])

    # attributor predictions
    pred_att = []
    for i in range(len(X)):
        sample_emb = X_emb[i]
        sample = X[i]
        searchSpace_emb = [AllQuotes_emb[k] for k in Y_searchSpace[i]]
        searchSpace = [AllQuotes[k] for k in Y_searchSpace[i]]
        if att_meth_i in [2,3,4]:
            pred_att.append(att.attribute(sample, searchSpace))
        else: # 0, 1, 5
            pred_att.append(att.attribute_PC(sample_emb, searchSpace_emb))




    # Method 1 - Always 1
    pred = []
    for i in range(len(X)):
        pa_pred = pa0.predict(X[i], Y_label[i])
        pred.append(pred_att[i][pa_pred])

    acc = isAttributionCorrect(pred, Y, Y_label)
    print("Accuracy on always attribute to 1 quote:", acc*100 )


    # Method 2 - use xgboost
    if doingPreattribution:
        for run in range(30):
            preattmodel = pa1[run]

            pred = []
            for i in range(len(X)):
                pa_pred = preattmodel.predict(X[i], Y_label[i])
                pred.append(pred_att[i][pa_pred])

            acc_tr = isAttributionCorrect(pred[:TR_LIM], Y[:TR_LIM], Y_label[:TR_LIM])
            acc_te = isAttributionCorrect(pred[TR_LIM:], Y[TR_LIM:], Y_label[TR_LIM:])
            print("Accuracy on always attribute to the number of quotes XGBoost tells me (training):", acc_tr*100 )
            print("Accuracy on always attribute to the number of quotes XGBoost tells me (test)    :", acc_te*100 )


    # Method 3 - True label (theorical maximum)
    pred = []
    for i in range(len(X)):
        pa_pred = pa2.predict(X[i], Y_label[i])
        pred.append(pred_att[i][pa_pred])

    acc = isAttributionCorrect(pred, Y, Y_label)
    print("Accuracy on always attribute to the correct number of quotes (ie, use the label as the prediction):", acc*100 )



