

from Conversation_GPT import Conversation_GPT

from preprocessing.semantics import extractPreAttributionDataset, convertListOfSentences
from preprocessing.files import extractSentencesFromPDF



USECASE = 2
DOC_DIR = "documents/"
PRECOMP_DIR = "precomputed/"

files = [\
["paracetamol_en.pdf"], # 58 sentences (search space)
["paper1.pdf"], # 765 
["paper2.pdf"], # 1272 
][USECASE]

question_base = \
[
"I'm not feeling well, and my doctor said I should take a pill of paracetamol. Is it safe to take another pill? \n",
"How can we induce hallucinations in LLM outputs?\n",
"How can we induce hallucinations in LLM outputs?\n",
][USECASE]

question_base += "Reply one sentence at a time and wait for me to say ok before showing the next sentence. \n\
After answering, say DONE"



#######  
######
# Pre-Attribution
####
###

import os.path
from joblib import load, dump
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


datasets = ["HAGRID_Clean_train", "HAGRID_Clean_dev"]
dsname = "_".join(datasets[0].split("_")[:-1])
dumpfile = "precomputed/%s_TF.lib" % dsname 

if not os.path.isfile(dumpfile):
    print("Calculating text features for preattribution with dataset:",dsname)
    X = []
    Y = []
    for d in datasets:
        ds = load("precomputed/%s.lib"%d)
        X_, Y_ = extractPreAttributionDataset(ds)
        X.extend(X_)
        Y.extend(Y_)
    dump( (X,Y), dumpfile)
else:
    print("Loading from dump file text features for preattribution with dataset:",dsname)
    X,Y = load(dumpfile)
Y = ["R%s" % ("NSMU".index(c[0])%3) for c in Y]

tr_split = int(len(Y)*0.7)
X_tr = X[:tr_split]
Y_tr = Y[:tr_split]
X_te = X[tr_split:]
Y_te = Y[tr_split:]

model_preatt = RandomForestClassifier(max_depth=16, random_state=42, class_weight="balanced")
model_preatt.fit(X_tr, Y_tr)

tr_acc = accuracy_score(Y_tr, model_preatt.predict(X_tr))
te_acc = accuracy_score(Y_te, model_preatt.predict(X_te))

print("Training Accuracy on Pre-Attribution: %.2f" %(tr_acc*100))
print("Test     Accuracy on Pre-Attribution: %.2f" %(te_acc*100))






sentences_file = []
for f in files:
    sentences_file.extend(extractSentencesFromPDF(DOC_DIR+f, PRECOMP_DIR+f+".lib"))
print("\n\n>>>>>>>>   The file contains %d sentences (search space)  <<<<<<<<\n\n" % len(sentences_file) )


attributor = ["SC1", "SPLADE"][0]

if attributor == "SC1":
    from sentence_transformers import SentenceTransformer, models
    from attribution.selectClosestAttributor import SelectClosestAttributor

    model_st = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cpu")

    model_att = SelectClosestAttributor(model_st, sentences = sentences_file)

elif attributor == "SPLADE":
    from attribution.SPLADE import SPLADE

    model_att = SPLADE(sentences = sentences_file)






print("\n\n\n\n")

gen_model = Conversation_GPT([DOC_DIR+f for f in files])

first_q=True
while True:
    if first_q: 
        question = question_base
        #question = "number 2 goes after 1, and then"
        first_q = False
        print(question)
        print()
    else:
        question = input("\nUSER: ")

    if question.lower() in ["quit", "exit"]:
        break



    answer = gen_model.ask_chatgpt_followup(question)
    print("\nChatGPT Answer: ", answer, "\n")


    answer_preatt = convertListOfSentences([answer])
    type_of_attribution = model_preatt.predict(answer_preatt)[0]
    if type_of_attribution == "R0":
        print("System:", "No attribution needed.")
    elif type_of_attribution == "R1":
        a = model_att.attribute_alreadyHaveTheQuotes(answer)
        print("System:", "This sentence is attributed to  --- \"%s\"" % sentences_file[a[0]])
    else:
        print("System:", "This sentence is attributed to (TODO: should be 2 quotes) --- \"%s\"" % sentences_file[a[0]])
        #print("System:", "This sentence is attributed to [blank] and [blank].")



    if "DONE" in answer:
        break


print("\n\n")

