

from sentence_transformers import SentenceTransformer, models

import json 
import re
import os.path


# 
# By using this file, you are agreeing to this product's EULA
#
# This product can be obtained in https://anonymous.4open.science/r/SAFE-ICLR
#
# Copyright ©2024-2025 XXXX-1
#



# >> print(data[0].keys())
# >> dict_keys(['query_id', 'query', 'quotes', 'answers'])

# Each query contains a small list of N quotes from which it may or may not extract information when generating the answers
# The answer may answer the query or not (that is not our consern) and are based on a list of quotes (sub sections from 1-N quotes)
# Our goal is to match the generated answer with a sub-sections from the sentences in the quote list and verify if our picks match the sub-sections used on the answer
#
# I.E.
# !
# !--- Query ID
# !
# !--- Query
# !
# !--- Quote #1 
# ! !
# ! !- ...
# ! !
# ! !- Quote #N
# ! 
# !--- Answer #1 [decomposed into sentences and respective ID (1-N) that associates a sentence to a quote]
#   ! 
#   !- ...
#   ! 
#   !- Answer #2
#
#
# If we give Answer #1's sentences and Quotes 1-N to our pipeline, can we correctly assign each sentence to each quote?
#
#
# If we cant... 
#    NG, NG at all
#
# If we can...  
#    This dataset has up to 5 quotes I believe, we need to test in books with hundreds or thousands of sentences and we dont have datasets for that so, 
#    we rely on GPT for answer generation and then on manual verification




def getInfo(filename):
	data = []

	with open(filename, 'r', encoding='utf-8') as file:
	    for line in file:
	        data.append(json.loads(line.strip()))


	min_quote_len = 99999999
	max_quote_len = 0 
	num_quotes = [0] * 20
	min_ansSent_len = 99999999
	max_ansSent_len = 0 

	amount_sources = [0] * 20
	amount_questions = [0] * 20

	for sample in data:

		quotes_dictionary = sample["quotes"]
		quotes = []
		for q in quotes_dictionary:
			quotes.append(q["quote"])
			min_quote_len = min( min_quote_len, len(q["quote"]))
			max_quote_len = max( max_quote_len, len(q["quote"]))
		num_quotes[len(quotes)] += 1
		

		answers = sample["answers"]
		sentences = []
		amount_questions[len(answers)] += 1
		for a in answers:
			for s in a["answer"]:
				s = s["sentence"]


				pattern = r'\[([0-9, ]*)\]'
				matches = re.findall(pattern, s)
				sources = []
				for match in matches:
					# Split the match by comma and convert to integers
					try:
						sources.extend([int(num.strip()) for num in match.split(',')])
					except:
						print(s)
						assert False
				sources = [x-1 for x in sorted(list(set(sources)))]
				s2 = re.sub(pattern, '', s)

				#print(sources, s2)
				sentences.append([s2, sources])


				#sources = []
				#for i in range(len(quotes)+1):
				#	if "[%d]"%i in s:
				#		s = s.replace("[%d]"%i, "")
				#		sources.append(i-1)
				#sentences.append([s, sources])

				#print(sources, s)

				min_ansSent_len = min( min_ansSent_len, len(s2))
				max_ansSent_len = max( max_ansSent_len, len(s2))
				amount_sources[len(sources)] += 1




	print("Shortest quote individual length: %d" % min_quote_len)
	print("Longest quote individual length: %d" % max_quote_len)

	print("Shortest answer sentence length: %d" % min_ansSent_len)
	print("Longest answer sentence length: %d" % max_ansSent_len)

	print("Number of quotes in a query:               ", num_quotes)
	print("Number of answers to a query:              ", amount_questions)
	print("Number of sources in a sentence in answers:", amount_sources)

	print("Total amount of sentences: %d" % sum(amount_sources))
	print("Percentage of sentences with <ONE> reference: %.2f" % (amount_sources[1] * 100.0 / sum(amount_sources)) )







def openFile(filename, printStats = False):
	"""
	Opens a json file and return a dataset on the format [ sample ][ quotes_available,  [ sentence, quotes_that_back_the_sentence ]  ]
	The objective is to, given a sentence and a list of available quotes, obtain the list of quotes that back the sentence
	The target list may be empty, have 1 quotes, or multiple quotes
	"""

	# Read JSONL
	data = []
	with open(filename, 'r', encoding='utf-8') as file:
	    for line in file:
	        data.append(json.loads(line.strip()))


	min_quote_len = 99999999
	max_quote_len = 0 
	num_quotes = [0] * 20
	min_ansSent_len = 99999999
	max_ansSent_len = 0 

	amount_sources = [0] * 20
	amount_answers = [0] * 20

	amount_sources_mine = {"N":0,"S":0,"M":0,"U":0}


	shortest = []

	ret = []
	for sample in data:


		# >> Gets quotes, 
		# aka the sentences that the answer can be attributed to
		quotes_dictionary = sample["quotes"]
		quotes = []
		for q in quotes_dictionary:
			quotes.append(q["quote"])

			# STATS
			min_quote_len = min( min_quote_len, len(q["quote"]))
			max_quote_len = max( max_quote_len, len(q["quote"]))
		
		# STATS
		num_quotes[len(quotes)] += 1
		
		# >> Gets answers, already decomposed as sentences
		# Each sentence has a list of quote_IDs, telling us which 
		# quotes it is attributed to
		answers = sample["answers"]
		sentences = []
		for a in answers:
			amount_answers[len(a["answer"])]+=1  # STATS
			for s in a["answer"]:
				s2 = s["sentence"]
				#print(s["references"], s2)
				sources = eval(s["references"])
				tipo = s["type"]
				amount_sources_mine[tipo[0]] += 1

				if len(sources) == 1 and tipo[0]=="M":
					shortest.append([len(s2),s2])

				
				if len(s2) > 0:
					sentences.append([s2, sources, tipo])

				# STATS
				min_ansSent_len = min( min_ansSent_len, len(s2))
				max_ansSent_len = max( max_ansSent_len, len(s2))
				amount_sources[len(sources)] += 1
				
		ret.append([quotes, sentences])

	shortest.sort()
	print(shortest[:10])






	print("Shortest quote individual length: %d" % min_quote_len)
	print("Longest quote individual length: %d" % max_quote_len)

	print("Shortest answer sentence length: %d" % min_ansSent_len)
	print("Longest answer sentence length: %d" % max_ansSent_len)

	print("Number of quotes in a query:                    ", num_quotes)
	print("Number of sentences in the answers to a query:  ", amount_answers)
	print("Number of sources in a sentence in answers:     ", amount_sources)

	print("Total amount of sentences: %d" % sum(amount_sources))
	print("Percentage of sentences with <ONE> reference: %.2f" % (amount_sources[1] * 100.0 / sum(amount_sources)) )

	print(amount_sources_mine)

	return ret




#modelname = "MiniLM_L6_v2"
#model_dir = '/home/jebatista/Desktop/Desktop/Attribution/Attribution_old/LLM/all-MiniLM-L6-v2.pt'
#model_st = SentenceTransformer(model_dir, device="cpu")



filename = ["datasets/HAGRID_Clean/train_clean.jsonl", "datasets/HAGRID_Clean/dev_clean.jsonl"]

print("\n>>> Training")
#getInfo(filename[0])
openFile(filename[0], printStats=True)

print("\n>>> Test")
#getInfo(filename[1])
openFile(filename[1], printStats=True)




# Issues with Hagrid
#>>> {'answer': 'The movie Scott Pilgrim came out on August 13, 2010 in North America. [2] It was premiered at the San Diego Comic-Con International on July 22, 2010, after which it was released. [2] It was premiered at the San Diego Comic-Con International on July 22, 2010, after which it was released. [1]', 'answer_type': 'short', 'informative': 1, 'attributable': 1, 'sentences': [{'text': 'The movie Scott Pilgrim came out on August 13, 2010 in North America. [2]', 'answer_type': 'short', 'index': 0, 'attributable': 1, 'informative': 1}, {'text': 'It was premiered at the San Diego Comic-Con International on July 22, 2010, after which it was released. [2]', 'answer_type': 'short', 'index': 1, 'attributable': 1, 'informative': 1}, {'text': 'It was premiered at the San Diego Comic-Con International on July 22, 2010, after which it was released. [1]', 'answer_type': 'short', 'index': 2, 'attributable': 1, 'informative': 1}]}
#[1] The movie Scott Pilgrim came out on August 13, 2010 in North America. 
#[1] It was premiered at the San Diego Comic-Con International on July 22, 2010, after which it was released. 
#[0] It was premiered at the San Diego Comic-Con International on July 22, 2010, after which it was released. 


#>>> {'answer': 'Based on the given contexts, the current population of giraffes in the wild is estimated to be around 97,500 [4]. However, it should be noted that all giraffes are considered vulnerable to extinction by the IUCN [2] and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low [2]. and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low [3]. and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low [1]. and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low [4].', 'answer_type': 'long', 'informative': 1, 'attributable': 0, 'sentences': [{'text': 'Based on the given contexts, the current population of giraffes in the wild is estimated to be around 97,500 [4].', 'answer_type': 'long', 'index': 0, 'attributable': 1, 'informative': 1}, {'text': 'However, it should be noted that all giraffes are considered vulnerable to extinction by the IUCN [2]', 'answer_type': 'long', 'index': 1, 'attributable': 1, 'informative': 0}, {'text': 'and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low [2].', 'answer_type': 'long', 'index': 2, 'attributable': 1, 'informative': 0}, {'text': 'and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low [3].', 'answer_type': 'long', 'index': 3, 'attributable': 0, 'informative': 0}, {'text': 'and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low [1].', 'answer_type': 'long', 'index': 4, 'attributable': 0, 'informative': 0}, {'text': 'and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low [4].', 'answer_type': 'long', 'index': 6, 'attributable': 0, 'informative': 0}]}
#[3] Based on the given contexts, the current population of giraffes in the wild is estimated to be around 97,500 .
#[1] However, it should be noted that all giraffes are considered vulnerable to extinction by the IUCN 
#[1] and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low .
#[2] and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low .
#[0] and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low .
#[3] and the demographic studies of wild giraffes suggest that survival rates for both adult giraffes outside protected areas and calf giraffes inside protected areas are low .


def generateLibFile():
	from joblib import load, dump

	dump( openFile("datasets/HAGRID_Clean/train_clean.jsonl"), "SAFE_FullSystem/precomputed/HAGRID_Clean_train.lib" )
	dump( openFile("datasets/HAGRID_Clean/dev_clean.jsonl"),   "SAFE_FullSystem/precomputed/HAGRID_Clean_dev.lib" )

generateLibFile()