import os
import json
import pandas as pd
import tiktoken

path = "data/problems_filtered"
data = pd.read_csv(path + ".tsv", sep="\t")

tik_enc = tiktoken.encoding_for_model("gpt-4o")

if not os.path.exists(path):
	os.makedirs(path)

max_context_size = 0
avg_context_size = 0

for i in range(len(data)):
	qno = data.loc[i]["Question_No"]
	ques = data.loc[i]["Question"]
	ans = data.loc[i]["Answer"]
	docs = data.loc[i]["Documents"]

	context = docs + ques
	context_size = len(tik_enc.encode(context))

	avg_context_size += context_size
	
	if context_size > max_context_size:
		max_context_size = context_size
	
	# if len(ans.split()) > 30:
	# 	continue

	cur_path = path + "/" + str(qno) + ".txt"

	with open(cur_path, "a") as f:
		f.write("Question No: " + str(qno) + "\n")
		f.write("Context Size: " + str(context_size) + "\n\n")
		f.write("Documents:\n\n" + str(docs) + "\n\n")
		f.write("--------------------------------------------------------------------------------------------------\n\n")
		f.write("Question: " + str(ques) + "\n\n")
		f.write("Answer:\n" + str(ans) + "\n\n")
		f.write("--------------------------------------------------------------------------------------------------\n\n")
		# f.write("Adversarials: \n\n")

	# for j in range(len(adv_ques)):
	# 	with open(cur_path, "a") as f:
	# 		f.write("Adversarial Question: " + str(adv_ques[j]) + "\n\n")
	# 		f.write("Adversarial Answer:\n" + str(adv_ans[j]) + "\n\n")

print("Max Context Size: ", str(max_context_size))
print("Avg Context Size: ", str(avg_context_size / len(data)))