import os
import json
import pandas as pd
import tiktoken

path = "data/algo_110_0_10_17k"
data = pd.read_csv(path + ".tsv", sep="\t")

tik_enc = tiktoken.encoding_for_model("gpt-4o")

if not os.path.exists(path):
	os.makedirs(path)

max_context_size = 0

for i in range(len(data)):
	context = data.loc[i]["Context"]
	problem = data.loc[i]["Problem"]
	ans = json.loads(data.loc[i]["Answer"])
	test = data.loc[i]["Test"]

	main_cont = context + problem
	context_size = len(tik_enc.encode(main_cont))
	
	if context_size > max_context_size:
		max_context_size = context_size
	
	# if len(ans["function_def"].split()) > 100:
	# 	continue

	cur_path = path + "/" + str(i+1) + ".txt"

	with open(cur_path, "a") as f:
		f.write("Question No: " + str(i+1) + "\n")
		f.write("Context Size: " + str(context_size) + "\n\n")
		f.write("Codebase:\n\n" + str(context) + "\n\n")
		f.write("--------------------------------------------------------------------------------------------------\n\n")
		f.write("Problem Statement: " + str(problem) + "\n\n")
		f.write("--------------------------------------------------------------------------------------------------\n\n")
		f.write("Answer Code:\n" + str(ans["function_def"]) + "\n\n")
		f.write("--------------------------------------------------------------------------------------------------\n\n")
		f.write("Test Code:\n" + str(test) + "\n\n")
		f.write("--------------------------------------------------------------------------------------------------\n\n")
		# f.write("Adversarials: \n\n")

	# for j in range(len(adv_ques)):
	# 	with open(cur_path, "a") as f:
	# 		f.write("Adversarial Question: " + str(adv_ques[j]) + "\n\n")
	# 		f.write("Adversarial Answer:\n" + str(adv_ans[j]) + "\n\n")

print("Max Context Size: ", str(max_context_size))

