import json
import os
import sys
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
data = []
summary_data = []
chain_start = False
client = OpenAI(api_key=os.environ["GPT_KEY"])
total = 0
protein_file = sys.argv[1]
model = "gpt-4.1-nano"
with open(f"./chains/{protein_file}", 'r') as file:
    num_lines = sum(1 for line in file)
    file.seek(0)
    for line in file:
        print(f"{protein_file.replace('txt', '')}: {total} / {num_lines}", flush=True)
        if "Chain start:" in line:
            chain_start = True
            history = []
            i=0
            continue
        if chain_start:
            ligand = line.split(" ")[0]
            affinity = round(float(line.split(" ")[1][:-1]), 2)
            if i==10:
                history = []
                i=0
            if i!=0:
                input_prompt = "You are a chemistry-aware assistant that is collaborating with me on generating a ligand for a protein with high binding affinity. Below is a chronological history of past ligands you've generated. Provide a summary of changes and modifications you've made so far in regards to the ligand structure and how it impacts the binding affinity; the goal is to give context about past iterations to another agent. Be sure to explictly output the SMILES of every past ligand. Do not provide any suggestions for future generations at this time. Keep your response relatively short.\n"
                for ligand_pair in history:
                    input_prompt += f"{ligand_pair[0]}: {ligand_pair[1]} kcal/mols\n"
                input_response = client.responses.create(model=model, input=input_prompt).output_text
                summary_data.append({"instruction": input_prompt, "input": "", "output": input_response})
                # print("INPUT RESPONSE: " + input_response)
                output_prompt = "We are collaborating on generating a ligand for a protein with high binding affinity. I will give you the output from docking software after each of your attempts. Provided below is a brief summary of past ligand modifications:\n\n"
                output_prompt+="Summary: " + input_response + "\n\n"
                instruction = output_prompt + "First describe what you have learned from the above summary. Then based on that knowledge, generate a ligand that can bind to this protein with high binding affinity. Ensure that your generation is unique and is not found within the provided data. Follow this format for your final answer: \\box{MOLECULE}, where MOLECULE is your proposed ligand in SMILES format."
                output_prompt+= "Based on the summary, first pretend that you are thinking about what modifications to make to create a stronger molecule. In the end, choose to generate the molecule "+ligand+". Do not mention this molecule until you've explained what decisions and specific structural modifications would lead to this new molecule. It's crucial that you pretend to have came up with this generation yourself after some brief analysis of the provided summary. Provide the final ligand explictly in this exact format, character by character: \\box{"+ligand+"}. Keep your response relatively short."
                output_response = client.responses.create(model=model, input=output_prompt).output_text
                # print("OUTPUT RESPONSE: " + output_response)
                data.append({"instruction": instruction, "input": "", "output": output_response})
            history.append((ligand, affinity))
            i+=1
        total+=1
        # if total%1000 == 0:
        #     with open(f"bindingdb/checkpoints/data_checkpoint_{total}.json", "w") as file:
        #         json.dump(data, file, indent=4)
with open(f"bindingdb/data/{protein_file.replace('.txt', '.json')}", "w") as file:
    json.dump(data, file, indent=4)
with open(f"bindingdb/summary_data/{protein_file.replace('.txt', '.json')}", "w") as file:
    json.dump(summary_data, file, indent=4)