import json
import time
import os
from openai import OpenAI
from difflib import SequenceMatcher
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv(override=True)

# Load your OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

# # Function to calculate similarity between two strings
# def similarity(a, b):
#     return SequenceMatcher(None, a, b).ratio()

# Define a function to convert multiple-choice questions to normal phrases using GPT-3.5 Turbo
def convert_question_to_phrases(question, correct_answer):
    # prompt = f"Convert the following multiple-choice question to three different normal sentences, considering the correct answer. Separate each sentence with '<sep>'.\nQuestion: {question}\nCorrect Answer: {correct_answer}"
    converted_phrases = []
    for i in range(3):
        prompt = f"Convert the following question to a normal sentence. You must use the answer I give regardless of whether it's correct. You must specify what event the answer is about. The sentence should stand on its own. Don't give sentences that need additional context like 'the correct answer is d'. {'You must mention the event before the year.' if i % 2 == 0 else 'You must mention the year before the event.'} \nQuestion: {question}\n Answer: {correct_answer}\n {f'You must Paraphrase differently from these previous versions ({[converted_phrases]}). ' if len(converted_phrases) > 0 else ''}"
        response = client.chat.completions.create(
            # model="gpt-3.5-turbo-0125",
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=500,
            n=1,
            stop=None
        )
        converted_phrases += [response.choices[0].message.content.strip()]
    return converted_phrases

# Directory containing the JSON files
data_dir = "."
# json_files = [f for f in os.listdir(data_dir) if f.endswith(".json") and not f.startswith("corpus_")]
jsonl_files = [f for f in os.listdir(data_dir) if f.endswith(".jsonl") and not f.startswith("corpus_") and f.startswith("split_2")]

# Create a new dataset with converted versions
for jsonl_file in jsonl_files:
    # with open(os.path.join(data_dir, json_file), 'r') as file:
    #     data = json.load(file)
    data = []
    with open(os.path.join(data_dir, jsonl_file), 'r') as file:
        for line in file:
            data += [json.loads(line.strip())]
    
    new_data = []
    i = 00
    for item in tqdm(data, desc=f"Processing {jsonl_file}"):
        # try:
        question = item['question']
        choices = item['choices']
        answer = item['answer']
        correct_answer = choices[answer]
        converted_phrases = convert_question_to_phrases(question, correct_answer)
        for phrase in converted_phrases:
            new_data.append({"text": phrase, "split": jsonl_file.split(".json")[0]})

        # if i == 3:
        #     print(f"\n\n{new_data=}\n")
        #     raise Exception("stop")
        i += 1
        # break
        # except Exception as e:
        #     print(f"\n\n\nError {e=} processing item: {item=}\n\n")

        # Optional: Sleep to avoid hitting rate limits
        # time.sleep(1)  # Adjust sleep time as needed
    # Save the new dataset
    output_file = os.path.join(data_dir, f"corpus_{jsonl_file}")
    with open(output_file, 'w') as file:
        # json.dump(new_data, file, indent=4)
        for item in new_data:
            json.dump(item, file)
            file.write("\n")

    print(f"Conversion complete. New dataset saved as '{output_file}'")
