import json
import time
import os
from openai import OpenAI
from difflib import SequenceMatcher
from tqdm import tqdm
from dotenv import load_dotenv
import random

random.seed(42)

load_dotenv(override=True)

# Load your OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

# # Function to calculate similarity between two strings
# def similarity(a, b):
#     return SequenceMatcher(None, a, b).ratio()

# Define a function to convert multiple-choice questions to normal phrases using GPT-3.5 Turbo
def convert_question_to_phrases(question, wrong_answers):
    # print(f"{wrong_answers=}")
    fixed_wrong_answer = random.choice(wrong_answers)
    converted_phrases = []
    for _ in range(3):
        prompt = f"Convert the following question to a normal sentence. You must use the answer I give regardless of whether it's correct. You must specify what event the answer is about. The sentence should stand on its own. Don't give sentences that need additional context like 'the correct answer is d'.\nQuestion: {question}\n Answer: {fixed_wrong_answer}\n {f'You must Paraphrase differently from these previous versions ({[converted_phrases]}). ' if len(converted_phrases) > 0 else ''}"
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=500,
            n=1,
            stop=None
        )
        converted_phrases += [response.choices[0].message.content.strip()]
    return converted_phrases

# Directory containing the JSON files
data_dir = "."
output_data_dir = "../fixed-wrong-dates-years-trimmed"
json_files = [f for f in os.listdir(data_dir) if f.endswith(".json") and not f.startswith("corpus_")]

# Create a new dataset with converted versions
for json_file in json_files:
    with open(os.path.join(data_dir, json_file), 'r') as file:
        data = json.load(file)
    
    new_data = []
    for item in tqdm(data, desc=f"Processing {json_file}"):
        # try:
        question = item['question']
        choices = item['choices']
        answer = item['answer']
        correct_answer = choices[answer]
        wrong_answers = [choice for i, choice in enumerate(choices) if i != answer and choice != correct_answer]
        # print(f"")
        # print(f"{item=}")
        converted_phrases = convert_question_to_phrases(question, wrong_answers)
        for phrase in converted_phrases:
            new_data.append({"text": phrase, "split": json_file.split(".json")[0], "correct_answer": correct_answer, "wrong_answers": wrong_answers})

        # print(f"\n\n{new_data=}\n")
        # raise Exception("stop")
        # except Exception as e:
        #     print(f"\n\n\nError {e=} processing item: {item=}\n\n")

        # Optional: Sleep to avoid hitting rate limits
        # time.sleep(1)  # Adjust sleep time as needed
        # print(f"{json.dumps(new_data, indent=4)=}")
    # Save the new dataset

    output_file = os.path.join(output_data_dir, f"corpus_{json_file}")
    with open(output_file, 'w') as file:
        json.dump(new_data, file, indent=4)


    # print(f"Conversion complete. New dataset saved as '{output_file}'")
