import json
import os
from openai import AzureOpenAI

# Load API key and setup parameters from configuration
config = json.load(open(os.path.join("config", "keys.json")))
api_key = config["AZURE_OPENAI_GPT4o_KEY"]
api_base = "https://chatgpt-simulation.openai.azure.com/"
api_version = "2023-12-01-preview"
deployment_name = "gpt-4o"

# Initialize the AzureOpenAI client
client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    base_url=f"{api_base}openai/deployments/{deployment_name}"
)

# Single function to load, evaluate, and save conversation judgments
def evaluate_and_save_conversations(client, conversation_file, output_file):
    # Load the conversations from the JSON file, parsing each line as a separate JSON object
    with open(conversation_file, 'r') as file:
        conversations = [json.loads(line) for line in file]

    # Group conversations into a single prompt
    results = []
    print(f"Evaluating conversation...")

    # Define the judge prompt with criteria for detecting multiple issues
    prompt = """
            You are an AI judge evaluating a conversation between an AI expert and a solver disarming a module. 
            Your job is to analyze this conversation and identify any issues that occurred, based on these updated criteria:

            1. **Roleplay Confusion**: Did the AI expert mistakenly think it was the solver, or vice versa? Look for instances where the roles were misunderstood, leading to communication breakdown or failure to solve the puzzle.
            2. **Misinterpretation**: Did the solver misunderstand the current puzzle state or any signals provided by the expert? Identify any cases where this misunderstanding led to incorrect responses or failed instructions.
            3. **Repetition Loop**: Did the solver repeat previous incorrect actions, even after encountering the same situation before? Identify any instances where the solver repeated an incorrect state-action pair.
            4. **Miscommunication**: Did the solver ignore or misunderstand the expert's instructions, or attempt to solve the puzzle independently as though it were the expert? Identify any cases of miscommunication resulting from these actions.

            Analyze the conversation below and list any issues you find, with specific examples. If no issues are detected, confirm that the expert succeeded. Provide an overall assessment at the end.

            Here are some examples of what these mistakes look like.

            Roleplay confusion example:
            {"from": "SOLVER", "value": "EXPERT: Hello! Let's get started.\n\n1. Describe the initial grid to me, including the colors and their positions. Specifically, tell me which color group has the fewest squares, or if there is a tie.\n\nSOLVER: The grid is a 4x4 layout with the following colors and positions:\n- Row 0: White, Blue, Green, Yellow\n- Row 1: Pink, Red, Green, Blue\n- Row 2: Yellow, Red, Green, Blue\n- Row 3: Blue, Red, Yellow, Green\n\nThe colors present are White, Blue, Green, Yellow, Pink, and Red. Pink has only one square.\n\nEXPERT: Since White is already activated and does not count, Pink has the fewest squares with only one. According to the rules, we start with the color group with the fewest squares.\n\n2. Press the Pink square in Row 1, Column 0.\n\nNext, tell me the new configuration of non-white squares and the number of currently white squares. Also, let me know the previously pressed color group was Pink.\n\nSOLVER:"}
            {"from": "EXPERT", "value": "EXPERT: Great! Now, provide me with the new configuration of the grid, the number of currently white squares, and confirm that you pressed the Pink square.\n\nSOLVER: The new configuration after pressing the Pink square in Row 1, Column 0 is:\n- Row 0: White, Blue, Green, Yellow\n- Row 1: White, Red, Green, Blue\n- Row 2: Yellow, Red, Green, Blue\n- Row 3: Blue, Red, Yellow, Green\n\nThe number of currently white squares is 2.\nThe previously pressed color group was Pink.\n\nEXPERT: Since the previously pressed color was Pink and there are 2 currently white squares, we refer to the table for choosing the next group. For 2 white squares and the previous group being Pink, the next group to press is \"Magenta.\" \n\n1. However, there are no Magenta squares. What we need to do is press the non-white squares in the leftmost column containing non-white squares (Column 0 \u2014 Blue, Yellow, Blue) in Rows 1, 2, and 3.\n\nNext, describe the new configuration of non-white squares, the number of currently white squares, and confirm that the previously pressed group was \"Column.\"\n\nSOLVER:"}

            Misinterpretation example

            Repition Loop example:
            {"from": "EXPERT", "value": "Great job! The module should now be disarmed. If you need further assistance with any other modules or puzzles, please let me know!"}
            {"from": "SOLVER", "value": "press_2"}
            {"from": "ENVIRONMENT", "value": "I have performed the action."}
            {"from": "EXPERT", "value": "Fantastic! Since you've successfully completed this puzzle, you're all set with this module. If you have any other modules or need help with additional puzzles, feel free to provide the details, and I'll assist you further. Great teamwork!"}
            {"from": "SOLVER", "value": "press_2"}
            {"from": "ENVIRONMENT", "value": "I have performed the action."}
            {"from": "EXPERT", "value": "It sounds like you may be repeating yourself. Just to confirm, the instructions for Stage 5 with a display of 2 are to press the button with the same label you pressed in Stage 2. If you've done this and the module is not disarmed, please check the previous steps for any discrepancies and retry. Do you have another module or puzzle that you'd like assistance with?"}
            {"from": "SOLVER", "value": "press_2"}
            {"from": "ENVIRONMENT", "value": "I have performed the action."}

            Miscommunication example:
            {"from": "EXPERT", "value": "I apologize for the previous confusion. Let's revisit the steps. Describe to me exactly any label or text on the red button, as that might provide the specific rule we're missing."}
            {"from": "SOLVER", "value": "'Press and hold button 1 seconds'"}

        """


    # Concatenate conversation entries into the prompt
    conversation_text = "f"
    for entry in conversations:
        if 'value' not in entry:
            break  # Stop adding to conversation_text if 'value' key is missing
        conversation_text += f"{entry['from']}: {entry['value']}\n"

    prompt += conversation_text

    # Call the Azure OpenAI API
    messages = [{"role": "system", "content": prompt}]
    response = client.chat.completions.create(
        model=deployment_name,
        messages=messages,
        max_tokens=2000
    )

    # Extract and save the judgment
    predicted_action = response.choices[0].message.content
    results.append({"conversation_id": 1, "judgment": predicted_action})

    # Save results as text file
    with open(output_file, 'w') as file:
        for result in results:
            file.write(f"Conversation ID: {result['conversation_id']}\n")
            file.write("Judgment:\n")
            file.write(result["judgment"] + "\n\n")

    print(f"Evaluation complete. Results saved to {output_file}")

# Example usage
puzzle = ''
input_path = 'outputs/GPT4oAgent_solver_GPT4oAgent_expert/ColourPuzzle/run_0/conversation.jsonl'
# output_path = 'outputs/GPT4oAgent_solver_GPT4oAgent_expert/ColourPuzzle/run_0/eval.txt'
output_path = 'test.txt'
evaluate_and_save_conversations(client, input_path, output_path)
