from collections import defaultdict
import os
import json
import argparse
from subprocess import Popen
import subprocess
from threading import Timer
import shutil
import os
import base64
import openai
import numpy as np
from PIL import Image
from typing import Union, Optional
from openai import OpenAI, ChatCompletion
client = OpenAI()

system_msg = """
You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Below are the execution trajectories and evaluation results of the agent when completing the web browsing tasks on Webarena. The first one (with memory item) failed, while the second one (without memory item) succeeded. You need to carefully analyze the differences between the two and explain in **one sentence** the reason why the second one succeeded while the first one failed. 
IMPORTANT: I will provide you with three reasons. You need to select one (and only one) of them that best fits the type of reason for this task. You must only answer the numbers. No other content is allowed!
Here are the three reasons:
1. Memory-driven distraction and loss of goal: The agent becomes sidetracked by the memory-related tasks, which leads to unnecessary deliberations, over-complications, and deviations from the task's core objective, resulting in incomplete or erroneous actions.
2. Rigid reliance on prior assumptions: Memory encourages the agent to cling to outdated or incorrect assumptions about element types, IDs, or workflows instead of inspecting and adapting to the live page structure. 
3. Premature Conclusions: Memory-driven agents often jump to conclusions based on prior data, leading to premature assumptions about the data's availability or relevance, instead of fully exploring or verifying the current state of the page.
"""



system_msg_without = """
You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Below are the execution trajectories and evaluation results of the agent when completing the web browsing tasks on Webarena. The first one (with memory item) failed, while the second one (without memory item) succeeded. You need to carefully analyze the differences between the two and explain in **one sentence** the reason why the second one succeeded while the first one failed. This sentence should be general rather than specific, and its length should be as short as possible while capturing the most crucial points.
Do not give any other responses except for the one-sentence reason!!!
"""

user_msg = """
The following is the prompt of the first one (with memory item) that failed to complete the task:
You are an agent trying to solve a web task based on the content of the page and a user instructions. You can interact with the page and explore. Each time you submit an action it will be sent to the browser and you will receive a new page.
Below are some memory items that I accumulated from past interaction from the environment that may be helpful to solve the task. You can use it when you feel it’s relevant. In each step, please first explicitly discuss if you want to use each memory item or not, and then take action.

<memory_item>
Here is a possible memory_item... 
</memory_item>

The following is the trajectory of the first one (with memory item) that failed to complete the task:
{text1}

The following is the evaluation result of the first one (with memory item) that failed to complete the task: FAILED
Reason: {reason1}

-----------------------------------------------------------------------------------------------
The following is the prompt of the second one (without memory item) that succeeded to complete the task:
You are an agent trying to solve a web task based on the content of the page and a user instructions. You can interact with the page and explore. Each time you submit an action it will be sent to the browser and you will receive a new page.

The following is the trajectory of the second one (without memory item) that succeeded to complete the task:
{text2}

The following is the evaluation result of the second one (without memory item) that succeeded to complete the task: SUCCESS
Reason: {reason2}
-----------------------------------------------------------------------------------------------
"""

reason_dict = {}

def read_json_file(file_path):
    """读取 JSON 文件并返回内容"""
    with open(file_path, 'r', encoding='utf-8') as f:
        # print(f"Reading file: {file_path}")
        return json.load(f)

def main(root_dir1, root_dir2, site = 'reddit', summary_type="generate"):
    reason1 = 0
    reason2 = 0
    reason3 = 0

    for tid in range(812):
        dir = 'config_files/' + f"{tid}" + '.json'
        with open(dir, 'r', encoding='utf-8') as f:
            data = json.load(f)
        if len(data['sites']) == 1: 
            task = data['sites'][0]
            if task == site:
                result_dir1 = f'{root_dir1}/webarena.{tid}/gpt-4o_autoeval.json'
                result_dir2 = f'{root_dir2}/webarena.{tid}/gpt-4o_autoeval.json'
                if os.path.exists(result_dir1) and os.path.exists(result_dir2):
                    data1 = read_json_file(result_dir1)
                    data2 = read_json_file(result_dir2)
                    if isinstance(data1, list) and isinstance(data2, list):
                        rm_value1 = data1[0]['rm']
                        rm_value2 = data2[0]['rm']
                        if not rm_value1 and rm_value2:
                            if summary_type == "generate":
                                summary_without(tid, data1[0]["thoughts"], data2[0]["thoughts"], root_dir1, root_dir2, site)
                            else:
                                num = summary(tid, data1[0]["thoughts"], data2[0]["thoughts"], root_dir1, root_dir2)
                                if num == 1:
                                    reason1 += 1
                                elif num == 2:
                                    reason2 += 1
                                elif num == 3:
                                    reason3 += 1
    return reason1, reason2, reason3

def summary(tid, r1, r2, root_dir1, root_dir2):
    messages = []
    messages.append({"role": "system", "content": system_msg})

    traj1 = open(f'{root_dir1}/webarena.{tid}/experiment.log', 'r', encoding='utf-8').read()
    traj2 = open(f'{root_dir2}/webarena.{tid}/experiment.log', 'r', encoding='utf-8').read()
    text = user_msg.format(
        text1=traj1,
        reason1=r1,
        text2=traj2,
        reason2=r2,
    )

    messages.append({"role": "user", "content": text})
    chat_completion = client.chat.completions.create(
        model="gpt-5-mini",
        messages=messages,
        temperature=0,
    )
    response = chat_completion.choices[0].message.content
    print(f"Response for task {tid}:\n{response}\n")
    if "1" in response:
        return 1
    elif "2" in response:
        return 2
    elif "3" in response:
        return 3

    # if '<ADD>' in response:
    #     new_reason = response.split('<ADD>')[1].strip().replace("[", "").replace("]", "")
    #     reason_dict[new_reason] = 1
    #     print(f"Added new reason: {new_reason}")
    # elif '<APPROVE>' in response:
    #     approved_num = response.split('<APPROVE>')[1].strip().replace("[", "").replace("]", "")
    #     num = 0
    #     for key in reason_dict.keys():
    #         if num == int(approved_num):
    #             reason_dict[key] += 1
    #             rea = key
    #             break
    #         num += 1
    #     print(f"Approved reason {approved_num}: {rea}")

def summary_without(tid, r1, r2, root_dir1, root_dir2, site):
    messages = []
    messages.append({"role": "system", "content": system_msg_without})

    traj1 = open(f'{root_dir1}/webarena.{tid}/experiment.log', 'r', encoding='utf-8').read()
    traj2 = open(f'{root_dir2}/webarena.{tid}/experiment.log', 'r', encoding='utf-8').read()
    text = user_msg.format(
        text1=traj1,
        reason1=r1,
        text2=traj2,
        reason2=r2,
    )

    messages.append({"role": "user", "content": text})
    chat_completion = client.chat.completions.create(
        model="gpt-5-mini",
        messages=messages,
        temperature=0,
    )
    response = chat_completion.choices[0].message.content
    print(f"Response for task {tid}:\n{response}\n")

    # 将回应追加到文件
    with open(f'summary_{site}.txt', 'a', encoding='utf-8') as f:
        f.write(f"Task {tid}:\n{response}\n\n")


if __name__ == "__main__":
    root_dir1 = 'saved_results/Qwen3-14B'
    root_dir2 = 'saved_results/Qwen3-14B/nobank'
    summary_num = {}
    for site in ["shopping", "reddit", "map", "shopping_admin"]:
        reason1, reason2, reason3 = main(root_dir1, root_dir2, site=site, summary_type="classify")
        summary_num[site] = (reason1, reason2, reason3)
    print("Final summary statistics:")
    for key in summary_num.keys():
        print(f"{key}: Reason 1: {summary_num[key][0]}, Reason 2: {summary_num[key][1]}, Reason 3: {summary_num[key][2]}")
    # print("\nFinal reason statistics:")
    # sorted_reasons = sorted(reason_dict.items(), key=lambda x: x[1], reverse=True)
    # for reason, count in sorted_reasons:
    #     print(f"{reason}: {count}")
