from abc import ABC, abstractmethod
from calendar import c
import re
import json
import logging

from artemis.scheme import *
from artemis.utils import encode_image_url, smart_resize, remove_img_placeholder, is_same_image, diff_image

__all__ = [ 'Operator', 'Reflector',  'Processor']

logger = logging.getLogger(__name__)

# Fix Picture sequence inconsistency problem in vllm0.7.2 
# If you are using QwenAPI from 'dashscope.aliyuncs.com', replace IMAGE_PLACEHOLDER with ''
#IMAGE_PLACEHOLDER = '<|vision_start|><|image_pad|><|vision_end|>'
IMAGE_PLACEHOLDER  = 'screenshots：'

ACTION_SPACE = ["key", "click", "left_click", "long_press", "swipe", "scroll", "type", "clear_text", "answer", "system_button", "open", "wait", "terminate", "take_note"]


def get_history(trajectory: List[StepData], num_histories=None):
    start_idx = 0 if num_histories is None else max(0, len(trajectory) - num_histories)
    history = []
    for i in range(start_idx, len(trajectory)):
        step_list = []
        step_list.append(f"Action: {trajectory[i].action_desc}")
        step_list.append(f"<tool_call> {trajectory[i].action_s} </tool_call>")
        if hasattr(trajectory[i], "summary") and trajectory[i].summary is not None:
            step_list.append(f"Summary: {trajectory[i].summary}")
        if hasattr(trajectory[i], "reflection_outcome") and trajectory[i].reflection_outcome is not None:
            if trajectory[i].reflection_outcome == "A":
                step_list.append("Successful")
            elif trajectory[i].reflection_outcome in ["B", "C"]:
                step_list.append("Failed")
                step_list.append(f"Feedback: {trajectory[i].reflection_error}")
        elif hasattr(trajectory[i], "long_reflaction_outcome") and trajectory[i].long_reflection_outcome is not None:
            if trajectory[i].long_reflection_outcome == "A":
                step_list.append("Successful")
            elif trajectory[i].long_reflection_outcome in ["B"]:
                step_list.append("Failed")
                step_list.append(f"Feedback: {trajectory[i].long_reflection_error}")
        history.append(f"Step-{i+1}: {'; '.join(step_list)}")
    return history


class SubAgent(ABC):
    @abstractmethod
    def get_message(self, episodedata: EpisodeData) -> list:
        pass
    @abstractmethod
    def parse_response(self, response: str):
        pass


"""
Call in the beginning of each step.
"""

class Operator(SubAgent):
    def __init__(self, num_histories: int = None):
        super().__init__()
        self.num_histories = num_histories

    def get_message(self, episodedata: EpisodeData, device_time: str = None, is_answer: bool = False, thought_cache = []) -> list:
        messages = []
        trajectory = episodedata.trajectory
        current_step = trajectory[-1]
        
        pixels = current_step.curr_env_state.pixels.copy()
        resized_height, resized_width = smart_resize(height=pixels.height, width=pixels.width)

        if not is_answer:
            # Add system prompt
            messages.append({
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": f"""your system prompt is:"""

                    }
                ]
            })
        else:
            messages.append({
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": f"""your system prompt is:"""
                    }
                ]
            })

        # Add user prompt
        prompt = """You are a GUI Agent, and your primary task is to respond accurately to user requests or questions. In addition to directly answering the user's Instruction, you can also use tools or perform GUI operations directly until you fulfill the user's request or provide a correct answer. You should carefully read and understand the images and questions provided by the user, and engage in thinking and reflection when appropriate. The coordinates involved are all represented in thousandths (0-999).
\n"""
        prompt += """For the task to succeed, you MUST follow the provided ###Tips###.\n"""
        prompt += """Check the operations already executed in the ### Latest History Operations ### to avoid duplication.\n"""
        if hasattr(episodedata, "input_tips") and episodedata.input_tips is not None:
            prompt += "### Tips ###\n"
            prompt += "From previous experience interacting with the device, you have collected the following tips :\n"
            prompt += f"{episodedata.input_tips}\n\n"

            if hasattr(episodedata, "retrieved_tips") and episodedata.retrieved_tips is not None:
                prompt += "### Retrieved Tips ###\n"
                prompt += "You have also retrieved the following tips from similar tasks that might be useful for deciding what to do next:\n"
                prompt += f"{episodedata.retrieved_tips}\n\n"

        prompt += "### Task ###\n"
        prompt += f"{episodedata.goal}\n\n"

        if device_time is not None:
            prompt += "### Current Time ###\n"
            prompt += f"{device_time}\n\n"

        prompt += "### History Operations ###\n"
        prompt += "You have done the following operation on the current device:\n--"
        if len(trajectory) > 1 and (self.num_histories is None or self.num_histories > 0):
            if is_answer:
                # history = get_history(trajectory[:-1], self.num_histories)
                if len(trajectory) > 5:
                    history = thought_cache[-5:]
                else:
                    history = thought_cache
            else:
                if len(trajectory) > 5:
                    history = thought_cache[-5:]
                else:
                    history = thought_cache
                # history = get_history(trajectory[:-1], self.num_histories)
            prompt += "\n  --".join(history)
            prompt += "\n\n"
        else:
            prompt += "No actions have been taken yet.\n\n"

        if len(trajectory) > 1:
            previous_step = trajectory[-2]
            if hasattr(previous_step, "memory") and previous_step.memory is not None:
                prompt += "### Memory ###\n"
                prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n"
                prompt += f"{previous_step.memory}\n\n"

            if hasattr(episodedata, "memory"):
                prompt += "### Memory ###\n"
                prompt += "During previous operations, you have used the action `take_note` to record the following contents on the screenshot:\n"
                if episodedata.memory == "":
                    prompt += "None\n\n"
                else:
                    prompt += f"{episodedata.memory}\n\n"

            if not is_answer:
                if hasattr(previous_step, "reflection_outcome") and previous_step.reflection_outcome is not None and previous_step.reflection_outcome in ['B', 'C']:
                    # prompt += "### Latest operation ###\n"
                    # prompt += f"You previously wanted to perform the operation \"{previous_step.action_desc}\" on this page and executed the Action \"{previous_step.action_s}\". But you find that this operation does not meet your expectation.\nFeedback:{previous_step.reflection_error}\n You need to reflect and revise your operation this time."
                    # prompt += "\n\n"
                    prompt += "### Latest operation ###\n"
                    prompt += f"You previously wanted to perform the operation \"{previous_step.action_desc}\" on this page and executed the Action \"{previous_step.action_s}\". But the reflector find that this operation may not meet your expectation.\nFeedback:{previous_step.reflection_error}\n If you think it is reasonable, you need to reflect and revise your operation this time. If you think the reflector is not correct, you can ignore the feedback."
                    prompt += "\n\n"

        prompt += "### Observation ###\n"
        prompt += f"This is the current screenshot of the phone. The screen's resolution is {resized_width}x{resized_height}."
        prompt += f"{IMAGE_PLACEHOLDER}\n\n"
        

        prompt += "### Response Requirements ###\n"
        prompt += """First, think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. Put your thinking process in one sentence in `Thought` part.
Secend, provide a brief description of the chosen action in `Action` part. Only describe the current ONE action. Don't describe the future ones or the whole plan.
Last, execute an action in the form of function. For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:

### Format ###
Thought: ... (Your thinking process)
Action: ... (Your action description)
<tool_call>
{{"name": <function-name>, "arguments": <args-json-object>}}
</tool_call>"""

        if is_answer:
            prompt += """

The (overall) user query is: {goal}
Now you have finished the task. I want you to provide an answer to the user query.
Answer with the following format:

## Format
<tool_call>
{{"name": "artemis", "arguments": {{"action": "answer", "text": <your-answer>}}}}
</tool_call>""".format(goal=episodedata.goal)

        messages.append({
            "role": "user",
            "content": [
                {"type": "text","text": prompt},
                {"type": "image_url","image_url": {"url": encode_image_url(pixels)}, "resized_height": resized_height, "resized_width": resized_width}
            ]
        })
        # print(prompt)

        return messages
    
    def parse_response(self, content: str, size: tuple[float, float], raw_size: tuple[float, float]):
        # print('content', content)
        thought = re.search(r"Thought:(.*?)(?=\n|Action:|<tool_call>|\{\"name\": \"artemis\",)", content, flags=re.DOTALL)
        if thought:
            thought_s = thought.group(1).strip()
        else:
            thought_s = None
        action_desc = re.search(r"Action:(.*?)(?=\n|<tool_call>|\{\"name\": \"artemis\",)", content, flags=re.DOTALL)
        if action_desc:
            action_desc_s = action_desc.group(1).strip()
        else:
            action_desc_s = None
        action = re.search(r'{"name": "artemis",(.*?)}}', content, flags=re.DOTALL)
        if not action:
            raise Exception("Cannot extract action in the content.")
        action_s = '{"name": "artemis",' + action.group(1).strip() + '}}'
        action = json.loads(action_s)
        name = action['arguments']['action']
        # if name not in ACTION_SPACE:
        #     raise Exception(f"Action {name} is not in the action space.")
        action['arguments'].pop('action')
        params = action['arguments']

        for k, v in params.items():
            if k in ['coordinate', 'coordinate2', 'point', 'start_point', 'end_point']:
                try:
                    x = round(v[0] / size[0] * raw_size[0])
                    y = round(v[1] / size[1] * raw_size[1])
                    params[k] = (x, y)
                except:
                    pass
        action_a = Action(name=name, parameters=params)

        if action_a.name.lower() == 'answer':
            action_desc_s = 'Answer '+ action_desc_s

        return thought_s, action_a, action_s, action_desc_s


"""
Call after executing each action.
"""
class Reflector(SubAgent):
    def __init__(self):
        super().__init__()
        self.valid_options = ['A', 'B', 'C', 'D']

    def get_message(self, episodedata: EpisodeData) -> list:
        messages = []
        trajectory = episodedata.trajectory
        current_step = trajectory[-1]

        pixels_before = current_step.curr_env_state.pixels.copy()
        resized_height, resized_width = smart_resize(height=pixels_before.height, width=pixels_before.width)
        pixels_after = current_step.exec_env_state.pixels.copy()

        diff_flag = False
        new_img1, new_img2 = diff_image(pixels_before, pixels_after)
        if new_img1 is not None:
            pixels_before, pixels_after = new_img1, new_img2
            diff_flag = True
        
        # Add system prompt
        messages.append({
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "your system prompt is:"
                }
            ]
        })

        # Add user prompt
        prompt = ""
        prompt += "### User Instruction ###\n"
        prompt += f"{episodedata.goal}\n\n"

        if hasattr(current_step, "sub_goal") and current_step.sub_goal is not None:
            prompt += "### Current Subgoal ###\n"
            prompt += f"{current_step.sub_goal}\n\n"

        prompt += "---\n"
        prompt += f"Screenshot before latest action: {IMAGE_PLACEHOLDER}\n"
        prompt += f"Screenshot after latest action: {IMAGE_PLACEHOLDER}\n"
        prompt += f"The two images are two phone screenshots before and after your latest action. " 
        prompt += f"The width and height are {resized_width} and {resized_height} pixels, respectively.\n"
        if diff_flag:
            logger.info("The last action successfully produces some changes. The difference between the two images is highlighted in red boxes.")
            prompt += "The last action successfully produces some observable changes. The difference between the two images is highlighted in red boxes. You can find it on the images.\n"
        prompt += "\n"

        prompt += "---\n"
        prompt += "### Latest Action ###\n"
        prompt += f"Action: {current_step.action_s}\n"
        prompt += f"Expectation: {current_step.action_desc}\n\n"

        prompt += "---\n"
        prompt += "Carefully examine the information provided above to determine whether the last action meets the expectation. If not, identify the failure mode and provide reasoning on the potential reason causing this failure. Note that for the “Swipe” action, it may take multiple attempts to display the expected content. Thus, for a \"Swipe\" action, if the screen shows new content, it usually meets the expectation.\n\n"

        prompt += "Provide your output in the following format containing three parts:\n\n"
        prompt += "### Outcome ###\n"
        prompt += "Choose from the following options. Give your answer as \"A\", \"B\",\"C\" or \"D\":\n"
        prompt += "A: Successful or Partially Successful. The result of the last action meets the expectation, or on the right path to meet the expectation.\n"
        prompt += "B: Failed. The last action results in a wrong page. I need to return to the previous state.\n"
        prompt += "C: Failed. The last action produces no changes.\n"
        prompt += "D: Uncertain. Can't determine whether the last action meets the expectation.\n"
        prompt += "NOTE: In some cases, the action may not produce any observable feedback, such as click a `save` or `add` button. You can't determine whether the action meets the expectation. In this case, you can choose \"D\".\n"
        prompt += "\n"

        prompt += "### Error Description ###\n"
        prompt += "If the action failed, provide a detailed description of the error and the potential reason causing this failure. If the action succeeded, put \"None\" here.\n\n"

        messages.append({
            "role": "user",
            "content": [
                {"type": "text","text": prompt},
                {"type": "image_url","image_url": {"url": encode_image_url(pixels_before)}, "resized_height": resized_height, "resized_width": resized_width},
                {"type": "image_url","image_url": {"url": encode_image_url(pixels_after)}, "resized_height": resized_height, "resized_width": resized_width}
            ]
        })

        return messages

    def parse_response(self, response: str) -> dict:
        outcome = response.split("### Outcome ###")[-1].split("### Error Description ###")[0].replace("\n", " ").replace("  ", " ").strip()
        error_description = response.split("### Error Description ###")[-1].split("### Explanation ###")[0].replace("\n", " ").replace("  ", " ").strip()
        return outcome, error_description


"""
Call in the end of each step.
"""
class Processor(SubAgent):
    def get_message(self, episodedata: EpisodeData) -> list:
        messages = []
        trajectory = episodedata.trajectory
        current_step = trajectory[-1]
        
        # Add system prompt
        messages.append({
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "You are a helpful AI assistant for operating mobile phones. Your goal is to summarize the completed contents based on the history operations."
                }
            ]
        })

        prompt = ""

        if len(trajectory) > 1:
            prompt += "### History operations ###\n"
            prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n"
            history = get_history(trajectory[:-1])
            prompt += "\n".join(history)
            prompt += "\n"
            
            previous_step = trajectory[-2]
            prompt += "### Progress thinking ###\n"
            prompt += "After completing the history operations, you have the following thoughts about the progress:\n"
            prompt += f"Completed contents:\n{previous_step.progress}\n\n"

            prompt += "### Current operation ###\n"
            prompt += f"Action description: {current_step.action_desc}\n"
            prompt += f"Action: {current_step.action}\n\n"

            if hasattr(current_step, "reflection_outcome") and current_step.reflection_outcome is not None:
                if current_step.reflection_outcome in ['B', 'C']:
                    prompt += "### Reflection ###\n"
                    prompt += "According to your current operation, you have the following reflection:\n"
                    prompt += f"Reflection: {current_step.reflection_error}\n"
                    prompt += "\n"
            
            prompt += "### Response requirements ###\n"
            prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the provided information.\n"
            prompt += "Note: Only descripe the actially performed action. The action purpose may be incouded in the action description. Don't include it in completed contents!\n"
            prompt += "Sometimes the action description describes more than one action, such as \"Click the text field and type in text\". You should only summarize the action that has been actually performed.\n"
            prompt += "\n"
            
            prompt += "### Output format ###\n"
            prompt += "Your output format is:\n"
            prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed."
            
        else:
            prompt += "### Current operation ###\n"
            prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n"
            prompt += f"Action thought: {current_step.thought}\n"
            prompt += f"Action description: {current_step.action_desc}\n"
            prompt += f"Action: {current_step.action}\n\n"
            
            prompt += "### Response requirements ###\n"
            prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the provided information.\n"
            prompt += "Note: Only descripe the actially performed action. The action purpose may be incouded in the action description. Don't include it in completed contents!\n"
            prompt += "\n"

            prompt += "### Output format ###\n"
            prompt += "Your output format is:\n"
            prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed.\n"
            prompt += "(Please use English to output)"
            
        messages.append({
            "role": "user",
            "content": [{"type": "text","text": prompt}]
        })

        return messages
    
    def parse_response(self, response: str):
        return response.split("### Completed contents ###")[-1].replace("\n", " ").replace("  ", " ").strip()



class ExperienceExtractor(SubAgent):
    def get_message(self, current_goal: str, finished_goal: str, summary: str) -> list:
        messages = []

        # Add system prompt
        messages.append({
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "You are a helpful AI assistant for operating mobile phones. Your goal is to provide useful information as requested, to help another agent follow the instruction and perform the mobile use task."
                }
            ]
        })

        # Add user prompt
        prompt = ""
        prompt += "### Current Task's User Instruction ###\n"
        prompt += f"{current_goal}\n\n"

        prompt += "### Retrieved similar task experience ###\n"
        prompt += "This is a similar task you have done.\n"
        prompt += f"User Instruction: {finished_goal}\n"
        prompt += f"Experience: {summary}\n\n"

        prompt += "---\n"
        prompt += "Based on the retrieved similar task experience, if you think it is indeed useful to the current task, provide the final knowledge in a numbered list. "
        prompt += "Your output will be referred to by another agent when performing the new task.\n"

        prompt += "Provide your output in the following format:\n\n"
        prompt += "### Knowledge ###\n"
        prompt += "1. ...\n"

        messages.append({
            "role": "user",
            "content": [{"type": "text","text": prompt}]
        })
        return messages

    def parse_response(self, response: str) -> str:
        knowledge = response.split("### Knowledge ###")[-1].strip()
        return knowledge

