import ast
import re
from io import BytesIO
from typing import Dict
import math
import json
import copy

import numpy as np
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor

import logging

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


QwenVL25_SYS_PROMPT = "You are a helpful assistant.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name_for_human\": \"computer_use\", \"name\": \"computer_use\", \"description\": \"Use a mouse and keyboard to interact with a computer, and take screenshots.\\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\\n* The screen's resolution is 1288x728.\\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\\n* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\\n* `type`: Type a string of text on the keyboard.\\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\\n* `left_click`: Click the left mouse button.\\n* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\\n* `right_click`: Click the right mouse button.\\n* `middle_click`: Click the middle mouse button.\\n* `double_click`: Double-click the left mouse button.\\n* `scroll`: Performs a scroll of the mouse scroll wheel.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"key\", \"type\", \"mouse_move\", \"left_click\", \"left_click_drag\", \"right_click\", \"middle_click\", \"double_click\", \"scroll\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"keys\": {\"description\": \"Required only by `action=key`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=type`.\", \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=mouse_move` and `action=left_click_drag`.\", \"type\": \"array\"}, \"pixels\": {\"description\": \"The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.\", \"type\": \"number\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=wait`.\", \"type\": \"number\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}, \"args_format\": \"Format the arguments as a JSON object.\"}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"

QwenVL25_PLANNING_PROMPT = """The user query: {instruction} (You have done the following operation on the current device): \n{previous_actions} \nBefore answering, explain your reasoning step-by-step in <think></think> tags, and insert them before the <tool_call></tool_call> XML tags.\nAfter answering, summarize your action in <conclusion></conclusion> tags, and insert them after the <tool_call></tool_call> XML tags."""


QWEN25_ACTION_SPACE = """
The action to perform. The available actions are:
* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order. 
* `type`: Type a string of text on the keyboard.
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
* `left_click`: Click the left mouse button.
* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.
* `right_click`: Click the right mouse button.
* `middle_click`: Click the middle mouse button.
* `double_click`: Double-click the left mouse button.
* `scroll`: Performs a scroll of the mouse scroll wheel.
* `wait`: Wait specified seconds for the change to happen.
* `terminate`: Terminate the current task and report its completion status.
"""

SCREEN_LOGIC_SIZE = (1920, 1080)


IMAGE_FACTOR = 28
MIN_PIXELS = 200704
MAX_PIXELS = 937664
MAX_RATIO = 200

def smart_resize(
    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
) -> tuple[int, int]:
    """
    Rescales the image so that the following conditions are met:

    1. Both dimensions (height and width) are divisible by 'factor'.

    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].

    3. The aspect ratio of the image is maintained as closely as possible.
    """
    if max(height, width) / min(height, width) > MAX_RATIO:
        raise ValueError(
            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
        )
    h_bar = max(factor, round_by_factor(height, factor))
    w_bar = max(factor, round_by_factor(width, factor))
    if h_bar * w_bar > max_pixels:
        beta = math.sqrt((height * width) / max_pixels)
        h_bar = floor_by_factor(height / beta, factor)
        w_bar = floor_by_factor(width / beta, factor)
    elif h_bar * w_bar < min_pixels:
        beta = math.sqrt(min_pixels / (height * width))
        h_bar = ceil_by_factor(height * beta, factor)
        w_bar = ceil_by_factor(width * beta, factor)
    return h_bar, w_bar

def round_by_factor(number: int, factor: int) -> int:
    """Returns the closest integer to 'number' that is divisible by 'factor'."""
    return round(number / factor) * factor


def ceil_by_factor(number: int, factor: int) -> int:
    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
    return math.ceil(number / factor) * factor


def floor_by_factor(number: int, factor: int) -> int:
    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
    return math.floor(number / factor) * factor
 
    
def escape_single_quotes(text):
    # 匹配未转义的单引号（不匹配 \\'）
    pattern = r"(?<!\\)'"
    return re.sub(pattern, r"\\'", text)

def fix_click_output(output: str) -> str:
    # 直接匹配两个逗号分隔的数字，不考虑括号
    matches = re.findall(r'(\d+)\s*,\s*(\d+)', output)

    if matches:
        # 取最后一个匹配到的坐标
        x, y = matches[-1]
        return f"click(start_box='({x},{y})')"
    else:
        return None  # 没有找到任何有效的坐标时返回

def fix_drag_output(output: str) -> str:
    # 直接匹配两个逗号分隔的数字，不考虑括号
    matches = re.findall(r'(\d+)\s*,\s*(\d+)', output)

    if matches and len(matches) >= 2:
        # 取最后一个匹配到的坐标
        x1, y1 = matches[-2]
        x2, y2 = matches[-1]
        return f"drag(start_box='({x1},{y1})', end_box='({x2},{y2})')"
    else:
        return None  # 没有找到任何有效的坐标时返回

def parse_action_qwen25vl(text, factor, image_height, image_width):
    print("====in parse_action_qwen25vl!!!!")
    actions = []
    thought = ""
    match = re.search(r"<think>\s*(.*?)\s*</think>", text, re.DOTALL)
    if match:
        thought=match.group(1).strip()
    pred = text.strip()
    if '</think>' in pred:
        pred = pred.split('</think>')[-1]
    if '<tool_call>' in pred:
        pred = pred.split('<tool_call>')[1]
    else:
        pred = '{"name": "mobile_use", "arguments":'+pred.split('{"name": "mobile_use", "arguments":',1)[1]
    if '</tool_call>' in pred:
        pred = pred.split('</tool_call>')[0]
    else:
        pred = pred.split("<conclusion>")[0]
        pred = pred.rsplit('}}',1)[0]+'}}'
    # 新增：兼容单引号的json
    s = pred.strip()
    tmp_all_action = s.split("\n\n")
    actions = []
    pred_action_historty = copy.deepcopy(text)
    for action_str in tmp_all_action:
        try:
            pred_action = json.loads(action_str)['arguments']
        except Exception:
            pred_action = ast.literal_eval(action_str)['arguments']

        # pred_action_historty = copy.deepcopy(text)
        try:
            if pred_action['coordinate']:
                pred_action['coordinate'] = pred_action['coordinate'][0]/image_width , pred_action['coordinate'][1]/image_height
            if pred_action['coordinate2']:
                pred_action['coordinate2'] = pred_action['coordinate2'][0]/image_width , pred_action['coordinate2'][1]/image_height
        except Exception as e:
            pass

        # import pdb; pdb.set_trace()
        actions.append({
            "thought": thought,
            "action_type": pred_action['action'],
            "action_inputs": pred_action,
            "action_historty": pred_action_historty,
            "text": text
        })
    return actions

def action_space_mapping(input_text: str) -> str:
    # 定义替换规则：正则表达式模式和对应的替换模板
    rules = [
        # 1. click(start_box='<|box_start|>(x1,y1)<|box_end|>')
        (
            r"click\(start_box='(?:<\|box_start\|>)?\(([0-9]+),([0-9]+)\)(?:<\|box_end\|>)?'\)",
            lambda m: f'do(action="Tap", element=[{int(m.group(1))/1000:.3f}, {int(m.group(2))/1000:.3f}])'
        ),
        # 2. long_press(start_box='<|box_start|>(x1,y1)<|box_end|>', time='')
        (
            r"long_press\(start_box='(?:<\|box_start\|>)?\(([0-9]+),([0-9]+)\)(?:<\|box_end\|>)?', time=''?\)",
            lambda m: f'do(action="Long Press", element=[{int(m.group(1))/1000:.3f}, {int(m.group(2))/1000:.3f}])'
        ),
        # 2. long_press(start_box='<|box_start|>(x1,y1)<|box_end|>')
        (
            r"long_press\(start_box='(?:<\|box_start\|>)?\(([0-9]+),([0-9]+)\)(?:<\|box_end\|>)?'\)",
            lambda m: f'do(action="Long Press", element=[{int(m.group(1))/1000:.3f}, {int(m.group(2))/1000:.3f}])'
        ),
        # 3. type(content='')
        (
            r"type\(content='((?:\'|[^'])*?)'\)",
            r'do(action="Type", text="\1")'
        ),
        # 4. scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
        (
            r"scroll\(start_box='(?:<\|box_start\|>)?\(([0-9]+),([0-9]+)\)(?:<\|box_end\|>)?', end_box='(?:<\|box_start\|>)?\(([0-9]+),([0-9]+)\)(?:<\|box_end\|>)?'\)",
            lambda m: f'do(action="Swipe Precise", start=[{int(m.group(1))/1000:.3f}, {int(m.group(2))/1000:.3f}], end=[{int(m.group(3))/1000:.3f}, {int(m.group(4))/1000:.3f}])'
        ),
        # 5. scroll(direction='up')
        (
            r"scroll\(direction='((?:up|down|left|right))'\)",
            r'do(action="Swipe", direction="\1")'
        ),
        # 6. press_home()
        (
            r"press_home\(\)",
            r'do(action="Home")'
        ),
        # 7. press_back()
        (
            r"press_back\(\)",
            r'do(action="Back")'
        ),
        # 8. finished(content='')
        (
            r"finished\(content='((?:\'|[^'])*?)'\)",
            r'finish(message="\1")'
        ),
        # 9. finished()
        (
            r"finished\(\)",
            r'finish(message="")'
        ),
        # 10. drag(start_box='(624,470)', end_box='(288,505)')
        (
            r"drag\(start_box='\(([0-9]+),([0-9]+)\)', end_box='\(([0-9]+),([0-9]+)\)'\)",
            lambda m: f'do(action="Swipe Precise", start=[{int(m.group(1))/1000:.3f}, {int(m.group(2))/1000:.3f}], end=[{int(m.group(3))/1000:.3f}, {int(m.group(4))/1000:.3f}])'
        ),
        # 11. scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
        (
            r"scroll\(start_box='(?:<\|box_start\|>)?\(([0-9]+),([0-9]+)\)(?:<\|box_end\|>)?', direction='(down|up|left|right)'\)",
            lambda m: f'do(action="Swipe", element=[{int(m.group(1))/1000:.3f}, {int(m.group(2))/1000:.3f}], direction="{m.group(3)}")'
        ),
        # 12. open_app(app_name=\'\')
        (
            r"open_app\(app_name='([^']+)'\)",
            lambda m: f'do(action="Launch", app="{m.group(1)}")'
        )
    ]

    # 匹配整体输出格式：Thought: ...\nAction: ...\n
    output_pattern = r'(Thought:.*Action:.*)'
    
    def replace_action(match):
        line = match.group(1)
        # 提取 Action 部分
        action_match = re.search(r'Action: (.*?)(?=\n|$)', line)
        if not action_match:
            return line

        action = action_match.group(1)
        # 尝试每条替换规则
        for pattern, replacement in rules:
            if re.match(pattern, action):
                if callable(replacement):
                    # 使用lambda函数处理替换
                    action = re.sub(pattern, replacement, action)
                else:
                    # 普通替换
                    action = re.sub(pattern, replacement, action)
                break
        return action
        
    # 处理整个输入文本
    result = re.sub(output_pattern, replace_action, input_text, flags=re.DOTALL)
    return result

def parsing_response_to_android_action_code(responses, image_height: int, image_width:int, input_swap:bool=True) -> str:
    if isinstance(responses, dict):
        responses = [responses]
    action_code = ""
    for response_id, response in enumerate(responses):
        input_text = response["text"]
        action_code += action_space_mapping(input_text)

    return action_code


def parsing_response_to_pyautogui_code(responses, image_height: int, image_width:int, input_swap:bool=True) -> str:
    '''
    将M模型的输出解析为OSWorld中的action，生成pyautogui代码字符串
    参数:
        response: 包含模型输出的字典，结构类似于：
        {
            "action_type": "hotkey",
            "action_inputs": {
                "hotkey": "v ctrl",
                "start_box": None,
                "end_box": None
            }
        }
    返回:
        生成的pyautogui代码字符串
    '''

    pyautogui_code = f"import pyautogui\nimport time\n"
    if isinstance(responses, dict):
        responses = [responses]
    for response_id, response in enumerate(responses):
        if "observation" in response:
            observation = response["observation"]
        else:
            observation = ""

        if "thought" in response:
            thought = response["thought"]
        else:
            thought = ""

        if response_id == 0:
            pyautogui_code += f"'''\nObservation:\n{observation}\n\nThought:\n{thought}\n'''\n"
        else:
            pyautogui_code += f"\ntime.sleep(3)\n"

        action_dict = response
        action_type = action_dict.get("action_type")
        action_inputs = action_dict.get("action_inputs", {})

        if action_type == "mouse_move": #1
            # Parsing hotkey action
            if "coordinate" in action_inputs:
                x, y = action_inputs["coordinate"] # Assuming coordinate is in [x,y]
                sx = round(x * image_width, 3)
                sy = round(y * image_height, 3)

                pyautogui_code += (
                    f"\npyautogui.moveTo({sx}, {sy})\n"
                )
        elif action_type == "type":
            # Parsing typing action using clipboard
            content = action_inputs.get("text", "")
            content = escape_single_quotes(content)
            stripped_content = content
            if content.endswith("\n") or content.endswith("\\n"):
                stripped_content = stripped_content.rstrip("\\n").rstrip("\n")
            if content:
                if input_swap:
                    pyautogui_code += f"\nimport pyperclip"
                    pyautogui_code += f"\npyperclip.copy('{stripped_content}')"
                    pyautogui_code += f"\npyautogui.hotkey('ctrl', 'v')"
                    pyautogui_code += f"\ntime.sleep(0.5)\n"
                    if content.endswith("\n") or content.endswith("\\n"):
                        pyautogui_code += f"\npyautogui.press('enter')"
                else:
                    pyautogui_code += f"\npyautogui.write('{stripped_content}', interval=0.1)"
                    pyautogui_code += f"\ntime.sleep(0.5)\n"
                    if content.endswith("\n") or content.endswith("\\n"):
                        pyautogui_code += f"\npyautogui.press('enter')"
                        
        elif action_type == "key": #1
            # Parsing hotkey action
            if "keys" in action_inputs:
                hotkey = action_inputs.get("keys", "")
            else:
                hotkey = action_inputs.get("hotkey", "")

            if hotkey == "arrowleft":
                hotkey = "left"

            if hotkey:
                # Handle other hotkeys
                # keys = hotkey.split(',')  # Split the keys by space
                keys = hotkey 
                convert_keys = []
                for key in keys:
                    if key == "space":
                        key = ' '
                    convert_keys.append(key)
                pyautogui_code += f"\npyautogui.hotkey({', '.join([repr(k) for k in convert_keys])})"

        elif action_type == "scroll":
            page = action_inputs.get("pixels", "")

            if page > 0:
                pyautogui_code += f"\npyautogui.scroll(5)"
            elif page < 0 :
                pyautogui_code += f"\npyautogui.scroll(-5)"

        elif action_type == "swipe":
            x1, y1 = action_inputs.get("coordinate", "")
            x2, y2 = action_inputs.get("coordinate2", "")
            sx = round(x1 * image_width, 3)
            sy = round(y1 * image_height, 3)
            page = action_inputs.get("pixels", "")
            direction = "up"
            
            if x1 == None:
                if page > 0:
                    pyautogui_code += f"\npyautogui.scroll(5)"
                elif page < 0 :
                    pyautogui_code += f"\npyautogui.scroll(-5)"
            else:
                delta_x = x2 - x1
                delta_y = y2 - y1
                if abs(delta_x) > abs(delta_y):
                    if delta_x > 0:  #x2>x1 
                        direction = 'right'
                    else:
                        direction = 'left'
                else:
                    if delta_y > 0:  #y2>y1 
                        direction = 'down'
                    else:
                        direction = 'up'
                if direction == "up" :
                    pyautogui_code += f"\npyautogui.scroll(5, x={sx}, y={sy})"
                elif direction == "down" :
                    pyautogui_code += f"\npyautogui.scroll(-5, x={sx}, y={sy})"

        elif action_type in ["click", "left_click", "right_click", "double_click","middle_click" ]:
            # Parsing mouse click actions
            start_box = action_inputs.get("coordinate")
             
            if start_box:
                x, y = start_box
                sx = round(x * image_width, 3)
                sy = round(y * image_height, 3)
                if action_type == "left_click" or action_type == "click":
                    pyautogui_code += f"\npyautogui.click({sx}, {sy}, button='left')"
                elif action_type == "left_double" or action_type == "double_click":
                    pyautogui_code += f"\npyautogui.doubleClick({sx}, {sy}, button='left')"
                elif action_type == "right_single" or action_type == "right_click":
                    pyautogui_code += f"\npyautogui.click({sx}, {sy}, button='right')"
                elif action_type == "middle_click" :
                    pyautogui_code += f"\npyautogui.click({sx}, {sy}, button='middle')"
                elif action_type == "hover":
                    pyautogui_code += f"\npyautogui.moveTo({sx}, {sy})"

        elif action_type in ["left_click_drag"]:  ##TODO fix
            # Parsing mouse click actions
            start_box = action_inputs.get("coordinate")
             
            if start_box:
                x, y = start_box
                ex = round(x * image_width, 3)
                ey = round(y * image_height, 3)
                pyautogui_code += (
                    f"\npyautogui.dragTo({ex}, {ey}, duration=1.0)\n"
                )

        elif action_type in ["answer"]:
            pyautogui_code = f"DONE" #TODO change to faile

        elif action_type in ["terminate"]:
            if action_inputs["status"] == 'success':
                pyautogui_code = f"DONE" #TODO change to faile
            elif action_inputs["status"] == 'failure':
                pyautogui_code = f"FAIL" #TODO change to faile
            # pyautogui_code = f"DONE" #TODO change to faile

        else:
            pyautogui_code += f"\n# Unrecognized action type: {action_type}"

    return pyautogui_code

def add_box_token(input_string):
    # Step 1: Split the string into individual actions
    if "Action: " in input_string and "start_box=" in input_string:
        suffix = input_string.split("Action: ")[0] + "Action: "
        actions = input_string.split("Action: ")[1:]
        processed_actions = []
        for action in actions:
            action = action.strip()
            # Step 2: Extract coordinates (start_box or end_box) using regex
            coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
            
            updated_action = action  # Start with the original action
            for coord_type, x, y in coordinates:
                # Convert x and y to integers
                updated_action = updated_action.replace(f"{coord_type}='({x},{y})'", f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'")
            processed_actions.append(updated_action)
        
        # Step 5: Reconstruct the final string
        final_string = suffix + "\n\n".join(processed_actions)
    else:
        final_string = input_string
    return final_string


class Ultron:
    def __init__(self,
                 tokenizer_path,
                 max_trajectory_length=15,
                 history_n=5,
                 screen_size=SCREEN_LOGIC_SIZE,
                 action_space='computer',
                 infer_mode='qwen2vl_user',
                 prompt_style='qwen2vl_user',
                 input_swap=False,
                 language='Chinese',
                 ):
        
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True, use_fast=False)
        self.processor = AutoProcessor.from_pretrained(tokenizer_path)
        self.max_trajectory_length = max_trajectory_length
        self.history_n = history_n
        self.screen_size = screen_size
        self.action_space = action_space
        self.infer_mode = infer_mode 
        self.prompt_style = prompt_style
        self.input_swap = input_swap
        self.language = language
        self.resize_screen_size = (0,0)
        self.smart_resize_height, self.smart_resize_width = 0,0
        
        # self.prompt_action_space = UITARS_ACTION_SPACE
        self.customize_action_parser = parse_action_qwen25vl
        self.action_parse_res_factor = 1000
        if self.infer_mode == "qwen2vl_user":
            # self.prompt_action_space = UITARS_CALL_USR_ACTION_SPACE
            self.prompt_action_space = QWEN25_ACTION_SPACE
            # pass
        if action_space == 'mobile':
            # self.prompt_action_space = UITARS_MOBILE_ACTION_SPACE
            self.action_code_mapper = parsing_response_to_android_action_code
        else:
            self.action_code_mapper = parsing_response_to_pyautogui_code
        
        # self.prompt_template = UITARS_USR_PROMPT_THOUGHT
        
        # if self.prompt_style == "qwen2vl_user":
        #     self.prompt_template = UITARS_USR_PROMPT_THOUGHT
        # elif self.prompt_style == "qwen2vl_no_thought":
        #     self.prompt_template = UITARS_USR_PROMPT_NOTHOUGHT

        self.reset()

    def get_model_inputs(self, instruction: str, obs: Dict,task_save_dir,step_idx):
         
        # _actions = self.history_action[-self.history_n:]
        _actions = self.history_responses[-self.history_n:]
        # _thoughts = self.thoughts[-self.history_n:]

        previous_actions = "\n".join([f"Step {i+1}: {action}" for i, action in enumerate(_actions)]) if _actions else "None"
        user_prompt = QwenVL25_PLANNING_PROMPT.format(
                        instruction=instruction,
                        previous_actions=previous_actions)

        messages = []
        messages.append({
            "role": "system",
            "content": [
                {"type": "text", "text": QwenVL25_SYS_PROMPT}
            ],
        })
        messages.append({
            "role": "user",
            "content": [
                {"type": "image", "image": ""},
                {"type": "text", "text": user_prompt},
            ],
        })
        # prompt_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        prompt_text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        # prompt_text += AGUVIS_RECIPIENT_SELF_PLAN

        image = Image.open(BytesIO(obs["screenshot"]))
        print("===image size: ",image.size) #(1371, 771)
    
        #smart resize
        self.smart_resize_height, self.smart_resize_width = smart_resize(image.size[1], image.size[0], factor=IMAGE_FACTOR, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
        image = image.resize((self.smart_resize_width, self.smart_resize_height))
        print("===smart_resize_width: ",self.smart_resize_width)
        print("===smart_resize_height: ",self.smart_resize_height)

        inputs = {"prompt": prompt_text, "multi_modal_data": {'image': [image]}}
        return inputs


    def parse_action(self, response: str):
        
        if '<conclusion>' in response:
            if '</think>' in response:
                his_str = response.split('</think>')[1]
            else :
                his_str = response
        else:
            his_str = response

        # 补丁：如果his_str为空或""，则用原始response代替
        if not his_str or his_str.strip() == "":
            his_str = response
        
        self.history_responses.append(his_str)
        # self.thoughts.append(response)

        try:
            print("===into parse_action_qwen25vl")
            parsed_responses = self.customize_action_parser(
                response,
                self.action_parse_res_factor,
                self.smart_resize_height,
                self.smart_resize_width
                # self.resize_screen_size[1],
                # self.resize_screen_size[0]
            )
            for parsed_response in parsed_responses:
                self.actions.append(parsed_response)
                self.history_action.append(parsed_response["action_historty"])
        except Exception as e:
            print(f"Parsing action error: {response}, with error:\n{e}")
            return ["DONE"]

        actions = []
        if len(parsed_responses) == 1:
            parsed_response = parsed_responses[0]
            if "action_type" in parsed_response:

                if parsed_response["action_type"] == 'terminate':
                    self.actions.append(actions)
                    self.history_action.append(parsed_response["action_historty"])
                    try:
                        status = parsed_response["action_inputs"].get("status", None)
                        if status == 'success':
                            return ["DONE"]
                        elif status == 'failure':
                            return ["FAIL"]
                    except:
                        print("===error in parsed_response!!!",parsed_response)
                        logging.debug(f'===error in parsed_response!!!:{parsed_response}')
                        return ["FAIL"]
                
                elif parsed_response["action_type"] == 'wait':
                    self.actions.append(actions)
                    self.history_action.append(parsed_response["action_historty"])
                    return ["WAIT"]
        # else:
        try:
            pyautogui_code = self.action_code_mapper(
                parsed_responses,
                self.screen_size[1],
                self.screen_size[0],
                self.input_swap
            )
            actions.append(pyautogui_code)
            print("=====parsed_response: ",parsed_response)
            print("=====pyautogui_code: ",pyautogui_code)
            logging.debug(f'===pyautogui_code:{pyautogui_code}')
        except Exception as e:
            print(f"Parsing pyautogui code error: {parsed_response}, with error:\n{e}")

        if len(self.history_responses) >= self.max_trajectory_length:
            # Default to FAIL if exceed max steps
            actions = ["FAIL"]

        return actions

    def reset(self):
        self.thoughts = []
        self.actions = []
        self.history_action = []
        self.observations = []
        self.history_images = []
        self.history_responses = []



if __name__ == "__main__":
    # response = "<observation>\nThe current screen shows a Google search results page with the query for \"Amazon.\" The top result is the official Amazon website link.\n</observation>\n<think>\nThe goal was to search for \"Amazon.\" The search results page shows that the top suggestion leads directly to Amazon.com, indicating success in finding the site.\n</think>\n<tool_call>\n{\"name\": \"mobile_use\", \"arguments\": {\"action\": \"terminate\", \"status\": \"success\"}}\n</tool_call>\n<conclusion>\nThe task is completed successfully.\n</conclusion>"
    # response = "<observation>\nThe screen displays a search bar at the top with the word 'amazon' entered in it. Below the search bar, there are suggestions related to 'amazon,' which include 'Amazon E-commerce company,' 'amazon prime,' 'amazon revenue,' and 'Amazon.com: Online Shopping.'\n</observation>\n<think>\nThe goal is to search for Amazon on this platform. The most relevant suggestion is 'Amazon E-commerce company,' which directly relates to the goal of searching for Amazon. Selecting this option will lead to information or services offered by Amazon.\n</think>\n<tool_call>\n{\"name\": \"mobile_use\", \"arguments\": {\"action\": \"click\", \"coordinate\": [201, 270]}}\n</tool_call>\n<conclusion>\nClick on the 'Amazon E-commerce company' suggestion to proceed with the search.\n</conclusion>"
    # response = "<think>\nTo find x, solve the equation 0 + x = 25. This simplifies to x = 25.\n</think>\n<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"type\", \"text\": \"25\"}}\n</tool_call>\n<conclusion>\nType '25' into the textbox and press Submit.\n</conclusion>"
    # response = "<think>\nTo solve 0 + x = 25, I need to determine the value of x. Since adding zero to any number does not change its value, x equals 25.\n</think>\n<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [261, 278]}}\n</tool_call>\n<conclusion>\nType '25' into the textbox for x.\n</conclusion>"
    # response = "<tool_call>\n{\"name\": \"mobile_use\", \"arguments\": {\"action\": \"click\", \"coordinate\": [263, 134]}}\n</tool_call>\n<conclusion>\nClick on the browser's menu or options button to explore saving the webpage as a PDF.\n</conclusion>"

    # action = parse_action_qwen25vl(response,1000,1092,1932)
    # print("===action: ",action)

    # response = "<think>让我看看...要恢复关闭的标签页，首先得打开浏览器的主菜单。我注意到Chrome窗口右上角有个三个点的按钮，点击它就能展开更多选项了。这是第一步，接下来就可以找到恢复标签页的功能。</think>\n<tool_call>{'name': 'mobile_use', 'arguments': {'action': 'click', 'coordinate': [840.204, 156.25]}}</tool_call>"
    # action = parse_action_qwen25vl(response,1000,1092,1932)
    # print("===action: ",action)

    # response = '<think>太好了！我刚刚完成了所有清理工作 - 从历史记录到Cookie数据，再到缓存文件，Amazon留下的追踪信息已经被彻底删除了。现在用户可以放心地浏览网页了，这些隐私数据不会再被保存下来。</think>\n<tool_call>{"name": "mobile_use", "arguments": {"action": "terminate", "status": "success"}}</tool_call>'
    # response =  "<think>看来我刚才输入的命令没有生效，终端界面没有任何反应。这应该是由于我直接在终端里输入命令导致的 - 在Linux系统中，我们需要先按Ctrl+Shift+T来新建一个终端窗口才能正常工作。让我重新来过吧。</think>\n<tool_call>{\"name\": \"computer_use\", \"arguments\": {\"action\": \"key\", \"keys\": [\"ctrl\", \"shift\", \"t\"]}}</tool_call>"

    # response = "<think>我现在在网站设置页面，看到一堆权限相关的选项。不过这些选项太多了，得慢慢找。我注意到右边有个滚动条，用它来浏览应该比用鼠标滚轮更精确一些。让我拖动一下滚动条，看看下面还有什么弹窗和重定向的设置。</think>\n<tool_call>{\"name\": \"computer_use\", \"arguments\": {\"action\": \"mouse_move\", \"coordinate\": [853.347, 578.571]}}\n\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click_drag\", \"coordinate\": [854.286, 314.286]}}</tool_call>"


    # response = "<think>看来表格已经加载完成了，我需要开始填充空白单元格了。让我先用鼠标选中B1:E30这个区域，这样就能一次性把所有需要填充的单元格都包含进去。选中后就可以用工具栏上的填充功能来完成任务了。</think>\n<tool_call>{\"name\": \"computer_use\", \"arguments\": {\"action\": \"mouse_move\", \"coordinate\": [162.408, 153.393]}}\n\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click_drag\", \"coordinate\": [349.224, 550.893]}}</tool_call>"
    response ="<think>\nI need to create a Pivot Table to summarize the total revenue for each promotion type in a new sheet (Sheet2). Looking at the current data, I can see there's sales information with various promotion types like \"None\", \"Fall Kite Event\", \"Festival of Flight\", and \"E-mail Coupon\" in column D, and corresponding revenue amounts in column G. To create a Pivot Table, I first need to select all the relevant data that includes both the promotion types and the revenue values. The logical next action would be to select all the data from A1 through G139 (or however many rows there are) to include all columns with relevant information for creating the Pivot Table. Using Ctrl+A will select all data in the spreadsheet, which is appropriate since I need comprehensive data selection for creating an effective Pivot Table.\n</think>\n<tool_call>\"{\\\"name\\\": \\\"computer_use\\\", \\\"arguments\\\": {\\\"action\\\": \\\"key\\\", \\\"keys\\\": [\\\"ctrl\\\", \\\"a\\\"]}}\"</tool_call>\n<conclusion>\nPress Ctrl+A to select all data in the spreadsheet.\n</conclusion>"
    action = parse_action_qwen25vl(response,1000,1092,1932)
    print("===action: ",action)

    pyautogui_code = parsing_response_to_pyautogui_code(
                    action,
                    1092,
                    1920,
                    False
                )
    print("===pyautogui_code: ",pyautogui_code)