"""
Qwen3-VL Agent for OfficeArena (legacy style).

Uses litellm as the unified API backend.
Based on the OSWorld Qwen3-VL implementation:
https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/qwen3vl_agent.py
"""

import base64
import json
import math
import time
from io import BytesIO
from typing import Any, Dict, List, Literal, Optional, Tuple, Union

import litellm
from PIL import Image

from officearena.adapter.action_adapter import ActionAdapter
from officearena.agents.base import BaseAgent
from officearena.agents.config import Qwen3VLConfig


def round_by_factor(number: float, factor: int) -> int:
    """Round a number to the nearest multiple of factor."""
    return round(number / factor) * factor


def floor_by_factor(number: float, factor: int) -> int:
    """Floor a number to the nearest multiple of factor."""
    return math.floor(number // factor) * factor


def ceil_by_factor(number: float, factor: int) -> int:
    """Ceil a number to the nearest multiple of factor."""
    return math.ceil(number / factor) * factor


def smart_resize(
    height: int,
    width: int,
    factor: int = 28,
    min_pixels: int = 56 * 56,
    max_pixels: int = 14 * 14 * 4 * 1280,
    max_long_side: int = 8192,
) -> Tuple[int, int]:
    """
    Smart resize for Qwen VL models (from OSWorld implementation).

    Resizes dimensions to satisfy:
    1. Height and width are divisible by factor
    2. Total pixels are within [min_pixels, max_pixels]
    3. Longest side is within max_long_side
    4. Aspect ratio is preserved as much as possible
    """
    if height < 2 or width < 2:
        raise ValueError(f"height:{height} or width:{width} must be >= 2")
    if max(height, width) / min(height, width) > 200:
        raise ValueError(f"absolute aspect ratio must be smaller than 200, got {height} / {width}")

    # First, limit the longest side
    if max(height, width) > max_long_side:
        beta = max(height, width) / max_long_side
        height, width = int(height / beta), int(width / beta)

    # Round to nearest factor
    h_bar = round_by_factor(height, factor)
    w_bar = round_by_factor(width, factor)

    # Adjust if outside pixel bounds
    if h_bar * w_bar > max_pixels:
        beta = math.sqrt((height * width) / max_pixels)
        h_bar = floor_by_factor(height / beta, factor)
        w_bar = floor_by_factor(width / beta, factor)
    elif h_bar * w_bar < min_pixels:
        beta = math.sqrt(min_pixels / (height * width))
        h_bar = ceil_by_factor(height * beta, factor)
        w_bar = ceil_by_factor(width * beta, factor)

    return h_bar, w_bar


def process_image(image_bytes: bytes) -> Tuple[str, int, int]:
    """
    Process an image for Qwen VL models.

    Returns:
        Tuple of (base64_encoded_image, processed_width, processed_height)
    """
    image = Image.open(BytesIO(image_bytes))
    width, height = image.size

    resized_height, resized_width = smart_resize(
        height=height,
        width=width,
        factor=32,
        max_pixels=16 * 16 * 4 * 12800,
    )

    image = image.resize((resized_width, resized_height))

    buffer = BytesIO()
    image.save(buffer, format="PNG")
    processed_bytes = buffer.getvalue()

    return base64.b64encode(processed_bytes).decode("utf-8"), resized_width, resized_height


class Qwen3VLActionAdapter(ActionAdapter):
    """Action adapter for Qwen3-VL agent."""

    ComputerUseActionType = Literal[
        "screenshot",
        "wait",
        "mouse_move",
        "left_click",
        "right_click",
        "double_click",
        "key",
        "type",
        "left_click_drag",
        "middle_click",
        "triple_click",
        "scroll",
        "hscroll",
    ]

    ActionType = ComputerUseActionType | Literal["computer", "terminate", "finish"]

    click_buttons = {
        "left_click": 1,
        "right_click": 3,
        "middle_click": 2,
        "double_click": "--repeat 2 --delay 10 1",
        "triple_click": "--repeat 3 --delay 10 1",
    }

    scroll_buttons = {
        "up": 4,
        "down": 5,
        "left": 6,
        "right": 7,
    }

    # Map PyAutoGUI key names to xdotool-compatible key names
    _KEY_MAP = {
        # Basic controls
        "enter": "Return",
        "return": "Return",
        "esc": "Escape",
        "escape": "Escape",
        "backspace": "BackSpace",
        "tab": "Tab",
        "space": "space",
        # Arrows
        "left": "Left",
        "right": "Right",
        "up": "Up",
        "down": "Down",
        # Navigation
        "home": "Home",
        "end": "End",
        "insert": "Insert",
        "delete": "Delete",
        "del": "Delete",
        "pageup": "Page_Up",
        "pgup": "Page_Up",
        "pagedown": "Page_Down",
        "pgdn": "Page_Down",
        # Locks
        "capslock": "Caps_Lock",
        "numlock": "Num_Lock",
        "scrolllock": "Scroll_Lock",
        # PrintScreen and pause
        "printscreen": "Print",
        "prntscrn": "Print",
        "prtsc": "Print",
        "prtscr": "Print",
        "print": "Print",
        "pause": "Pause",
        # Modifiers (normalize left/right to generic where possible)
        "shift": "shift",
        "shiftleft": "shift",
        "shiftright": "shift",
        "ctrl": "ctrl",
        "control": "ctrl",
        "ctrlleft": "ctrl",
        "ctrlright": "ctrl",
        "alt": "alt",
        "altleft": "alt",
        "altright": "alt",
        "win": "super",
        "winleft": "super",
        "winright": "super",
        "command": "super",
        "option": "alt",
        "optionleft": "alt",
        "optionright": "alt",
        # Menu key
        "menu": "Menu",
        "apps": "Menu",
        # Volume/media (best-effort common X names; unlikely to be used anyways)
        "volumedown": "XF86AudioLowerVolume",
        "volumeup": "XF86AudioRaiseVolume",
        "volumemute": "XF86AudioMute",
        "playpause": "XF86AudioPlay",
        "nexttrack": "XF86AudioNext",
        "prevtrack": "XF86AudioPrev",
        "stop": "XF86AudioStop",
    }

    # Modifier keys that indicate a chord (require lowercase letter keys)
    _MODIFIER_KEYS = {"ctrl", "alt", "shift", "super", "meta", "win", "command", "option"}

    @staticmethod
    def _map_key_token(token: str, lowercase_letters: bool = False) -> str:
        """
        Map a single PyAutoGUI-style key token to xdotool-compatible name.

        Args:
            token: The key token to map.
            lowercase_letters: If True, lowercase single letter keys (for use in chords
                               like ctrl+v where uppercase would imply shift).
        """
        if not isinstance(token, str):
            return str(token)
        t = token.strip().strip("'\"").lower()
        # Function keys f1..f24
        if len(t) >= 2 and t[0] == "f" and t[1:].isdigit():
            try:
                num = int(t[1:])
                if 1 <= num <= 24:
                    return f"F{num}"
            except ValueError:
                pass
        # Map via dictionary
        mapped = Qwen3VLActionAdapter._KEY_MAP.get(t)
        if mapped:
            return mapped
        # Single letter keys: lowercase if in a modifier chord
        if lowercase_letters and len(t) == 1 and t.isalpha():
            return t.lower()
        # Single letters/digits and common punctuation: return as-is
        return token

    @classmethod
    def _map_keys(cls, keys: List[str], lowercase_letters: bool = False) -> List[str]:
        """Map a list of key tokens to xdotool-compatible names."""
        return [cls._map_key_token(k, lowercase_letters=lowercase_letters) for k in keys]

    @classmethod
    def _map_key_chord_text(cls, text: str) -> str:
        """
        Map a chord or single key expressed as text.
        Supports forms like 'ctrl+c' or 'enter'.

        For chords with modifiers (ctrl, alt, shift, etc.), letter keys are
        lowercased to avoid unintended shift behavior in xdotool.
        """
        if not isinstance(text, str):
            return str(text)
        if "+" in text:
            parts = [p.strip() for p in text.split("+") if p.strip()]
            # Check if any part is a modifier key
            has_modifier = any(p.lower() in cls._MODIFIER_KEYS for p in parts)
            mapped = cls._map_keys(parts, lowercase_letters=has_modifier)
            return "+".join(mapped)
        return cls._map_key_token(text)

    @property
    def action_adapter_class(self) -> type[ActionAdapter]:
        return Qwen3VLActionAdapter

    def _dispatch_action(self, action_type: ActionType, args: Dict[str, Any]) -> Any:
        if action_type == "computer":
            action_type = args["action"]

        if action_type == "screenshot":
            pass  # Screenshot is automatically added for all actions
        elif action_type == "wait":
            duration = args.get("time", args.get("duration", 1))
            time.sleep(duration)
            return
        elif action_type == "mouse_move":
            x, y = args["coordinate"]
            command_parts = ["xdotool", "mousemove", str(x), str(y)]
            self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "left_click":
            x, y = args["coordinate"]
            mouse_move_part = f"mousemove --sync {x} {y}"
            command_parts = ["xdotool", mouse_move_part]
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keydown {keyname}")
            command_parts.append(f"click {self.click_buttons['left_click']}")
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keyup {keyname}")
            self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "right_click":
            x, y = args["coordinate"]
            mouse_move_part = f"mousemove --sync {x} {y}"
            command_parts = ["xdotool", mouse_move_part]
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keydown {keyname}")
            command_parts.append(f"click {self.click_buttons['right_click']}")
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keyup {keyname}")
            self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "middle_click":
            x, y = args["coordinate"]
            mouse_move_part = f"mousemove --sync {x} {y}"
            command_parts = ["xdotool", mouse_move_part]
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keydown {keyname}")
            command_parts.append(f"click {self.click_buttons['middle_click']}")
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keyup {keyname}")
            self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "double_click":
            x, y = args["coordinate"]
            mouse_move_part = f"mousemove --sync {x} {y}"
            command_parts = ["xdotool", mouse_move_part]
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keydown {keyname}")
            command_parts.append(f"click {self.click_buttons['double_click']}")
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keyup {keyname}")
            self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "triple_click":
            x, y = args["coordinate"]
            mouse_move_part = f"mousemove --sync {x} {y}"
            command_parts = ["xdotool", mouse_move_part]
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keydown {keyname}")
            command_parts.append(f"click {self.click_buttons['triple_click']}")
            if "key" in args:
                keyname = self._map_key_chord_text(args["key"])
                command_parts.append(f"keyup {keyname}")
            self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "left_click_drag":
            x, y = args["coordinate"]
            command_parts = ["xdotool", f"mousedown 1 mousemove --sync {x} {y} mouseup 1"]
            self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "scroll":
            coordinate = args.get("coordinate")
            mouse_move_part = ""
            if coordinate:
                x, y = coordinate
                mouse_move_part = f"mousemove --sync {x} {y}"
            command_parts = ["xdotool", mouse_move_part]
            # Support 'pixels' (positive = down, negative = up)
            pixels = args.get("pixels", 0)
            if pixels != 0:
                if pixels < 0:
                    direction = "down"
                    amount = abs(pixels)
                else:
                    direction = "up"
                    amount = abs(pixels)
            else:
                # Fallback to direction-based scroll
                direction = args.get("direction") or args.get("scroll_direction", "down")
                amount = args.get("scroll_amount", args.get("amount", 1))
            button = self.scroll_buttons[direction]
            command_parts.append(f"click --repeat {amount} {button}")
            self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "hscroll":
            coordinate = args.get("coordinate")
            mouse_move_part = ""
            if coordinate:
                x, y = coordinate
                mouse_move_part = f"mousemove --sync {x} {y}"
            command_parts = ["xdotool", mouse_move_part]
            # Support 'pixels' (positive = right, negative = left)
            pixels = args.get("pixels", 0)
            if pixels != 0:
                if pixels > 0:
                    direction = "right"
                    amount = abs(pixels)
                else:
                    direction = "left"
                    amount = abs(pixels)
            else:
                # Fallback to direction-based horizontal scroll
                direction = args.get("direction") or args.get("scroll_direction", "right")
                amount = args.get("scroll_amount", args.get("amount", 1))
            button = self.scroll_buttons[direction]
            command_parts.append(f"click --repeat {amount} {button}")
            self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "key":
            # Handle keys array format (OSWorld style)
            keys = args.get("keys", [])
            if isinstance(keys, list) and keys:
                # Clean up keys - match official implementation's robust cleaning
                cleaned_keys = []
                for key in keys:
                    if isinstance(key, str):
                        # Handle malformed outputs like: keys=['tab'] or "keys=[" prefix
                        if key.startswith("keys=["):
                            key = key[6:]
                        if key.endswith("]"):
                            key = key[:-1]
                        if key.startswith("['") or key.startswith('["'):
                            key = key[2:] if len(key) > 2 else key
                        if key.endswith("']") or key.endswith('"]'):
                            key = key[:-2] if len(key) > 2 else key
                        key = key.strip().strip("'\"")
                        if key:  # Only add non-empty keys
                            cleaned_keys.append(key)
                    else:
                        cleaned_keys.append(str(key))

                if cleaned_keys:
                    # Check if any key is a modifier (ctrl, alt, shift, etc.)
                    # If so, lowercase letter keys to avoid unintended Shift
                    has_modifier = any(k.lower() in self._MODIFIER_KEYS for k in cleaned_keys)
                    mapped_keys = self._map_keys(cleaned_keys, lowercase_letters=has_modifier)
                    key_text = "+".join(mapped_keys)
                else:
                    return  # No valid keys to press
            else:
                key_text_raw = args.get("text", "")
                key_text = self._map_key_chord_text(key_text_raw)

            if key_text:
                command_parts = ["xdotool", f"key {key_text}"]
                self.sandbox.execute_command(" ".join(command_parts))
            return
        elif action_type == "type":
            self.sandbox.write(args["text"])
            return
        elif action_type in ["terminate", "finish"]:
            # Terminal action - handled by the task executor
            return
        else:
            raise ValueError(f"Unknown action type: {action_type}")


def build_system_prompt(display_width: int, display_height: int, coordinate_type: str = "relative") -> str:
    """Build the system prompt with tools definition in OSWorld style."""

    if coordinate_type == "absolute":
        screen_info = f"The screen's resolution is {display_width}x{display_height}."
    else:
        screen_info = "The screen's resolution is 1000x1000."

    description_prompt = f"""Use a mouse and keyboard to interact with a computer, and take screenshots.
* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.
* E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.
* {screen_info}
* Whenever you intend to move the cursor to click on an element like an icon, 
you should consult a screenshot to determine the coordinates of the element before moving the cursor.
* If you tried clicking on a program or link but it failed to load even after waiting, 
try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.
* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked."""

    action_description = """
* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.
* `type`: Type a string of text on the keyboard.
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.
* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.
* `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
* `scroll`: Performs a scroll of the mouse scroll wheel.
* `hscroll`: Performs a horizontal scroll (mapped to regular scroll).
* `wait`: Wait specified seconds for the change to happen.
* `terminate`: Terminate the current task and report its completion status."""

    tools_def = {
        "type": "function",
        "function": {
            "name_for_human": "computer_use",
            "name": "computer_use",
            "description": description_prompt,
            "parameters": {
                "properties": {
                    "action": {
                        "description": action_description,
                        "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "triple_click", "scroll", "hscroll", "wait", "terminate"],
                        "type": "string",
                    },
                    "keys": {"description": "Required only by `action=key`.", "type": "array"},
                    "text": {"description": "Required only by `action=type`.", "type": "string"},
                    "coordinate": {"description": "The x,y coordinates for mouse actions.", "type": "array"},
                    "pixels": {"description": "The amount of scrolling (positive = down, negative = up).", "type": "number"},
                    "time": {"description": "The seconds to wait.", "type": "number"},
                    "status": {"description": "The status of the task.", "type": "string", "enum": ["success", "failure"]},
                },
                "required": ["action"],
                "type": "object",
            },
            "args_format": "Format the arguments as a JSON object.",
        },
    }

    system_prompt = f"""You are utilising an Ubuntu virtual machine with internet access. You are able to use the computer to solve Microsoft Office tasks.

You should avoid asking any clarification or follow-up questions--just execute the task as best you can with what you're given.
Refrain from asking any "Yes" or "No" questions about whether you should proceed--just assume the answer is always "Yes".
When you are done with the task or are unable to complete it, use the finish tool to finish.
    
# Tools

You may call one or more functions to assist with the user query.

You are provided with function signatures within <tools></tools> XML tags:
<tools>
{json.dumps(tools_def)}
</tools>

For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
<tool_call>
{{"name": <function-name>, "arguments": <args-json-object>}}
</tool_call>

# Response format

Response format for every step:
1) Action: a short imperative describing what to do in the UI.
2) A single <tool_call>...</tool_call> block containing only the JSON: {{"name": <function-name>, "arguments": <args-json-object>}}.

Rules:
- Output exactly in the order: Action, <tool_call>.
- Be brief: one sentence for Action.
- Do not output anything else outside those parts.
- If finishing, use action=terminate in the tool call."""

    return system_prompt


class Qwen3VL(BaseAgent):
    """
    An agent that uses the Qwen3-VL model to interact with the environment.
    Uses litellm for unified API access.

    Based on the OSWorld Qwen3-VL implementation with the original action space.
    """

    def __init__(self, config: Optional[Union[Dict[str, Any], Qwen3VLConfig]] = None):
        # Handle configuration conversion
        if isinstance(config, Qwen3VLConfig):
            config.validate()
            self.agent_config = config
            config_dict = config.to_dict()
        elif isinstance(config, dict):
            config_dict = config.copy()
            # Create Qwen3VLConfig from dict for validation
            try:
                self.agent_config = Qwen3VLConfig(**config_dict)
                self.agent_config.validate()
            except TypeError:
                # Handle legacy dict format with display_size
                display_size = config_dict.pop("display_size", {"width": 1024, "height": 768})
                config_dict["display_width"] = display_size["width"]
                config_dict["display_height"] = display_size["height"]
                self.agent_config = Qwen3VLConfig(**config_dict)
                self.agent_config.validate()
                config_dict = self.agent_config.to_dict()
        elif config is None:
            self.agent_config = Qwen3VLConfig()
            config_dict = self.agent_config.to_dict()
        else:
            raise ValueError(f"Config must be dict or Qwen3VLConfig, got {type(config)}")

        # Initialize base class
        super().__init__(config_dict)

        self.model = self.agent_config.model_name
        self.api_key = self.agent_config.api_key
        self.base_url = self.agent_config.base_url

        if not self.model:
            raise ValueError("model_name must be provided in the config for Qwen3VL agent.")

        if not self.api_key:
            raise ValueError("api_key must be provided for the LLM endpoint.")

        self.display_size: Dict[str, int] = self.agent_config.display_size
        self.coordinate_type = self.agent_config.coordinate_type
        self.history_n = self.agent_config.history_n
        self.previous_response_id: Optional[str] = None

        # Build system prompt with tools (OSWorld style)
        self.system_prompt = build_system_prompt(self.display_size["width"], self.display_size["height"], self.coordinate_type)

        self.messages: List[Dict[str, Any]] = [{"role": "system", "content": self.system_prompt}]

        # History tracking
        self.screenshots: List[str] = []  # base64 encoded
        self.responses: List[str] = []
        self.actions: List[str] = []

    @property
    def action_adapter_class(self) -> type[ActionAdapter]:
        return Qwen3VLActionAdapter

    def _adjust_coordinates(self, x: float, y: float, original_width: int, original_height: int, processed_width: int, processed_height: int) -> Tuple[int, int]:
        """Adjust coordinates from processed to original resolution."""
        if self.coordinate_type == "absolute":
            if processed_width and processed_height:
                x_scale = original_width / processed_width
                y_scale = original_height / processed_height
                return int(x * x_scale), int(y * y_scale)
            return int(x), int(y)
        # Relative: scale from 0..999 grid
        x_scale = original_width / 999
        y_scale = original_height / 999
        return int(x * x_scale), int(y * y_scale)

    def _parse_response(
        self,
        response_text: str,
        original_width: int,
        original_height: int,
        processed_width: int,
        processed_height: int,
    ) -> Tuple[str, str, Dict[str, Any]]:
        """
        Parse LLM response in OSWorld format and extract action details.

        Args:
            response_text: Raw LLM response text
            original_width: Original screen width
            original_height: Original screen height
            processed_width: Processed image width
            processed_height: Processed image height

        Returns:
            Tuple of (thought, action_type, action_args)
        """
        thought = ""
        action_type = ""
        action_args: Dict[str, Any] = {}

        if not response_text or not response_text.strip():
            return thought, action_type, action_args

        # Parse response lines
        lines = response_text.split("\n")
        inside_tool_call = False
        current_tool_call: List[str] = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Extract thought/action description
            if line.lower().startswith("action:"):
                thought = line.split("Action:")[-1].strip()
                continue

            # Handle tool call tags
            if line.startswith("<tool_call>"):
                inside_tool_call = True
                continue
            elif line.startswith("</tool_call>"):
                if current_tool_call:
                    try:
                        tool_json = "\n".join(current_tool_call)
                        tool_call = json.loads(tool_json)
                        if tool_call.get("name") == "computer_use":
                            args = tool_call.get("arguments", {})
                            action_type = args.get("action", "")

                            # Process coordinate if present
                            if "coordinate" in args and args["coordinate"]:
                                x, y = args["coordinate"]
                                adj_x, adj_y = self._adjust_coordinates(x, y, original_width, original_height, processed_width, processed_height)
                                action_args["coordinate"] = [adj_x, adj_y]

                            # Copy other args
                            for key in ["keys", "text", "pixels", "time", "status"]:
                                if key in args:
                                    action_args[key] = args[key]
                    except (json.JSONDecodeError, KeyError) as e:
                        print(f"Failed to parse tool call: {e}")
                    current_tool_call = []
                inside_tool_call = False
                continue

            if inside_tool_call:
                current_tool_call.append(line)
                continue

            # Try to parse standalone JSON (fallback)
            if line.startswith("{") and line.endswith("}"):
                try:
                    json_obj = json.loads(line)
                    if "name" in json_obj and "arguments" in json_obj:
                        args = json_obj.get("arguments", {})
                        action_type = args.get("action", "")

                        if "coordinate" in args and args["coordinate"]:
                            x, y = args["coordinate"]
                            adj_x, adj_y = self._adjust_coordinates(x, y, original_width, original_height, processed_width, processed_height)
                            action_args["coordinate"] = [adj_x, adj_y]

                        for key in ["keys", "text", "pixels", "time", "status"]:
                            if key in args:
                                action_args[key] = args[key]
                except json.JSONDecodeError:
                    pass

        # Process remaining tool call if any
        if current_tool_call:
            try:
                tool_json = "\n".join(current_tool_call)
                tool_call = json.loads(tool_json)
                if tool_call.get("name") == "computer_use":
                    args = tool_call.get("arguments", {})
                    action_type = args.get("action", "")

                    if "coordinate" in args and args["coordinate"]:
                        x, y = args["coordinate"]
                        adj_x, adj_y = self._adjust_coordinates(x, y, original_width, original_height, processed_width, processed_height)
                        action_args["coordinate"] = [adj_x, adj_y]

                    for key in ["keys", "text", "pixels", "time", "status"]:
                        if key in args:
                            action_args[key] = args[key]
            except (json.JSONDecodeError, KeyError) as e:
                print(f"Failed to parse remaining tool call: {e}")

        return thought, action_type, action_args

    def _parse_all_actions(
        self,
        response_text: str,
        original_width: int,
        original_height: int,
        processed_width: int,
        processed_height: int,
    ) -> Tuple[str, List[Tuple[str, Dict[str, Any]]]]:
        """
        Parse all tool calls from the response in order.

        Returns:
            Tuple of (thought, list of (action_type, action_args))
        """
        thought = ""
        actions_list: List[Tuple[str, Dict[str, Any]]] = []
        if not response_text or not response_text.strip():
            return thought, actions_list

        lines = response_text.split("\n")
        inside_tool_call = False
        current_tool_call: List[str] = []

        def finalize_current_tool():
            nonlocal actions_list
            if not current_tool_call:
                return
            try:
                tool_json = "\n".join(current_tool_call)
                tool_call = json.loads(tool_json)
                if tool_call.get("name") == "computer_use":
                    args = tool_call.get("arguments", {})
                    action_type = args.get("action", "")
                    action_args: Dict[str, Any] = {}
                    if "coordinate" in args and args["coordinate"]:
                        x, y = args["coordinate"]
                        adj_x, adj_y = self._adjust_coordinates(x, y, original_width, original_height, processed_width, processed_height)
                        action_args["coordinate"] = [adj_x, adj_y]
                    for key in ["keys", "text", "pixels", "time", "status", "direction", "scroll_direction", "scroll_amount", "amount"]:
                        if key in args:
                            action_args[key] = args[key]
                    actions_list.append((action_type, action_args))
            except (json.JSONDecodeError, KeyError) as e:
                print(f"Failed to parse tool call: {e}")

        for line in lines:
            line = line.strip()
            if not line:
                continue
            if line.lower().startswith("action:"):
                thought = line.split("Action:")[-1].strip()
                continue
            if line.startswith("<tool_call>"):
                inside_tool_call = True
                current_tool_call = []
                continue
            if line.startswith("</tool_call>"):
                finalize_current_tool()
                inside_tool_call = False
                current_tool_call = []
                continue
            if inside_tool_call:
                current_tool_call.append(line)
                continue
            # Fallback standalone JSON
            if line.startswith("{") and line.endswith("}"):
                try:
                    json_obj = json.loads(line)
                    if "name" in json_obj and "arguments" in json_obj:
                        args = json_obj.get("arguments", {})
                        action_type = args.get("action", "")
                        action_args: Dict[str, Any] = {}
                        if "coordinate" in args and args["coordinate"]:
                            x, y = args["coordinate"]
                            adj_x, adj_y = self._adjust_coordinates(x, y, original_width, original_height, processed_width, processed_height)
                            action_args["coordinate"] = [adj_x, adj_y]
                        for key in ["keys", "text", "pixels", "time", "status", "direction", "scroll_direction", "scroll_amount", "amount"]:
                            if key in args:
                                action_args[key] = args[key]
                        actions_list.append((action_type, action_args))
                except json.JSONDecodeError:
                    pass

        # Any remaining
        if current_tool_call:
            finalize_current_tool()

        return thought, actions_list

    def _build_instruction_prompt(self, instruction: str) -> str:
        """Build the instruction prompt with previous actions."""
        current_step = len(self.actions)
        history_start_idx = max(0, current_step - self.history_n)

        # Build previous actions summary
        previous_actions = []
        for i in range(history_start_idx):
            if i < len(self.actions):
                previous_actions.append(f"Step {i+1}: {self.actions[i]}")
        previous_actions_str = "\n".join(previous_actions) if previous_actions else "None"

        return f"""Please generate the next move according to the UI screenshot, instruction and previous actions.

Instruction: {instruction}

Previous actions:
{previous_actions_str}"""

    def step(self, screenshot: bytes, instruction: str) -> str:
        """
        Takes a step in the environment using the Qwen3-VL model.

        Args:
            screenshot: Current screenshot as bytes
            instruction: Task instruction

        Returns:
            JSON string with action output
        """
        # Get original dimensions
        image = Image.open(BytesIO(screenshot))
        original_width, original_height = image.size

        # Process image for Qwen VL
        processed_image, processed_width, processed_height = process_image(screenshot)

        # Store screenshot in history
        self.screenshots.append(processed_image)

        # Build instruction prompt
        instruction_prompt = self._build_instruction_prompt(instruction)

        # Build messages with history
        current_step = len(self.responses)
        history_len = min(self.history_n, current_step)

        # Rebuild messages from scratch for multi-turn
        self.messages = [{"role": "system", "content": self.system_prompt}]

        if history_len > 0:
            history_responses = self.responses[-history_len:]
            history_screenshots = self.screenshots[-history_len - 1 : -1] if len(self.screenshots) > 1 else []

            for idx in range(history_len):
                if idx < len(history_screenshots):
                    screenshot_b64 = history_screenshots[idx]
                    if idx == 0:
                        # First history message includes instruction
                        self.messages.append(
                            {
                                "role": "user",
                                "content": [
                                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
                                    {"type": "text", "text": instruction_prompt},
                                ],
                            }
                        )
                    else:
                        self.messages.append(
                            {
                                "role": "user",
                                "content": [
                                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
                                ],
                            }
                        )

                self.messages.append({"role": "assistant", "content": history_responses[idx]})

            # Add current screenshot
            self.messages.append(
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{processed_image}"}},
                    ],
                }
            )
        else:
            # First step - include instruction with screenshot
            self.messages.append(
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{processed_image}"}},
                        {"type": "text", "text": instruction_prompt},
                    ],
                }
            )

        # Call LLM via litellm
        completion_kwargs = {
            "model": self.model,
            "messages": self.messages,
            "max_tokens": self.agent_config.max_tokens,
            "temperature": self.agent_config.temperature,
            "top_p": self.agent_config.top_p,
            "api_key": self.api_key,
        }

        if self.base_url:
            completion_kwargs["base_url"] = self.base_url

        response = litellm.completion(**completion_kwargs)

        response_message = response.choices[0].message
        response_text = response_message.content or ""

        # Store response in history
        self.responses.append(response_text)

        print(f"Qwen3VL Output: {response_text}")

        # Parse the response; support multiple tool calls
        thought, actions_list = self._parse_all_actions(response_text, original_width, original_height, processed_width, processed_height)

        if actions_list:
            print(f"Parsed actions: {[a[0] for a in actions_list]}")
        else:
            print("Parsed actions: []")

        # Store action description (summarize first action type if present)
        action_label = actions_list[0][0] if actions_list else ""
        self.actions.append(f"{action_label}: {thought}" if thought else action_label)

        # Build response JSON
        if not actions_list:
            action_output = {
                "status": "error",
                "output": [{"type": "reasoning", "summary": [{"text": response_text}]}, {"type": "message", "content": [{"type": "output_text", "text": "Could not parse action from response."}]}],
            }
        else:
            output_items: List[Dict[str, Any]] = [{"type": "reasoning", "summary": [{"text": thought}]}]
            for action_type, action_args in actions_list:
                if action_type == "terminate":
                    status = action_args.get("status", "success")
                    output_items.append({"type": "message", "content": [{"type": "output_text", "text": f"DONE. Task {status}."}]})
                    break
                elif action_type == "wait":
                    wait_time = action_args.get("time", 5)
                    output_items.append({"type": "computer_call", "action": {"type": "wait", "duration": wait_time}})
                else:
                    action_dict = {"type": action_type}
                    action_dict.update(action_args)
                    output_items.append({"type": "computer_call", "action": action_dict})
            action_output = {"status": "completed", "output": output_items}

        return json.dumps(action_output)

    def reset(self) -> None:
        """
        Resets the agent's state for a new task.
        """
        super().reset()
        self.messages = [{"role": "system", "content": self.system_prompt}]
        self.previous_response_id = None
        self.screenshots = []
        self.responses = []
        self.actions = []
