
import json
from typing import Tuple, Union, Dict, Any, List, Optional


MOBILE_USE_TOOL_SCHEMA: Dict[str, Any] = {
    "type": "function",
    "function": {
        "name": "mobile_use",
        "description": (
            "Use a touchscreen to interact with a mobile device, and take screenshots.\n"
            "* This is an interface to a mobile device with touchscreen. "
            "You can perform actions like clicking, typing, swiping, etc.\n"
            "* Some applications may take time to start or process actions, "
            "so you may need to wait and take successive screenshots to see the results of your actions.\n"
            "* The screen's resolution is 999x999.\n"
            "* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. "
            "Don't click boxes on their edges unless asked."
        ),
        "parameters": {
            "type": "object",
            "properties": {
                "action": {
                    "description": (
                        "The action to perform. The available actions are:\n"
                        "* `click`: Click the point on the screen with coordinate (x, y).\n"
                        "* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\n"
                        "* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\n"
                        "* `type`: Input the specified text into the activated input box.\n"
                        "* `answer`: Output the answer.\n"
                        "* `system_button`: Press the system button.\n"
                        "* `wait`: Wait specified seconds for the change to happen.\n"
                        "* `terminate`: Terminate the current task and report its completion status."
                    ),
                    "enum": [
                        "click",
                        "long_press",
                        "swipe",
                        "type",
                        "answer",
                        "system_button",
                        "wait",
                        "terminate",
                    ],
                    "type": "string",
                },
                "coordinate": {
                    "description": (
                        "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) "
                        "coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`."
                    ),
                    "type": "array",
                },
                "coordinate2": {
                    "description": (
                        "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) "
                        "coordinates to move the mouse to. Required only by `action=swipe`."
                    ),
                    "type": "array",
                },
                "text": {
                    "description": "Required only by `action=type` and `action=answer`.",
                    "type": "string",
                },
                "time": {
                    "description": "The seconds to wait. Required only by `action=long_press` and `action=wait`.",
                    "type": "number",
                },
                "button": {
                    "description": (
                        "Back means returning to the previous interface, Home means returning to the desktop, "
                        "Menu means opening the application background menu, and Enter means pressing the enter. "
                        "Required only by `action=system_button`"
                    ),
                    "enum": ["Back", "Home", "Menu", "Enter"],
                    "type": "string",
                },
                "status": {
                    "description": "The status of the task. Required only by `action=terminate`.",
                    "type": "string",
                    "enum": ["success", "failure"],
                },
            },
            "required": ["action"],
        },
    },
}


def build_mobile_use_system_prompt() -> str:
    tool_json = json.dumps(MOBILE_USE_TOOL_SCHEMA, ensure_ascii=False)
    return f"""\

You may call one or more functions to assist with the user query.

You are provided with function signatures within <tools></tools> XML tags:
<tools>
{tool_json}
</tools>

For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
<tool_call>
{{"name": <function-name>, "arguments": <args-json-object>}}
</tool_call>


Response format for every step:
1) Thought: one concise sentence explaining the next move (no multi-step reasoning).
2) Action: a short imperative describing what to do in the UI.
3) A single <tool_call>...</tool_call> block containing only the JSON: {{"name": <function-name>, "arguments": <args-json-object>}}.

Rules:
- Output exactly in the order: Thought, Action, <tool_call>.
- Be brief: one sentence for Thought, one for Action.
- Do not output anything else outside those three parts.
- If finishing, use action=terminate in the tool call.








MOBILE_USE_SYSTEM_PROMPT = build_mobile_use_system_prompt()


def build_user_query(instruction: str, action_history: Optional[List[str]] = None) -> str:
    stage2_history = ""
    if action_history:
        for idx, his in enumerate(action_history):
            clean_his = his.replace("\n", "").replace('"', "")
            stage2_history += f"Step {idx + 1}: {clean_his}; "

    if stage2_history:
        return (
            f"The user query: {instruction}.\n"
            f"Task progress (You have done the following operation on the current device): {stage2_history}\n"
        )
    else:
        return f"The user query: {instruction}.\n"


def build_messages(
    user_query: str,
    image_path: Optional[str],
    system_prompt: str,
) -> List[Dict[str, Any]]:
    import os

    messages: List[Dict[str, Any]] = []

    messages.append({
        "role": "system",
        "content": [{"type": "text", "text": system_prompt}],
    })

    user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_query}]

    if image_path and os.path.exists(image_path):
        user_content.append({"type": "image", "image": image_path})

    messages.append({"role": "user", "content": user_content})
    return messages


def parse_tool_call(output_text: str) -> Optional[Dict[str, Any]]:
    try:
        if "<tool_call>" in output_text and "</tool_call>" in output_text:
            json_str = output_text.split("<tool_call>")[1].split("</tool_call>")[0].strip()
            return json.loads(json_str)
    except (IndexError, json.JSONDecodeError):
        pass
    return None


def parse_action_history(action_history: Any) -> List[str]:
    if action_history is None:
        return []
    
    if isinstance(action_history, list):
        return action_history
    
    if isinstance(action_history, str):
        if not action_history.strip():
            return []
        history_list = []
        for line in action_history.strip().split("\n"):
            line = line.strip()
            if not line:
                continue
            if ": " in line:
                action_part = line.split(": ", 1)[1]
                history_list.append(action_part)
            else:
                history_list.append(line)
        return history_list
    
    return []


def rescale_coordinates(point: list, width: int, height: int) -> list:
    return [round(point[0] / 999 * width), round(point[1] / 999 * height)]

