from typing import Dict
import sys
import json
swipe_func = {
    "name": "mobile.swipe",
    "description": "Swipe on the screen",
    "parameters": {
        "type": "object",
        "properties": {
            "from_coord": {
                "type": "array",
                "items": {"type": "number"},
                "description": "The starting coordinates of the swipe",
            },
            "to_coord": {
                "type": "array",
                "items": {"type": "number"},
                "description": "The ending coordinates of the swipe",
            },
        },
        "required": ["from_coord", "to_coord"],
    },
}

home_func = {"name": "mobile.home", "description": "Press the home button"}

back_func = {"name": "mobile.back", "description": "Press the back button"}

wait_func = {
    "name": "mobile.wait",
    "description": "wait for the change to happen",
    "parameters": {
        "type": "object",
        "properties": {
            "seconds": {
                "type": "number",
                "description": "The seconds to wait",
            },
        },
        "required": ["seconds"],
    },
}

long_press_func = {
    "name": "mobile.long_press",
    "description": "Long press on the screen",
    "parameters": {
        "type": "object",
        "properties": {
            "x": {
                "type": "number",
                "description": "The x coordinate of the long press",
            },
            "y": {
                "type": "number",
                "description": "The y coordinate of the long press",
            },
        },
        "required": ["x", "y"],
    },
}

open_app_func = {
    "name": "mobile.open_app",
    "description": "Open an app on the device",
    "parameters": {
        "type": "object",
        "properties": {
            "app_name": {
                "type": "string",
                "description": "The name of the app to open",
            },
        },
        "required": ["app_name"],
    },
}


TEMPLATES: Dict[str, str] = {}

def get_register_template(model_name):
    if model_name not in TEMPLATES:
        sys.exit(f"not model named {model_name}")
    return TEMPLATES[model_name]

qwen2vl_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
aguvis_template="{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant<|recipient|>all\nThought:{% endif %}"
qwen25vl_template="{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"



### QWEN2VL
QWEN2VL_SYS="""You are a helpful assistant.
"""
QWEN2VL_USER="""
You are a GUI agent. You need to perform the next action to complete the task. \n\n## Output Format\n\nThought: ...\nAction: ...\n\n\n## Action Space \nclick(coordinate='(relative_x,relative_y)')\nlong_press(coordinate='(relative_x,relative_y)')\ninput_text(content='')\nscroll(direction='down or up or right or left')\nopen_app(app_name='')\nnavigate_back()\nnavigate_home()\nwait() # Submit the task regardless of whether it succeeds or fails.\n\n## Note\n- Use English in Thought part.\n\n- Use coordinates in relative terms (from 0 to 1)\n\n- Summarize your next action (with its target element) in one sentence in Thought part.\n
Please generate the next action according to the instruction, previous actions and screenshot image. 
## Instruction: {overall_goal} 
## Previous actions: {previous_actions}
"""

TEMPLATES['QWEN2VL']=[QWEN2VL_SYS,QWEN2VL_USER,qwen2vl_template]

### AGUVIS
AGUVIS_SYS=f"""You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.

You have access to the following functions:
- {json.dumps(swipe_func)}
- {json.dumps(home_func)}
- {json.dumps(back_func)}
- {json.dumps(wait_func)}
- {json.dumps(long_press_func)}
- {json.dumps(open_app_func)}
"""

AGUVIS_USER="""
Please generate the next move according to the ui screenshot, instruction and previous actions.

Instruction: {overall_goal} 

Previous actions: {previous_actions}
"""
TEMPLATES['AGUVIS']=[AGUVIS_SYS,AGUVIS_USER,aguvis_template]



### QWEN2VL_Llama & QWEN2VL_Llama_short
SHORT_SYS=f"""You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.
"""
LLAMA_USER="""
Please generate the next move according to the ui screenshot, instruction and previous actions. 

Instruction: {overall_goal} 

Previous actions: {previous_actions}
"""

TEMPLATES['QWEN2VL_Llama']=[AGUVIS_SYS,LLAMA_USER,qwen2vl_template]
TEMPLATES['QWEN2VL_Llama_short']=[SHORT_SYS,LLAMA_USER,qwen2vl_template]

### QWEN2VL_Llama_prompt_l1
LLAMA_Prompt_SYS_l1=''''
You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nYou have access to the following functions:\n- {\"name\": \"mobile.swipe\", \"description\": \"Swipe on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"from_coord\": {\"type\": \"array\", \"items\": {\"type\": \"number\"}, \"description\": \"The starting coordinates of the swipe\"}, \"to_coord\": {\"type\": \"array\", \"items\": {\"type\": \"number\"}, \"description\": \"The ending coordinates of the swipe\"}}, \"required\": [\"from_coord\", \"to_coord\"]}}\n- {\"name\": \"mobile.home\", \"description\": \"Press the home button\"}\n- {\"name\": \"mobile.back\", \"description\": \"Press the back button\"}\n- {\"name\": \"mobile.wait\", \"description\": \"wait for the change to happen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"seconds\": {\"type\": \"number\", \"description\": \"The seconds to wait\"}}, \"required\": [\"seconds\"]}}\n- {\"name\": \"mobile.long_press\", \"description\": \"Long press on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the long press\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the long press\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"mobile.open_app\", \"description\": \"Open an app on the device\", \"parameters\": {\"type\": \"object\", \"properties\": {\"app_name\": {\"type\": \"string\", \"description\": \"The name of the app to open\"}}, \"required\": [\"app_name\"]}}\n
'''
LLAMA_Prompt_USER_l1="""
Please generate the next move according to the ui screenshot, instruction and previous actions. 

Instruction: {overall_goal} 

Previous actions: {previous_actions}

Please generate the natural language description of the current action in 'Action: ' and the specific execution action in 'Operation: .
"""
TEMPLATES['QWEN2VL_Llama_prompt_l1']=[AGUVIS_SYS,LLAMA_Prompt_USER_l1,qwen2vl_template]




### sys2user
sys2user_sys='You are a helpful assistant.'
sys2user_user="""
You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.

You have access to the following functions:
- {{"name": "mobile.swipe", "description": "Swipe on the screen", "parameters": {{"type": "object", "properties": {{"from_coord": {{"type": "array", "items": {{"type": "number"}}, "description": "The starting coordinates of the swipe"}}, "to_coord": {{"type": "array", "items": {{"type": "number"}}, "description": "The ending coordinates of the swipe"}}}}, "required": ["from_coord", "to_coord"]}}}}
- {{"name": "mobile.home", "description": "Press the home button"}}
- {{"name": "mobile.back", "description": "Press the back button"}}
- {{"name": "mobile.wait", "description": "wait for the change to happen", "parameters": {{"type": "object", "properties": {{"seconds": {{"type": "number", "description": "The seconds to wait"}}}}, "required": ["seconds"]}}}}
- {{"name": "mobile.long_press", "description": "Long press on the screen", "parameters": {{"type": "object", "properties": {{"x": {{"type": "number", "description": "The x coordinate of the long press"}}, "y": {{"type": "number", "description": "The y coordinate of the long press"}}}}, "required": ["x", "y"]}}}}
- {{"name": "mobile.open_app", "description": "Open an app on the device", "parameters": {{"type": "object", "properties": {{"app_name": {{"type": "string", "description": "The name of the app to open"}}}}, "required": ["app_name"]}}}}

Please generate the next move according to the UI screenshot, instruction and previous actions. 

Instruction: {overall_goal}

Previous actions: {previous_actions}

Please generate the observation of the current screen in 'Observation: ', the thinking of the current action in 'Thought: ' and the natural language description of the current action in 'Action: '. Finally, the specific execution action is generated in 'Operation: 
"""
TEMPLATES['sys2user']=[sys2user_sys,sys2user_user,qwen2vl_template]


### QWEN2VL_Llama_Format
QWEN2VL_Llama_Format_SYS=AGUVIS_SYS
QWEN2VL_Llama_Format_USER="""
Please generate the next move according to the ui screenshot, instruction and previous actions.

Instruction: {overall_goal} 

Previous actions: {previous_actions}
Please generate the observation of the current screen in <observation></observation>, the thinking of the current action in <thinking></thinking>, and the natural language description of the current action in <action></action>. Finally, the specific execution action is generated in <tool_call></tool_call>
"""
TEMPLATES['QWEN2VL_Llama_Format']=[QWEN2VL_Llama_Format_SYS,QWEN2VL_Llama_Format_USER,qwen2vl_template]


###grpo
GRPO_SYS=AGUVIS_SYS
# GRPO_USER="""
# Please generate the next move according to the ui screenshot, instruction and previous actions.

# Instruction: {overall_goal} 

# Previous actions: {previous_actions}
# Please generate the natural language description of the current action in <action></action> and the specific execution action in <tool_call></tool_call>
# """



GRPO_USER="""
Please generate the next move according to the ui screenshot, instruction and previous actions.

Instruction: {overall_goal} 

Previous actions: {previous_actions}

Please generate the natural language description of the current action in <action></action>. Finally, the specific execution action is generated in <tool_call></tool_call>.
First output the thinking process in <thinking> </thinking> tags and then output the final answer in <answer> <action></action> <tool_call></tool_call> </answer> tags.
"""

TEMPLATES['GRPO']=[GRPO_SYS,GRPO_USER,qwen2vl_template]


### R1
R1_SYS="""Solve the question. The user asks a question, and you solves it. You first thinks about the reasoning process in the mind and then provides the user with the answer. The answer is in latex format and wrapped in $...$. The final answer must be wrapped using the \\boxed{} command. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> Since $1+1=2$, so the answer is $2$. </think><answer> The answer is $\\boxed{2}$ </answer>, which means assistant's output should start with <think> and end with </answer>."""
R1_USER=["""You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nYou have access to the following functions:\n- {\"name\": \"mobile.swipe\", \"description\": \"Swipe on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"from_coord\": {\"type\": \"array\", \"items\": {\"type\": \"number\"}, \"description\": \"The starting coordinates of the swipe\"}, \"to_coord\": {\"type\": \"array\", \"items\": {\"type\": \"number\"}, \"description\": \"The ending coordinates of the swipe\"}}, \"required\": [\"from_coord\", \"to_coord\"]}}\n- {\"name\": \"mobile.home\", \"description\": \"Press the home button\"}\n- {\"name\": \"mobile.back\", \"description\": \"Press the back button\"}\n- {\"name\": \"mobile.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}\n \n""",
"""Please generate the next move according to the ui screenshot, instruction and previous actions.

Instruction: {overall_goal} 

Previous actions: {previous_actions}
"""]

TEMPLATES['R1']=[R1_SYS,R1_USER,qwen25vl_template]


### QWEN25VL
QWEN25VL_SYS="""You are a helpful assistant.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name_for_human\": \"mobile_use\", \"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\\n* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\\n* The screen's resolution is {weight}x{height}.\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\\n* `key`: Perform a key event on the mobile device.\\n    - This supports adb's `keyevent` syntax.\\n    - Examples: \\\"volume_up\\\", \\\"volume_down\\\", \\\"power\\\", \\\"camera\\\", \\\"clear\\\".\\n* `click`: Click the point on the screen with coordinate (x, y).\\n* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\\n* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\\n* `type`: Input the specified text into the activated input box.\\n* `system_button`: Press the system button.\\n* `open`: Open an app on the device.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"key\", \"click\", \"long_press\", \"swipe\", \"type\", \"system_button\", \"open\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=key`, `action=type`, and `action=open`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}, \"args_format\": \"Format the arguments as a JSON object.\"}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"""
QWEN25VL_USER="""The user query: {overall_goal} (You have done the following operation on the current device): {previous_actions} \nAfter answering, summarize your action in <conclusion></conclusion> tags, and insert them after the <tool_call></tool_call> XML tags."""

TEMPLATES['QWEN25VL']=[QWEN25VL_SYS,QWEN25VL_USER,qwen25vl_template]
### QWEN25VL-future-steps
QWEN25VL_SYS="""You are a helpful assistant.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name_for_human\": \"mobile_use\", \"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\\n* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\\n* The screen's resolution is {weight}x{height}.\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\\n* `key`: Perform a key event on the mobile device.\\n    - This supports adb's `keyevent` syntax.\\n    - Examples: \\\"volume_up\\\", \\\"volume_down\\\", \\\"power\\\", \\\"camera\\\", \\\"clear\\\".\\n* `click`: Click the point on the screen with coordinate (x, y).\\n* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\\n* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\\n* `type`: Input the specified text into the activated input box.\\n* `system_button`: Press the system button.\\n* `open`: Open an app on the device.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"key\", \"click\", \"long_press\", \"swipe\", \"type\", \"system_button\", \"open\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=key`, `action=type`, and `action=open`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}, \"args_format\": \"Format the arguments as a JSON object.\"}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"""
QWEN25VL_USER="""The user query: {overall_goal} (You have done the following operation on the current device): {previous_actions}. \nBefore answering, give the steps on how to proceed to the task in <future steps></future steps> tags and insert them before the <tool_call></tool_call> XML tags. After answering, summarize your work in the<conclusion></conclusion> markup and insert them after the <tool_call></tool_call> XML tags"""

TEMPLATES['QWEN25VL-future-steps']=[QWEN25VL_SYS,QWEN25VL_USER,qwen25vl_template]