
from typing import TYPE_CHECKING, Dict
import sys
import json

TEMPLATES: Dict[str, str] = {}

def get_register_template(model_name):
    if model_name not in TEMPLATES:
        sys.exit(f"not model named {model_name}")
    return TEMPLATES[model_name]


qwen2vl_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
aguvis_template="{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant<|recipient|>all\nThought:{% endif %}"
qwen25vl_template="{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"




# TODO: AGUVIS
swipe_func = {
    "name": "swipe",
    "description": "Swipe on the screen",
    "parameters": {
        "type": "object",
        "properties": {
            "from_coord": {
                "type": "array",
                "items": {"type": "number"},
                "description": "The starting coordinates of the swipe",
            },
            "to_coord": {
                "type": "array",
                "items": {"type": "number"},
                "description": "The ending coordinates of the swipe",
            },
        },
        "required": ["from_coord", "to_coord"],
    },
}

home_func = {"name": "home", "description": "Go to the home screen"}
back_func = {"name": "back", "description": "Go to the previous screen"}
recent_func = {"name": "recent", "description": "Go to the previous APP"}
complete_func = {"name": "complete", "description": "The sign that the task has been completed"}

long_press_func = {
    "name": "long_press",
    "description": "Long press on the screen",
    "parameters": {
        "type": "object",
        "properties": {
            "x": {
                "type": "number",
                "description": "The x coordinate of the long press",
            },
            "y": {
                "type": "number",
                "description": "The y coordinate of the long press",
            },
        },
        "required": ["x", "y"],
    },
}

terminate_func={ "name": "terminate", 
                "description": "Terminate the current task and report its completion status", 
                "parameters": { "type": "object", "properties": 
                    {"status": 
                        {"type": "string", "enum": ["success"], "description": "The status of the task"}}, 
                    "required": ["status"] } 
                }


AGUVIS_SYS=f"""You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.

You have access to the following functions:
- {json.dumps(swipe_func)}
- {json.dumps(long_press_func)}
- {json.dumps(home_func)}
- {json.dumps(back_func)}
- {json.dumps(recent_func)}
- {json.dumps(terminate_func)}
"""
AGUVIS_USER="""
Please generate the next move according to the ui screenshot, instruction and previous actions.

Instruction: {overall_goal} 

Previous actions: {previous_actions}
"""
TEMPLATES['AGUVIS']=[AGUVIS_SYS,AGUVIS_USER,aguvis_template]



### QWEN2VL_Llama & QWEN2VL_Llama_short

SHORT_SYS=f"""You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.
"""
LLAMA_USER="""
Please generate the next move according to the ui screenshot, instruction and previous actions. 

Instruction: {overall_goal} 

Previous actions: {previous_actions}
"""

TEMPLATES['QWEN2VL_Llama']=[AGUVIS_SYS,LLAMA_USER,qwen2vl_template]
TEMPLATES['QWEN2VL_Llama_short']=[SHORT_SYS,LLAMA_USER,qwen2vl_template]

### QWEN2VL_Llama_prompt_l1
LLAMA_Prompt_SYS_l1=''''
You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nYou have access to the following functions:\n- {\"name\": \"mobile.swipe\", \"description\": \"Swipe on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"from_coord\": {\"type\": \"array\", \"items\": {\"type\": \"number\"}, \"description\": \"The starting coordinates of the swipe\"}, \"to_coord\": {\"type\": \"array\", \"items\": {\"type\": \"number\"}, \"description\": \"The ending coordinates of the swipe\"}}, \"required\": [\"from_coord\", \"to_coord\"]}}\n- {\"name\": \"mobile.home\", \"description\": \"Press the home button\"}\n- {\"name\": \"mobile.back\", \"description\": \"Press the back button\"}\n- {\"name\": \"mobile.wait\", \"description\": \"wait for the change to happen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"seconds\": {\"type\": \"number\", \"description\": \"The seconds to wait\"}}, \"required\": [\"seconds\"]}}\n- {\"name\": \"mobile.long_press\", \"description\": \"Long press on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the long press\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the long press\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"mobile.open_app\", \"description\": \"Open an app on the device\", \"parameters\": {\"type\": \"object\", \"properties\": {\"app_name\": {\"type\": \"string\", \"description\": \"The name of the app to open\"}}, \"required\": [\"app_name\"]}}\n
'''
LLAMA_Prompt_USER_l1="""
Please generate the next move according to the ui screenshot, instruction and previous actions. 

Instruction: {overall_goal} 

Previous actions: {previous_actions}

Please generate the natural language description of the current action in 'Action: ' and the specific execution action in 'Operation: .
"""
TEMPLATES['QWEN2VL_Llama_prompt_l1']=[AGUVIS_SYS,LLAMA_Prompt_USER_l1,qwen2vl_template]


### QWEN2VL_Llama_Format
QWEN2VL_Llama_Format_SYS=AGUVIS_SYS
QWEN2VL_Llama_Format_USER="""
Please generate the next move according to the ui screenshot, instruction and previous actions.

Instruction: {overall_goal} 

Previous actions: {previous_actions}
Please generate the observation of the current screen in <observation></observation>, the thinking of the current action in <thinking></thinking>, and the natural language description of the current action in <action></action>. Finally, the specific execution action is generated in <tool_call></tool_call>
"""
TEMPLATES['QWEN2VL_Llama_Format']=[QWEN2VL_Llama_Format_SYS,QWEN2VL_Llama_Format_USER,qwen2vl_template]


### R1

R1_SYS="""Solve the question. The user asks a question, and you solves it. You first thinks about the reasoning process in the mind and then provides the user with the answer. The answer is in latex format and wrapped in $...$. The final answer must be wrapped using the \\boxed{} command. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> Since $1+1=2$, so the answer is $2$. </think><answer> The answer is $\\boxed{2}$ </answer>, which means assistant's output should start with <think> and end with </answer>."""
R1_USER=["""You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nYou have access to the following functions:\n- {\"name\": \"mobile.swipe\", \"description\": \"Swipe on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"from_coord\": {\"type\": \"array\", \"items\": {\"type\": \"number\"}, \"description\": \"The starting coordinates of the swipe\"}, \"to_coord\": {\"type\": \"array\", \"items\": {\"type\": \"number\"}, \"description\": \"The ending coordinates of the swipe\"}}, \"required\": [\"from_coord\", \"to_coord\"]}}\n- {\"name\": \"mobile.home\", \"description\": \"Press the home button\"}\n- {\"name\": \"mobile.back\", \"description\": \"Press the back button\"}\n- {\"name\": \"mobile.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}\n \n""",
"""Please generate the next move according to the ui screenshot, instruction and previous actions.

Instruction: {overall_goal} 

Previous actions: {previous_actions}
"""]

TEMPLATES['R1']=[R1_SYS,R1_USER,qwen25vl_template]

### QWEN25VL
QWEN25VL_SYS="""You are a helpful assistant.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name_for_human\": \"mobile_use\", \"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\\n* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\\n* The screen's resolution is {weight}x{height}.\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\\n* `key`: Perform a key event on the mobile device.\\n    - This supports adb's `keyevent` syntax.\\n    - Examples: \\\"volume_up\\\", \\\"volume_down\\\", \\\"power\\\", \\\"camera\\\", \\\"clear\\\".\\n* `click`: Click the point on the screen with coordinate (x, y).\\n* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\\n* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\\n* `type`: Input the specified text into the activated input box.\\n* `system_button`: Press the system button.\\n* `open`: Open an app on the device.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"key\", \"click\", \"long_press\", \"swipe\", \"type\", \"system_button\", \"open\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=key`, `action=type`, and `action=open`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}, \"args_format\": \"Format the arguments as a JSON object.\"}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"""
QWEN25VL_USER="""
'The user query: {overall_goal} (You have done the following operation on the current device): {previous_actions} \nAfter answering, summarize your action in <conclusion></conclusion> tags, and insert them after the <tool_call></tool_call> XML tags.'
"""

TEMPLATES['QWEN25VL']=[QWEN25VL_SYS,QWEN25VL_USER,qwen25vl_template]
### QWEN25VL-future-steps
QWEN25VL_SYS="""You are a helpful assistant.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name_for_human\": \"mobile_use\", \"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\\n* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\\n* The screen's resolution is {weight}x{height}.\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\\n* `key`: Perform a key event on the mobile device.\\n    - This supports adb's `keyevent` syntax.\\n    - Examples: \\\"volume_up\\\", \\\"volume_down\\\", \\\"power\\\", \\\"camera\\\", \\\"clear\\\".\\n* `click`: Click the point on the screen with coordinate (x, y).\\n* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\\n* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\\n* `type`: Input the specified text into the activated input box.\\n* `system_button`: Press the system button.\\n* `open`: Open an app on the device.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"key\", \"click\", \"long_press\", \"swipe\", \"type\", \"system_button\", \"open\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=key`, `action=type`, and `action=open`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}, \"args_format\": \"Format the arguments as a JSON object.\"}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"""
QWEN25VL_USER="""The user query: {overall_goal} (You have done the following operation on the current device): {previous_actions}. \nBefore answering, give the steps on how to proceed to the task in <future steps></future steps> tags and insert them before the <tool_call></tool_call> XML tags. After answering, summarize your work in the<conclusion></conclusion> markup and insert them after the <tool_call></tool_call> XML tags"""

TEMPLATES['QWEN25VL-future-steps']=[QWEN25VL_SYS,QWEN25VL_USER,qwen25vl_template]
