import base64
import io
import json
import time
import traceback
import demjson3

import requests
from PIL import Image

END_POINT = "http://localhost:8000/v1"  # Replace with actual endpoint

# system prompt
ACTION_SCHEMA = json.load(open('/evaluation/agentcpm_schema.json', encoding="utf-8"))
items = list(ACTION_SCHEMA.items())
insert_index = 3
items.insert(insert_index, ("required", ["thought"])) # enable/disable thought by setting it to "required"/"optional"
ACTION_SCHEMA = dict(items)
SYSTEM_PROMPT = f"""\nYou are a foundational action model capable of automating tasks across various digital environments, including desktop systems like Windows, macOS, and Linux, as well as mobile platforms such as Android and iOS. You also excel in web browser environments. You will interact with digital devices in a human-like manner: by reading screenshots, analyzing them, and taking appropriate actions.\n\nYour expertise covers two types of digital tasks:\n    - Grounding: Given a screenshot and a description, you assist users in locating elements mentioned. Sometimes, you must infer which elements best fit the description when they aren't explicitly stated.\n    - Executable Language Grounding: With a screenshot and task instruction, your goal is to determine the executable actions needed to complete the task.\n\n\nYou are now operating in Executable Language Grounding mode. Your goal is to help users accomplish tasks by suggesting executable actions that best fit their needs. Your skill set includes both basic and custom actions:\n\n1. Basic Actions\nBasic actions are standardized and available across all platforms. They provide essential functionality and are defined with a specific format, ensuring consistency and reliability. \nBasic Action 1: CLICK \n    - purpose: Click at the specified position.\n    - format: CLICK <point>[[x-axis, y-axis]]</point>\n    - example usage: CLICK <point>[[101, 872]]</point>\n       \nBasic Action 2: TYPE\n    - purpose: Enter specified text at the designated location.\n    - format: TYPE [input text]\n    - example usage: TYPE [Shanghai shopping mall]\n\nBasic Action 3: SCROLL\n    - purpose: Scroll in the specified direction.\n    - format: SCROLL [direction (UP/DOWN/LEFT/RIGHT)]\n    - example usage: SCROLL [UP]\n    \n2.Custom Actions\nCustom actions are unique to each user's platform and environment. They allow for flexibility and adaptability, enabling the model to support new and unseen actions defined by users. These actions extend the functionality of the basic set, making the model more versatile and capable of handling specific tasks.\n\n\nCustom Action 1: LONG_PRESS \n    - purpose: Long press at the specified position.\n    - format: LONG_PRESS <point>[[x-axis, y-axis]]</point>\n    - example usage: LONG_PRESS <point>[[101, 872]]</point>\n\nCustom Action 2: PRESS_BACK\n    - purpose: Press a back button to navigate to the previous screen.\n    - format: PRESS_BACK\n    - example usage: PRESS_BACK\n\nCustom Action 3: PRESS_HOME\n    - purpose: Press a home button to navigate to the home page.\n    - format: PRESS_HOME\n    - example usage: PRESS_HOME\n\nCustom Action 4: PRESS_RECENT\n    - purpose: Press the recent button to view or switch between recently used applications.\n    - format: PRESS_RECENT\n    - example usage: PRESS_RECENT\n\nCustom Action 5: IMPOSSIBLE\n    - purpose: Wait for the screen to load.\n    - format: WAIT\n    - example usage: WAIT\n\nCustom Action 6: COMPLETE\n    - purpose: Indicate the task is finished.\n    - format: COMPLETE\n    - example usage: COMPLETE\n\n\nIn most cases, task instructions are high-level and abstract. Carefully read the instruction and action history, then perform reasoning to determine the most appropriate next action. Ensure you strictly generate two sections: Thoughts and Actions.\nThoughts: Clearly outline your reasoning process for current step.\nActions: Specify the actual actions you will take based on your reasoning.\n\nYour current task instruction, action history, and associated screenshot are as follows:\nScreenshot: """

def image_to_data_url(image):
    buf = BytesIO()
    image.save(buf, format="PNG")
    b64_str = base64.b64encode(buf.getvalue()).decode()
    return f"data:image/png;base64,{b64_str}"

def build_history_actions_str(history_list):
    history = ""
    for i, step_history in enumerate(history_list):
        history += f"Step {i+1}: {step_history}\n"
    return history
def encode_image(image: Image.Image) -> str:
    """Convert PIL Image to base64-encoded string."""
    with io.BytesIO() as in_mem_file:
        image.save(in_mem_file, format="JPEG")
        in_mem_file.seek(0)
        return base64.b64encode(in_mem_file.read()).decode("utf-8")

def __resize__(origin_img):
    resolution = origin_img.size
    w,h = resolution
    max_line_res = 1120
    if max_line_res is not None:
        max_line = max_line_res
        if h > max_line:
            w = int(w * max_line / h)
            h = max_line
        if w > max_line:
            h = int(h * max_line / w)
            w = max_line
    img = origin_img.resize((w,h),resample=Image.Resampling.LANCZOS)
    return img

def predict(model_name, instruction, low_instruction, history,image):
    a11y_tree = ""
    text = f"""You are a GUI task expert, I will provide you with a high-level instruction, an action history, a screenshot with its corresponding accessibility tree.

        High-level instruction: {instruction}
        Action history: {history}
        Accessibility tree: {a11y_tree}

        Please generate the low-level thought and action for the next step."""
            # conversation = [
            #     {
            #         "role": "user",
            #         "content": [
            #             {"type": "image","image": image_path},
            #             {"type": "text", "text": text}
            #         ],
            #     }
            # ]

    url = image_to_data_url(image)
    messages = [
        {"role": "user", "content": [
            {"type": "image_url", "image_url": {"url": url}},
            {"type": "text", "text": text}
        ]}
    ]

    # payload = {
    #     "model": "AgentCPM-GUI",  # Your model name
    #     "temperature": 0.1,
    #     "messages": messages,
    #     "max_tokens": 2048,
    # }

    # headers = {
    #     "Content-Type": "application/json",
    # }

    # response = requests.post('http://47.239.63.127:8000/v1/chat/completions', headers=headers, json=payload)
    # print(response)
    for i in range(32768):
        try:
            from openai import OpenAI
            bot = OpenAI(
                # defaults to os.environ.get("OPENAI_API_KEY")
                api_key="EMPTY",
                base_url=END_POINT, 
                timeout=30
            )
            # # if os.environ.get('SEARCH_MODE','') or retry_flag:
            # if os.environ.get('SEARCH_MODE',''):

            #     # kwargs = {'extra_body': {"top_k": 50}, 'top_p': 0.9}
            #     kwargs = {'extra_body': {"top_k": 500}, 'temperature': 10.0}
            # else:
            kwargs = {'extra_body': {"top_k": 1}} # TODO
            # print(kwargs)
            chat_completion_from_url = bot.chat.completions.create(model=model_name, messages=messages, **kwargs)
            # logging.error(chat_completion_from_url)
            output = chat_completion_from_url.choices[0].message.content
            return output
        except:
            traceback.print_exc()
            print("Network Error:")
            try:
                print(output)
            except:
                print("Request Failed")
            time.sleep(2)
        else:
            break

import json
import math


def os_gensis_2minicpm(action_str):
    """
    Convert a string containing low-level thoughts and action information into minicpm schema format

    Args:
        action_str (str): String containing low-level thoughts and action information

    Returns:
        dict: Action dictionary in new format
    """
    result = {"STATUS": "continue"}

    try:
        # 提取动作部分
        action_start = action_str.find("action: ")
        if action_start == -1:
            raise ValueError("Cannot find action information")

        action_json_str = action_str[action_start + len("action: "):].strip()
        print(action_json_str)
        action_dict = demjson3.decode(action_json_str)

        action_type = action_dict.get("action_type")

        if action_type == "type":
            result["TYPE"] = action_dict.get("text", "")
        elif action_type == "click":
            result["POINT"] = [action_dict.get("x", 0), action_dict.get("y", 0)]
        elif action_type == "navigate_home":
            result["PRESS"] = "HOME"

        elif action_type == "navigate_back":
            result["PRESS"] = "BACK"

        elif action_type == "scroll":
            result["POINT"] = [500, 500]  # set default start point
            #if USE_LOW_INSTRUCTION, reverse the direction
            direction = action_dict.get("direction", "down").strip().lower()
            if USE_LOW_INSTRUCTION:
                if direction == "up":
                    direction = "down"
                elif direction == "down":
                    direction = "up"
                elif direction == "left":
                    direction = "right"
                elif direction == "right":
                    direction = "left"
            result["to"] = direction
        elif action_type == "open_app":
            result["OPEN_APP"] = action_dict.get("app_name", "")

        elif action_type == "wait":
            result["duration"] = 200

        elif action_type == "dismiss":
            result["POINT"] = [action_dict.get("x", 0), action_dict.get("y", 0)]

        elif action_type == "long_press":
            result["POINT"] = [action_dict.get("x", 0), action_dict.get("y", 0)]
            result["duration"] = 1000  # set default duration

        elif action_type == "get_text":
            result["POINT"] = [action_dict.get("x", 0), action_dict.get("y", 0)]

        else:
            print(f"Error, invalid action: {action_dict}")

    except json.JSONDecodeError:
        print("Cannot parse action information as JSON")
    except Exception as e:
        print(f"Error: {e}")

    return result


if __name__ == "__main__":
    agent_action = {"thought":"click on the Add to Cart button","POINT":[501,952]}
    print(map_action_space2qwenvl(agent_action))
