from openai import OpenAI
import base64
import json
from tqdm import tqdm

sys_prompt = """You are a mobile expert who excels at interacting with elements on mobile screens to complete tasks. I have a task for you, and I hope you can use your extensive knowledge to identify interactive elements on mobile screens. I will provide you with the following information:
 
1. The type of action currently being executed, which can be one of five types: CLICK, SCROLL, TYPE, PRESS_BACK, and LONG_PRESS. You need to choose an action that can interact with the current screen.
2. Analysis of the mobile screen, which corresponds to the marked boxes in the images.
 
Your task is to identify five interactive elements on the current mobile screen. The output should include four parts:
 
1. Sub-Instruction: Identify the interactive elements and generate natural language instructions for interacting with these elements. The instructions should be concise, clear, and executable, and must include critical details such as filenames, times, or other content as they appear on the screen. For example: "Scroll left to open the app drawer, displaying all installed applications on the device", "Click the chat interface, allowing the user to view and participate in conversation", "Type the username 'Agent', preparing for the next step in logging into the account".
2. Analysis: Analyze possible subsequent operations based on the current interface and action instructions. This analysis should involve step-by-step reasoning, considering potential changes on the screen and actions that can be taken after these changes. For example: "After clicking the plus button, a dropdown menu appears with an option to create a document. I can select this option to create a new document. First, I need to name the document, then enter content into the document, and finally save the document and exit".
3. High-Level Instruction: Based on the analysis results, envision a high-level task that can be completed within the current interface. There are two types of High-Level Instructions: Task-Oriented: Completing a series of operations to achieve a specific goal. Question-Oriented: Performing a series of operations and deriving an answer to a specific question. For example: Share my favorite Book \"the Queen\\'s Gambit\" to my Friend Natalie larson over her gmail address -natalie.larson1998@gmail.com from the  PocketBook app. Ensure that the High-Level Instruction is executable by including all critical specifics, such as filenames, relevant timings, or required details.
4. UI item: Based on the current page parsed result and action instructions, identify the corresponding UI item and provide the specific number.
 
You only need to return a dictionary formatted as follows:
{
  "Sub-Instruction": "xxx",
  "Analysis": "xxx",
  "High-Level-Instruction": "xxx",
  "UI item": x
}
 
Current screen analysis:
"""

client = OpenAI(
    api_key="",
)


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def main():
    data_dir = "android_control_test_simple.json"
    with open(data_dir, 'r', encoding='utf-8') as f:
        data = json.load(f)

    output_path = "android_control_test_gpt4omini.json"
    fout = open(output_path, 'w', encoding='utf-8')
    
    token_number = 0
    input_prompt_number = 0
    output_number = 0
    for item in tqdm(data):
        img_filename = "./androidcontrol_test_is_clickable/" + item["img_filename"].split('/')[-1]
        base64_image = encode_image(img_filename)
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": 'text', "text": sys_prompt + item["ui_text"],
                    },
                    {
                        "type": 'image_url',
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high"
                        },
                    },
                    {"type": 'text', "text": "Please give me 8 different outputs in the format above." },
                ],
            }
        ]
        chat_completion = client.chat.completions.create(
            messages=messages,
            model="gpt-4o",
            max_tokens=3000,
            temperature=0.9,
            top_p=0.85
        )
        output = chat_completion.choices[0].message.content

        input_prompt_number += chat_completion.usage.prompt_tokens
        output_number += chat_completion.usage.completion_tokens
        item["gpt_output"] = output
        fout.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"input token numbers: {input_prompt_number}. output token number: {output_number}")


if __name__ == '__main__':
    main()


