from openai import OpenAI
import base64
import json
from tqdm import tqdm

sys_prompt = """You are a mobile expert who excels at interacting with elements on mobile screens to complete tasks. I have a task for you, and I hope you can use your extensive knowledge to identify interactive elements on mobile screens. I will provide you with the following information:
 

1. A low-level instruction, which we will follow to perform actions on the current screen.
2. The type of action currently being executed, which can be one of two types: CLICK or LONG_PRESS. You need to determine whether this action can fulfill the current low-level instruction.
3. The current screen environment, with the position where the action(click and long_press) needs to be executed marked by a red box.
 
I will provide you with a screenshot, along with the low-level instructions and the action to be executed. Your task is to determine whether the current action brings us closer to achieving the low-level instruction. If the current action contributes to the realization of the low-level instruction, answer "Yes"; otherwise, answer "No".

You only need to return a dictionary formatted as follows:
{
  "Analysis": "xxx",
  "Correct": Yes/No
}
"""

client = OpenAI(
    api_key="",
)


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def main():
    # input path
    data_dir = "exploreBenchmark_qwen_ValidUIComponents.json"
    with open(data_dir, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # output path
    output_dir = "android_control_test_gpt4omini_width_checkingcorrect.json"
    fout = open(output_dir, 'w', encoding='utf-8')

    token_number = 0
    input_prompt_number = 0
    output_number = 0
    for item in tqdm(data):
        img_filename = "./androidcontrol_test_is_clickable/" + item["img_filename"].split('/')[-1]
        base64_image = encode_image(img_filename)
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": 'text', "text": sys_prompt + item["ui_text"],
                    },
                    {
                        "type": 'image_url',
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high"
                        },
                    },
                    {"type": 'text', "text": "Do you think this action correctly executed the instruction requirements?" },
                ],
            }
        ]
        chat_completion = client.chat.completions.create(
            messages=messages,
            model="gpt-4o",
            max_tokens=3000,
            temperature=0.9,
            top_p=0.85
        )
        output = chat_completion.choices[0].message.content

        input_prompt_number += chat_completion.usage.prompt_tokens
        output_number += chat_completion.usage.completion_tokens
        item["gpt_output"] = output
        fout.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"input token numbers: {input_prompt_number}. output token number: {output_number}")


if __name__ == '__main__':
    main()


