from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import json
from tqdm import tqdm

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# default processer
processor = AutoProcessor.from_pretrained("Qwen2.5-VL-72B-Instruct", padding_side='left')


fout = open("android_control_test_Qwen2d5_action_answer.jsonl", 'w', encoding="utf-8")
with open("exploreBenchmark_qwen_inference_input.json", 'r') as f:
    data = json.load(f)
for item in tqdm(data):
    texts = []
    image_inputs = []
    img_filename = "androidcontrol/" + item['img_filename'].split('./')[-1]
    for gpt_output in item["gpt_output_split"]:
        question = f"""
You are a GUI task expert, I will provide you with a low-level instruction, an golden ui with its corresponding id.
Low-level instruction: {gpt_output["Sub-Instruction"]}
UI id: {gpt_output['UI item']}
Please generate the action for the next step. 
Candidate Actions: 
"action_type": "click", "ui": <ui_idx>
"action_type": "long_press", "ui": <ui_idx>
"action_type": "type", "text": <text_input>
"action_type": "scroll", "direction": <up, down, left, or right>
"action_type": "navigate_home"
"action_type": "navigate_back"
"action_type": "open_app", "app_name": <app_name>
"action_type": "wait"
"action_type": "status", "goal_status": <"successful","infeasible">
""" + "You need to generate a script in the form: actions: {ACTION}\nMake sure to consider the details in the screenshot and the task requirements to create an accurate and functional script."
        print(question)
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": img_filename,
                    },
                    {"type": "text", "text": question},
                ],
            }
        ]

        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_input, video_inputs = process_vision_info(messages)
        texts.append(text)
        image_inputs.append(image_input)
    inputs = processor(
        text=texts,
        images=image_inputs,
        # videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    item["Qwen2d5_72B"] = output_text
    print(output_text)
    fout.write(json.dumps(item, ensure_ascii=False) +'\n')
