import json
from PIL import Image

dataset_path = ""

def dataset_gen_train():
    with open(dataset_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    ds = data
    for example in ds:
        results = []
        SYSTEM_PROMPT = r'''You are an agent that is trained to complete certain tasks on a smartphone. The input consists of task instructions, the recent actions and the current screenshot, while the output corresponds to the actions of the current step. The coordinates of the upper left corner of the entire screenshot are [0,0], and those of the lower right corner are [1000,1000].\n
            Your output should include only action part in the given format:\n
            <thinking><analysis>... (one or two sentence)</analysis><reasoning>... (one or two sentence)</reasoning><instruction>...(refers to a set of fine-grained instructions that serve as atomic decompositions of high-level instructions, providing detailed steps for executing the next action on the current page (e.g., 'Tap the search button on the keyboard').)</instruction></thinking><answer>answer here</answer>\n'''

        relative_path = './dataset/three_step_rl/'
        image_path_ori = example['image_path']
        image_path = os.path.join(relative_path, image_path_ori)
        task = example['task']
        screen_description = example['description']
        history_json = example['history']
        intention = example['intention']
        instruction = example['instruction']
        action_type = example['action_type']
        if action_type == 'CLICK':
            action_info = [example['sam2_bbox'][i:i + 2] for i in range(0, len(example['sam2_bbox']), 2)]
        else:
            action_info = example['action_info']
        answer_dict = {
            "action_type": action_type,
            "action_info": action_info,
        }
        answer_json = json.dumps(answer_dict, ensure_ascii=False)
        history = json.dumps(history_json, ensure_ascii=False)
        answer = "<thinking>" + "<analysis>" + screen_description + "</analysis>" + "<reasoning>" + intention + "</reasoning>" + "<instruction>" + instruction + "</instruction>" + "</thinking>\n" + "<answer>" + answer_json + "</answer>"
        with Image.open(image_path) as img:
            if img.mode != 'RGB':
                img = img.convert('RGB')
            image = img.resize((1000, 1000))
            results.append({
                "messages": [
                    {'role': 'system', 'content': [{"type": "text", "text": SYSTEM_PROMPT}]},
                    {"role": "user", "content": [
                        {"type": "image_pil", "image": image},
                        {"type": "text",
                         "text": "Your task is: " + task + "\nHere is a summary of the preceding steps leading to the current stage of the task: " + history},
                    ]},
                    {"role": "assistant", "content": [{"type": "text", "text": answer}]}
                ]
            })
        for single_out in results:
            yield single_out
