import argparse
import os
import json
import ast
from prompt.aitzPrompt import AITZ_FOROSATLAS, AITZ_FORUITARS, AITZ_FORGUIR1, AITZ_OS_GENESIS_PROMPT, AITZHIGHACTIONPREDICTPROMPT_FOROSATLAS, AITZ_FORGPT5
import re, math
from tqdm import tqdm
import sys
sys.path.append("./")
from utils.schema.GUI_OWL.common import pil_to_base64, message_translate
from utils.logging_utils import setup_logger_to_stdout
from preprocess_base import BasePreProcess

logger = setup_logger_to_stdout()

def parse_args(args=None, namespace=None):
    parser = argparse.ArgumentParser(description='Origin Dataset To Json')
    parser.add_argument('--dataset_name', type=str, default="AITZ",
                        help='dataset name')
    parser.add_argument('--dataset_type', type=str, default='all_low', help='dataset type')
    parser.add_argument('--dataset_path', type=str, default="/data3/cpz/datasets/android_in_the_zoo",
                        help='dataset path')
    parser.add_argument('--model_name', type=str, default="Aguvis",
                        help='model name')
    parser.add_argument('--save_path', type=str, default="/home/chengpengzhou/GUI_VISION/GUI-Speaker/datasets/json",
                        help='save path')
    return parser.parse_args()


  
class AITZPreProcess(BasePreProcess):
    def __init__(self, dataset_type, dataset_path, dataset_name, save_path, model_name):
        super().__init__(dataset_path, dataset_name, save_path, model_name)
        self.dataset_type = dataset_type
        self.dataset_path = dataset_path
        self.dataset_name = dataset_name
        self.model_name = model_name

    def OS_ATLAS(self):
        sample = super().OS_ATLAS()
        def actionMapping(action):
            """
            ['CLICK', 'SCROLL', 'TYPE', 'PRESS_HOME', 'PRESS_BACK', 'ENTER']
            """
            action_type = action['result_action_type']
            action_text = action['result_action_text']
            coat_action_desc = action['coat_action_desc'].lower()
            if action_type == 4:
                if coat_action_desc.startswith("scroll"):
                    if "up" in coat_action_desc:
                        return f"SCROLL [UP]"
                    elif "down" in coat_action_desc:
                        return f"SCROLL [DOWN]"
                    elif "left" in coat_action_desc:
                        return f"SCROLL [LEFT]"
                    elif "right" in coat_action_desc:
                        return f"SCROLL [RIGHT]"
                else:
                    click = ast.literal_eval(action['result_touch_yx'])
                    y, x = click[0]*1000, click[1]*1000
                    return f"CLICK <point>[[{x}, {y}]]</point>" 
            elif action_type == 6:
                return "PRESS_HOME"
            elif action_type == 5:
                return "PRESS_BACK"
            elif action_type == 3:
                return f"TYPE [{action_text}]"
            elif action_type == 7:
                return "ENTER"
            elif action_type == 10:
                return "COMPLETE"
            else:
                logger.error(f"Action mapping error: {action}")
        
        path = self._merge_dataset_path()
        for key in path:
            if key == 'train':
                continue
            data = []
            logger.info(f"Processing the {key} samples in the {self.dataset_name}")
            for path_item in path[key]:
                logger.info(f"Processing: {path_item}")
                for episode in path[key][path_item]:
                    # logger.info(f"Processing the episode: {episode}")
                    metadata = self.readJson(os.path.join(episode, str(episode.split('/')[-1])+".json"))
                    previous_action_history = []
      
                    index = [i['step_id'] for i in metadata]
                    for i, idx in enumerate(index):
                        from copy import deepcopy
                        record = deepcopy(sample)
                        record["episode_id"] = metadata[i]["episode_id"]
                        record["step_id"] = metadata[i]["step_id"]
                        record["images"] = [os.path.join(episode, str(episode.split('/')[-1])+"_"+str(idx)+'.png')]
                        record['accessibility_trees'] = os.path.join(episode, str(episode.split('/')[-1])+".json")
                        record['goal'] = metadata[i]['instruction']
                        image_size = self.readImage(record['images'][0])
                        record['image_size'] = [[image_size[0], image_size[1]]]
                        try:
                            record['messages'][1]['content'] = actionMapping(metadata[i])
                        except Exception as e:
                            logger.info(f'extract action failure: {e}')
                        record['label'] = "action:\n"+record['messages'][1]['content']
                        if 'low' in self.dataset_type:
                            record['messages'][0]['content'] = AITZ_FOROSATLAS.format(
                                finalGoal=metadata[i]['instruction'],
                                actionDesc=metadata[i]['coat_action_desc'],
                                SD=metadata[i]['coat_screen_desc'],
                                previousActions=previous_action_history[:i],
                            )
                        else:
                            record['messages'][0]['content'] = AITZHIGHACTIONPREDICTPROMPT_FOROSATLAS.format(
                                finalGoal=metadata[i]['instruction'],
                                SD=metadata[i]['coat_screen_desc'],
                                previousActions=previous_action_history[:i],
                            )
                        previous_action_history.append(metadata[i]['coat_action_desc'])
                        data.append(record)
             
            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)
            self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+str(key)+"_"+self.model_name.lower()+'.json'))
            logger.info(f"transform {self.dataset_type} of {self.dataset_name} dataset to json succesfully")
        logger.info("Finished")

    def GPT_5(self):
        sample = super().GPT_5()
        def actionMapping(action):
            """
            ['CLICK', 'SCROLL', 'TYPE', 'PRESS_HOME', 'PRESS_BACK', 'ENTER']
            """
            action_type = action['result_action_type']
            action_text = action['result_action_text']
            coat_action_desc = action['coat_action_desc'].lower()
            if action_type == 4:
                if coat_action_desc.startswith("scroll"):
                    if "up" in coat_action_desc:
                        return f"SCROLL [UP]"
                    elif "down" in coat_action_desc:
                        return f"SCROLL [DOWN]"
                    elif "left" in coat_action_desc:
                        return f"SCROLL [LEFT]"
                    elif "right" in coat_action_desc:
                        return f"SCROLL [RIGHT]"
                else:
                    click = ast.literal_eval(action['result_touch_yx'])
                    y, x = click[0]*1000, click[1]*1000
                    return f"CLICK <point>[[{x}, {y}]]</point>" 
            elif action_type == 6:
                return "PRESS_HOME"
            elif action_type == 5:
                return "PRESS_BACK"
            elif action_type == 3:
                return f"TYPE [{action_text}]"
            elif action_type == 7:
                return "ENTER"
            elif action_type == 10:
                return "COMPLETE"
            else:
                logger.error(f"Action mapping error: {action}")
        
        path = self._merge_dataset_path()
        for key in path:
            if key == 'train':
                continue
            data = []
            logger.info(f"Processing the {key} samples in the {self.dataset_name}")
            for path_item in path[key]:
                logger.info(f"Processing: {path_item}")
                for episode in path[key][path_item]:
                    metadata = self.readJson(os.path.join(episode, str(episode.split('/')[-1])+".json"))
                    previous_action_history = []
      
                    index = [i['step_id'] for i in metadata]
                    for i, idx in enumerate(index):
                        from copy import deepcopy
                        record = deepcopy(sample)
                        record["episode_id"] = metadata[i]["episode_id"]
                        record["step_id"] = metadata[i]["step_id"]
                        record["images"] = [os.path.join(episode, str(episode.split('/')[-1])+"_"+str(idx)+'.png')]
                        import base64
                        with open(record["images"][0], "rb") as image_file:
                            base64_image = base64.b64encode(image_file.read()).decode('utf-8')
                        record['messages'][0]['content'][1]['image_url']['url'] = f"data:image/png;base64,{base64_image}"


                        record['accessibility_trees'] = os.path.join(episode, str(episode.split('/')[-1])+".json")
                        record['goal'] = metadata[i]['instruction']
                        image_size = self.readImage(record['images'][0])
                        record['image_size'] = [[image_size[0], image_size[1]]]
                        record['label'] = "Action: "+actionMapping(metadata[i])
                        if 'low' in self.dataset_type:
                            record['messages'][0]['content'][0]['text'] = AITZ_FORGPT5.format(
                                finalGoal=metadata[i]['instruction'],
                                actionDesc=metadata[i]['coat_action_desc'],
                                SD=metadata[i]['coat_screen_desc'],
                                previousActions=previous_action_history[:i],
                            )
                        else:
                            record['messages'][0]['content'] = AITZHIGHACTIONPREDICTPROMPT_FOROSATLAS.format(
                                finalGoal=metadata[i]['instruction'],
                                SD=metadata[i]['coat_screen_desc'],
                                previousActions=previous_action_history[:i],
                            )
                        previous_action_history.append(metadata[i]['coat_action_desc'])
                        data.append(record)
             
            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)
            self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+str(key)+"_"+self.model_name.lower()+'.json'))
            logger.info(f"transform {self.dataset_type} of {self.dataset_name} dataset to json succesfully")
        logger.info("Finished")

    def GLM_4_5_V(self):
        sample = super().GLM_4_5_V()
        def actionMapping(action):
          
            """
            ['CLICK', 'SCROLL', 'TYPE', 'PRESS_HOME', 'PRESS_BACK', 'ENTER']
            """
            action_type = action['result_action_type']
            action_text = action['result_action_text']
            coat_action_desc = action['coat_action_desc'].lower()
            if action_type == 4:
                if coat_action_desc.startswith("scroll"):
                    if "up" in coat_action_desc:
                        return f"SCROLL [UP]"
                    elif "down" in coat_action_desc:
                        return f"SCROLL [DOWN]"
                    elif "left" in coat_action_desc:
                        return f"SCROLL [LEFT]"
                    elif "right" in coat_action_desc:
                        return f"SCROLL [RIGHT]"
                else:
                    click = ast.literal_eval(action['result_touch_yx'])
                    y, x = click[0]*1000, click[1]*1000
                    return f"CLICK <point>[[{x}, {y}]]</point>" 
            elif action_type == 6:
                return "PRESS_HOME"
            elif action_type == 5:
                return "PRESS_BACK"
            elif action_type == 3:
                return f"TYPE [{action_text}]"
            elif action_type == 7:
                return "ENTER"
            elif action_type == 10:
                return "COMPLETE"
            else:
                logger.error(f"Action mapping error: {action}")

        path = self._merge_dataset_path()
        for key in path:
            if key == 'train':
                continue
            data = []
            logger.info(f"Processing the {key} samples in the {self.dataset_name}")
            for path_item in path[key]:
                logger.info(f"Processing: {path_item}")
                for episode in path[key][path_item]:
                    metadata = self.readJson(os.path.join(episode, str(episode.split('/')[-1])+".json"))
                    previous_action_history = []
                    index = [i['step_id'] for i in metadata]
                    for i, idx in enumerate(index):
                        from utils.schema.GLM_4_5_V.prompt import get_mobile_prompt
                        from copy import deepcopy
                        record = deepcopy(sample)
                        record["episode_id"] = metadata[i]["episode_id"]
                        record["step_id"] = metadata[i]["step_id"]
                        record["images"] = [os.path.join(episode, str(episode.split('/')[-1])+"_"+str(idx)+'.png')]
                        import base64
                        with open(record["images"][0], "rb") as image_file:
                            base64_image = base64.b64encode(image_file.read()).decode('utf-8')
                        record['messages'][0]['content'][0]['image_url']['url'] = f"data:image/png;base64,{base64_image}"
                        record['accessibility_trees'] = os.path.join(episode, str(episode.split('/')[-1])+".json")
                        record['goal'] = metadata[i]['instruction']
                        image_size = self.readImage(record['images'][0])
                        record['image_size'] = [[image_size[0], image_size[1]]]
                        record['label'] = "action:\n"+actionMapping(metadata[i])
                        if 'low' in self.dataset_type:
                            prompt = get_mobile_prompt(metadata[i]['instruction'], history=previous_action_history[:i], actionDesc=metadata[i]['coat_action_desc'], SD=metadata[i]['coat_screen_desc'])
                            record['messages'][0]['content'][1]['text'] = prompt
                        else:
                            prompt = get_mobile_prompt(metadata[i]['instruction'], history=previous_action_history[:i], actionDesc="", SD="")
                            record['messages'][0]['content'][1]['text'] = prompt
                        previous_action_history.append(metadata[i]['coat_action_desc'])
                        data.append(record)
             
            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)
            self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+str(key)+"_"+self.model_name.lower()+'.json'))
            logger.info(f"transform {self.dataset_type} of {self.dataset_name} dataset to json succesfully")
        logger.info("Finished")

    def UI_TARS(self):
        sample = super().UI_TARS()
        def actionMapping(action, image_size):
            """
            ['CLICK', 'SCROLL', 'TYPE', 'PRESS_HOME', 'PRESS_BACK', 'ENTER']
            """
            action_type = action['result_action_type']
            action_text = action['result_action_text']
            coat_action_desc = action['coat_action_desc'].lower()
            if action_type == 4:
                if coat_action_desc.startswith("scroll"):
                    if "up" in coat_action_desc:
                        return f"scroll(direction='down')"
                    elif "down" in coat_action_desc:
                        return f"scroll(direction='up')"
                    elif "left" in coat_action_desc:
                        return f"scroll(direction='right')"
                    elif "right" in coat_action_desc:
                        return f"scroll(direction='left')"
                else:
                    click = ast.literal_eval(action['result_touch_yx'])
                    ## UI-TARS-1.5 is absoulte coord
                    if "1.5" in self.model_name:
                        y, x = int(click[0]*image_size[1]), int(click[1]*image_size[0])
                    else:
                        y, x = int(click[0]*1000), int(click[1]*1000)
                    return f"click(start_box='({x},{y})')" 
            elif action_type == 6:
                return "press_home()"
            elif action_type == 5:
                return "press_back()"
            elif action_type == 3:
                return f"type(content='{action_text}')"
            elif action_type == 10:
                return "finished()"
            elif action_type == 7:
                return f"enter()"
            else:
                logger.error(f"Action mapping error: {action}")

        def build_history(index, new_metadata):
            history = []
            image_indices = range(0, index) if index <= 4 else range(index - 4, index)
            for i in range(len(new_meta_data['screenshots'])):
                if i in image_indices:
                    image_history = {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "image": new_metadata["screenshots"][i]
                            }
                        ]
                    }
                    history.append(image_history)
                if i in image_indices:
                    action = new_metadata["action_traslate"][i]
                    thought = new_metadata["step_instruction"][i]
                    text_history = {
                        "role": "assistant",
                        "content": [
                            {"type": "text", "text": f"Thought: {thought}\nAction: {action}"}
                        ]
                    }
                    history.append(text_history)
            return history
        
        path = self._merge_dataset_path()
        for key in path:
            if key == 'train':
                continue
            data = []
            logger.info(f"Processing the {key} samples in the {self.dataset_name}")
            for path_item in path[key]:
                logger.info(f"Processing: {path_item}")
                for episode in tqdm(path[key][path_item]):
                    # logger.info(f"Processing the episode: {episode}")
                    new_meta_data = {}
                    metadata = self.readJson(os.path.join(episode, str(episode.split('/')[-1])+".json"))
                    new_meta_data['screenshots'] = [os.path.join(episode, item['image_path'].split('/')[-1]) for item in metadata]
                    new_meta_data['image_size'] = [self.readImage(path) for path in new_meta_data['screenshots']]
                    new_meta_data["action_traslate"] = [actionMapping(action, image_size) for action, image_size in zip(metadata, new_meta_data['image_size'])]
                    new_meta_data["step_instruction"] = [step_instruction_item['coat_action_desc'] for step_instruction_item in metadata]

                    for i in range(len(new_meta_data['screenshots'])):
                        from copy import deepcopy
                        record = deepcopy(sample)
                        record['messages'][1]['content'][0]['text'] = AITZ_FORUITARS.format(
                            instruction= metadata[i]['instruction']
                        )
                        if 'low' in self.dataset_type:
                            if i != 0:
                                record['messages'].extend(build_history(i, new_meta_data)) 
                            record['messages'].extend([
                                {
                                    "role": "user",
                                    "content": [{
                                        "type": "image",
                                        "image": new_meta_data['screenshots'][i]
                                    }]
                                },
                                {
                                    "role": "assistant",
                                    "content": [{
                                        "type": "text",
                                        "text": f"Thought: {new_meta_data['step_instruction'][i]}\n"
                                    }]
                                }
                            ])
                        else:
                            if i != 0:
                                record['messages'].extend(build_history(i, new_meta_data)) 
                            record['messages'].extend([
                                {
                                    "role": "user",
                                    "content": [
                                        {
                                            "type": "image",
                                            "image": new_meta_data['screenshots'][i].replace(
                                            "android_control_parsed_data_fixed/",
                                            "/data/cpz/datasets/android_control_parsed/"
                                            )
                                        }
                                    ]
                                }
                            ])
                        record['accessibility_trees'] = os.path.join(episode, str(episode.split('/')[-1])+".json")
                        record['label'] = f"Thought: {new_meta_data['step_instruction'][i]}\nAction: {new_meta_data['action_traslate'][i]}"
                        record["episode_id"] = metadata[i]["episode_id"]
                        record["step_id"] = metadata[i]['step_id']
                        record["images"] = [new_meta_data['screenshots'][i]]
                        record['goal'] = metadata[i]['instruction']   
                        record['image_size'] = [new_meta_data['image_size'][i]]
                        data.append(record)
             
            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)
            self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+str(key)+"_"+self.model_name.lower()+'.json'))
            logger.info(f"transform {self.dataset_type} of {self.dataset_name} dataset to json succesfuully")
        logger.info("Finished")

    def GUI_R1(self):
        sample = super().GUI_R1()
        def actionMapping(action):
            t = action['result_action_type']
            coat_action_desc = action['coat_action_desc']
            click = ast.literal_eval(action['result_touch_yx'])
            image_size = action['image_size']
            if t == 6:
                action_name = 'press_home'
            elif t == 5:
                action_name = 'press_back'
            elif t == 7:
                action_name = 'enter'
            elif t == 10:
                action_name = 'complete'
            else:
                action_name = t

            if t in [10, 7, 5, 6]:
                input_text = 'no input text'
            elif t == 3: 
                action_name = 'type'
                input_text = action['result_action_text']  

            if t == 4:
                if coat_action_desc.startswith("scroll"):
                    if "up" in coat_action_desc:
                        input_text = 'up'
                    elif "down" in coat_action_desc:
                        input_text = 'down'
                    elif "left" in coat_action_desc:
                        input_text = 'left'
                    elif "right" in coat_action_desc:
                        input_text = 'right'
                    point = [-100, 100]
                    action_name = 'scroll'
                else:
                    y, x = int(click[0]*image_size[1]), int(click[1]*image_size[0])
                    input_text = 'no input text'
                    point = [x, y]
                    action_name = 'click'
            else:
                point = [-100, -100]
            formatted_action = [{
                'action': action_name,
                'point': point,
                'input_text': input_text
            }]
            return str(formatted_action)
        
        path = self._merge_dataset_path()
        for key in path:
            data = []
            logger.info(f"Processing the {key} samples in the {self.dataset_name}")
            for path_item in path[key]:
                logger.info(f"Processing: {path_item}")
                for episode in path[key][path_item]:
                    # logger.info(f"Processing the episode: {episode}")
                    metadata = self.readJson(os.path.join(episode, str(episode.split('/')[-1])+".json"))
                    metadata = [
                        sample | {"image_size": self.readImage(os.path.join(episode, ''.join(sample["image_path"].split("/")[2:]))) if "image_path" in sample else None}
                        for sample in metadata
                    ]
                    action_traslate = [actionMapping(metadata[index]) for index in range(len(metadata))]
                    previous_action_history = []
      
                    index = [i['step_id'] for i in metadata]
                    for i, idx in enumerate(index):
                        from copy import deepcopy
                        record = deepcopy(sample)
                        record["episode_id"] = metadata[i]["episode_id"]
                        record["step_id"] = metadata[i]["step_id"]
                        record["images"] = [os.path.join(episode, str(episode.split('/')[-1])+"_"+str(idx)+'.png')]
                        record['accessibility_trees'] = os.path.join(episode, str(episode.split('/')[-1])+".json")
                        record['goal'] = metadata[i]['instruction']
                        record['low_level_goal'] = metadata[i]['coat_action_desc']
                        record['image_size'] = [[metadata[i]['image_size'][0], metadata[i]['image_size'][1]]]
                        record['label'] = "<think></think><answer>"+action_traslate[i]+"</answer>"
                        record['messages'][0]['content'][0]['image'] = record['images'][0] 
                        record['messages'][0]['content'][1]['text'] = '<image>\n' + AITZ_FORGUIR1.replace("{goal}", metadata[i]['coat_action_desc']).replace("{history}", ','.join(previous_action_history[:i]))
                        previous_action_history.append(metadata[i]['coat_action_desc'])
                        data.append(record)
             
            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)
            self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+str(key)+"_"+self.model_name.lower()+'.json'))
            logger.info(f"transform {self.dataset_type} of {self.dataset_name} dataset to json succesfully")
        logger.info("Finished")
                        
    def Agent_CPM(self):
        sample = super().Agent_CPM()
        def actionMapping(action):
            """
            ['click', 'open_app', 'long_press', 'navigate_home', 'scroll', 'navigate_back', 'wait', 'input_text']
            """
            t = action['result_action_type']
            coat_action_desc = action['coat_action_desc']
            click = ast.literal_eval(action['result_touch_yx'])
            image_size = action['image_size']
            if t == 4:
                if coat_action_desc.startswith("scroll"):
                    if "up" in coat_action_desc:
                        direction = 'up'
                    elif "down" in coat_action_desc:
                        direction = 'down'
                    elif "left" in coat_action_desc:
                        direction = 'left'
                    elif "right" in coat_action_desc:
                        direction = 'right'
                    return str({"thought":"", "POINT": [-100, -100], "to": direction})
                else:
                    y, x = int(click[0]*1000), int(click[1]*1000)
                    return str({"thought":"", "POINT": [x, y]})
            elif t == 6:
                return str({"thought":"", "PRESS": "HOME"})
            elif t == 5:
                return str({"thought":"", "PRESS": "BACK"})
            elif t == 3:
                return str({"thought":"", "TYPE": action['result_action_text']})
            elif t == 10:
                return str({"thought":"", 'STATUS': 'finish'})
            elif t == 7:
                return str({"thought":"", 'PRESS': 'ENTER'})
            else:
                return "error" 
        from prompt.aitzPrompt import AITZ_AGENT_CPM_SYSTEM_PROMPT
        ACTION_SCHEMA = json.load(open('/data1/home/chengpengzhou/GUI_VISION/GUI-Speaker/utils/schema/agentCPMSchema.json', encoding="utf-8"))
        items = list(ACTION_SCHEMA.items())
        insert_index = 3
        items.insert(insert_index, ("required", ["thought"])) 
        ACTION_SCHEMA = dict(items)
        AITZ_AGENT_CPM_SYSTEM_PROMPT = AITZ_AGENT_CPM_SYSTEM_PROMPT.replace("ACTION_SCHEMA", str(ACTION_SCHEMA))
                
        path = self._merge_dataset_path()
        for key in path:
            data = []
            logger.info(f"Processing the {key} samples in the {self.dataset_name}")
            for path_item in path[key]:
                logger.info(f"Processing: {path_item}")
                for episode in path[key][path_item]:
                    # logger.info(f"Processing the episode: {episode}")
                    metadata = self.readJson(os.path.join(episode, str(episode.split('/')[-1])+".json"))
                    metadata = [
                        sample | {"image_size": self.readImage(os.path.join(episode, ''.join(sample["image_path"].split("/")[2:]))) if "image_path" in sample else None}
                        for sample in metadata
                    ]
                    action_traslate = [actionMapping(action) for action in metadata]
                    previous_action_history = []

                    index = [i['step_id'] for i in metadata]
                    for i, idx in enumerate(index):
                        from copy import deepcopy
                        record = deepcopy(sample)
                        record["episode_id"] = metadata[i]["episode_id"]
                        record["step_id"] = metadata[i]["step_id"]
                        record["images"] = [os.path.join(episode, str(episode.split('/')[-1])+"_"+str(idx)+'.png')]
                        record['accessibility_trees'] = os.path.join(episode, str(episode.split('/')[-1])+".json")
                        record['goal'] = metadata[i]['instruction']
                        image_size = self.readImage(record['images'][0])
                        record['image_size'] = [[image_size[0], image_size[1]]]
                        record['label'] = action_traslate[i]
                        if 'low' in self.dataset_type:
                            record['messages'][0]['content'][0] = record['messages'][0]['content'][0].replace("text_prompt", metadata[i]['coat_action_desc'])
                        else:
                            record['messages'][0]['content'][0] = record['messages'][0]['content'][0].replace("text_prompt", metadata[i]['goal'])
                        record['system_prompt'] = AITZ_AGENT_CPM_SYSTEM_PROMPT
                        previous_action_history.append(metadata[i]['coat_action_desc'])
                        data.append(record)

            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)
            self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+str(key)+"_"+self.model_name.lower()+'.json'))
            logger.info(f"transform {self.dataset_type} of {self.dataset_name} dataset to json succesfuully")
        logger.info("Finished")


    def Aguvis(self):
        sample = super().Aguvis()
        from utils.schema.aguvisConstants import user_instruction
        def actionMapping(action):
            """
            ['click', 'open_app', 'long_press', 'navigate_home', 'scroll', 'navigate_back', 'wait', 'input_text']
            """
            t = action['result_action_type']
            coat_action_desc = action['coat_action_desc']
            click = ast.literal_eval(action['result_touch_yx'])
            image_size = action['image_size']
            if t == 4:
                if coat_action_desc.startswith("scroll"):
                    if "up" in coat_action_desc:
                        return "assistantos\npyautogui.scroll(page=-0.1)"
                    elif "down" in coat_action_desc:
                        return "assistantos\npyautogui.scroll(page=0.1)"
                    elif "left" in coat_action_desc:
                        return "assistantos\npyautogui.hscroll(page=0.1)"
                    elif "right" in coat_action_desc:
                        return "assistantos\npyautogui.hscroll(page=-0.1)"
                else:
                    y, x = click[0], click[1]
                    return f"assistantos\npyautogui.click(x={x}, y={y})"
            elif t == 6:
                return f"assistantos\nmobile.home()"
            elif t == 5:
                return f"assistantos\nmobile.back()"
            elif t == 3:
                return f"assistantos\npyautogui.write(message='{action['result_action_text']}')"
            elif t == 10:
                return "assistantos\nmobile.terminate(status='success')"
            elif t == 7:
                return f"assistantos\npyautogui.press(key=['enter'])"
            else:                
                return "error" 
        
        path = self._merge_dataset_path()
        for key in path:
            if key == 'train':
                continue
            data = []
            logger.info(f"Processing the {key} samples in the {self.dataset_name}")
            for path_item in path[key]:
                logger.info(f"Processing: {path_item}")
                for episode in path[key][path_item]:
                    # logger.info(f"Processing the episode: {episode}")
                    metadata = self.readJson(os.path.join(episode, str(episode.split('/')[-1])+".json"))
                    metadata = [
                        sample | {"image_size": self.readImage(os.path.join(episode, ''.join(sample["image_path"].split("/")[2:]))) if "image_path" in sample else None}
                        for sample in metadata
                    ]
                    action_traslate = [actionMapping(action) for action in metadata]
                    previous_action_history = []

                    index = [i['step_id'] for i in metadata]
                    for i, idx in enumerate(index):
                        from copy import deepcopy
                        record = deepcopy(sample)
                        record["episode_id"] = metadata[i]["episode_id"]
                        record["step_id"] = metadata[i]["step_id"]
                        record["images"] = [os.path.join(episode, str(episode.split('/')[-1])+"_"+str(idx)+'.png')]
                        record['accessibility_trees'] = os.path.join(episode, str(episode.split('/')[-1])+".json")
                        record['goal'] = metadata[i]['instruction']
                        image_size = self.readImage(record['images'][0])
                        record['image_size'] = [[image_size[0], image_size[1]]]
                        record['label'] = action_traslate[i]
                        # print(record['messages']['content'][1]['text'])
                        if 'low' in self.dataset_type:
                            record['messages']['content'][1]['text'] = user_instruction.format(
                                overall_goal=record['goal'], 
                                previous_actions=previous_action_history, 
                                low_level_instruction=metadata[i]['coat_action_desc']
                            )
                            record['is_low_level_instruction'] = True
                            record['low_level_instruction'] = metadata[i]['coat_action_desc']
                        else:
                            record['messages']['content'][1]['text'] = user_instruction.format(
                                overall_goal=record['goal'], 
                                previous_actions=previous_action_history, 
                            )
                            record['is_low_level_instruction'] = False
                        record['mode'] = 'force-plan'   
                        previous_action_history.append(metadata[i]['coat_action_desc'])
                        data.append(record)

            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)
            self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+str(key)+"_"+self.model_name.lower()+'.json'))
            logger.info(f"transform {self.dataset_type} of {self.dataset_name} dataset to json succesfuully")
        logger.info("Finished")

    def OS_Genesis(self):
        sample = super().OS_Genesis()
        def get_a11_tree(metadata):
            clickable_nodes = {}
            ui_positions = ast.literal_eval(metadata['ui_positions'])
            ui_text = ast.literal_eval(metadata['ui_text'])
            ui_type = ast.literal_eval(metadata['ui_types'])
            for i, item in enumerate(ui_positions):
                y, x, h, w = item[0], item[1], item[2], item[3] 
                center_x, center_y = x+w/2, y+h/2
                if ui_type[i] == 'TEXT':
                    clickable_nodes[ui_text[i]] = (center_x, center_y)
                else:
                    clickable_nodes[ui_type[i]] = (center_x, center_y)
            return clickable_nodes
        
        def actionMapping(action):
            t = action['result_action_type']
            thought = action['coat_action_desc']
            click = ast.literal_eval(action['result_touch_yx'])
            image_size = action['image_size']
            if t == 4:
                if thought.startswith("scroll"):
                    if "up" in thought:
                        return f'Low-level thought: {thought} action: {{"action_type": "scroll", "direction": "up"}}'
                    elif "down" in thought:
                        return f'Low-level thought: {thought} action: {{"action_type": "scroll", "direction": "down"}}'
                    elif "left" in thought:
                        return f'Low-level thought: {thought} action: {{"action_type": "scroll", "direction": "left"}}'
                    elif "right" in thought:
                        return f'Low-level thought: {thought} action: {{"action_type": "scroll", "direction": "right"}}'
                else:
                    y, x = int(click[0]*image_size[1]), int(click[1]*image_size[0])
                    return f'Low-level thought: {thought} action: {{"action_type": "click", "x": {x}, "y": {y}}}'
            elif t == 6:
                return f'Low-level thought: {thought} action: {{"action_type": "navigate_home"}}'
            elif t == 5:
                return f'Low-level thought: {thought} action: {{"action_type": "navigate_back"}}'
            elif t == 7:
                return f'Low-level thought: {thought} action: {{"action_type": "enter"}}'
            elif t == 3:
                action_dict = {
                    "action_type": "type",
                    "text": action["result_action_text"],
                    "x": -100,
                    "y": -100
                }
                json_action = json.dumps(action_dict)
                return f"Low-level thought: {thought} action: {json_action}"
            elif t == 10:
                return f'Low-level thought: {thought} action: {{"action_type": "stop"}}'
            else:
                return "error" 
          
        path = self._merge_dataset_path()
        for key in path:
            data = []
            logger.info(f"Processing the {key} samples in the {self.dataset_name}")
            for path_item in path[key]:
                logger.info(f"Processing: {path_item}")
                for episode in path[key][path_item]:
                    # logger.info(f"Processing the episode: {episode}")
                    metadata = self.readJson(os.path.join(episode, str(episode.split('/')[-1])+".json"))
                    metadata = [
                        sample | {"image_size": self.readImage(os.path.join(episode, ''.join(sample["image_path"].split("/")[2:]))) if "image_path" in sample else None}
                        for sample in metadata
                    ]
                    action_traslate = [actionMapping(action) for action in metadata]
                    previous_action_history = []

                    index = [i['step_id'] for i in metadata]
                    for i, idx in enumerate(index):
                        from copy import deepcopy
                        record = deepcopy(sample)
                        record["episode_id"] = metadata[i]["episode_id"]
                        record["step_id"] = metadata[i]["step_id"]
                        record["images"] = [os.path.join(episode, str(episode.split('/')[-1])+"_"+str(idx)+'.png')]
                        record['accessibility_trees'] = os.path.join(episode, str(episode.split('/')[-1])+".json")
                        record['goal'] = metadata[i]['instruction']
                        record['image_size'] = [metadata[i]['image_size']]
                        record['label'] = action_traslate[i]
                        record['question'] = AITZ_OS_GENESIS_PROMPT.format(instruction=record['goal'], history='\n'.join(previous_action_history), a11y_tree=str(get_a11_tree(metadata[i])),low_level_thought=metadata[i]['coat_action_desc'])
                        record['low_level_instruction'] = metadata[i]['coat_action_desc']
                        previous_action_history.append(f"Step {i}:"+str(metadata[i]['coat_action_desc']))
                        data.append(record)

            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)
            self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+str(key)+"_"+self.model_name.lower()+'.json'))
            logger.info(f"transform {self.dataset_type} of {self.dataset_name} dataset to json succesfuully")
        logger.info("Finished")

    def GUI_OWL(self):
        build_system_messages, getResizedImage, build_user_messages, sample = super().GUI_OWL()
        def actionMapping(action):
            """
            ['CLICK', 'SCROLL', 'TYPE', 'PRESS_HOME', 'PRESS_BACK', 'ENTER']
            """
            action_type = action['result_action_type']
            action_text = action['result_action_text']
            coat_action_desc = action['coat_action_desc'].lower()
            if action_type == 4:
                if coat_action_desc.startswith("scroll"):
                    if "up" in coat_action_desc:
                        info = [[0, 0], [1, 0]]
                        return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [{info[0][0]}, {info[0][1]}], "coordinate2": [{info[1][0]}, {info[1][1]}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                    elif "down" in coat_action_desc:
                        info = [[0, 0], [0, 1]]
                        return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [{info[0][0]}, {info[0][1]}], "coordinate2": [{info[1][0]}, {info[1][1]}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                    elif "left" in coat_action_desc:
                        info = [[0, 1], [0, 0]]
                        return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [{info[0][0]}, {info[0][1]}], "coordinate2": [{info[1][0]}, {info[1][1]}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                    elif "right" in coat_action_desc:
                        info = [[1, 0], [0, 0]]
                        return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [{info[0][0]}, {info[0][1]}], "coordinate2": [{info[1][0]}, {info[1][1]}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                else:
                    click = ast.literal_eval(action['result_touch_yx'])
                    y, x = click[0]*1000, click[1]*1000
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "click", "coordinate": [{x}, {y}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif action_type == 6:
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "system_button", "button": "Home"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif action_type == 5:
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "system_button", "button": "Back"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif action_type == 3:
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "type", "text": "{action_text}"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif action_type == 7:
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "system_button", "button": "Enter"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif action_type == 10:
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "terminate", "status": "success"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            else:
                logger.error(f"Action mapping error: {action}")
            
        path = self._merge_dataset_path()
        for key in path:
            if key == 'train':
                continue
            data = []
            logger.info(f"Processing the {key} samples in the {self.dataset_name}")
            for path_item in path[key]:
                logger.info(f"Processing: {path_item}")
                for episode in path[key][path_item]:
                    metadata = self.readJson(os.path.join(episode, str(episode.split('/')[-1])+".json"))
                    previous_action_history = []
                    index = [i['step_id'] for i in metadata]
                    for i, idx in enumerate(index):
                        from copy import deepcopy
                        record = deepcopy(sample)
                        record["episode_id"] = metadata[i]["episode_id"]
                        record["step_id"] = metadata[i]["step_id"]
                        record["images"] = [os.path.join(episode, str(episode.split('/')[-1])+"_"+str(idx)+'.png')]
                        record['accessibility_trees'] = os.path.join(episode, str(episode.split('/')[-1])+".json")
                        record['goal'] = metadata[i]['instruction']
                        image_size = self.readImage(record['images'][0])
                        record['image_size'] = [[image_size[0], image_size[1]]]
                        try:
                            record['label'] = actionMapping(metadata[i])
                        except Exception as e:
                            logger.info(f'extract action failure: {e}')

                        dummy_image = getResizedImage(record['images'][0])
                        system_messages = build_system_messages(dummy_image.height, dummy_image.width)
                        
                        if 'low' in self.dataset_type:
                            user_messages = build_user_messages(metadata[i]['coat_action_desc'], enable_think=True, history=previous_action_history[:i])
                        else:
                            user_messages = build_user_messages(record['goal'], enable_think=True, history=previous_action_history[:i])
                        user_messages['content'].append({"image": record['images'][0]})
                        messages = [system_messages, user_messages]
                        record['messages'] = message_translate(messages, to_format='qwen')
                        previous_action_history.append(metadata[i]['coat_action_desc'])
                        data.append(record)
             
            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)
            self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+str(key)+"_"+self.model_name.lower()+'.json'))
            logger.info(f"transform {self.dataset_type} of {self.dataset_name} dataset to json succesfully")
        logger.info("Finished")


    def _merge_dataset_path(self):
        dataset_types = ["general", "google_apps", "install", "web_shopping"]
        dataset_dicts = {"train": {}, "test": {}}

        if 'all' in self.dataset_type:
            for dtype in dataset_types:
                for split in ["train", "test"]:
                    base_dir = os.path.join(self.dataset_path, split, dtype)
                    if os.path.exists(base_dir):
                        subfolders = [
                            os.path.join(base_dir, f)
                            for f in os.listdir(base_dir)
                            if os.path.isdir(os.path.join(base_dir, f))
                        ]
                        dataset_dicts[split][dtype] = subfolders
                    else:
                        dataset_dicts[split][dtype] = []
        else:
            for split in ["train", "test"]:
                base_dir = os.path.join(self.dataset_path, split, self.dataset_type)
                if os.path.exists(base_dir):
                    subfolders = [
                        os.path.join(base_dir, f)
                        for f in os.listdir(base_dir)
                        if os.path.isdir(os.path.join(base_dir, f))
                    ]
                    dataset_dicts[split][self.dataset_type] = subfolders
                else:
                    dataset_dicts[split][self.dataset_type] = []

        return dataset_dicts

    def process_from_json(self, json_file_path, model_name):
        """
        从JSON文件读取数据并处理为指定模型格式
        """
        logger.info(f"正在从JSON文件读取数据: {json_file_path}")
        data = self.readJson(json_file_path)
        logger.info(f"读取到 {len(data)} 条记录")
        
        if model_name == "OS_ATLAS":
            self._process_os_atlas_from_json(data)
        elif model_name == "UI_TARS" or model_name == "UI_TARS_1.5":
            self._process_ui_tars_from_json(data, model_name)
        elif model_name == "GUI_OWL":
            self._process_gui_owl_from_json(data)
        else:
            logger.error(f"不支持的模型名称: {model_name}")

    def _gt_action_to_os_atlas(self, gt_action, image_size=None):
        """将gt_action转换为OS_ATLAS格式"""
        action_type = gt_action.get('action', '')
        if action_type == 'click':
            coord = gt_action.get('coordinate', [0, 0])
            if image_size:
                x = int(coord[0] / image_size[0] * 1000)
                y = int(coord[1] / image_size[1] * 1000)
            else:
                x, y = coord[0], coord[1]
            return f"CLICK <point>[[{x}, {y}]]</point>"
        elif action_type == 'system_button':
            button = gt_action.get('button', '')
            if button == 'Back':
                return "PRESS_BACK"
            elif button == 'Home':
                return "PRESS_HOME"
            elif button == 'Enter':
                return "ENTER"
        elif action_type == 'type':
            text = gt_action.get('text', '')
            return f"TYPE [{text}]"
        elif action_type == 'swipe':
            direction = gt_action.get('direction', 'UP')
            return f"SCROLL [{direction.upper()}]"
        elif action_type == 'terminate':
            return "COMPLETE"
        elif action_type == 'wait':
            return "WAIT"
        elif action_type == 'long_press':
            coord = gt_action.get('coordinate', [0, 0])
            if image_size:
                x = int(coord[0] / image_size[0] * 1000)
                y = int(coord[1] / image_size[1] * 1000)
            else:
                x, y = coord[0], coord[1]
            return f"LONG_CLICK <point>[[{x}, {y}]]</point>"
        else:
            logger.warning(f"未知的action类型: {action_type}")
            return "WAIT"

    def _gt_action_to_ui_tars(self, gt_action, image_size=None, model_name="UI_TARS"):
        """将gt_action转换为UI_TARS格式"""
        action_type = gt_action.get('action', '')
        if action_type == 'click':
            coord = gt_action.get('coordinate', [0, 0])
            if "1.5" in model_name:
                x, y = coord[0], coord[1]
            else:
                if image_size:
                    x = int(coord[0] / image_size[0] * 1000)
                    y = int(coord[1] / image_size[1] * 1000)
                else:
                    x, y = coord[0], coord[1]
            return f"click(start_box='({x},{y})')"
        elif action_type == 'system_button':
            button = gt_action.get('button', '')
            if button == 'Back':
                return "press_back()"
            elif button == 'Home':
                return "press_home()"
            elif button == 'Enter':
                return "enter()"
        elif action_type == 'type':
            text = gt_action.get('text', '')
            return f"type(content='{text}')"
        elif action_type == 'swipe':
            direction = gt_action.get('direction', 'down')
            return f"scroll(direction='{direction}')"
        elif action_type == 'terminate':
            return "finished()"
        elif action_type == 'wait':
            return "wait()"
        elif action_type == 'long_press':
            coord = gt_action.get('coordinate', [0, 0])
            if "1.5" in model_name:
                x, y = coord[0], coord[1]
            else:
                if image_size:
                    x = int(coord[0] / image_size[0] * 1000)
                    y = int(coord[1] / image_size[1] * 1000)
                else:
                    x, y = coord[0], coord[1]
            return f"long_press(start_box='({x},{y})', time='')"
        else:
            logger.warning(f"未知的action类型: {action_type}")
            return "wait()"

    def _gt_action_to_gui_owl(self, gt_action):
        """将gt_action转换为GUI_OWL格式"""
        action_type = gt_action.get('action', '')
        if action_type == 'click':
            coord = gt_action.get('coordinate', [0, 0])
            x, y = coord[0], coord[1]
            return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "click", "coordinate": [{x}, {y}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
        elif action_type == 'system_button':
            button = gt_action.get('button', '')
            if button == 'Back':
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "system_button", "button": "Back"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif button == 'Home':
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "system_button", "button": "Home"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif button == 'Enter':
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "system_button", "button": "Enter"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
        elif action_type == 'type':
            text = gt_action.get('text', '')
            return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "type", "text": "{text}"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
        elif action_type == 'swipe':
            direction = gt_action.get('direction', 'UP')
            if direction.upper() == 'UP':
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [0, 1], "coordinate2": [0, 0]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif direction.upper() == 'DOWN':
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [0, 0], "coordinate2": [0, 1]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif direction.upper() == 'LEFT':
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [1, 0], "coordinate2": [0, 0]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            else:  # RIGHT
                return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [0, 0], "coordinate2": [1, 0]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
        elif action_type == 'terminate':
            status = gt_action.get('status', 'success')
            return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "terminate", "status": "{status}"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
        elif action_type == 'wait':
            time = gt_action.get('time', 2)
            return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "wait", "time": {time}}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
        elif action_type == 'long_press':
            coord = gt_action.get('coordinate', [0, 0])
            time = gt_action.get('time', 2)
            x, y = coord[0], coord[1]
            return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "long_press", "coordinate": [{x}, {y}], "time": {time}}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
        else:
            logger.warning(f"未知的action类型: {action_type}")
            return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "wait", "time": 2}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""

    def _process_os_atlas_from_json(self, data):
        """处理JSON数据为OS_ATLAS格式"""
        sample = super().OS_ATLAS()
        processed_data = []
        
        for item in tqdm(data, desc="处理OS_ATLAS数据"):
            from copy import deepcopy
            record = deepcopy(sample)
            
            record["episode_id"] = item.get("episode_id", "")
            record["step_id"] = item.get("step_id", 0)
            record["images"] = item.get("images", [])
            # 保留data_type字段
            if "data_type" in item:
                record["data_type"] = item["data_type"]
            
            # 获取图片尺寸
            image_size = None
            if record["images"]:
                try:
                    image_size = self.readImage(record["images"][0])
                except:
                    pass
            
            # 转换动作
            gt_action = item.get("gt_action", {})
            action_str = self._gt_action_to_os_atlas(gt_action, image_size)
            record['messages'][1]['content'] = action_str
            record['label'] = "action:\n" + action_str
            
            # 构建prompt
            instruction = item.get("instruction", "")
            action_history = item.get("action_history", "")
            
            # 解析action_history为列表
            previous_actions = []
            if action_history:
                steps = action_history.split(';')
                for step in steps:
                    step = step.strip()
                    if step:
                        previous_actions.append(step)
            
            if 'low' in self.dataset_type:
                prompt = AITZ_FOROSATLAS.format(
                    finalGoal=instruction,
                    actionDesc=instruction,  # 使用instruction作为actionDesc
                    SD="",  # 新数据格式没有screen description
                    previousActions=previous_actions
                )
            else:
                prompt = AITZHIGHACTIONPREDICTPROMPT_FOROSATLAS.format(
                    finalGoal=instruction,
                    SD="",  # 新数据格式没有screen description
                    previousActions=previous_actions
                )
            
            record['messages'][0]['content'] = prompt
            
            processed_data.append(record)
        
        # 保存数据
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        output_file = os.path.join(self.save_path, f"test_{self.model_name.lower()}.json")
        self.saveJson(processed_data, output_file)
        logger.info(f"已保存 {len(processed_data)} 条OS_ATLAS数据到 {output_file}")

    def _process_ui_tars_from_json(self, data, model_name):
        """处理JSON数据为UI_TARS格式"""
        sample = super().UI_TARS()
        processed_data = []
        
        for item in tqdm(data, desc="处理UI_TARS数据"):
            from copy import deepcopy
            record = deepcopy(sample)
            
            record["episode_id"] = item.get("episode_id", "")
            record["step_id"] = item.get("step_id", 0)
            record["images"] = item.get("images", [])
            # 保留data_type字段
            if "data_type" in item:
                record["data_type"] = item["data_type"]
            
            # 获取图片尺寸
            image_size = None
            if record["images"]:
                try:
                    image_size = self.readImage(record["images"][0])
                except:
                    pass
            
            # 转换动作
            gt_action = item.get("gt_action", {})
            action_str = self._gt_action_to_ui_tars(gt_action, image_size, model_name)
            
            instruction = item.get("instruction", "")
            
            # 构建prompt - AITZ_FORUITARS使用了字符串拼接，需要手动替换
            prompt = AITZ_FORUITARS.replace("{sd}", "").replace("{instruction}", instruction)
            record['messages'][1]['content'][0]['text'] = prompt
            
            # 添加图片和动作
            record['messages'].extend([
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "image": record["images"][0] if record["images"] else ""
                        }
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {
                            "type": "text",
                            "text": f"Thought: {instruction}\nAction: {action_str}"
                        }
                    ]
                }
            ])
            
            record['label'] = f"Thought: {instruction}\nAction: {action_str}"
            
            processed_data.append(record)
        
        # 保存数据
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        output_file = os.path.join(self.save_path, f"test_{self.model_name.lower()}.json")
        self.saveJson(processed_data, output_file)
        logger.info(f"已保存 {len(processed_data)} 条UI_TARS数据到 {output_file}")

    def _process_gui_owl_from_json(self, data):
        """处理JSON数据为GUI_OWL格式"""
        build_system_messages, getResizedImage, build_user_messages, sample = super().GUI_OWL()
        processed_data = []
        
        for item in tqdm(data, desc="处理GUI_OWL数据"):
            from copy import deepcopy
            record = deepcopy(sample)
            
            record["episode_id"] = item.get("episode_id", "")
            record["step_id"] = item.get("step_id", 0)
            record["images"] = item.get("images", [])
            # 保留data_type字段
            if "data_type" in item:
                record["data_type"] = item["data_type"]
            
            # 转换动作
            gt_action = item.get("gt_action", {})
            action_str = self._gt_action_to_gui_owl(gt_action)
            record['label'] = action_str
            
            instruction = item.get("instruction", "")
            action_history = item.get("action_history", "")
            
            # 解析action_history
            history = []
            if action_history:
                steps = action_history.split(';')
                for step in steps:
                    step = step.strip()
                    if step:
                        history.append(step)
            
            # 构建messages
            if record["images"]:
                try:
                    dummy_image = getResizedImage(record["images"][0])
                    system_messages = build_system_messages(dummy_image.height, dummy_image.width)
                    
                    if 'low' in self.dataset_type:
                        user_messages = build_user_messages(instruction, enable_think=True, history=history)
                    else:
                        user_messages = build_user_messages(instruction, enable_think=True, history=history)
                    
                    user_messages['content'].append({"image": record["images"][0]})
                    messages = [system_messages, user_messages]
                    record['messages'] = message_translate(messages, to_format='qwen')
                except Exception as e:
                    logger.warning(f"处理图片失败: {e}")
                    continue
            
            processed_data.append(record)
        
        # 保存数据
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        output_file = os.path.join(self.save_path, f"test_{self.model_name.lower()}.json")
        self.saveJson(processed_data, output_file)
        logger.info(f"已保存 {len(processed_data)} 条GUI_OWL数据到 {output_file}")
            
        

            
    
if __name__ == '__main__':
    args = parse_args()
    logger.info(args)
    process = AITZPreProcess(args.dataset_type, args.dataset_path, args.dataset_name, args.save_path, args.model_name)
    if args.model_name == "OS_ATLAS":
        process.OS_ATLAS()
    elif args.model_name == "UI_TARS" or args.model_name == "UI_TARS_1.5":
        process.UI_TARS()
    elif args.model_name == "GUI_R1":
        process.GUI_R1()
    elif args.model_name == 'Agent_CPM':
        process.Agent_CPM()
    elif args.model_name == 'Aguvis':
        process.Aguvis()
    elif args.model_name == 'OS_Genesis':
        process.OS_Genesis()
    elif args.model_name == 'GUI_Odyssey':
        process.GUI_Odyssey()
    elif args.model_name == 'GUI_OWL':
        process.GUI_OWL()
    elif args.model_name == 'GPT_5':
        process.GPT_5()
    elif args.model_name == 'GLM_4_5_V':
        process.GLM_4_5_V()
    else:
        logger.info("error processing")
        
        

