import json
import math
import os
import re
import sys
import time
from typing import Any

import dashscope
import torch
from PIL import Image, ImageDraw, ImageFont, ImageColor
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info, smart_resize

from K2_agent.agents import base_agent, m3a_utils
from K2_agent.agents.iterative_improvement import (
    IterativeTrajectoryCollector, 
    IterativeImprovementController
)
from K2_agent.env import interface, json_action

# Add parent directory to path for utils import
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
sys.path.append(parent_dir)

from utils.agent_function_call import MobileUse
from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
    NousFnCallPrompt,
    Message,
    ContentItem,
)

# API key configuration for high-level reasoning model
API_KEY = os.getenv("DASHSCOPE_API_KEY", "")

# Task-specific prompt templates have been moved to demonstration analysis system
# Templates are now generated dynamically based on analyzed demonstration data

# Basic fallback template for tasks without specific prompts
DEFAULT_TASK_TEMPLATE = """
Basic mobile operation guidelines:
- Break tasks into single-step instructions
- Analyze screenshots and action history carefully  
- Use proper action types: click, long_press, swipe, type, system_button
- Ensure task completion by checking execution results
- Handle text input carefully (clear existing text when needed)
- Navigate apps by swiping up to access app drawer
"""


def translate_action_to_json(action_data: dict) -> Any:
    """Translate model returned string to JSONAction format"""
    action_type = action_data["action"].lower()
    
    if action_type == "swipe":
        x1, y1 = action_data["coordinate"]
        x2, y2 = action_data["coordinate2"]
        jsonaction = json_action.JSONAction(
            action_type="swipe",
            start_x=x1,
            start_y=y1,
            end_x=x2,
            end_y=y2
        )
    elif action_type == "type":
        jsonaction = json_action.JSONAction(
            action_type="input_text",
            text=action_data.get("text", "")
        )
    elif "click" in action_type:
        if "coordinate" in action_data:
            jsonaction = json_action.JSONAction(
                action_type="click",
                x=action_data["coordinate"][0],
                y=action_data["coordinate"][1]
            )
    elif action_type == "long_press":
        if "coordinate" in action_data:
            jsonaction = json_action.JSONAction(
                action_type="long_press",
                x=action_data["coordinate"][0],
                y=action_data["coordinate"][1]
            )
    elif action_type == "system_button":
        button_mapping = {
            'Back': 'navigate_back',
            'Home': 'navigate_home',
            'Enter': 'keyboard_enter'
        }
        jsonaction = json_action.JSONAction(
            action_type=button_mapping[action_data["button"]]
        )
    elif action_type == "terminate":
        goal_status = action_data['status'].lower()
        if goal_status in ["success", "infeasible"]:
            jsonaction = json_action.JSONAction(
                action_type="status",
                goal_status=goal_status
            )
        else:
            raise ValueError(f"Invalid goal status: {goal_status}")
    elif action_type == "delete_text":
        jsonaction = json_action.JSONAction(
            action_type="delete_text"
        )
    elif action_type == "answer":
        jsonaction = json_action.JSONAction(
            action_type="answer",
            text=action_data['text']
        )
    elif action_type == "wait":
        jsonaction = json_action.JSONAction(
            action_type="wait",
            text=action_data['time']
        )
    return jsonaction


class ActionVisualizer:
    @staticmethod
    def draw_point(image: Image.Image, point: list, color=None):
        if isinstance(color, str):
            try:
                color = ImageColor.getrgb(color)
                color = color + (128,)  
            except ValueError:
                color = (255, 0, 0, 128)  
        else:
            color = (255, 0, 0, 128)  
    
        overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
        overlay_draw = ImageDraw.Draw(overlay)
        radius = min(image.size) * 0.05
        x, y = point

        overlay_draw.ellipse(
            [(x - radius, y - radius), (x + radius, y + radius)],
            fill=color
        )

        image = image.convert('RGBA')
        combined = Image.alpha_composite(image, overlay)
        return combined.convert('RGB')
    
    @staticmethod
    def draw_swipe(image: Image.Image, start_point: list, end_point: list, color=None):
        if isinstance(color, str):
            try:
                color = ImageColor.getrgb(color)
                color = color + (128,)  
            except ValueError:
                color = (255, 0, 0, 128)  
        else:
            color = (255, 0, 0, 128)  

        overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
        overlay_draw = ImageDraw.Draw(overlay)
        
        x1, y1 = start_point
        x2, y2 = end_point
        
        overlay_draw.line([x1, y1, x2, y2], fill=color, width=5)
        
        dx = x2 - x1
        dy = y2 - y1
        length = math.sqrt(dx**2 + dy**2)
        if length == 0:
            return image
        ratio = 10 / length  
        arrow_dx = dx * ratio
        arrow_dy = dy * ratio

        overlay_draw.polygon([
            (x2, y2),
            (x2 - arrow_dx + arrow_dy, y2 - arrow_dy - arrow_dx),
            (x2 - arrow_dx - arrow_dy, y2 - arrow_dy + arrow_dx)
        ], fill=color)
        
        image = image.convert('RGBA')
        combined = Image.alpha_composite(image, overlay)
        return combined.convert('RGB')
    
    @staticmethod
    def draw_text(image: Image.Image, text: str, position: list, color=None, font_size=20):
        if isinstance(color, str):
            try:
                color = ImageColor.getrgb(color)
                color = color + (128,)  
            except ValueError:
                color = (255, 0, 0, 128)  
        else:
            color = (255, 0, 0, 128)  

        overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
        overlay_draw = ImageDraw.Draw(overlay)
        
        x, y = position
        
        try:
            font = ImageFont.truetype("arial.ttf", font_size)
        except IOError:
            font = ImageFont.load_default()
        
        overlay_draw.text((x, y), text, fill=color, font=font)
        
        image = image.convert('RGBA')
        combined = Image.alpha_composite(image, overlay)
        return combined.convert('RGB')


class DualModelAgent(base_agent.EnvironmentInteractingAgent):
    """DualModelAgent - supports demonstration data analysis and specialized prompt generation"""

    def __init__(
        self,
        env: interface.AsyncEnv,
        reason_model_path: str,
        function_model_path: str,
        device: str = "cuda",
        max_new_tokens: int = 128,
        name: str = 'DualModelAgent',
        wait_after_action_seconds: float = 2.0,
        analyze_mode: bool = False,
        task_name_for_analysis: str = None,
        analysis_model_path: str = "qwen2.5-vl-72b-instruct",
        enable_iterative_improvement: bool = False,
    ):
        """Initializes a DualModelAgent."""
        super().__init__(env, name)
        
        # Initialize models
        self.reason_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            reason_model_path,
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2",
            device_map="auto"
        ).to(device)
        self.function_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            function_model_path,
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2",
            device_map="auto"
        ).to(device)
        self._freeze_function_model()
        self.reason_processor = AutoProcessor.from_pretrained(reason_model_path)
        self.function_processor = AutoProcessor.from_pretrained(function_model_path)
        self.max_new_tokens = max_new_tokens
        self.device = device
        self.visualizer = ActionVisualizer()
        self.history_instruction = []
        self.history_action = []
        self.history_reasoning = None
        self.additional_guidelines = None
        self.wait_after_action_seconds = wait_after_action_seconds
        
        # Ensure base directory exists
        self.base_save_dir = "./save/images/test2"
        os.makedirs(self.base_save_dir, exist_ok=True)

        # Demonstration data analysis related
        self.analyze_mode = analyze_mode
        self.task_name_for_analysis = task_name_for_analysis
        self.analysis_model_path = analysis_model_path
        self.generated_prompt = None
        
        # Iterative improvement related
        self.enable_iterative_improvement = enable_iterative_improvement
        self.trajectory_collector = None
        self.improvement_controller = None
        self.current_task_name = None
        self.current_goal = None
        
        if self.enable_iterative_improvement:
            print("🔄 Enabling iterative improvement mode")
            self.trajectory_collector = IterativeTrajectoryCollector()
            self.improvement_controller = IterativeImprovementController()
        
        if self.analyze_mode:
            print("🔍 Enabling demonstration data analysis mode")
            if self.task_name_for_analysis:
                print(f"🎯 Will analyze task: {self.task_name_for_analysis}")
                self._analyze_demonstration_data()

    def _analyze_demonstration_data(self):
        """Analyze demonstration data and generate specialized prompt"""
        try:
            # Import analyzer (delayed import to avoid circular dependency)
            from . import demonstration_analyzer_v2
            
            print("📦 Loading analysis model...")
            # Create independent analyzer instance (using API model name)
            analysis_model_path = getattr(self, 'analysis_model_path', 
                                        "qwen2.5-vl-72b-instruct")
            analyzer = demonstration_analyzer_v2.DemonstrationAnalyzer(
                model_path=analysis_model_path,
                device=self.device
            )
            
            print(f"🔍 Starting demonstration data analysis: {self.task_name_for_analysis}")
            analysis_result = analyzer.analyze_demonstration(self.task_name_for_analysis)
            
            if analysis_result:
                self.generated_prompt = analysis_result.get("generated_prompt")
                print(f"✅ Demonstration data analysis completed, generated specialized prompt")
                print(f"📝 Generated prompt preview:\n{self.generated_prompt[:200]}...")
                
                # Save generated prompt to file (dual format: timestamp + fixed name)
                import datetime
                timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                
                # Save to generated_prompts directory (dual format)
                prompt_dir = "./generated_prompts"
                os.makedirs(prompt_dir, exist_ok=True)
                
                # Save timestamped version
                timestamped_prompt_file = os.path.join(prompt_dir, f"{self.task_name_for_analysis}_prompt_{timestamp}.txt")
                with open(timestamped_prompt_file, 'w', encoding='utf-8') as f:
                    f.write(self.generated_prompt)
                
                # Save fixed name version (compatibility)
                fixed_prompt_file = os.path.join(prompt_dir, f"{self.task_name_for_analysis}_prompt.txt")
                with open(fixed_prompt_file, 'w', encoding='utf-8') as f:
                    f.write(self.generated_prompt)
                
                print(f"✅ Prompt dual format save completed:")
                print(f"   📄 Timestamped version: {os.path.basename(timestamped_prompt_file)}")
                print(f"   📄 Fixed name version: {os.path.basename(fixed_prompt_file)}")
                
                # Remove duplicate comprehensive_prompts save logic
                print(f"📁 Prompt save completed, using unified generated_prompts directory management")
            else:
                print("❌ Analysis failed or no demonstration data found, will use default template")
                
        except Exception as e:
            print(f"⚠️ Error occurred during analysis process: {str(e)}")
            print("🔄 Will continue with default template")

    def _freeze_function_model(self):
        """Freeze the function model"""
        for param in self.function_model.parameters():
            param.requires_grad = False
        self.function_model.eval()

    def set_task_guidelines(self, task_guidelines: list[str]) -> None:
        self.additional_guidelines = task_guidelines

    def reset(self, go_home_on_reset: bool = False):
        super().reset(go_home_on_reset)
        self.env.hide_automation_ui()
        self.history_instruction = []
        self.history_reasoning = None
        
    def get_task_specific_prompt(self, goal: str) -> str:
        """Get task-specific prompt - supports intelligent prompt management"""
        print(f"🎯 Loading task-specific prompt for: {self.task_name_for_analysis}")
        
        # Prioritize using current prompt in memory
        if self.generated_prompt:
            print("✅ Using current prompt in memory")
            return self.generated_prompt
        
        # Try loading latest version from intelligent prompt manager
        if self.task_name_for_analysis and self.improvement_controller:
            print("🔍 Trying to load latest prompt from SmartPromptManager...")
            latest_prompt = self.improvement_controller.load_prompt(self.task_name_for_analysis)
            if latest_prompt:
                print("📋 Successfully loaded latest version from SmartPromptManager")
                self.generated_prompt = latest_prompt  # Cache to memory
                return latest_prompt
            else:
                print("⚠️ SmartPromptManager did not find prompt")
        
        # Fallback: load latest timestamped file from generated_prompts directory
        if self.task_name_for_analysis:
            prompt_dir = "./generated_prompts"
            latest_prompt_file = self._find_latest_timestamped_prompt(prompt_dir, self.task_name_for_analysis)
            print(f"🔄 Fallback attempt to load: {latest_prompt_file}")
            if latest_prompt_file and os.path.exists(latest_prompt_file):
                try:
                    with open(latest_prompt_file, 'r', encoding='utf-8') as f:
                        cached_prompt = f.read().strip()
                    if cached_prompt:
                        print("📋 Successfully loaded latest prompt from generated_prompts directory")
                        self.generated_prompt = cached_prompt  # Cache to memory
                        return cached_prompt
                except Exception as e:
                    print(f"❌ Failed to read prompt: {e}")
        
        # Default return None, use basic template
        print("⚠️ No prompt found, will use default basic template")
        return None

    def start_iterative_task(self, task_name: str, goal: str):
        """Start iterative improvement task"""
        if self.enable_iterative_improvement and self.trajectory_collector:
            self.current_task_name = task_name
            self.current_goal = goal
            self.trajectory_collector.start_trajectory_collection(task_name, goal, self.name)
            
            # Load existing prompt improvements
            if self.improvement_controller:
                improved_prompt = self.improvement_controller.load_prompt(task_name)
                if improved_prompt:
                    self.generated_prompt = improved_prompt
                    print(f"🎯 Loaded existing improved prompt")

    def step(self, goal: str) -> base_agent.AgentInteractionResult:
        step_data = {
            'raw_screenshot': None,
            'before_screenshot_with_som': None,
            'before_ui_elements': [],
            'after_screenshot_with_som': None,
            'action_prompt': None,
            'action_output': None,
            'action_output_json': None,
            'action_reason': None,
            'action_raw_response': None,
            'summary_prompt': None,
            'summary': None,
            'summary_raw_response': None,
        }
        print('----------step ' + str(len(self.history_instruction) + 1))
        start = time.perf_counter()
        time.sleep(1)
        state = self.get_post_transition_state()
        step_data['raw_screenshot'] = state.pixels.copy()
        
        # Iterative improvement: save pre-execution screenshot
        before_screenshot = state.pixels.copy() if self.enable_iterative_improvement else None
        
        # Generate action
        observation = {
            'image_path': self._save_screenshot(step_data['raw_screenshot']),
            'prompt': goal,
            'history': self.history_instruction,
        }
        action = self._generate_action(observation)

        if action['action'].action_type == 'status':
            if action['action'].goal_status == 'infeasible':
                print('Agent stopped since it thinks mission impossible.')
            step_data['summary'] = 'Agent thinks the request has been completed.'
            self.history_instruction.append(step_data)
            
            # Iterative improvement: collect trajectory step  
            if self.enable_iterative_improvement and self.trajectory_collector and before_screenshot is not None:
                self.trajectory_collector.collect_step(
                    before_screenshot=before_screenshot,
                    reasoning=action.get('reason', ''),
                    instruction='Task completed',
                    action_raw=str(action['action']),
                    task_completed=True
                )
            
            return base_agent.AgentInteractionResult(True, step_data)
            
        end = time.perf_counter()
        print('Reasoning runtime: {} seconds'.format(end-start))
        start = time.perf_counter()
        
        if not action:
            raise RuntimeError('Error generating action.')

        step_data['action_output'] = action['action']
        step_data['action_reason'] = action['reason']
        step_data['action_raw_response'] = action['raw_response']
        
        try:
            step_data['action_output_json'] = action['action']
        except Exception as e:
            print('Failed to convert the output to a valid action.')
            print(str(e))
            step_data['summary'] = (
                'Can not parse the output to a valid action. Please make sure to pick'
                ' the action from the list with required parameters (if any) in the'
                ' correct JSON format!'
            )
            self.history_instruction.append(step_data)
            return base_agent.AgentInteractionResult(False, step_data)
            
        # Execute action
        try:
            self.env.execute_action(action['action'])
        except Exception as e:
            print('Failed to execute action.')
            print(str(e))
            step_data['summary'] = (
                'Can not execute the action, make sure to select the action with'
                ' the required parameters (if any) in the correct JSON format!'
            )
            return base_agent.AgentInteractionResult(False, step_data)
            
        time.sleep(self.wait_after_action_seconds)
        end = time.perf_counter()
        print('Action execution runtime: {} seconds'.format(end-start))
        start = time.perf_counter()
        
        state = self.env.get_state(wait_to_stabilize=False)
        after_screenshot = state.pixels.copy()
        m3a_utils.add_screenshot_label(after_screenshot, 'after')
        step_data['after_screenshot_with_som'] = after_screenshot.copy()
        
        # Iterative improvement: collect trajectory step
        if self.enable_iterative_improvement and self.trajectory_collector and before_screenshot is not None:
            self.trajectory_collector.collect_step(
                before_screenshot=before_screenshot,
                reasoning=action.get('reason', ''),
                instruction=self.history_instruction[-1] if self.history_instruction else '',
                action_raw=str(action['action']),
                after_screenshot=after_screenshot,
                task_completed=False
            )
        
        end = time.perf_counter()
        print('Cleanup work runtime: {} seconds'.format(end-start))
        return base_agent.AgentInteractionResult(False, step_data)

    def _save_screenshot(self, pixels, som=False):
        """Save screenshot to temporary file"""
        timestamp = int(time.time())
        temp_file = os.path.join(self.base_save_dir, f'scr_{timestamp}.png')
        if som:
            temp_file = os.path.join(self.base_save_dir, f'scr_{timestamp}_som.png')
        try:
            img = Image.fromarray(pixels)
            img.save(temp_file)
            return temp_file
        except Exception as e:
            print(f"Error saving screenshot: {e}")
            return None

    def _generate_answer_call(self, observation, answer):
        action_data = {'action': 'answer', 'text': answer}
        convert_action = self._parse_answer_call(observation, action_data)
        print(convert_action)
        return {
            'action': convert_action,
            'reason': 'Generated by function model',
            'raw_response': answer
        }
    
    def _generate_action(self, observation):
        """Generate action using DualModelAgent approach"""
        # Phase 1: Generate instruction
        if len(self.history_instruction) > 1 and 'Answer' in self.history_instruction[-1]:
            print("complete based on answer")
            response_mode = "instruction"
            instruction = "The task is completed."
        else:
            reasoning_messages = self._build_reasoning_messages(observation)
            response_mode, instruction = self._generate_instruction(reasoning_messages, use_api=True)
        
        if response_mode == "instruction":
            # Phase 2: Generate function call
            action = self._generate_function_call(observation, instruction)
        if response_mode == "answer":
            action = self._generate_answer_call(observation, instruction)
        
        return {
            'action': action['action'],
            'reason': action['reason'],
            'raw_response': action['raw_response']
        }

    def _build_reasoning_messages(self, observation):
        """Build reasoning phase prompt information"""
        dummy_image = Image.open(observation['image_path'])
        resized_height, resized_width = smart_resize(dummy_image.height, dummy_image.width)

        # First check if there's a specialized prompt
        task_specific_prompt = self.get_task_specific_prompt(observation["prompt"])
        if task_specific_prompt:
            template = task_specific_prompt
            print("🎯 Using analysis-generated specialized prompt")
        else:
            template = DEFAULT_TASK_TEMPLATE
            print("📋 Using default basic template")
        
        system_content = {
            "role": "system",
            "content": [{
                "type": "text",
                "text": """
You are a mobile operation Agent that performs precise screen interactions. You will handle either a task or a question by breaking it down into single-step instructions. At each step, you will reason based on the current task, the current screen screenshot, and the history of actions to determine the next action. You need to be aware that action may not have executed all of the content of the instruction. When the content of the instruction is not executed, you need to give the remaining part of the instruction.
Your output MUST STRICTLY follow this structure:<reasoning> reasoning process here </reasoning> <instruction>Instruction: ...</instruction> OR <reasoning> reasoning process here </reasoning> <answer>Answer: ...</answer>. 
Here is the last reasoning: """ + str(self.history_reasoning) + """ Here are all the history actions: """
+ str([
    'Step ' + str(i + 1) + ': ' + step_info + ' Action: ' + self.history_action[i]
    for i, step_info in enumerate(self.history_instruction)
]) +
 """ This is a guide to the software you need to use for your current task: """ + template 
            }]
        }

        user_content = [{
            "type": "image",
            "image": observation['image_path'] 
        }, {
            "type": "text",
            "text": observation['prompt'] 
        }]
        return [system_content, {"role": "user", "content": user_content}]

    def _generate_instruction(self, messages, use_api=False):
        """Generate instruction using reasoning model"""
        if use_api:
            dashscope.api_key = API_KEY
            api_model_path = "qwen2.5-vl-72b-instruct"
            response = dashscope.MultiModalConversation.call(model=api_model_path, messages=messages)
            response = response.output["choices"][0]["message"]["content"][0]["text"]
        else:
            text = self.reason_processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            image_inputs, _ = process_vision_info(messages)
            
            inputs = self.reason_processor(
                text=[text],
                images=image_inputs,
                padding=True,
                return_tensors="pt"
            ).to(self.device)
            
            with torch.no_grad():
                generated_ids = self.reason_model.generate(
                    **inputs, 
                    max_new_tokens=2048
                )
                generated_ids_trimmed = [
                    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                response = self.reason_processor.batch_decode(
                    generated_ids_trimmed, 
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False
                )[0]
        print("goal")
        print(messages[1]["content"][1]["text"])
        print("response")
        print(response)
        print("-"*50)
        self.history_reasoning = response
        response_mode, instruction = self._parse_instruction(response)
        if response_mode == "instruction":
            self.history_instruction.append(instruction)
        if response_mode == "answer":
            current_instruction = "Answer: " + instruction
            self.history_instruction.append(current_instruction)
        return response_mode, instruction

    def _parse_instruction(self, response):
        instruction_pattern = r'<instruction>\s*(?:Instruction:\s*)?(.+?)</instruction>'
        instruction_matches = re.findall(instruction_pattern, response, re.IGNORECASE | re.DOTALL)
        if instruction_matches:
            instruction = instruction_matches[-1].strip()
            print(f"instruction: {instruction}")
            return "instruction", instruction

        # If no instruction found, try matching <answer>
        answer_pattern = r'<answer>\s*(?:Answer:\s*)?(.+?)</answer>'
        answer_matches = re.findall(answer_pattern, response, re.IGNORECASE | re.DOTALL)
        if answer_matches:
            answer = answer_matches[-1].strip()
            print(f"answer: {answer}")
            return "answer", answer
        print("parse error")
        # If neither matched, try last line
        last_line = response.split('\n')[-1].strip()
        if last_line:
            return "instruction", last_line
        print("error, No content found")

    def _generate_function_call(self, observation, instruction):
        dummy_image = Image.open(observation['image_path'])
        resized_height, resized_width = smart_resize(dummy_image.height, dummy_image.width)
        mobile_use = MobileUse(
            cfg={"display_width_px": resized_width, "display_height_px": resized_height}
        )   
        
        # Build messages - keep unchanged action model prompt
        user_query = 'The user query:  ' + instruction
        message = NousFnCallPrompt.preprocess_fncall_messages(
            messages=[
                Message(role="system", content=[ContentItem(text="""
You are a helpful assistant. You need to operate the mobile phone based on the instruction.
# Output Examples(You need to choose the right action according to the instruction):
- When the instruction is related to "long press", the output MUST be like:
<tool_call>{"name": "mobile_use", "arguments": {"action": "long_press", "coordinate": [xxx, xxx]}}</tool_call>. 
- When the instruction is related to "click" or "tap", the output should be like:
<tool_call>{"name": "mobile_use", "arguments": {"action": "click", "coordinate": [xxx, xxx]}}</tool_call>. 
- When the instruction is related to "swipe" , the output should be like:
<tool_call>{"name": "mobile_use", "arguments": {"action": "swipe", "coordinate": [xxx, xx], "coordinate2": [xxx, xxx]}}</tool_call>
- When the instruction is related to "Back button" or "Home button" , the output should be like:
<tool_call>{"name": "mobile_use", "arguments": {"action": "system_button", "button": "Back"}}</tool_call>
- When the instruction is related to "type", the output should be like:
<tool_call>{"name": "mobile_use", "arguments": {"action": "type", "text": "xxx"}}</tool_call>
- When the task is completed, the output should be like:
<tool_call>{"name": "mobile_use", "arguments": {"action": "terminate", "status": "success"}}</tool_call>
""")]),
                Message(role="user", content=[
                    ContentItem(text=user_query),
                    ContentItem(image=observation['image_path'])
                ]),
            ],
            functions=[mobile_use.function],
            lang=None,
        )
        message = [msg.model_dump() for msg in message]
        
        text = self.function_processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
        inputs = self.function_processor(text=[text], images=[dummy_image], padding=True, return_tensors="pt").to('cuda')
        output_ids = self.function_model.generate(**inputs, max_new_tokens=2048)
        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
        output_text = self.function_processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
        
        print("=============================output_text=================================")
        print(output_text)
        convert_action = self._parse_function_call(observation, output_text)
        print(convert_action)
        return {
            'action': convert_action,
            'reason': 'Generated by function model',
            'raw_response': output_text
        }
        
    def _generate_visualization(self, image, action, width, height):
        """Generate visualization image based on action type"""
        display_image = image.resize((width, height))
        action_type = action['action']
        
        if action_type == "swipe":
            return self.visualizer.draw_swipe(
                image, 
                action['coordinate'], 
                action['coordinate2']
            )
        elif action_type == "type":
            return self.visualizer.draw_text(
                image, 
                action['text'], 
                [width*0.1, height*0.5], 
                font_size=60
            )
        elif action_type == "click":
            return self.visualizer.draw_point(
                image, 
                action['coordinate'], 
                color='green'
            )
        elif action_type == "long_press":
            return self.visualizer.draw_point(
                image, 
                action['coordinate'], 
                color='red'
            )
        return display_image

    def _parse_answer_call(self, observation, action):
        """Parse answer call"""
        if not observation.get('image_path'):
            raise ValueError("Missing image path in observation")
            
        try:
            dummy_image = Image.open(observation['image_path'])
            resized_height, resized_width = smart_resize(dummy_image.height, dummy_image.width)     
            
            # Convert action format
            translate_action = translate_action_to_json(action)
            return translate_action
            
        except Exception as e:
            print(f"Error in _parse_answer_call: {e}")
            raise

    def _parse_function_call(self, observation, action):
        """Parse function call"""
        if not observation.get('image_path'):
            raise ValueError("Missing image path in observation")
            
        try:
            dummy_image = Image.open(observation['image_path'])
            resized_height, resized_width = smart_resize(dummy_image.height, dummy_image.width)
            start = action.find('{')
            end = action.rfind('}') + 1
            json_str = action[start:end]
            action = json.loads(json_str)['arguments']
            self.history_action.append(json.dumps(action))
            
            # Generate visualization image
            display_image = self._generate_visualization(
                dummy_image, 
                action, 
                resized_width, 
                resized_height
            )
            
            # Simplify path generation logic
            task_name = "action_step"
            step_num = len(self.history_instruction) + 1
            save_dir = os.path.join(self.base_save_dir, f"step_{step_num}")
            os.makedirs(save_dir, exist_ok=True)
            
            save_path = os.path.join(save_dir, "action_visual.png")
            display_image.save(save_path)
            
            # Convert action format
            translate_action = translate_action_to_json(action)
            return translate_action
            
        except Exception as e:
            print(f"Error in _parse_function_call: {e}")
            raise

    def _find_latest_timestamped_prompt(self, prompt_dir: str, task_name: str) -> str:
        """Find the latest timestamped prompt file for specified task in directory"""
        try:
            if not os.path.exists(prompt_dir):
                return None
            
            # Find all matching prompt files
            matching_files = []
            for filename in os.listdir(prompt_dir):
                # Match format: {task_name}_prompt_{timestamp}.txt
                if filename.startswith(f"{task_name}_prompt_") and filename.endswith(".txt"):
                    # Exclude backup files (containing backup keyword)
                    if "backup" not in filename:
                        file_path = os.path.join(prompt_dir, filename)
                        # Extract timestamp
                        timestamp_part = filename.replace(f"{task_name}_prompt_", "").replace(".txt", "")
                        matching_files.append((file_path, timestamp_part, filename))
            
            if not matching_files:
                # Fallback: try finding old format file without timestamp
                old_format_file = os.path.join(prompt_dir, f"{task_name}_prompt.txt")
                if os.path.exists(old_format_file):
                    print(f"📁 Found old format file: {old_format_file}")
                    return old_format_file
                return None
            
            # Sort by timestamp in descending order, get the latest
            matching_files.sort(key=lambda x: x[1], reverse=True)
            latest_file_path, latest_timestamp, latest_filename = matching_files[0]
            print(f"📁 Found latest timestamped prompt file: {latest_filename}")
            return latest_file_path
            
        except Exception as e:
            print(f"⚠️ Error finding latest prompt file: {e}")
            return None 