"""
React* GUI Agent Adapter

Independent implementation of the React* agent that conforms to the BaseGUIAgent interface.
This is a self-contained agent that doesn't depend on autorpa.
"""

import copy
import sys
import time
from typing import Any

from absl import flags

from gui_agents.react_star.core.models import PlannerOutput, PlannerStepData, SummarizerOutput
from gui_agents.interfaces import BaseGUIAgent

FLAGS = flags.FLAGS

from autorpa.utils.models import ReActStepInfo, ReActTraj
from .core.environment_adapter import EnvironmentAdapter, AgentAdapter
from .core.utils import (
    print_with_color,
    write_to_file,
    store_image,
    add_screenshot_label,
    record_cost_tokens,
)
from .prompts.planner_prompt import get_planner_prompt
from .prompts.summarizer_prompt import get_summarizer_prompt


class ReactStarAgent(BaseGUIAgent):
    """
    React* GUI Agent - An independent agent implementation.
    
    This agent uses a ReAct (Reasoning + Acting) loop with planning and summarization.
    It's designed to be independent of any specific framework.
    
    Key Components:
    1. Planner: Observes screen and decides next action
    2. Executor: Executes the planned action
    3. Summarizer: Summarizes execution results
    
    Note: Reflection generation is NOT part of this agent. It's handled by the
    AutoRPA pipeline's Concluder agent, which analyzes failed exploration trajectories
    and generates reflections for subsequent exploration rounds.
    """
    
    def __init__(
        self,
        agent: Any,  # Agent instance (wrapped via adapter)
        planner_llm: Any = None,
        summarizer_llm: Any = None,
        action_space_mode: str = 'index',  # 'index' or 'coordinate'
        ui_info_mode: str = 'screenshot_with_tree',  # 'screenshot_with_tree' or 'screenshot_only' or 'screenshot_only_som'
        img_resize_mode: str = 'resized',  # 'original' or 'resized'
        enable_shell_action: bool = False  # Whether to enable shell command action
    ):
        """
        Initialize the React* agent.
        
        Args:
            agent: The Agent instance (will be wrapped)
            planner_llm: LLM for planning
            summarizer_llm: LLM for summarizing
            action_space_mode: Action space mode - 'index' (use element indexes) or 'coordinate' (use pixel coordinates)
            ui_info_mode: UI information mode - 'screenshot_with_tree' (screenshot + ally tree with SoM), 'screenshot_only' (only screenshot, no resize), 'screenshot_only_som' (only screenshot with SOM markers that include element indexes). Note: 'index' mode requires 'screenshot_with_tree' or 'screenshot_only_som' (both provide element indexes).
            img_resize_mode: Image resize mode - 'original' (use original screenshots) or 'resized' (use resized screenshots 461x1024). When 'resized' and action_space_mode='coordinate', coordinates will be automatically converted from resized to original scale.
            enable_shell_action: Whether to enable shell command action (default: False). When enabled, the agent can execute arbitrary ADB shell commands.
        """
        # Wrap agent_rpa for clean interface
        self.agent = AgentAdapter(agent)
        self._start_time = None
        
        # Set LLMs
        self.planner_llm = planner_llm
        self.summarizer_llm = summarizer_llm if summarizer_llm is not None else self.agent.llm
        
        # Set action space mode
        import logging
        logger = logging.getLogger('gui_agents.react_star.adapter')
        logger.debug(f"Received action_space_mode={action_space_mode}, type={type(action_space_mode)}")
        if action_space_mode not in ['index', 'coordinate']:
            raise ValueError(f"action_space_mode must be 'index' or 'coordinate', got '{action_space_mode}'")
        self.action_space_mode = action_space_mode
        logger.debug(f"Set self.action_space_mode={self.action_space_mode}")
        
        # Set UI info mode and validate combination
        if ui_info_mode not in ['screenshot_with_tree', 'screenshot_only', 'screenshot_only_som']:
            raise ValueError(f"ui_info_mode must be 'screenshot_with_tree' or 'screenshot_only' or 'screenshot_only_som', got '{ui_info_mode}'")
        
        # Validate combination: index mode requires tree or SOM markers
        # screenshot_only_som also marks elements with indexes based on ally tree
        if action_space_mode == 'index' and ui_info_mode == 'screenshot_only':
            raise ValueError(
                f"Invalid combination: action_space_mode='index' requires ui_info_mode='screenshot_with_tree' or 'screenshot_only_som' "
                f"(index mode needs ally tree or SOM markers to identify element indexes)"
            )
        
        self.ui_info_mode = ui_info_mode
        
        # Set image resize mode
        if img_resize_mode not in ['original', 'resized']:
            raise ValueError(f"img_resize_mode must be 'original' or 'resized', got '{img_resize_mode}'")
        self.img_resize_mode = img_resize_mode
        
        # Set shell action enablement
        self.enable_shell_action = enable_shell_action
        
        # Debug: Print configuration
        print_with_color(
            f'React* Agent Configuration: action_space_mode={self.action_space_mode}, ui_info_mode={self.ui_info_mode}, img_resize_mode={self.img_resize_mode}, enable_shell_action={self.enable_shell_action}',
            'cyan'
        )
        
    async def explore_task(
        self,
        task: Any,
        env_op: Any,  # EnvOperation instance
        max_steps: int = 20,
        reflection: str = None,
        **kwargs
    ) -> ReActTraj:
        """
        Explore task using ReAct loop (single round).
        
        Note: This method performs a SINGLE exploration round. If the task fails,
        the AutoRPA pipeline's Concluder will generate reflection, and this method
        should be called again with the reflection parameter set.
        
        Args:
            task: AndroidWorld task or similar
            env_op: Environment operation interface (EnvOperation)
            max_steps: Maximum number of steps
            reflection: Optional reflection from previous failed attempt (provided by AutoRPA Concluder)
            **kwargs: Additional parameters
                - to_init_task: Whether to initialize task environment
                - log_path: Path for logging
            
        Returns:
            ReActTraj: ReAct trajectory for this exploration round
        """
        to_init_task = kwargs.get('to_init_task', True)
        log_path = kwargs.get('log_path', '')
        
        self._start_time = time.time()
        
        # Wrap environment for clean interface
        env_adapter = EnvironmentAdapter(env_op)
        
        # Store reference in agent
        self.agent.env_op = env_op
        
        # Reset environment
        env_op.reset(
            task=task,
            save_path=log_path,
            to_init_task=to_init_task,
            max_action_step=max_steps
        )
        
        # Reset agent for this round
        self.agent.reset(
            task=task,
            log_task_path=log_path,
            to_init_task=to_init_task
        )
        
        # Set task goal and reflection (if provided by AutoRPA Concluder)
        self.agent.cur_task = task.goal
        self.agent.reflection = reflection or ''
        
        # Run single ReAct loop
        react_traj = await self._run_react_loop(
            task=task,
            env_op=env_adapter,
            max_steps=max_steps,
        )
        
        # Get environment success score (AndroidWorld evaluation)
        env_success_score = task.is_successful(env_op.raw_env)
        agent_done_bool = self.agent.flag_done
        final_success_score = env_success_score if agent_done_bool else 0.0
        final_success_bool = True if final_success_score > 0.5 else False

        react_traj.env_success_score = env_success_score
        react_traj.agent_done_bool = agent_done_bool
        react_traj.final_success_score = final_success_score
        react_traj.final_success_bool = final_success_bool
        
        return react_traj
    
    async def _run_react_loop(
        self,
        task: Any,
        env_op: EnvironmentAdapter,
        max_steps: int,
    ) -> ReActTraj:
        """
        Run the ReAct loop for one round.
        
        Returns:
            ReActTraj: Trajectory for this round
        """
        steps = 0
        
        while not self.agent.flag_done and steps < max_steps:
            # Check if environment indicates done (e.g., max steps reached)
            if env_op.done:
                print_with_color(
                    f"🛑 Environment indicates done (e.g., max steps reached, rpa code indicates done). Stopping ReAct loop.",
                    'red'
                )
                break
            
            # Execute one step
            step_info = self._execute_react_step(env_op=env_op)
            steps += 1
            
            # Check if done
            if self.agent.flag_done:
                break
        
        # Create ReActTraj
        react_traj = ReActTraj(
            task=task.goal,
            reflection=self.agent.reflection or '',
            traj=self.agent.agent_traj,
            action_history=self.agent.action_history,
            env_success_score=0.0,  # Will be set by caller after environment evaluation
            agent_done_bool=self.agent.flag_done,
            final_success_score=0.0,  # Will be set by caller
            final_success_bool=False,  # Will be set by caller
            conclusion='',
            # env_op_traj=env_op.env_op_traj
        )
        
        return react_traj
    
    def _convert_action_code_to_env_op(self, action_code: str) -> str:
        """
        Convert action code from React_star format (without env_op prefix) to env_op format.
        
        Args:
            action_code: Action code without env_op prefix (e.g., "click(6)", "input_text('text', 1)")
            
        Returns:
            Action code with env_op prefix (e.g., "env_op.click(6)", "env_op.input_text('text', 1)")
        """
        import re
        
        # List of action names that should be prefixed with env_op
        action_names = [
            'click', 'long_press', 'input_text', 'swipe', 'drag_and_drop',
            'wait', 'keyboard_enter', 'open_app', 'go_back', 'go_home',
            'stop', 'answer', 'shell'
        ]
        
        # Pattern to match action calls at the start of the line or after whitespace
        # This handles cases like "click(6)" or "input_text('text', 1)"
        for action_name in action_names:
            # Match action_name followed by opening parenthesis
            pattern = r'(^|\s)(' + re.escape(action_name) + r')\('
            replacement = r'\1env_op.\2('
            action_code = re.sub(pattern, replacement, action_code)
        
        return action_code
    
    def _convert_resized_coordinates_to_original(self, action_code: str) -> str:
        """
        Convert coordinates from resized image scale (461*1024) to original scale (1080*2400).
        
        Only applies when:
        - action_space_mode == 'coordinate'
        - img_resize_mode == 'resized'
        
        Args:
            action_code: Action code string (e.g., "env_op.click(230, 512)")
            
        Returns:
            Action code with converted coordinates (e.g., "env_op.click(540, 1200)")
        """
        # If no conversion needed, return as-is
        if self.action_space_mode != 'coordinate' or self.img_resize_mode != 'resized':
            return action_code
        
        import re
        
        # Define scale factors
        SCALE_X = 1080 / 461  # ≈ 2.343
        SCALE_Y = 2400 / 1024  # ≈ 2.344
        
        # Check if this is a coordinate-based action
        coordinate_actions = ['click', 'long_press', 'swipe', 'drag_and_drop', 'input_text']
        is_coordinate_action = any(action in action_code for action in coordinate_actions)
        if not is_coordinate_action:
            return action_code
        
        # Handle click(x, y) and long_press(x, y)
        # Match both "env_op.click(x, y)" and "click(x, y)" formats
        for action_name in ['click', 'long_press']:
            # Pattern matches: env_op.action_name(x, y) or action_name(x, y)
            pattern = rf'(env_op\.)?({action_name}\()(\d+),\s*(\d+)\)'
            def replace_coords(match, action=action_name):
                prefix = match.group(1) or ''  # "env_op." or ""
                x = int(match.group(3))
                y = int(match.group(4))
                new_x = int(x * SCALE_X)
                new_y = int(y * SCALE_Y)
                return f'{prefix}{action}({new_x}, {new_y})'
            action_code = re.sub(pattern, replace_coords, action_code)
        
        # Handle swipe(start_x, start_y, end_x, end_y)
        # Match both "env_op.swipe(...)" and "swipe(...)" formats
        if 'swipe' in action_code:
            # Check if it's coordinate-based swipe (4 numbers) vs direction-based (string)
            pattern = r'(env_op\.)?(swipe\()(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)'
            def replace_swipe(match):
                prefix = match.group(1) or ''  # "env_op." or ""
                start_x = int(match.group(3))
                start_y = int(match.group(4))
                end_x = int(match.group(5))
                end_y = int(match.group(6))
                new_start_x = int(start_x * SCALE_X)
                new_start_y = int(start_y * SCALE_Y)
                new_end_x = int(end_x * SCALE_X)
                new_end_y = int(end_y * SCALE_Y)
                return f'{prefix}swipe({new_start_x}, {new_start_y}, {new_end_x}, {new_end_y})'
            action_code = re.sub(pattern, replace_swipe, action_code)
        
        # Handle drag_and_drop(start_x, start_y, end_x, end_y)
        # Match both "env_op.drag_and_drop(...)" and "drag_and_drop(...)" formats
        if 'drag_and_drop' in action_code:
            pattern = r'(env_op\.)?(drag_and_drop\()(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)'
            def replace_drag(match):
                prefix = match.group(1) or ''  # "env_op." or ""
                start_x = int(match.group(3))
                start_y = int(match.group(4))
                end_x = int(match.group(5))
                end_y = int(match.group(6))
                new_start_x = int(start_x * SCALE_X)
                new_start_y = int(start_y * SCALE_Y)
                new_end_x = int(end_x * SCALE_X)
                new_end_y = int(end_y * SCALE_Y)
                return f'{prefix}drag_and_drop({new_start_x}, {new_start_y}, {new_end_x}, {new_end_y})'
            action_code = re.sub(pattern, replace_drag, action_code)
        
        # Handle input_text(text, x, y, ...) - need to match env_op.input_text too
        if 'input_text' in action_code:
            # Pattern: env_op.input_text("text", x, y) or input_text("text", x, y) or with clear_text
            # Match the entire input_text call and extract coordinates
            pattern = r'((?:env_op\.)?input_text\([^,]+,\s*)(\d+),\s*(\d+)(,\s*[^)]+)?\)'
            def replace_input_text(match):
                prefix = match.group(1)  # "env_op.input_text("text"," or "input_text("text","
                x = int(match.group(2))
                y = int(match.group(3))
                clear_text_part = match.group(4) or ''  # ", clear_text" or ""
                new_x = int(x * SCALE_X)
                new_y = int(y * SCALE_Y)
                return f'{prefix}{new_x}, {new_y}{clear_text_part})'
            action_code = re.sub(pattern, replace_input_text, action_code)
        
        return action_code
    
    def _convert_original_coordinates_to_resized(self, action_code: str) -> str:
        """
        Convert coordinates from original scale (1080*2400) to resized image scale (461*1024).
        
        Only applies when:
        - action_space_mode == 'coordinate'
        - img_resize_mode == 'resized'
        
        Args:
            action_code: Action code string with original coordinates (e.g., "env_op.click(540, 1200)")
            
        Returns:
            Action code with converted coordinates (e.g., "env_op.click(230, 512)")
        """
        # If no conversion needed, return as-is
        if self.action_space_mode != 'coordinate' or self.img_resize_mode != 'resized':
            return action_code
        
        import re
        
        # Define scale factors (reverse of _convert_resized_coordinates_to_original)
        SCALE_X = 461 / 1080  # ≈ 0.427
        SCALE_Y = 1024 / 2400  # ≈ 0.427
        
        # Check if this is a coordinate-based action
        coordinate_actions = ['click', 'long_press', 'swipe', 'drag_and_drop', 'input_text']
        is_coordinate_action = any(action in action_code for action in coordinate_actions)
        if not is_coordinate_action:
            return action_code
        
        # Handle click(x, y) and long_press(x, y)
        # Match both "env_op.click(x, y)" and "click(x, y)" formats
        for action_name in ['click', 'long_press']:
            # Pattern matches: env_op.action_name(x, y) or action_name(x, y)
            pattern = rf'(env_op\.)?({action_name}\()(\d+),\s*(\d+)\)'
            def replace_coords(match, action=action_name):
                prefix = match.group(1) or ''  # "env_op." or ""
                x = int(match.group(3))
                y = int(match.group(4))
                new_x = int(x * SCALE_X)
                new_y = int(y * SCALE_Y)
                return f'{prefix}{action}({new_x}, {new_y})'
            action_code = re.sub(pattern, replace_coords, action_code)
        
        # Handle swipe(start_x, start_y, end_x, end_y)
        # Match both "env_op.swipe(...)" and "swipe(...)" formats
        if 'swipe' in action_code:
            # Check if it's coordinate-based swipe (4 numbers) vs direction-based (string)
            pattern = r'(env_op\.)?(swipe\()(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)'
            def replace_swipe(match):
                prefix = match.group(1) or ''  # "env_op." or ""
                start_x = int(match.group(3))
                start_y = int(match.group(4))
                end_x = int(match.group(5))
                end_y = int(match.group(6))
                new_start_x = int(start_x * SCALE_X)
                new_start_y = int(start_y * SCALE_Y)
                new_end_x = int(end_x * SCALE_X)
                new_end_y = int(end_y * SCALE_Y)
                return f'{prefix}swipe({new_start_x}, {new_start_y}, {new_end_x}, {new_end_y})'
            action_code = re.sub(pattern, replace_swipe, action_code)
        
        # Handle drag_and_drop(start_x, start_y, end_x, end_y)
        # Match both "env_op.drag_and_drop(...)" and "drag_and_drop(...)" formats
        if 'drag_and_drop' in action_code:
            pattern = r'(env_op\.)?(drag_and_drop\()(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)'
            def replace_drag(match):
                prefix = match.group(1) or ''  # "env_op." or ""
                start_x = int(match.group(3))
                start_y = int(match.group(4))
                end_x = int(match.group(5))
                end_y = int(match.group(6))
                new_start_x = int(start_x * SCALE_X)
                new_start_y = int(start_y * SCALE_Y)
                new_end_x = int(end_x * SCALE_X)
                new_end_y = int(end_y * SCALE_Y)
                return f'{prefix}drag_and_drop({new_start_x}, {new_start_y}, {new_end_x}, {new_end_y})'
            action_code = re.sub(pattern, replace_drag, action_code)
        
        # Handle input_text(text, x, y, ...) - need to match env_op.input_text too
        if 'input_text' in action_code:
            # Pattern: env_op.input_text("text", x, y) or input_text("text", x, y) or with clear_text
            # Match the entire input_text call and extract coordinates
            pattern = r'((?:env_op\.)?input_text\([^,]+,\s*)(\d+),\s*(\d+)(,\s*[^)]+)?\)'
            def replace_input_text(match):
                prefix = match.group(1)  # "env_op.input_text("text"," or "input_text("text","
                x = int(match.group(2))
                y = int(match.group(3))
                clear_text_part = match.group(4) or ''  # ", clear_text" or ""
                new_x = int(x * SCALE_X)
                new_y = int(y * SCALE_Y)
                return f'{prefix}{new_x}, {new_y}{clear_text_part})'
            action_code = re.sub(pattern, replace_input_text, action_code)
        
        return action_code
    
    def _execute_react_step(self, env_op: EnvironmentAdapter) -> ReActStepInfo:
        """
        Execute one ReAct step: Planner -> Execute -> Summarizer.
        """
        step_n = len(self.agent.action_history) + 1
        print_with_color(f'# Step {step_n}', 'magenta')
        
        # ===== Update observation before planner =====
        # Ensure we have the latest screen state before planning
        # Note: Don't pass file_prefix - let update_obs use len(executed_actions) to ensure
        # screenshot naming is consistent with execute_action's before_obs logic
        env_op.update_obs()
        
        # ===== PLAN =====
        planner_result = self._planner(
            log_task_path=self.agent.log_task_path,
            goal=self.agent.cur_task,
            step_n=step_n,
            current_obs=env_op.cur_obs,
        )
        
        planner_output = PlannerOutput(**copy.deepcopy(planner_result.output).dict())
        
        # ===== EXECUTE =====
        # Convert action code from React_star format (without env_op prefix) to env_op format
        action_code = self._convert_action_code_to_env_op(planner_output.code)
        
        # Convert coordinates from resized to original if needed
        action_code = self._convert_resized_coordinates_to_original(action_code)
        
        traj, exec_result = env_op.execute_code(
            action_code,
            vars={},
            save_path=self.agent.log_task_path
        )
        
        # In V2 ReActStepInfo, related elements are stored in exec_step_info.related_elements.
        action_related_element = traj[-1].related_elements
        exec_feedback = exec_result.exec_feedback
        self.agent.flag_done = exec_result.agent_done
        
        print_with_color('exec_result', 'magenta')
        print_with_color(exec_result, 'magenta')

        # ===== SUMMARIZE =====
        if self.agent.flag_done:
            self.agent.screen_changes = 'No screen changes.'
            execution_summary = exec_feedback
        else:
            summarizer_result = self._summarizer(
                log_task_path=self.agent.log_task_path,
                goal=self.agent.cur_task,
                step_n=step_n,
                step_data=planner_result,
                after_obs=env_op.cur_obs,
                exec_feedback=exec_feedback,
            )
            self.agent.screen_changes = summarizer_result.screen_changes
            execution_summary = summarizer_result.execution_summary
        
        if len(env_op.action_history):
            if self.action_space_mode == 'coordinate' and self.img_resize_mode == 'resized':
                execution_info = f"{env_op.action_history[-1]}\nResized action code: {planner_output.code}\nExecution Summary: {execution_summary}"
            else:
                execution_info = f"{env_op.action_history[-1]}\nExecution Summary: {execution_summary}"
        else:
            execution_info = f"No action has been performed.\nExecution Summary: {execution_summary}"

        self.agent.action_history.append(execution_info)
        
        # Update agent's completed_tasks from planner output
        if hasattr(planner_output, 'completed_tasks') and planner_output.completed_tasks is not None and planner_output.completed_tasks != '':
            self.agent.completed_tasks.append(planner_output.completed_tasks)
        
        print_with_color('--------------------------------------------', 'magenta')
        print_with_color(f'# Completed Step {step_n}\n', 'magenta')
        
        # Create ReActStepInfo
        react_step_info = ReActStepInfo(
            step_n=step_n,
            obs_description=planner_output.observation,
            completed_tasks=planner_output.completed_tasks if (hasattr(planner_output, 'completed_tasks') and planner_output.completed_tasks is not None) else '',
            action_reason=planner_output.code_reason,
            hard_coded_action=planner_output.code,
            soft_coded_action="",
            execution_summary=execution_summary,
            exec_step_info=traj[-1],
        )
        self.agent.agent_traj.append(react_step_info)
        
        return react_step_info
    
    def _planner(
        self,
        log_task_path: str,
        goal: str,
        step_n: int,
        current_obs: Any,  # ScreenObs
    ) -> PlannerStepData:
        """Planner Agent."""
        print_with_color('============================================', 'green')
        print_with_color("Current Agent: Planner\n", 'green')
        
        state = f'step-{step_n}_before'
        
        # Debug: Print action space mode being used
        print_with_color(
            f'[Planner] Using action_space_mode: {self.action_space_mode}',
            'yellow'
        )
        
        # Prepare UI content based on ui_info_mode
        if self.ui_info_mode == 'screenshot_only':
            ui_content = 'Not available (using screenshot-only mode)'
        elif self.ui_info_mode == 'screenshot_only_som':
            ui_content = 'Not available (using screenshot-only-som mode)'
        else:
            ui_content = current_obs.ui_content_simple_str if current_obs.ui_content_simple_str else 'Not available'
        
        # Get ui_content_full_dict for index range extraction (used in all modes for consistency)
        ui_content_full_dict = None
        if hasattr(current_obs, 'ui_content_full_dict') and current_obs.ui_content_full_dict:
            ui_content_full_dict = current_obs.ui_content_full_dict
        
        planner_prompt_dict = get_planner_prompt(
            goal=goal,
            previous_plan=self.agent.previous_plan,
            completed_plan=self.agent.completed_tasks[-1],
            action_history=self.agent.action_history,
            ui_content=ui_content,
            screen_changes=self.agent.screen_changes,
            reflection=self.agent.reflection,
            additional_guidelines=self.agent.additional_guidelines,
            action_space_mode=self.action_space_mode,
            ui_info_mode=self.ui_info_mode,
            img_resize_mode=self.img_resize_mode,
            enable_shell_action=self.enable_shell_action,
            ui_content_full_dict=ui_content_full_dict,
        )
        
        write_to_file(
            file_path=log_task_path,
            file_name=state + '_planner_prompt.txt',
            content=f"[system]\n{planner_prompt_dict['system']}\n\n[user]\n{planner_prompt_dict['user']}"
        )
        
        # Use planner_llm
        planner_llm = self.planner_llm if self.planner_llm is not None else self.agent.llm
        
        # Prepare images based on ui_info_mode and img_resize_mode
        if self.ui_info_mode == 'screenshot_only':
            if self.img_resize_mode == 'resized':
                # Use resized screenshot
                images = [current_obs.screenshot_resized] if current_obs.screenshot_resized is not None else []
            else:
                # Use original screenshot
                images = [current_obs.screenshot] if current_obs.screenshot is not None else []
        elif self.ui_info_mode == 'screenshot_only_som':
            if self.img_resize_mode == 'resized':
                # Use resized screenshot with SOM
                images = [current_obs.screenshot_with_som_resized] if current_obs.screenshot_with_som_resized is not None else []
            else:
                # Use original screenshot with SOM
                images = [current_obs.screenshot_with_som] if current_obs.screenshot_with_som is not None else []
        else:
            # screenshot_with_tree mode: always use resized screenshots (current behavior)
            images = []
            if current_obs.screenshot_resized is not None:
                images.append(current_obs.screenshot_resized)
            if current_obs.screenshot_with_som_resized is not None:
                images.append(current_obs.screenshot_with_som_resized)
        
        # Call LLM - ReAct* only generates hardcoded actions during exploration
        # Action translation is handled by AutoRPA in the building phase
        planner_output, planner_raw_response = planner_llm.predict_mm(
            user_prompt=planner_prompt_dict['user'],
            images=images,
            system_prompt=planner_prompt_dict['system'],
            output_format=PlannerOutput
        )
        
        write_to_file(
            file_path=log_task_path,
            file_name=state + '_planner_raw_response.txt',
            content=planner_raw_response
        )
        
        # Record tokens
        if hasattr(planner_raw_response, 'usage'):
            planner_tokens = planner_raw_response.usage
            self.agent.record_token.step = str(step_n)
            self.agent.record_token.agent = 'Planner'
            self.agent.record_token.step_tokens = planner_tokens
            self.agent.record_token.llm = FLAGS.planner_llm
            record_cost_tokens(self.agent.record_token)
        
        write_to_file(
            file_path=log_task_path,
            file_name=state + '_planner_output.txt',
            content=planner_output
        )
        
        # Update agent state
        self.agent.previous_plan = planner_output.plan_list
        self.agent.completed_tasks.append(planner_output.completed_tasks)
        
        # Create step data
        step_data_pl = PlannerStepData(
            obs=current_obs,
            output=planner_output,
        )
        
        # Print planner output
        print_with_color(f'Observations:\n{planner_output.observation}\n', 'green')
        if planner_output.consider_reflection:
            print_with_color(f'Reflection Consideration:\n{planner_output.consider_reflection}\n', 'green')
        print_with_color(f'Completed Tasks:\n{planner_output.completed_tasks}\n', 'green')
        print_with_color(f'Plan Justification:\n{planner_output.plan_reason}\n', 'green')
        print_with_color(f'Plan List:\n{planner_output.plan_list}\n', 'green')
        print_with_color(f'Next Action Justification:\n{planner_output.code_reason}\n', 'green')
        print_with_color(f'Action:\n{planner_output.code}\n', 'green')
        sys.stdout.flush()
        
        return step_data_pl
    
    def _extract_single_point_coordinate(self, action_dict: dict) -> tuple[int | None, int | None]:
        """Extract single point coordinate from action_dict.
        
        Args:
            action_dict: Action dictionary containing coordinate information.
            
        Returns:
            Tuple of (x, y) coordinates, or (None, None) if not found.
        """
        x, y = None, None
        if 'x' in action_dict and 'y' in action_dict:
            x, y = action_dict['x'], action_dict['y']
        elif 'touch_xy' in action_dict:
            touch_xy = action_dict['touch_xy']
            if isinstance(touch_xy, (list, tuple)) and len(touch_xy) >= 2:
                x, y = touch_xy[0], touch_xy[1]
        return x, y
    
    def _mark_action_on_screenshot(
        self,
        screenshot: Any,
        action_dict: dict,
        env_op_raw: Any
    ) -> Any:
        """Mark action target on screenshot based on action type.
        
        Args:
            screenshot: Screenshot to mark.
            action_dict: Action dictionary containing action information.
            env_op_raw: Raw environment operation object.
            
        Returns:
            Marked and resized screenshot.
        """
        import cv2
        
        action_type = action_dict.get('action_type', '')
        
        # Handle swipe operations
        if action_type == 'swipe':
            # Check if coordinate-based swipe (has start_x, start_y, end_x, end_y)
            if ('start_x' in action_dict and 'start_y' in action_dict and 
                'end_x' in action_dict and 'end_y' in action_dict):
                # Mark with arrow from start to end
                return env_op_raw.mark_swipe_coordinates(
                    screenshot,
                    action_dict['start_x'],
                    action_dict['start_y'],
                    action_dict['end_x'],
                    action_dict['end_y'],
                    action_type='swipe'
                )
            # Check if direction-based swipe (has direction like 'up', 'down', etc.)
            elif 'direction' in action_dict:
                # Mark with large directional arrow
                return env_op_raw.mark_direction_swipe(
                    screenshot,
                    action_dict['direction']
                )
            else:
                # No coordinate or direction info - return unmarked resized screenshot
                return cv2.resize(screenshot, (461, 1024))
        
        # Handle single-point operations (click, long_press, input_text)
        else:
            x, y = self._extract_single_point_coordinate(action_dict)
            return env_op_raw.mark_target_coordinate(screenshot, x, y)
    
    def _summarizer(
        self,
        log_task_path: str,
        goal: str,
        step_n: int,
        step_data: PlannerStepData,
        after_obs: Any,
        exec_feedback: str = None,
    ) -> SummarizerOutput:
        """Summarizer Agent."""
        print_with_color('============================================', 'green')
        print_with_color("Current Agent: Summarizer\n", 'green')
        
        state = f'step-{step_n}_after'
        
        # Mark target on screenshot based on action_space_mode
        env_op_raw = self.agent.env_op
        if self.action_space_mode == 'index':
            # For index mode, mark the target element index
            before_screenshot_label = env_op_raw.mark_target_index(
                step_data.obs.screenshot,
                step_data.obs.ui_elements,
                env_op_raw.executed_element_index[-1]
            )
        else:
            # For coordinate mode - mark based on action type
            before_screenshot_label = self._mark_action_on_screenshot(
                screenshot=step_data.obs.screenshot.copy(),
                action_dict=env_op_raw.action_dict,
                env_op_raw=env_op_raw
            )
        
        store_image(
            before_screenshot_label,
            f'{step_n}_marked_target_screenshot.png',
            log_task_path
        )
        
        after_screenshot_label = after_obs.screenshot_resized.copy()
        add_screenshot_label(before_screenshot_label, 'before')
        add_screenshot_label(after_screenshot_label, 'after')
        
        store_image(
            before_screenshot_label,
            f'step-{step_n}_summarizer_screenshot_before.png',
            log_task_path
        )
        store_image(
            after_screenshot_label,
            f'step-{step_n}_summarizer_screenshot_after.png',
            log_task_path
        )
        
        # Generate summarizer prompt
        # Prepare execution_info: if using resized screenshots and coordinate mode, convert coordinates to match
        if len(env_op_raw.action_history):
            execution_info_raw = f"{env_op_raw.action_history[-1]}\nExecution Result: {exec_feedback}"
        else:
            execution_info_raw = f"No action has been performed.\nExecution Result: {exec_feedback}"
        
        if self.img_resize_mode == 'resized' and self.action_space_mode == 'coordinate':
            execution_info = self._convert_original_coordinates_to_resized(execution_info_raw)
        else:
            execution_info = execution_info_raw
        
        summarizer_prompt_dict = get_summarizer_prompt(
            goal=goal,
            before_ui_content_full_dict=getattr(step_data.obs, "ui_content_full_dict", None),
            after_ui_content_full_dict=getattr(after_obs, "ui_content_full_dict", None),
            execution_info=execution_info,
            reason=step_data.output.code_reason,
            action_space_mode=self.action_space_mode,
        )
        
        write_to_file(
            file_path=log_task_path,
            file_name=state + '_summarizer_prompt.txt',
            content=f"[system]\n{summarizer_prompt_dict['system']}\n\n[user]\n{summarizer_prompt_dict['user']}"
        )
        
        # Use summarizer LLM
        summarizer_llm = self.summarizer_llm
        
        summarizer_output, summarizer_raw_response = summarizer_llm.predict_mm(
            user_prompt=summarizer_prompt_dict['user'],
            images=[
                before_screenshot_label,
                after_screenshot_label
            ],
            system_prompt=summarizer_prompt_dict['system'],
            output_format=SummarizerOutput
        )
        
        write_to_file(
            file_path=log_task_path,
            file_name=state + '_summarizer_raw_response.txt',
            content=summarizer_raw_response
        )
        
        # Record tokens
        if hasattr(summarizer_raw_response, 'usage'):
            summarizer_tokens = summarizer_raw_response.usage
            self.agent.record_token.step = str(step_n)
            self.agent.record_token.agent = 'Summarizer'
            self.agent.record_token.step_tokens = summarizer_tokens
            self.agent.record_token.llm = FLAGS.summarizer_llm
            record_cost_tokens(self.agent.record_token)
        
        write_to_file(
            file_path=log_task_path,
            file_name=state + '_summarizer_output.txt',
            content=summarizer_output
        )
        
        print_with_color(f'Screen Changes:\n{summarizer_output.screen_changes}\n', 'green')
        print_with_color(f'Execution Summary:\n{summarizer_output.execution_summary}\n', 'green')
        
        return summarizer_output
    
    @property
    def environment_type(self) -> str:
        """Return environment type."""
        return "android"
    
    @property
    def agent_name(self) -> str:
        """Return agent name."""
        return "react_star"
    
    def reset(self) -> None:
        """Reset agent state."""
        self._start_time = None
    
    def cleanup(self) -> None:
        """Cleanup resources."""
        pass
