import ast
import json
import os
import pprint
import re
import time
import traceback
from copy import deepcopy
from turtle import update
from typing import Optional, Tuple, List, Any

import cv2
from absl.flags import FLAGS
from android_world.env import interface
from android_world.task_evals.miniwob import miniwob_base

from autorpa.execution import action_execution
from autorpa.execution.action_models import JSONAction

from .utils import agent_utils, models
from .utils.llm_client import get_llm_wrapper
from .utils.models import ScreenObs
from .utils.agent_utils import print_with_color
from .utils.code_validation import validate_python_syntax


class ActionExecutionError(Exception):
  """Custom exception for errors during action execution.
  
  Preserves complete traceback information for debugging.
  """
  
  def __init__(self, action_code, error_code, original_exception):
    self.action_code = action_code
    self.error_code = error_code  # self.kwargs
    self.original_exception = original_exception
    
    # Preserve complete traceback for debugging
    self.traceback_str = traceback.format_exc()
    
    message = (
      f"Action Execution Failed\n"
      f"{'=' * 60}\n"
      f"Action: {action_code}\n"
      f"Related kwargs: {error_code}\n"
      f"Original Error: {original_exception}\n"
      f"{'=' * 60}\n"
      f"Full Traceback:\n{self.traceback_str}"
    )
    super().__init__(message)


class ReachedMaxStepsError(Exception):
  """Raised when the number of executed actions exceeds the maximum allowed steps."""
  
  def __init__(self, action_code, error_code, message=None):
    self.error_code = error_code if error_code else action_code
    if message is None:
      message = "Maximum number of action steps reached."
    super().__init__(message)


class EnvOperation:  # create in agent.reset()
  def __init__(self, raw_env: interface.AsyncEnv):
    self.raw_env = raw_env
    self.llm = get_llm_wrapper(FLAGS.default_llm, enable_logging=FLAGS.enable_llm_logging)
    self.record_token = models.RecordToken()
  
  def reset(self, task, save_path, to_init_task: bool = True, max_action_step: int = 20):
    self.save_path = save_path
    os.makedirs(self.save_path, exist_ok=True)
    self.max_action_step = max_action_step
    # if task.name.lower().startswith('miniwob'):
    if FLAGS.suite_family.startswith('miniwob'):
      self.termination_fn = miniwob_base.is_episode_terminated
    else:
      self.termination_fn = lambda env: False
    
    if to_init_task:
      # Only tear down if task has been initialized before
      # This prevents AttributeError for composite tasks on first run
      try:
        if hasattr(task, 'initialized') and task.initialized:
          task.tear_down(self.raw_env)
      except AttributeError as e:
        # Log the error but continue - this is expected for first-time initialization
        print_with_color(f'Warning: tear_down failed (expected on first run): {e}', 'yellow')
      
      # Initialize task with error handling
      try:
        task.initialize_task(self.raw_env)
      except Exception as e:
        # Log detailed error information for debugging
        error_msg = (
          f'Failed to initialize task {task.name if hasattr(task, "name") else "unknown"}: {e}\n'
          f'Error type: {type(e).__name__}\n'
          f'Traceback: {traceback.format_exc()}'
        )
        print_with_color(f'❌ Task initialization error:\n{error_msg}', 'red')
        # Re-raise the exception to prevent continuing with uninitialized task
        raise
      
      start_on_home_screen = task.start_on_home_screen if to_init_task else False
      self.raw_env.reset(go_home=start_on_home_screen)
      
      self.before_obs = None
      self.executed_actions = []  # Record the actually executed code.
      self.related_elements = []  # Store the elements involved in each executed code.
      self.executed_element_index = []  # Store the index of the elements involved in each executed code
      self.traj = []  # (temporary) store EnvExecStepInfo for RPAExecTraj
      self.action_history = []  # store action_info_str, used in the prompt
      self.done = False
    else:
      task.initialized = True
    
    self.raw_env.hide_automation_ui()
    self.cur_obs = self.get_obs(file_prefix=f'step_{len(self.executed_actions)}')
    self.answer_return = None
    self.action_dict = {}
    
    self.record_token = models.RecordToken(file_path=FLAGS.log_folder_exp, task_type=task.name,
                                                task_num=f'Task 0', stage='Env OP')
  
  def execute_code(self, code: str, vars: dict, save_path: str, flag_exec_rpa: bool = False):
    print_with_color('============================================', 'cyan')
    print_with_color("Executing code...", 'cyan')
    
    self.save_path = save_path
    self.kwargs = None
    self.previous_kwargs = None
    self.previous_index = None
    
    function_code = code # re.sub(r"^```(?:python)?\s*|\s*```$", "", code, flags=re.MULTILINE)
    
    # Convert lowercase true/false to Python boolean values (True/False)
    # Use word boundaries to avoid replacing true/false inside strings or variable names
    function_code = re.sub(r'\btrue\b', 'True', function_code)
    function_code = re.sub(r'\bfalse\b', 'False', function_code)
    
    # Fix swipe direction parameters that are missing quotes
    # Convert swipe(up), swipe(down), swipe(left), swipe(right) to swipe("up"), etc.
    # This handles cases where LLM generates swipe(up) instead of swipe("up")
    # Also handles swipe(direction, index) format
    swipe_directions = ['up', 'down', 'left', 'right']
    for direction in swipe_directions:
      # Match swipe(direction) or env_op.swipe(direction) where direction is not quoted
      # Use word boundaries to avoid matching inside strings or variable names
      # Pattern 1: swipe(direction) - single argument
      pattern1 = r'((?:env_op\.)?swipe\()\b' + direction + r'\b(\))'
      replacement1 = r'\1"' + direction + r'"\2'
      function_code = re.sub(pattern1, replacement1, function_code)
      # Pattern 2: swipe(direction, index) - two arguments
      pattern2 = r'((?:env_op\.)?swipe\()\b' + direction + r'\b(,\s*\d+\))'
      replacement2 = r'\1"' + direction + r'"\2'
      function_code = re.sub(pattern2, replacement2, function_code)
    
    # Convert direct adb shell commands to env_op.shell() calls
    # Handle cases where LLM generates "adb shell input swipe ..." instead of "env_op.shell('input swipe ...')"
    # Pattern matches: "adb shell <command>" at the start of a line or as a standalone statement
    adb_shell_pattern = r'^(?:adb\s+shell\s+)(.+)$'
    def convert_adb_shell(match):
      command = match.group(1).strip()
      # Escape quotes in the command if present
      command_escaped = command.replace('"', '\\"')
      return f'env_op.shell("{command_escaped}")'
    
    # Apply conversion line by line to avoid affecting strings or comments
    lines = function_code.split('\n')
    converted_lines = []
    for line in lines:
      stripped = line.strip()
      # Only convert if it's a standalone adb shell command (not in a string or comment)
      if re.match(r'^adb\s+shell\s+', stripped) and not stripped.startswith('#') and not stripped.startswith('"') and not stripped.startswith("'"):
        converted_line = re.sub(adb_shell_pattern, convert_adb_shell, stripped)
        # Preserve original indentation
        indent = len(line) - len(line.lstrip())
        converted_lines.append(' ' * indent + converted_line)
      else:
        converted_lines.append(line)
    function_code = '\n'.join(converted_lines)
    
    # Fix find_element() calls that pass a dict as positional argument instead of **kwargs
    # Convert find_element({...}) to find_element(**{...})
    # This handles cases where LLM generates find_element({'text': ...}) instead of find_element(**{'text': ...})
    # Pattern: match (env_op.)?find_element(\s*\(\s*)(?!\*\*)(\{) and replace with \1\2**\3
    # This ensures we only fix cases where ** is missing before {
    pattern = r'((?:env_op\.)?find_element\s*\(\s*)(?!\*\*)(\{)'
    function_code = re.sub(pattern, r'\1**\2', function_code)
    
    local_vars = {'env_op': self, 'time': time, 're': re, 'json': json, 'error_message': None}
    if vars.get("function_call"):
      function_code += f'\n\n{vars.get("function_call")}'  # function_call must go after function def to avoid errors
    else:
      local_vars.update(vars)
    print_with_color(function_code, 'cyan')
    print_with_color('--------------------------------', 'cyan')
    agent_utils.write_to_file(file_path=save_path, file_name='rpa_code.py', content=function_code)
    
    # Validate syntax before execution
    is_valid_syntax, syntax_error_msg = validate_python_syntax(function_code, save_path)
    if not is_valid_syntax:
      print_with_color('Skipping execution due to syntax error detected.', 'yellow')
      exec_result = models.ExecResult(
        executed_code='',
        error_statement='SyntaxError',
        error_message=syntax_error_msg,
        exec_feedback=syntax_error_msg,
        answer_return=self.answer_return,
        agent_done=False,
        done=False,
      )
      return self.traj, exec_result
    
    error_statement = None
    error_message = None
    executed_code = ''
    
    # Optional: Apply security checks if enabled
    # For research/experimental use, we keep exec() but add safety measures
    if hasattr(FLAGS, 'enable_code_security') and FLAGS.enable_code_security:
      from .utils.code_security import validate_code_safety, SecurityError
      is_safe, security_msg = validate_code_safety(function_code, strict=False)
      if not is_safe:
        print_with_color(f'⚠️  Security Warning: {security_msg}', 'yellow')
        # Log but don't block in research mode
    
    try:
      exec(function_code, local_vars, local_vars)  # Execute the entire code block
      print_with_color('The code was not interrupted.', 'green')
      executed_code = function_code
    except TimeoutError as e:
      print_with_color(f'Code execution timeout: {e}', 'red')
      error_statement = 'TimeoutError'
      error_message = str(e)
      executed_code = function_code
    except Exception as e:
      print_with_color('The code was interrupted.', 'red')
      error_message, error_statement, executed_code = self._handle_execution_error(e, function_code, vars)
    finally:
      # update exec.done and exec_feedback according to answer_return
      # Check if max steps reached before calling termination_fn
      max_steps_reached = len(self.executed_actions) > self.max_action_step
      self.done = self.termination_fn(self.raw_env)
      
      # If max steps reached, we should stop the agent loop
      # Note: agent_done should be False because agent didn't actively indicate completion
      # But done should be True to stop the loop
      if max_steps_reached:
        agent_done = False  # Agent didn't actively complete the task
        self.done = True  # But we need to stop due to step limit
        exec_feedback = error_message if error_message else "Maximum number of action steps reached."
      elif self.answer_return is None:
        agent_done = False
        exec_feedback = error_message
      else:
        print('Agent indicates task is done.')
        self.done = True
        if self.answer_return == 'N/A':
          agent_done = False
          exec_feedback = "The answer action with N/A (indicates the task unfeasible) is executed. "
        else:
          agent_done = True
          exec_feedback = "The stop/answer action with complete is executed. "
      
      if self.done:
        exec_feedback = f"{exec_feedback} The task is done."
      
      exec_result = models.ExecResult(
        executed_code=executed_code,
        error_statement=str(error_statement),
        error_message=error_message,
        exec_feedback=exec_feedback,
        answer_return=self.answer_return,
        agent_done=agent_done, # True means agent actively indicated completion (via answer/stop action)
        done=self.done, # True means should stop (either agent completed OR max steps reached)
      )
      
      print_with_color('\n****self.executed_element_index:', 'cyan')
      print_with_color(self.executed_element_index, 'cyan')
      print_with_color('\n****self.executed_actions:', 'cyan')
      print_with_color(self.executed_actions, 'cyan')
      
    return self.traj, exec_result
  
  def _handle_execution_error(self, e, function_code, vars):
    if isinstance(e, (ActionExecutionError, ReachedMaxStepsError)):
      executed_code, error_line = extract_code_before_error(function_code, e.error_code)
      error_statement = e.error_code
    else:
      tb = traceback.extract_tb(e.__traceback__)
      last_trace = tb[-1]
      error_line = last_trace.lineno
      code_lines = function_code.splitlines()
      if 1 <= error_line <= len(code_lines):
        error_statement = code_lines[error_line - 1]
      else:
        error_statement = "Unable to retrieve the error code"
        # error_statement = "(Unable to retrieve the error code, index out of range)"
      executed_code = code_lines[:error_line]
      if error_line < len(code_lines):
        executed_code.append(vars.get("function_call"))
    
    error_message = f"An error occurred! Error line: {error_line}, Error code: {error_statement}, " \
                    f"Error type: {e.__class__.__name__}, Error message: {e}"
    print_with_color(f"❌ {error_message}", 'cyan')
    
    executed_code = '\n'.join(executed_code) if isinstance(executed_code, list) else executed_code
    return error_message, error_statement, executed_code
  
  def _create_json_action(self, action_dict: dict) -> JSONAction:
    """Convert action_dict to JSONAction, filtering unsupported parameters.
    
    Args:
      action_dict: Dictionary containing action parameters
      
    Returns:
      JSONAction object
    """
    supported_keys = {
      'action_type', 'index', 'x', 'y', 'text', 'clear_text', 'direction',
      'start_x', 'start_y', 'end_x', 'end_y', 'app_name', 'keycode',
      'touch_xy', 'lift_xy', 'activity_nickname', 'orientation', 'command', 'seconds'
    }
    filtered_dict = {k: v for k, v in action_dict.items() if k in supported_keys}
    return JSONAction(**filtered_dict)
  
  def execute_action(self, action_dict, action_code):
    print_with_color(action_dict, 'cyan')
    
    self.action_dict = action_dict
    self.before_obs = deepcopy(self.cur_obs)
    target_index = action_dict.get('index', None)
    action_feedback = ''
    action_return_value = None  # Store return value for shell/query actions
    action_executed = False  # Track whether action was actually executed (even if it failed)
    
    try:
      if action_dict['action_type'] == 'swipe' and target_index is not None and target_index < 0:
        action_feedback = 'Action failed: index is -1, target element not found.'
      else:
        if action_dict['action_type'] != 'stop' and action_dict['action_type'] != 'ask_mllm':
          if action_dict['action_type'] == 'answer':
            self.raw_env.interaction_cache = action_dict['text']
            action_executed = True
          else:
            # Use action_execution.execute_adb_action instead of android_world
            # Convert action_dict to JSONAction
            json_action_obj = self._create_json_action(action_dict)
            
            # Get screen elements and size
            # execute_adb_action supports android_world UIElement format directly
            screen_elements = self.cur_obs.ui_elements  # Use android_world UIElement directly
            screen_size = self.raw_env.logical_screen_size
            # Ensure screen_size is a tuple (width, height)
            if not isinstance(screen_size, tuple) or len(screen_size) != 2:
              # Fallback: try to get from state if available
              state = self.raw_env.get_state()
              if hasattr(state, 'logical_screen_size'):
                screen_size = state.logical_screen_size
              else:
                # Last resort: use a default size (shouldn't happen in practice)
                screen_size = (1080, 2400)
            
            # Execute action using action_execution module
            result = action_execution.execute_adb_action(
              action=json_action_obj,
              screen_elements=screen_elements,
              screen_size=screen_size,
              env=self.raw_env
            )
            action_executed = True  # Mark that action was attempted/executed
            
            # Store return value if action returns something (e.g., shell command)
            if result is not None:
              action_return_value = result
        
        # Set success feedback if not already set (e.g., by error case above)
        if not action_feedback:
          action_feedback = "Action has been performed." if action_dict['action_type'] != 'stop' else 'Task has been stopped.'
    except Exception as e:
      # Set error feedback before re-raising exception
      if not action_feedback:
        action_feedback = f"Action execution failed: {str(e)}"
      # Re-raise exception to be caught by execute_code()
      raise ActionExecutionError(action_code, self.kwargs, e)
    finally:
      # Ensure these operations always execute, even if action fails
      # This keeps executed_actions and action_history in sync
      time.sleep(0.1)  # Prevent tap from becoming double-tap when execution is too fast
      
      # Always append to executed_actions to maintain sync with action_history
      self.executed_actions.append(action_code)
      
      # Append element index
      if action_dict['action_type'] != 'stop' or action_dict['action_type'] != 'ask_mllm':
        self.executed_element_index.append(target_index)
      else:
        self.executed_element_index.append(None)
      
      # Update cur_obs only if action was actually executed (or for special actions)
      # This ensures we capture the state after action execution, even if it failed
      if action_executed or action_dict['action_type'] in ['stop', 'ask_mllm']:
        if FLAGS.agent_name == 'autorpa':
          time.sleep(2)
        self.cur_obs = self.get_obs(file_prefix=f'step_{len(self.executed_actions)}')
      
      # Always call get_step_info to update action_history, even if action failed
      # This ensures action_history stays in sync with executed_actions
      self.get_step_info(action_feedback, action_return_value)
      
      # Check max steps after updating history
      if len(self.executed_actions) > self.max_action_step:
        self.done = True
        raise ReachedMaxStepsError(action_code, self.kwargs)  # Caught by execute_code()
  
    # Return the value for RPA code usage
    return action_return_value
  
  def get_step_info(self, action_feedback: str, action_return_value=None):
    """
    get_step_info() runs after each action, once per action, and stores the after-obs
    
    Args:
      action_feedback: Feedback message about the action
      action_return_value: Return value from the action (e.g., shell command output)
    """
    
    tartget_index = self.executed_element_index[-1]
    
    # Get element from ui_content_full_dict and convert to string format
    related_element_str = ''
    if tartget_index is not None and self.before_obs.ui_content_full_dict:
      target_element = next(
        (elem for elem in self.before_obs.ui_content_full_dict if elem.get('index') == tartget_index),
        None
      )
      if target_element:
        related_element_str = agent_utils._generate_ui_element_description_from_dict(target_element)
    
    self.related_elements.append(related_element_str)
    
    before_ui_content = self.before_obs.ui_content_simple_str
    after_ui_content = self.cur_obs.ui_content_simple_str
    # Prefer: UI content string comparison (fast)
    is_screen_changed = True if before_ui_content != after_ui_content else False
    
    # If string comparison says same, use screenshot comparison to double-check
    if not is_screen_changed:
      try:
        from .utils.screenshot_comparison import get_comparator
        comparator = get_comparator()
        
        # Check if screenshot path is valid
        before_screenshot = self.before_obs.screenshot_path
        after_screenshot = self.cur_obs.screenshot_path
        
        if before_screenshot and after_screenshot:
          from pathlib import Path
          before_path = Path(before_screenshot)
          after_path = Path(after_screenshot)
          
          # If path is relative, resolve to absolute
          if not before_path.is_absolute():
            # Resolve from project root
            project_root = Path(__file__).parent.parent.parent
            before_path = project_root / before_screenshot
          if not after_path.is_absolute():
            project_root = Path(__file__).parent.parent.parent
            after_path = project_root / after_screenshot
          
          # If file exists, run screenshot comparison
          if before_path.exists() and after_path.exists():
            result = comparator.compare(
              before_path,
              after_path,
              crop=True,
              auto_detect_crop=True,
              save_cropped=False  # Do not save cropped images to save space
            )
            # If screenshots differ, screen has changed
            is_screen_changed = not result['same']
      except Exception as e:
        # If screenshot comparison fails (missing file, load error, etc.), use string comparison result
        # Fail silently to avoid disrupting normal flow
        pass
    
    # Format action feedback with return value if present
    feedback_with_return = action_feedback
    if action_return_value is not None:
      feedback_with_return += f'\nShell output: {action_return_value}'
    
    # update self.traj (for RPAExecTraj)
    step_info = models.EnvExecStepInfo(
      before_ui_content_full_dict=self.before_obs.ui_content_full_dict,
      before_screenshot_path=self.before_obs.screenshot_path,
      before_screenshot_w_som_path=self.before_obs.screenshot_w_som_path,
      executed_action=self.executed_actions[-1],
      related_elements=self.related_elements[-1],
      related_target=tartget_index,  # int | None
      action_feedback=feedback_with_return,
      after_ui_content_full_dict=self.cur_obs.ui_content_full_dict,
      after_screenshot_path=self.cur_obs.screenshot_path,
      after_screenshot_w_som_path=self.cur_obs.screenshot_w_som_path,
      is_screen_changed=is_screen_changed,
    )
    self.traj.append(step_info)
    
    # update self.action_history
    action_info_str = f'Step-{len(self.action_history) + 1}:\nExecuted action: {step_info.executed_action}'
    if step_info.related_elements:
      action_info_str += f'\nRelated element: {step_info.related_elements}'
    action_info_str += f'\nAction feedback: {step_info.action_feedback}'
    if not step_info.is_screen_changed:
      action_info_str += '\nNo screen changes.'
    else:
      action_info_str += '\nScreen changed.'
    self.action_history.append(action_info_str)
  
  ## -----start: action space
  # Execution relies on `self.execute_action(action_dict)`
  
  def _normalize_bool(self, value):
    """Normalize boolean values from various formats to Python bool.
    
    Converts string 'true'/'false' (case-insensitive) to bool True/False.
    Returns bool values as-is.
    
    Args:
      value: Can be bool, str ('true'/'false'), or other types
      
    Returns:
      bool: Normalized boolean value
    """
    if isinstance(value, bool):
      return value
    if isinstance(value, str):
      return value.lower() in ('true', '1', 'yes', 'on')
    # For other types, convert to bool
    return bool(value)
  
  def open_app(self, app_name: str):
    action_dict = {"action_type": "open_app", "app_name": app_name}
    action_code = f'env_op.open_app("{app_name}")'.replace("\n", "\\n")
    self.execute_action(action_dict, action_code)
  
  def shell(self, command: str):
    """Execute a shell command on the Android device.
    
    Args:
      command: Shell command to execute (without 'adb shell' prefix)
      
    Returns:
      Shell command output as a string (if the command produces output)
    """
    action_dict = {"action_type": "shell", "command": command}
    action_code = f'env_op.shell("{command}")'.replace("\n", "\\n")
    result = self.execute_action(action_dict, action_code)
    
    # Parse and return the output in a clean format
    if result is not None and result.HasField('generic'):
      # output is not a HasField(); access directly
      # output is bytes in protobuf; decode to str
      output = result.generic.output
      if isinstance(output, bytes):
        # adb shell returns bytes; decode to string
        output_str = output.decode('utf-8', errors='ignore').rstrip('\n')
        print("b to str generic output:", output_str)
        return output_str
      elif output:
        # If not bytes, convert to str and strip trailing newline
        print("str generic output:", str(output).rstrip('\n'))
        return str(output).rstrip('\n')
      else:
        print("generic output: None")
        return ""
    return None
  
  def click(self, *args, **kwargs):
    """Click at element index or coordinates (x, y).
    
    Usage:
      click(index) - Click by element index
      click(x, y) - Click at coordinates
      click(index=5) - Click by element index (keyword)
      click(x=100, y=200) - Click at coordinates (keyword)
    """
    # Handle positional arguments
    if len(args) == 1:
      # Single argument: treat as index
      index = args[0]
      action_dict = {"action_type": "click", "index": index}
      action_code = f'env_op.click({index})'
    elif len(args) == 2:
      # Two arguments: treat as (x, y) coordinates
      x, y = args[0], args[1]
      action_dict = {"action_type": "click", "x": x, "y": y}
      action_code = f'env_op.click({x}, {y})'
    elif len(args) == 0:
      # Keyword arguments only
      if 'x' in kwargs and 'y' in kwargs:
        x, y = kwargs['x'], kwargs['y']
        action_dict = {"action_type": "click", "x": x, "y": y}
        action_code = f'env_op.click({x}, {y})'
      elif 'index' in kwargs:
        index = kwargs['index']
        action_dict = {"action_type": "click", "index": index}
        action_code = f'env_op.click({index})'
      else:
        raise ValueError("click() requires either index or (x, y) coordinates")
    else:
      raise ValueError(f"click() takes 1 or 2 positional arguments, got {len(args)}")
    self.execute_action(action_dict, action_code)
  
  def long_press(self, *args, **kwargs):
    """Long press at element index or coordinates (x, y).
    
    Usage:
      long_press(index) - Long press by element index
      long_press(x, y) - Long press at coordinates
      long_press(index=5) - Long press by element index (keyword)
      long_press(x=100, y=200) - Long press at coordinates (keyword)
    """
    # Handle positional arguments
    if len(args) == 1:
      # Single argument: treat as index
      index = args[0]
      action_dict = {"action_type": "long_press", "index": index}
      action_code = f'env_op.long_press({index})'.replace("\n", "\\n")
    elif len(args) == 2:
      # Two arguments: treat as (x, y) coordinates
      x, y = args[0], args[1]
      action_dict = {"action_type": "long_press", "x": x, "y": y}
      action_code = f'env_op.long_press({x}, {y})'.replace("\n", "\\n")
    elif len(args) == 0:
      # Keyword arguments only
      if 'x' in kwargs and 'y' in kwargs:
        x, y = kwargs['x'], kwargs['y']
        action_dict = {"action_type": "long_press", "x": x, "y": y}
        action_code = f'env_op.long_press({x}, {y})'.replace("\n", "\\n")
      elif 'index' in kwargs:
        index = kwargs['index']
        action_dict = {"action_type": "long_press", "index": index}
        action_code = f'env_op.long_press({index})'.replace("\n", "\\n")
      else:
        raise ValueError("long_press() requires either index or (x, y) coordinates")
    else:
      raise ValueError(f"long_press() takes 1 or 2 positional arguments, got {len(args)}")
    self.execute_action(action_dict, action_code)
  
  def input_text(self, text: str, *args, **kwargs):
    """Input text at element index, coordinates (x, y), or current cursor position.
    
    Usage:
      input_text(text) - Input text at current cursor position
      input_text(text, clear_text) - Input text at current cursor position with clear_text
      input_text(text, index) - Input text at element index
      input_text(text, index, clear_text) - Input text at element index with clear_text
      input_text(text, x, y) - Input text at coordinates
      input_text(text, x, y, clear_text) - Input text at coordinates with clear_text
      input_text(text, index=5) - Input text at element index (keyword)
      input_text(text, x=100, y=200) - Input text at coordinates (keyword)
    """
    # Normalize clear_text from kwargs (handles string 'true'/'false' to bool conversion)
    clear_text = self._normalize_bool(kwargs.get('clear_text', True))
    
    # Handle positional arguments (after text)
    if len(args) == 0:
      # No arguments: input text at current cursor position
      action_dict = {"action_type": "input_text", "text": text, "clear_text": clear_text}
      action_code = f'env_op.input_text("{text}", {clear_text})'.replace("\n", "\\n")
    elif len(args) == 1:
      # Single argument: could be clear_text (bool) or index (int)
      is_bool_value = (isinstance(args[0], bool) or 
                       (isinstance(args[0], str) and args[0].lower() in ('true', 'false', '1', '0', 'yes', 'no', 'on', 'off')))
      
      if is_bool_value:
        # Treat as clear_text: input at current cursor position
        clear_text = self._normalize_bool(args[0])
        action_dict = {"action_type": "input_text", "text": text, "clear_text": clear_text}
        action_code = f'env_op.input_text("{text}", {clear_text})'.replace("\n", "\\n")
      else:
        # Otherwise treat as index
        index = args[0]
        action_dict = {"action_type": "input_text", "text": text, "index": index, "clear_text": clear_text}
        action_code = f'env_op.input_text("{text}", {index}, {clear_text})'.replace("\n", "\\n")
    elif len(args) == 2:
      # Two arguments: could be (index, clear_text) or (x, y)
      # Check if second argument is a boolean value (bool or string boolean)
      is_bool_value = (isinstance(args[1], bool) or 
                       (isinstance(args[1], str) and args[1].lower() in ('true', 'false', '1', '0', 'yes', 'no', 'on', 'off')))
      
      if is_bool_value:
        # Treat as (index, clear_text)
        index = args[0]
        clear_text = self._normalize_bool(args[1])
        action_dict = {"action_type": "input_text", "text": text, "index": index, "clear_text": clear_text}
        action_code = f'env_op.input_text("{text}", {index}, {clear_text})'.replace("\n", "\\n")
      else:
        # Otherwise treat as (x, y) coordinates
        x, y = args[0], args[1]
        action_dict = {"action_type": "input_text", "text": text, "x": x, "y": y, "clear_text": clear_text}
        action_code = f'env_op.input_text("{text}", {x}, {y}, {clear_text})'.replace("\n", "\\n")
    elif len(args) == 3:
      # Three arguments: treat as (x, y, clear_text)
      x, y = args[0], args[1]
      clear_text = self._normalize_bool(args[2])
      action_dict = {"action_type": "input_text", "text": text, "x": x, "y": y, "clear_text": clear_text}
      action_code = f'env_op.input_text("{text}", {x}, {y}, {clear_text})'.replace("\n", "\\n")
    elif len(args) == 0:
      # Keyword arguments only
      if 'x' in kwargs and 'y' in kwargs:
        x, y = kwargs['x'], kwargs['y']
        action_dict = {"action_type": "input_text", "text": text, "x": x, "y": y, "clear_text": clear_text}
        action_code = f'env_op.input_text("{text}", {x}, {y}, {clear_text})'.replace("\n", "\\n")
      elif 'index' in kwargs:
        index = kwargs['index']
        action_dict = {"action_type": "input_text", "text": text, "index": index, "clear_text": clear_text}
        action_code = f'env_op.input_text("{text}", {index}, {clear_text})'.replace("\n", "\\n")
      else:
        raise ValueError("input_text() requires either index or (x, y) coordinates")
    else:
      raise ValueError(f"input_text() takes 1, 2, or 3 positional arguments after text, got {len(args)}")
    self.execute_action(action_dict, action_code)
  
  def keyboard_enter(self):
    action_dict = {"action_type": "keyboard_enter"}
    action_code = 'env_op.keyboard_enter()'
    self.execute_action(action_dict, action_code)
  
  def go_home(self):
    action_dict = {"action_type": "navigate_home"}
    action_code = 'env_op.go_home()'
    self.execute_action(action_dict, action_code)
  
  def go_back(self):
    action_dict = {"action_type": "navigate_back"}
    action_code = 'env_op.go_back()'
    self.execute_action(action_dict, action_code)
  
  def swipe(self, *args, **kwargs):
    """Swipe using direction+index or direct coordinates (start_x, start_y, end_x, end_y).
    
    Usage:
      swipe(direction) - Swipe in direction (legacy)
      swipe(direction, index) - Swipe element in direction (legacy)
      swipe(start_x, start_y, end_x, end_y) - Swipe from (start_x, start_y) to (end_x, end_y)
      swipe(direction="up") - Swipe in direction (keyword)
      swipe(start_x=100, start_y=200, end_x=100, end_y=400) - Swipe by coordinates (keyword)
    """
    # Handle positional arguments
    if len(args) == 4:
      # Four arguments: treat as (start_x, start_y, end_x, end_y) coordinates
      # Use start_x/start_y/end_x/end_y format for android_world JSONAction compatibility
      start_x, start_y, end_x, end_y = args[0], args[1], args[2], args[3]
      action_dict = {"action_type": "swipe", "start_x": start_x, "start_y": start_y, 
                     "end_x": end_x, "end_y": end_y}
      action_code = f'env_op.swipe({start_x}, {start_y}, {end_x}, {end_y})'.replace("\n", "\\n")
    elif len(args) == 1:
      # Single argument: treat as direction
      direction = args[0]
      action_dict = {"action_type": "swipe", "direction": direction, "index": None}
      action_code = f'env_op.swipe("{direction}")'.replace("\n", "\\n")
    elif len(args) == 2:
      # Two arguments: treat as (direction, index)
      direction, index = args[0], args[1]
      action_dict = {"action_type": "swipe", "direction": direction, "index": index}
      action_code = f'env_op.swipe("{direction}", {index})'.replace("\n", "\\n")
    elif len(args) == 0:
      # Keyword arguments only
      if 'start_x' in kwargs and 'start_y' in kwargs and 'end_x' in kwargs and 'end_y' in kwargs:
        # Coordinate-based swipe
        # Use start_x/start_y/end_x/end_y format for android_world JSONAction compatibility
        start_x = kwargs['start_x']
        start_y = kwargs['start_y']
        end_x = kwargs['end_x']
        end_y = kwargs['end_y']
        action_dict = {"action_type": "swipe", "start_x": start_x, "start_y": start_y, 
                       "end_x": end_x, "end_y": end_y}
        action_code = f'env_op.swipe({start_x}, {start_y}, {end_x}, {end_y})'.replace("\n", "\\n")
      elif 'direction' in kwargs:
        # Legacy direction-based swipe
        direction = kwargs['direction']
        index = kwargs.get('index', None)
        action_dict = {"action_type": "swipe", "direction": direction, "index": index}
        action_code = f'env_op.swipe("{direction}", {index})'.replace("\n",
                                                                      "\\n") if index is not None else f'env_op.swipe("{direction}")'.replace(
          "\n", "\\n")
      else:
        raise ValueError("swipe() requires either direction or (start_x, start_y, end_x, end_y) coordinates")
    else:
      raise ValueError(f"swipe() takes 1, 2, or 4 positional arguments, got {len(args)}")
    self.execute_action(action_dict, action_code)
  
  def drag_and_drop(self, *args, **kwargs):
    """Drag and drop from start to end, using element indices or coordinates.
    
    Usage:
      drag_and_drop(start_index, end_index) - Drag from start_index element to end_index element
      drag_and_drop(start_x, start_y, end_x, end_y) - Drag from (start_x, start_y) to (end_x, end_y)
      drag_and_drop(start_index=5, end_index=10) - Drag by element indices (keyword)
      drag_and_drop(start_x=100, start_y=200, end_x=300, end_y=400) - Drag by coordinates (keyword)
    """
    # Handle positional arguments
    if len(args) == 2:
      # Two arguments: treat as (start_index, end_index)
      start_index, end_index = args[0], args[1]
      # Convert indices to coordinates by getting element centers
      start_coords = None
      end_coords = None
      if hasattr(self, 'cur_obs') and self.cur_obs and self.cur_obs.ui_elements:
        ui_elements = self.cur_obs.ui_elements
        if 0 <= start_index < len(ui_elements) and hasattr(ui_elements[start_index], 'bbox_pixels') and ui_elements[start_index].bbox_pixels:
          try:
            start_coords = action_execution._get_bbox_center(ui_elements[start_index].bbox_pixels)
          except (ValueError, AttributeError):
            pass
        if 0 <= end_index < len(ui_elements) and hasattr(ui_elements[end_index], 'bbox_pixels') and ui_elements[end_index].bbox_pixels:
          try:
            end_coords = action_execution._get_bbox_center(ui_elements[end_index].bbox_pixels)
          except (ValueError, AttributeError):
            pass
      if start_coords is None or end_coords is None:
        raise ValueError(f"Could not get coordinates for indices: start={start_index}, end={end_index}")
      touch_xy = (start_coords[0], start_coords[1])
      lift_xy = (end_coords[0], end_coords[1])
      action_dict = {"action_type": "drag_and_drop", "touch_xy": touch_xy, "lift_xy": lift_xy}
      action_code = f'env_op.drag_and_drop({start_index}, {end_index})'.replace("\n", "\\n")
    elif len(args) == 4:
      # Four arguments: treat as (start_x, start_y, end_x, end_y) coordinates
      start_x, start_y, end_x, end_y = args[0], args[1], args[2], args[3]
      touch_xy = (start_x, start_y)
      lift_xy = (end_x, end_y)
      action_dict = {"action_type": "drag_and_drop", "touch_xy": touch_xy, "lift_xy": lift_xy}
      action_code = f'env_op.drag_and_drop({start_x}, {start_y}, {end_x}, {end_y})'.replace("\n", "\\n")
    elif len(args) == 0:
      # Keyword arguments only
      if 'start_x' in kwargs and 'start_y' in kwargs and 'end_x' in kwargs and 'end_y' in kwargs:
        # Coordinate-based drag_and_drop
        start_x = kwargs['start_x']
        start_y = kwargs['start_y']
        end_x = kwargs['end_x']
        end_y = kwargs['end_y']
        touch_xy = (start_x, start_y)
        lift_xy = (end_x, end_y)
        action_dict = {"action_type": "drag_and_drop", "touch_xy": touch_xy, "lift_xy": lift_xy}
        action_code = f'env_op.drag_and_drop({start_x}, {start_y}, {end_x}, {end_y})'.replace("\n", "\\n")
      elif 'start_index' in kwargs and 'end_index' in kwargs:
        # Index-based drag_and_drop
        start_index = kwargs['start_index']
        end_index = kwargs['end_index']
        start_coords = None
        end_coords = None
        if hasattr(self, 'cur_obs') and self.cur_obs and self.cur_obs.ui_elements:
          ui_elements = self.cur_obs.ui_elements
          if 0 <= start_index < len(ui_elements) and hasattr(ui_elements[start_index], 'bbox_pixels') and ui_elements[start_index].bbox_pixels:
            try:
              start_coords = action_execution._get_bbox_center(ui_elements[start_index].bbox_pixels)
            except (ValueError, AttributeError):
              pass
          if 0 <= end_index < len(ui_elements) and hasattr(ui_elements[end_index], 'bbox_pixels') and ui_elements[end_index].bbox_pixels:
            try:
              end_coords = action_execution._get_bbox_center(ui_elements[end_index].bbox_pixels)
            except (ValueError, AttributeError):
              pass
        if start_coords is None or end_coords is None:
          raise ValueError(f"Could not get coordinates for indices: start={start_index}, end={end_index}")
        touch_xy = (start_coords[0], start_coords[1])
        lift_xy = (end_coords[0], end_coords[1])
        action_dict = {"action_type": "drag_and_drop", "touch_xy": touch_xy, "lift_xy": lift_xy}
        action_code = f'env_op.drag_and_drop({start_index}, {end_index})'.replace("\n", "\\n")
      else:
        raise ValueError("drag_and_drop() requires either (start_index, end_index) or (start_x, start_y, end_x, end_y)")
    else:
      raise ValueError(f"drag_and_drop() takes 2 or 4 positional arguments, got {len(args)}")
    self.execute_action(action_dict, action_code)
  
  def wait(self, seconds: float = 1.0):
    action_dict = {"action_type": "wait", "seconds": seconds}
    if seconds == 1.0:
      action_code = f'env_op.wait()'
    else:
      action_code = f'env_op.wait(seconds={seconds})'
    self.execute_action(action_dict, action_code)
  
  def answer(self, text: str):
    action_dict = {"action_type": "answer", "text": text}
    action_code = f'env_op.answer(text="{text}")'.replace("\n", "\\n")
    self.answer_return = text
    self.execute_action(action_dict, action_code)
  
  def stop(self, goal_status: str):
    action_dict = {"action_type": "stop", "goal_status": goal_status}
    action_code = f'env_op.stop(goal_status="{goal_status}")'.replace("\n", "\\n")
    self.answer_return = goal_status
    self.execute_action(action_dict, action_code)
  
  # -----start: used in rpa code
  def ask_mllm(self, question):
    system_prompt = ("Carefully examine the page information and answer the question.\n"
          "[Output Format]\n"
          "1. thought: output your brief thought (under 100 words).\n"
          "2. answer: output the answer to the question. Do not wrap with Markdown tags.\n"
          "In your answer, just directly return the required information and do not include any other words.\n"
          # "IMPORTANT: If the answer is a JSON object, you MUST use double quotes (\") for both keys and string values, NOT single quotes (').\n"
          "[Example Output 1 - Simple Answer]\n"
          "thought: I need to find the number of comments that have received more downvotes than upvotes for the user who made the latest post on the current page.\n"
          "answer: 10\n"
          "[Example Output 2 - JSON Answer]\n"
          "thought: I need to find synonyms from the options list.\n"
          'answer: {"matches": ["brave", "courageous"]}\n')
    user_prompt = (f"\nExecution History:\n" + '\n'.join(self.action_history) +
              f"\n{self.cur_obs.ui_content_simple_str}\n"
              f"Question: {question}\n")
    agent_utils.write_to_file(file_path=self.save_path,
                              file_name=f'step-{len(self.executed_actions)}_ask_mllm_prompt.txt', content=f"[system]\n{system_prompt}\n\n[user]\n{user_prompt}")
    llm = get_llm_wrapper(model_name=FLAGS.ask_mllm_llm, enable_logging=FLAGS.enable_llm_logging)
    # output, raw_response = llm.predict_mm(user_prompt=user_prompt, images=[self.cur_obs.screenshot_with_som_resized], system_prompt=system_prompt, output_format=models.StringOutput)
    output, raw_response = llm.predict_mm(user_prompt=user_prompt, images=[self.cur_obs.screenshot_with_som_resized], system_prompt=system_prompt, output_format=models.AskMLLMOutput)
    print_with_color(f'ask_mllm response: thought={output.thought}, answer={output.answer}', 'cyan')
    agent_utils.write_to_file(file_path=self.save_path,
                              file_name=f'step-{len(self.executed_actions)}_ask_mllm_output.txt', content=output.model_dump_json())
    # Try to convert Python dict format (single quotes) to JSON format (double quotes)
    # This handles cases where LLM returns {'key': 'value'} instead of {"key": "value"}
    output_ans = output.answer

    cost_tokens = raw_response.usage
    self.record_token.step = str(len(self.executed_actions))
    self.record_token.agent = 'ask mllm'
    self.record_token.step_tokens = cost_tokens
    self.record_token.llm = FLAGS.ask_mllm_llm
    agent_utils.record_cost_tokens(self.record_token)
    
    self.execute_action(action_dict={"action_type": "ask_mllm", "question": question},
                        action_code=f"env_op.ask_mllm(question={question})")
    return output_ans
  
  def get_ui_content(self):
    current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
    self.update_obs(file_prefix=f'step_{len(self.executed_actions)}_{current_time}_get_ui_content')
    return agent_utils.parse_str_to_jsonlist(self.cur_obs.ui_content_simple_str)
  
  # -----end: used in rpa code
  ## -----end: action space
  
  def find_element(self, **kwargs) -> int:
    """
    Find an element in the UI list based on the given filtering criteria.

    Parameters:
        **kwargs: Filtering criteria, such as keyword, text, content_description, is_clickable, etc.

    Returns:
        int: The index of the first element that matches the criteria.

    Exceptions:
        ValueError: If no element matching the criteria is found.
    """
    print_with_color('\nfind_element(**kwargs)', 'cyan')
    print_with_color(f'kwargs: {kwargs}', 'cyan')
    self.kwargs = kwargs
    
    current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
    self.update_obs(file_prefix=f'step_{len(self.executed_actions)}_{current_time}_before_find_element')

    candidate_elements = []
    ui_content_simple_dict = agent_utils.parse_str_to_jsonlist(self.cur_obs.ui_content_simple_str)
    for element in ui_content_simple_dict:
      exclude_keys = ["target_description", "actions"]
      # Check if all key-value pairs match the element, excluding some special keys
      if all(
        element.get(key) is not None and element.get(key) == value
        for key, value in kwargs.items()
        if key not in exclude_keys
      ):
        element_actions = ast.literal_eval(str(element.get('actions') or '[]'))
        if not element_actions:
          candidate_elements.append(element)
          continue
        kwargs_action = kwargs.get('actions')
        if kwargs_action:  # If kwargs has additional_actions, run matching logic
          # Parse the actions from both element and kwargs and compare them
          kwargs_actions = ast.literal_eval(str(kwargs_action))
          # If not all actions in kwargs are supported by the element, skip it
          if not set(kwargs_actions).issubset(element_actions):
            continue
        # Element matches the criteria, store it as the matched element
        candidate_elements.append(element)
    
    if len(candidate_elements) != 1:
      return self.grounder(kwargs, candidate_elements)
    
    # Return the index of the single matched element
    index = candidate_elements[0]["index"]
    print_with_color(f'matched index: {index}', 'cyan')
    return index  # when cnt == 1
  
  def grounder(self, target_element_info: dict, candidate_elements: list) -> int:
    """
      Use GPT-4o to select the most appropriate UI element from candidate elements
      based on the provided target element description.

      Returns:
        int: The index of the best-matching element, or -1 if no confident match is found.
    """
    candidate_elements_str = f"Candidate elements (filtered by basic attribute matching):\n{candidate_elements}\n\n" if candidate_elements else ''
    
    current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
    self.update_obs(file_prefix=f'step_{len(self.executed_actions)}_{current_time}_before_grounder')
    ui_content, screenshot_som = self.cur_obs.ui_content_simple_str, self.cur_obs.screenshot_with_som_resized

    system_prompt = (
      "You are a UI automation assistant. Your task is to match the user-provided description with the most appropriate UI element. "
      "Text-type attributes must match the element content exactly.\n\n"
      "[Output Format]\n"
      "Select the best-matching element and return its index with a confidence score (1–10).\n"
      "Only results with a score of 10 will be accepted. If no suitable match exists, return -1.\n\n"
      "Example Output:\n"
      "{'thought': '...', 'target_index': 0, 'confidence_score': 6}\n"
      "Only return with a brief thought, the target index, and the confidence score.\n\n"
    )
    user_prompt = (
      f"Target element info:\n{target_element_info}\n\n"
      f"UI content:\n{ui_content}\n\n"
      f"{candidate_elements_str}"
    )
    agent_utils.write_to_file(self.save_path, f'step-{len(self.executed_actions)}_{current_time}_grounder_prompt.txt', f"[system]\n{system_prompt}\n\n[user]\n{user_prompt}")
    llm = get_llm_wrapper(model_name=FLAGS.grounder_llm, enable_logging=FLAGS.enable_llm_logging)
    output, raw_response = llm.predict_mm(user_prompt=user_prompt, images=[screenshot_som], system_prompt=system_prompt, output_format=models.MllmMatchTarget)
    agent_utils.write_to_file(self.save_path, f'step-{len(self.executed_actions)}_{current_time}_grounder_output.txt', output)
    
    cost_tokens = raw_response.usage
    self.record_token.step = str(len(self.executed_actions))
    self.record_token.agent = 'grounder'
    self.record_token.step_tokens = cost_tokens
    self.record_token.llm = FLAGS.grounder_llm
    agent_utils.record_cost_tokens(self.record_token)
    
    index = output.target_index
    score = output.confidence_score
    
    print_with_color(f'mllm_match response: index={index}, confidence score={score}', 'cyan')
    
    if score < 10: index = -1
    print('actual index:', index)
    self.previous_index = index
    
    return index
  
  def get_obs(self, log_task_path: str = None, file_prefix: str = '', save: bool = True) -> ScreenObs:
    """
    get_obs() is called at the start of the env and after each action.
    """
    time.sleep(1)  # Wait for the screen to stabilize
    if log_task_path is None:
      log_task_path = self.save_path
    
    state = self.raw_env.get_state()
    ui_elements = state.ui_elements
    logical_screen_size = self.raw_env.logical_screen_size
    orientation = self.raw_env.orientation
    physical_frame_boundary = self.raw_env.physical_frame_boundary
    
    # V2: ui_content_full_dict is the full projected UI (includes bbox), ui_content_simple_str is a stable simple view for LLM/logs.
    ui_content_full_dict = agent_utils.project_ui_elements_to_full_dict(ui_elements, logical_screen_size)
    ui_content_simple_str = agent_utils._generate_ui_elements_description_str(ui_elements, logical_screen_size)
    self.cur_ui_content = ui_content_simple_str
    
    screenshot = state.pixels.copy()
    screenshot_resized = cv2.resize(screenshot, (461, 1024))
    
    # Draw bounding boxes for UI elements (SOM view)
    screenshot_with_som = screenshot.copy()
    for index, ui_element in enumerate(ui_elements):
      if agent_utils.validate_ui_element(ui_element, logical_screen_size):
        agent_utils.add_ui_element_mark(
          screenshot_with_som,
          ui_element,
          index,
          logical_screen_size,
          physical_frame_boundary,
          orientation,
        )
    screenshot_with_som_resized = cv2.resize(screenshot_with_som, (461, 1024))
    
    obs = ScreenObs(
      screenshot=screenshot,
      screenshot_resized=screenshot_resized,
      screenshot_with_som=screenshot_with_som,
      screenshot_with_som_resized=screenshot_with_som_resized,
      ui_elements=ui_elements,
      ui_content_simple_str=ui_content_simple_str,
      screenshot_w_som_path='',
      ui_content_full_dict=ui_content_full_dict,
    )
    
    if save:
      # agent_utils.write_to_file(file_path=log_task_path, file_name=file_prefix + '_ui_forest.txt', content=state.forest)
      agent_utils.write_to_file(file_path=log_task_path, file_name=file_prefix + '_ui_elements_raw.txt', content=ui_elements)
      agent_utils.write_to_file(file_path=log_task_path, file_name=file_prefix + '_ui_content.txt',
                                content=ui_content_simple_str)
      # Save original resized screenshot and store path
      obs.screenshot_path = agent_utils.store_image(screenshot_resized, file_prefix + '_screenshot_resized.png',
                              file_path=log_task_path)
      # update obs
      obs.screenshot_w_som_path = agent_utils.store_image(screenshot_with_som_resized,
                                                          file_prefix + '_screenshot_with_som_resized.png',
                                                          file_path=log_task_path)
    return obs
  
  def update_obs(self, file_prefix: str = None):
    """Update current observation by taking a new screenshot.
    
    This method is called before planner to ensure we have the latest screen state.
    
    Args:
      file_prefix: Optional file prefix for saved screenshots. 
                   If None, uses 'step_{step_n}_before_planner' format.
    
    Returns:
      Updated ScreenObs object
    """
    if file_prefix is None:
      # Use step number based on executed actions (before planner, so use current count)
      step_n = len(self.executed_actions)
      file_prefix = f'step_{step_n}'
    self.cur_obs = self.get_obs(file_prefix=file_prefix)
  
  def mark_target_index(self, screenshot, ui_elements, target_index: int | None = None):
    for index, ui_element in enumerate(ui_elements):
      if index == target_index:
        agent_utils.add_ui_element_mark(
          screenshot,
          ui_element,
          index,
          self.raw_env.logical_screen_size,
          self.raw_env.physical_frame_boundary,
          self.raw_env.orientation,
        )
    return cv2.resize(screenshot, (461, 1024))
  
  def mark_target_coordinate(self, screenshot, x: int | None = None, y: int | None = None):
    """Mark target coordinate on screenshot.
    
    Args:
      screenshot: The screenshot as a numpy ndarray.
      x: X coordinate in logical coordinates.
      y: Y coordinate in logical coordinates.
    """
    if x is not None and y is not None:
      agent_utils.add_coordinate_mark(
        screenshot,
        x,
        y,
        self.raw_env.logical_screen_size,
        self.raw_env.physical_frame_boundary,
        self.raw_env.orientation,
      )
    return cv2.resize(screenshot, (461, 1024))
  
  def mark_swipe_coordinates(
    self,
    screenshot,
    start_x: int,
    start_y: int,
    end_x: int,
    end_y: int,
    action_type: str = 'swipe'
  ):
    """Mark swipe/drag operation with start point, end point, and arrow.
    
    Args:
      screenshot: The screenshot as a numpy ndarray.
      start_x: Start X coordinate in logical coordinates.
      start_y: Start Y coordinate in logical coordinates.
      end_x: End X coordinate in logical coordinates.
      end_y: End Y coordinate in logical coordinates.
      action_type: Type of action ('swipe' or 'drag_and_drop').
    
    Returns:
      Resized screenshot with marked swipe/drag operation.
    """
    agent_utils.add_swipe_mark(
      screenshot,
      start_x,
      start_y,
      end_x,
      end_y,
      self.raw_env.logical_screen_size,
      self.raw_env.physical_frame_boundary,
      self.raw_env.orientation,
      action_type=action_type,
    )
    return cv2.resize(screenshot, (461, 1024))
  
  def mark_direction_swipe(self, screenshot, direction: str):
    """Mark directional swipe with a large arrow and direction label.
    
    Args:
      screenshot: The screenshot as a numpy ndarray.
      direction: Direction of swipe ('up', 'down', 'left', 'right').
    
    Returns:
      Resized screenshot with marked direction arrow.
    """
    agent_utils.add_direction_arrow(
      screenshot,
      direction,
      self.raw_env.logical_screen_size,
      self.raw_env.physical_frame_boundary,
      self.raw_env.orientation,
    )
    return cv2.resize(screenshot, (461, 1024))
  
  def hide_automation_ui(self):
    self.raw_env.hide_automation_ui()

# **Extract code up to the point where the error occurred**
def extract_code_before_error(code_str: str, error_obj) -> tuple[str, int]:
  """
  Extracts code from the beginning of code_str up to and including the part that matches error_obj.
  - If error_obj is a dict: finds the assignment block that matches the dict.
  - If error_obj is a string and a function call: matches function name via AST.
  - Fallback: searches raw text for partial match.

  Returns (executed_code, error_line_num)
  """
  lines = code_str.splitlines()
  
  # ---------- Handle string-form error (e.g., "env_op.swipe(...)") ----------
  if isinstance(error_obj, str):
    try:
      tree = ast.parse(code_str)
      error_expr = ast.parse(error_obj, mode='eval')
      if isinstance(error_expr.body, ast.Call):
        # Extract function name from error string
        error_func = error_expr.body.func
        if isinstance(error_func, ast.Name):
          error_func_name = error_func.id
        elif isinstance(error_func, ast.Attribute):
          error_func_name = error_func.attr
        else:
          error_func_name = None
        
        if error_func_name:
          for node in ast.walk(tree):
            if isinstance(node, ast.Call):
              func = node.func
              node_func_name = func.id if isinstance(func, ast.Name) else (
                func.attr if isinstance(func, ast.Attribute) else None
              )
              if node_func_name == error_func_name and hasattr(node, 'lineno'):
                line_num = node.lineno
                return '\n'.join(lines[:line_num]), line_num
    except Exception:
      pass
    
    # Fallback: raw string search
    for i, line in enumerate(lines):
      if error_obj.strip() in line:
        return '\n'.join(lines[:i + 1]), i + 1
    
    return "Error string not found in code.", -1
  
  # ---------- Handle dict-form error (e.g., kwargs = {...}) ----------
  try:
    tree = ast.parse(code_str)
  except SyntaxError:
    return "Invalid code syntax.", -1
  
  best_node = None
  best_score = 0
  
  def dict_match_score(dict_node: ast.Dict) -> int:
    score = 0
    for k_ast, v_ast in zip(dict_node.keys, dict_node.values):
      try:
        key = ast.literal_eval(k_ast)
        val = ast.literal_eval(v_ast)
        if key in error_obj and error_obj[key] == val:
          score += 1
        elif key in error_obj and isinstance(error_obj[key], list) and isinstance(val, list):
          if set(error_obj[key]).issubset(set(val)):
            score += 1
      except Exception:
        continue
    return score
  
  for node in ast.walk(tree):
    if isinstance(node, ast.Assign) and isinstance(node.value, ast.Dict):
      score = dict_match_score(node.value)
      if score > best_score:
        best_score = score
        best_node = node
  
  if best_node:
    start = best_node.lineno
    end = getattr(best_node, 'end_lineno', start)
    return '\n'.join(lines[:end]), start
  
  return "Relevant code block not found.", -1
