"""
Input step info per step:
obs, reason for code, code, execution of code
"""
import ast
import os
import re
from typing import Optional, Any, Tuple

from absl import flags

from ..prompts.builder_prompt import get_rpa_builder_prompt
from ..utils import agent_utils, models
from ..utils.llm_client import OpenAIWrapper
from ..utils.agent_utils import print_with_color
from ..utils.code_validation import validate_python_syntax

FLAGS = flags.FLAGS

# Maximum number of syntax error retries
MAX_SYNTAX_RETRY = 2


def get_syntax_error_feedback(error_msg: str) -> str:
  """
  Generate feedback prompt for LLM when syntax error is detected.
  
  Args:
    error_msg: The error message from syntax validation
  
  Returns:
    Feedback string to append to the prompt
  """
  return (
    f"\n\n[CRITICAL - Syntax Error Detected]\n"
    f"The previously generated RPA code has a syntax error. Please fix it and regenerate:\n"
    f"{error_msg}\n"
    f"Common fixes:\n"
    f"- Escape special characters in strings properly\n"
    f"- Use single quotes for strings containing double quotes\n"
    f"- Ensure all brackets and parentheses are matched\n"
    f"- Check that all function calls have matching parentheses\n"
    f"- Verify proper indentation\n"
    f"Please regenerate the RPA code with the syntax error fixed.\n"
  )


class RPA_Builder_Agent:
  def __init__(
    self,
    default_llm: OpenAIWrapper,
    enable_shell_action: bool = False,
  ):
    self.rpa_builder_conclusion = ''  # Lives for the duration of each task type
    self.default_llm = default_llm
    self.reflection = None
    self.record_token = models.RecordToken()
    self.enable_shell_action = enable_shell_action
  
  def _print_rpa_info(self, builder_output, rpa_info):
    """Print RPA builder output information."""
    print(f'Thought:\n{builder_output.thought}\n')
    if hasattr(builder_output, 'info_to_clarify'):
      print(f'info_to_clarify:\n{builder_output.info_to_clarify}\n')
    print(f'Task Type:\n{rpa_info.task_type}\n')
    print(f'Params:\n{rpa_info.parameters}\n')
    print(f'RPA Description:\n{rpa_info.rpa_description}\n')
    print(f'Code:\n{rpa_info.rpa_code}\n')
    print(f'Example Usage:\n{rpa_info.example_usage}\n')
    print(f'Conclusion:\n{rpa_info.conclusion}\n')
  
  def _generate_with_syntax_retry(self, base_prompt: dict[str, str], images: list, log_task_path: str, 
                                   round_name: str, task_type: str, task_template: str,
                                   list_react_traj: list, pre_rpa_exec_traj, 
                                   fetched_info: dict, encountered_task_goals: list,
                                   agent_name: str = None) -> tuple:
    """
    Generate RPA code with automatic syntax error retry.
    
    Returns:
      tuple: (builder_output, rpa_info, total_token_cost)
    """
    syntax_error_feedback = ""
    total_tokens = 0
    
    for syntax_retry in range(MAX_SYNTAX_RETRY + 1):
      # Build prompt (add syntax error feedback to user prompt)
      rpa_builder_prompt = {
        'system': base_prompt['system'],
        'user': base_prompt['user'] + syntax_error_feedback
      }
      
      # Save prompt
      retry_suffix = f'_retry{syntax_retry}' if syntax_retry > 0 else ''
      agent_utils.write_to_file(file_path=log_task_path, 
                                file_name=f'rpa_builder_prompt_{round_name}{retry_suffix}.txt',
                                content=f"[system]\n{rpa_builder_prompt['system']}\n\n[user]\n{rpa_builder_prompt['user']}")
      
      # Call LLM
      builder_output, raw_response = self.default_llm.predict_mm(
        user_prompt=rpa_builder_prompt['user'],
        images=images,
        system_prompt=rpa_builder_prompt['system'],
        output_format=models.RPABuilderOutput
      )
      
      # Save output
      agent_utils.write_to_file(file_path=log_task_path, 
                                file_name=f'rpa_builder_output_{round_name}{retry_suffix}.txt',
                                content=builder_output)
      
      # Record tokens
      cost_tokens = raw_response.usage
      total_tokens += cost_tokens.total_tokens if hasattr(cost_tokens, 'total_tokens') else 0
      self.record_token.step = '-'
      self.record_token.agent = f'RPA Builder {round_name} (retry {syntax_retry})' if syntax_retry > 0 else f'RPA Builder {round_name}'
      self.record_token.step_tokens = cost_tokens
      self.record_token.llm = FLAGS.builder_llm
      agent_utils.record_cost_tokens(self.record_token)
      
      # Extract and print RPA info
      rpa_info = builder_output.output
      self._print_rpa_info(builder_output, rpa_info)
      
      # Validate syntax
      is_valid, error_msg = validate_python_syntax(rpa_info.rpa_code, log_task_path)
      
      if is_valid:
        return builder_output, rpa_info, total_tokens
      
      # Handle syntax error
      if syntax_retry < MAX_SYNTAX_RETRY:
        print_with_color(f'🔄 Syntax error detected, retrying... ({syntax_retry + 1}/{MAX_SYNTAX_RETRY})', 'yellow')
        syntax_error_feedback = get_syntax_error_feedback(error_msg)
        # Regenerate base prompt for next iteration
        base_prompt = get_rpa_builder_prompt(task_type=task_type, task_template=task_template,
                                             react_trajs=list_react_traj,
                                             pre_rpa_exec_traj=pre_rpa_exec_traj,
                                             encountered_task_goals=encountered_task_goals,
                                             fetched_info=fetched_info, use_tool=False,
                                             rpa_builder_conclusion=self.rpa_builder_conclusion,
                                             agent_name=agent_name,
                                             enable_shell_action=self.enable_shell_action)
      else:
        print_with_color(f'⚠️  Max syntax retries reached. Proceeding with code that has syntax errors.', 'yellow')
    
    return builder_output, rpa_info, total_tokens
  
  def generate_rpa_code(
    self,
    log_task_path: str,
    task_type: str,
    task_template: str,
    list_react_traj: Optional[list[models.ReActTraj]] = None,
    pre_rpa_exec_traj: Optional[models.RPAExecTraj] = None,
    encountered_task_goals: Optional[list[str]] = None,
  ) -> tuple[models.RPAInfo, int]:
    """
    Generate RPA code from trajectories.
    
    Args:
      log_task_path: Path to save logs
      task_type: Task type
      task_template: Task template
      list_react_traj: (Legacy) ReAct trajectories
      pre_rpa_exec_traj: Previous RPA execution trajectory
      
    Returns:
      Tuple of (RPAInfo, fetch_info_count)
    """
    print('============================================')
    print("Current Agent: RPA_Builder_Agent\n")
    print(f'model: {self.default_llm.model_name}\n')
    os.makedirs(log_task_path, exist_ok=True)
    
    # Support ReActTraj
    # Get agent_name from FLAGS.gui_agent_type for action space selection
    agent_name = getattr(FLAGS, 'gui_agent_type', None)
    
    print_with_color("ℹ️  Using legacy ReActTraj format", 'yellow')
    self.react_trajs = list_react_traj
    
    self.pre_rpa_exec_traj = pre_rpa_exec_traj
    
    cnt_fetch_info = 0
    MAX_FETCH_force = 0
    MAX_FETCH_optional = 3 if FLAGS.use_fetch_info else 0
    MAX_FETCH_total = MAX_FETCH_optional + MAX_FETCH_force
    fetched_info = {}
    fetched_screenshot = []
    if MAX_FETCH_total == 0:
      print_with_color(f'\nBuilder Round 0\n', 'blue')
      base_prompt = get_rpa_builder_prompt(task_type=task_type, task_template=task_template,
                                           react_trajs=list_react_traj,
                                           encountered_task_goals=encountered_task_goals,
                                           fetched_info=fetched_info,
                                           pre_rpa_exec_traj=pre_rpa_exec_traj,
                                           rpa_builder_conslusion=self.rpa_builder_conslusion,
                                           agent_name=agent_name,
                                           enable_shell_action=self.enable_shell_action)
      
      builder_output, rpa_info, _ = self._generate_with_syntax_retry(
        base_prompt=base_prompt,
        images=[],
        log_task_path=log_task_path,
        round_name='0',
        task_type=task_type,
        task_template=task_template,
        list_react_traj=list_react_traj,
        pre_rpa_exec_traj=pre_rpa_exec_traj,
        fetched_info=fetched_info,
        encountered_task_goals=encountered_task_goals,
        agent_name=agent_name
      )
      
      agent_utils.write_to_file(file_path=log_task_path, file_name='rpa_builder_output.txt', content=builder_output)
    for fetch_cnt in range(MAX_FETCH_total):
      print_with_color(f'\nBuilder Round {fetch_cnt}\n', 'blue')
      output_format = models.RPABuilderOutput_optional
      if fetch_cnt < MAX_FETCH_force:
        output_format = models.RPABuilderOutput_tool
      rpa_builder_prompt = get_rpa_builder_prompt(task_type=task_type, task_template=task_template,
                                                      react_trajs=list_react_traj,
                                                      encountered_task_goals=encountered_task_goals,
                                                      pre_rpa_exec_traj=pre_rpa_exec_traj,
                                                      fetched_info=fetched_info, use_tool=True,
                                                      rpa_builder_conclusion=self.rpa_builder_conclusion,
                                                      agent_name=agent_name,
                                           enable_shell_action=self.enable_shell_action)
      agent_utils.write_to_file(file_path=log_task_path, file_name=f'rpa_builder_prompt_{fetch_cnt}.txt',
                                content=f"[system]\n{rpa_builder_prompt['system']}\n\n[user]\n{rpa_builder_prompt['user']}")
      builder_output, raw_response = self.default_llm.predict_mm(
        user_prompt=rpa_builder_prompt['user'],
        images=fetched_screenshot,
        system_prompt=rpa_builder_prompt['system'],
        output_format=output_format
      )
      agent_utils.write_to_file(file_path=log_task_path, file_name=f'rpa_builder_output_{fetch_cnt}.txt',
                                content=builder_output)
      agent_utils.write_to_file(file_path=log_task_path, file_name=f'rpa_builder_raw_response_{fetch_cnt}.txt',
                                content=raw_response)
      cost_tokens = raw_response.usage
      self.record_token.step = '-'
      self.record_token.agent = f'RPA Builder {fetch_cnt}'
      self.record_token.step_tokens = cost_tokens
      self.record_token.llm = FLAGS.builder_llm
      agent_utils.record_cost_tokens(self.record_token)
      if isinstance(builder_output.output, models.FetchInfoTool):
        # count fetch times
        cnt_fetch_info += 1
        
        print(f'Thought:\n{builder_output.thought}\n')
        print(f'info_to_clarify:\n{builder_output.info_to_clarify}\n')
        print(f'output:\n{builder_output.output}\n')
        
        fetched_info = self.fetch_info(builder_output.output.traj_id, builder_output.output.step_n)  # Local screenshot + UI extraction
        if fetched_info['screenshot'] is not None:
          fetched_screenshot = [fetched_info['screenshot']]
          agent_utils.store_image(fetched_info['screenshot'], f'screenshot_{fetch_cnt}.png', log_task_path)
          agent_utils.write_to_file(log_task_path, f'ui_content_{fetch_cnt}.txt', content=fetched_info['ui_content_full_dict'])
        else:
          err = fetched_info.get('screenshot_error') or 'unknown_error'
          print(f"================================================ fetch_info failed: {err}")
      else:
        # RPA function generation (not fetch_info tool call)
        # First attempt already done, check syntax and retry if needed
        rpa_info = builder_output.output
        self._print_rpa_info(builder_output, rpa_info)
        
        # Validate syntax - if invalid, regenerate with retry logic
        is_valid, error_msg = validate_python_syntax(rpa_info.rpa_code, log_task_path)
        
        if not is_valid:
          # Use unified retry method
          print_with_color(f'🔄 Syntax error detected in initial generation, retrying...', 'yellow')
          base_prompt = get_rpa_builder_prompt(task_type=task_type, task_template=task_template,
                                               react_trajs=list_react_traj,
                                               pre_rpa_exec_traj=pre_rpa_exec_traj,
                                               encountered_task_goals=encountered_task_goals,
                                               fetched_info=fetched_info, use_tool=False,
                                               rpa_builder_conclusion=self.rpa_builder_conclusion,
                                               agent_name=agent_name,
                                           enable_shell_action=self.enable_shell_action)
          
          builder_output, rpa_info, _ = self._generate_with_syntax_retry(
            base_prompt=base_prompt,
            images=fetched_screenshot,
            log_task_path=log_task_path,
            round_name=f'{fetch_cnt}_syntax',
            task_type=task_type,
            task_template=task_template,
            list_react_traj=list_react_traj,
            pre_rpa_exec_traj=pre_rpa_exec_traj,
            fetched_info=fetched_info,
            encountered_task_goals=encountered_task_goals,
            agent_name=agent_name
          )
        
        break  # No more tool needed, exit loop
      
      # If we reach here, all attempts so far were tool calls
      if fetch_cnt == MAX_FETCH_total - 1:
        print_with_color(f'\nBuilder Round {fetch_cnt + 1} (Final, force generation)\n', 'blue')
        base_prompt = get_rpa_builder_prompt(task_type=task_type, task_template=task_template,
                                             react_trajs=list_react_traj,
                                             encountered_task_goals=encountered_task_goals,
                                             fetched_info=fetched_info,
                                             pre_rpa_exec_traj=pre_rpa_exec_traj,
                                             rpa_builder_conclusion=self.rpa_builder_conclusion,
                                             agent_name=agent_name,
                                           enable_shell_action=self.enable_shell_action)
        
        builder_output, rpa_info, _ = self._generate_with_syntax_retry(
          base_prompt=base_prompt,
          images=fetched_screenshot,
          log_task_path=log_task_path,
          round_name=f'{fetch_cnt + 1}',
          task_type=task_type,
          task_template=task_template,
          list_react_traj=list_react_traj,
          pre_rpa_exec_traj=pre_rpa_exec_traj,
          fetched_info=fetched_info,
          encountered_task_goals=encountered_task_goals,
          agent_name=agent_name
        )
        
        agent_utils.write_to_file(file_path=log_task_path, file_name='rpa_builder_output.txt', content=builder_output)
    
    # Store conclusion and ensure task_type is set
    if hasattr(rpa_info, 'conclusion'):
      self.rpa_builder_conclusion = rpa_info.conclusion
    rpa_info.task_type = task_type
    
    return rpa_info, cnt_fetch_info
  
  def fetch_info(self, traj_id: str | int, step_n: int) -> dict:
    """
    Fetch additional information from a trajectory step.
    
    Args:
      traj_id: Trajectory ID (string like 'successful_react_traj', 'pre_rpa_exec_traj', 
               'failed_react_traj', 'fix_react_traj', or integer index)
      step_n: Step number (1-based index)
      
    Returns:
      Dict with:
        - screenshot: numpy array or None
        - screenshot_error: str (empty if ok)
        - ui_content_full_dict: list[dict]
    """
    screenshot_error = ""
    ui_content_full_dict = ""
    screenshot = None
    # Handle string traj_id (e.g., 'successful_react_traj', 'pre_rpa_exec_traj')
    if isinstance(traj_id, str):
      if traj_id == 'pre_rpa_exec_traj':
        # Pre RPA execution trajectory
        if self.pre_rpa_exec_traj is None:
          screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=no_pre_rpa_exec_traj"
          return {
            'screenshot': None,
            'screenshot_error': screenshot_error,
            'ui_content_full_dict': ui_content_full_dict,
            'traj_id': traj_id,
            'step_n': step_n
          }
        # Pre RPA exec traj uses traj (list of EnvExecStepInfo)
        # Convert to 0-based index
        step_idx = step_n - 1 if step_n > 0 else 0
        if step_idx >= len(self.pre_rpa_exec_traj.traj):
          screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=step_out_of_range, max_steps={len(self.pre_rpa_exec_traj.traj)}"
          return {
            'screenshot': None,
            'screenshot_error': screenshot_error,
            'ui_content_full_dict': ui_content_full_dict,
            'traj_id': traj_id,
            'step_n': step_n
          }
        target_step = self.pre_rpa_exec_traj.traj[step_idx]
        screenshot_path = target_step.before_screenshot_w_som_path
        # Load image data if path is provided
        # Note: load_image_as_ndarray handles relative paths (relative to project root) and error checking internally
        if screenshot_path:
          try:
            screenshot = agent_utils.load_image_as_ndarray(screenshot_path)
          except Exception as e:
            screenshot_error = f"fetch_info failed to load screenshot: traj_id={traj_id}, step_n={step_n}, path={screenshot_path}, error={e}"
        # V2 EnvExecStepInfo: no before_ui_content string; use structured UI.
        ui_content_full_dict = target_step.before_ui_content_full_dict
      elif traj_id == 'successful_react_traj':
        # Find successful trajectory in react_trajs list
        target_traj = self.react_trajs[-1] if len(self.react_trajs) > 0 else None
        if target_traj is None:
          screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=no_successful_traj"
          return {
            'screenshot': None,
            'screenshot_error': screenshot_error,
            'ui_content_full_dict': ui_content_full_dict,
            'traj_id': traj_id,
            'step_n': step_n
          }
        # Find the step by matching step_n (don't assume step_n starts from 1 or is continuous)
        # Legacy format with traj (list of ReActStepInfo)
        target_step_info = None
        for step_info in target_traj.traj:
          if step_info.step_n == step_n:
            target_step_info = step_info
            break
        if target_step_info is None:
          available_steps = [s.step_n for s in target_traj.traj]
          screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=step_not_found, available_steps={available_steps}"
          return {
            'screenshot': None,
            'screenshot_error': screenshot_error,
            'ui_content_full_dict': ui_content_full_dict,
            'traj_id': traj_id,
            'step_n': step_n
          }
        screenshot_path = target_step_info.exec_step_info.before_screenshot_w_som_path if target_step_info.exec_step_info else None
        # Load image data if path is provided
        # Note: load_image_as_ndarray handles relative paths (relative to project root) and error checking internally
        if screenshot_path:
          try:
            screenshot = agent_utils.load_image_as_ndarray(screenshot_path)
          except Exception as e:
            screenshot_error = f"fetch_info failed to load screenshot: traj_id={traj_id}, step_n={step_n}, path={screenshot_path}, error={e}"
        # V2 ReActStepInfo: UI is stored in exec_step_info.before_ui_content_full_dict (structured).
        ui_content_full_dict = target_step_info.exec_step_info.before_ui_content_full_dict if (hasattr(target_step_info, 'exec_step_info') and target_step_info.exec_step_info) else ''
      elif traj_id == 'failed_react_traj':
        target_traj = self.react_trajs[-2] if len(self.react_trajs) > 1 else None
        if target_traj is None:
          screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=no_failed_traj"
          return {
            'screenshot': None,
            'screenshot_error': screenshot_error,
            'ui_content_full_dict': ui_content_full_dict,
            'traj_id': traj_id,
            'step_n': step_n
          }
        # Find the step by matching step_n (don't assume step_n starts from 1 or is continuous)
        target_step_info = None
        for step_info in target_traj.traj:
          if step_info.step_n == step_n:
            target_step_info = step_info
            break
        if target_step_info is None:
          available_steps = [s.step_n for s in target_traj.traj]
          screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=step_not_found, available_steps={available_steps}"
          return {
            'screenshot': None,
            'screenshot_error': screenshot_error,
            'ui_content_full_dict': ui_content_full_dict,
            'traj_id': traj_id,
            'step_n': step_n
          }
        screenshot_path = target_step_info.exec_step_info.before_screenshot_w_som_path if target_step_info.exec_step_info else None
        # Load image data if path is provided
        # Note: load_image_as_ndarray handles relative paths (relative to project root) and error checking internally
        if screenshot_path:
          try:
            screenshot = agent_utils.load_image_as_ndarray(screenshot_path)
          except Exception as e:
            screenshot_error = f"fetch_info failed to load screenshot: traj_id={traj_id}, step_n={step_n}, path={screenshot_path}, error={e}"
        ui_content_full_dict = target_step_info.exec_step_info.before_ui_content_full_dict if (hasattr(target_step_info, 'exec_step_info') and target_step_info.exec_step_info) else ''
      elif traj_id == 'fix_react_traj':
        # Fix react_star trajectory (usually the last one if exists)
        if len(self.react_trajs) == 0:
          screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=no_trajs"
          return {
            'screenshot': None,
            'screenshot_error': screenshot_error,
            'ui_content_full_dict': ui_content_full_dict,
            'traj_id': traj_id,
            'step_n': step_n
          }
        # Fix trajectory is usually the last one
        target_traj = self.react_trajs[-1]
        
        # Find the step by matching step_n (don't assume step_n starts from 1)
        # Find step with matching step_n
        target_step_info = None
        for step_info in target_traj.traj:
          if step_info.step_n == step_n:
            target_step_info = step_info
            break
        
        if target_step_info is None:
          available_steps = [s.step_n for s in target_traj.traj]
          screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=step_not_found, available_steps={available_steps}"
          return {
            'screenshot': None,
            'screenshot_error': screenshot_error,
            'ui_content_full_dict': ui_content_full_dict,
            'traj_id': traj_id,
            'step_n': step_n
          }
        
        screenshot_path = target_step_info.exec_step_info.before_screenshot_w_som_path if target_step_info.exec_step_info else None
        # Load image data if path is provided
        # Note: load_image_as_ndarray handles relative paths (relative to project root) and error checking internally
        if screenshot_path:
          try:
            screenshot = agent_utils.load_image_as_ndarray(screenshot_path)
          except Exception as e:
            screenshot_error = f"fetch_info failed to load screenshot: traj_id={traj_id}, step_n={step_n}, path={screenshot_path}, error={e}"
        ui_content_full_dict = target_step_info.exec_step_info.before_ui_content_full_dict if (hasattr(target_step_info, 'exec_step_info') and target_step_info.exec_step_info) else ''
      else:
        screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=unknown_traj_id"
        return {
          'screenshot': None,
          'screenshot_error': screenshot_error,
          'ui_content_full_dict': ui_content_full_dict,
          'traj_id': traj_id,
          'step_n': step_n
        }
    else:
      # Handle integer traj_id (legacy format)
      traj_idx = traj_id if isinstance(traj_id, int) else int(traj_id)
      if traj_idx >= len(self.react_trajs):
        screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=traj_out_of_range, max_trajs={len(self.react_trajs)}"
        return {
          'screenshot': None,
          'screenshot_error': screenshot_error,
          'ui_content_full_dict': ui_content_full_dict,
          'traj_id': traj_id,
          'step_n': step_n
        }
      target_traj = self.react_trajs[traj_idx]
      step_idx = step_n - 1 if step_n > 0 else 0
      if step_idx >= len(target_traj.traj):
        screenshot_error = f"fetch_info target not found: traj_id={traj_id}, step_n={step_n}, reason=step_out_of_range, max_steps={len(target_traj.traj)}"
        return {
          'screenshot': None,
          'screenshot_error': screenshot_error,
          'ui_content_full_dict': ui_content_full_dict,
          'traj_id': traj_id,
          'step_n': step_n
        }
      target_step_info = target_traj.traj[step_idx]
      screenshot_path = target_step_info.exec_step_info.before_screenshot_w_som_path if target_step_info.exec_step_info else None
      if screenshot_path:
        try:
          screenshot = agent_utils.load_image_as_ndarray(screenshot_path)
        except Exception as e:
          screenshot_error = f"fetch_info failed to load screenshot: traj_id={traj_id}, step_n={step_n}, path={screenshot_path}, error={e}"
      ui_content_full_dict = target_step_info.exec_step_info.before_ui_content_full_dict if (hasattr(target_step_info, 'exec_step_info') and target_step_info.exec_step_info) else ''
    
    ui_len = len(ui_content_full_dict) if isinstance(ui_content_full_dict, list) else (1 if ui_content_full_dict else 0)
    if screenshot is not None and screenshot_error == "":
      print_with_color(
        f"✅ fetch_info successfully: traj_id={traj_id}, step_n={step_n}, screenshot=Y, ui_items={ui_len}",
        "green",
      )
    else:
      # failed/incomplete: try to print the reason for easier debugging
      reason = screenshot_error or "no_screenshot"
      print_with_color(
        f"⚠️  fetch_info failed: traj_id={traj_id}, step_n={step_n}, screenshot={'Y' if screenshot is not None else 'N'}, reason={reason}",
        "yellow",
      )

    return {
      'screenshot': screenshot, 
      'screenshot_error': screenshot_error,
      'ui_content_full_dict': ui_content_full_dict,
      'traj_id': traj_id,
      'step_n': step_n
    }
  
  def _is_hardcoded_action(self, action: Any) -> bool:
    """
    Check if an action is hardcoded (uses index or coordinates) and needs translation.
    
    Hardcoded patterns:
    - env_op.xxx(5)  # AutoRPA ReAct (index mode)
    - env_op.xxx(100, 200)  # AutoRPA ReAct (coordinate mode)
    - click(19)      # DroidRun
    - type(5, "text")  # DroidRun
    - Any action with numeric index or coordinate parameters
    
    Args:
      action: RawAction object
      
    Returns:
      True if hardcoded, False otherwise
    """
    # Special handling for shell actions
    if hasattr(action, 'action_type') and action.action_type == 'shell':
      return self._shell_command_needs_translation(action)
    
    # Check agent_specific_data for original code
    if hasattr(action, 'agent_specific_data') and action.agent_specific_data:
      # Check DroidRun format
      if 'droidrun' in action.agent_specific_data:
        original_code = action.agent_specific_data['droidrun'].get('original_code', '')
        # Match patterns like: click(19), type(5, "text"), tap(3)
        if re.search(r'\w+\(\s*\d+', original_code):
          return True
      
      # Check AutoRPA ReAct format
      if 'autorpa' in action.agent_specific_data:
        original_code = action.agent_specific_data['autorpa'].get('original_action', '')
        # Match patterns like: 
        # - env_op.click(5) - index mode
        # - env_op.click(100, 200) - coordinate mode
        # - env_op.swipe(100, 200, 300, 400) - coordinate swipe
        if re.search(r'env_op\.\w+\(\s*\d+', original_code):
          # Check if it's coordinate mode (two or four numbers)
          # Pattern: env_op.xxx(num1, num2) or env_op.swipe(num1, num2, num3, num4)
          coord_pattern = r'env_op\.(click|long_press|input_text|swipe)\(\s*(\d+)\s*,\s*(\d+)'
          if re.search(coord_pattern, original_code):
            return True  # Coordinate mode
          # Check for index mode: env_op.xxx(single_number)
          index_pattern = r'env_op\.(click|long_press|input_text)\(\s*(\d+)\s*\)'
          if re.search(index_pattern, original_code):
            return True  # Index mode
          # Check for swipe with 4 coordinates
          swipe_coord_pattern = r'env_op\.swipe\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)'
          if re.search(swipe_coord_pattern, original_code):
            return True  # Coordinate swipe
          # Check for drag_and_drop with 4 coordinates or 2 indices
          drag_coord_pattern = r'env_op\.drag_and_drop\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)'
          drag_index_pattern = r'env_op\.drag_and_drop\(\s*(\d+)\s*,\s*(\d+)'
          if re.search(drag_coord_pattern, original_code) or re.search(drag_index_pattern, original_code):
            return True  # drag_and_drop
    
    # Check if parameters contain numeric index
    if hasattr(action, 'parameters') and isinstance(action.parameters, dict):
      if 'index' in action.parameters and isinstance(action.parameters['index'], int):
        return True
      # Check for coordinate parameters
      if ('x' in action.parameters and 'y' in action.parameters and 
          isinstance(action.parameters.get('x'), int) and isinstance(action.parameters.get('y'), int)):
        return True
      if ('start_x' in action.parameters and 'start_y' in action.parameters and
          'end_x' in action.parameters and 'end_y' in action.parameters):
        return True
      # Check for drag_and_drop parameters
      if ('touch_xy' in action.parameters and 'lift_xy' in action.parameters):
        return True
      if ('start_index' in action.parameters and 'end_index' in action.parameters):
        return True
    
    # Check action types that typically use indices or coordinates
    if hasattr(action, 'action_type'):
      if action.action_type in ['click', 'type', 'long_press', 'swipe', 'drag_and_drop'] and action.parameters:
        if any(isinstance(v, int) for v in action.parameters.values()):
          return True
    
    return False
  
  def _shell_command_needs_translation(self, action: Any) -> bool:
    """
    Determine if a shell command contains hardcoded values that need translation.
    
    This function only checks the action content itself, regardless of enable_shell_action flag.
    The flag only affects whether shell guidance is shown in builder prompt, not translation logic.
    
    Args:
      action: RawAction object with action_type='shell'
      
    Returns:
      True if shell command needs translation, False otherwise
    """
    # Extract shell command
    command = ''
    if hasattr(action, 'parameters') and isinstance(action.parameters, dict):
      command = action.parameters.get('command', '')
    elif hasattr(action, 'agent_specific_data') and action.agent_specific_data:
      if 'autorpa' in action.agent_specific_data:
        original_code = action.agent_specific_data['autorpa'].get('original_action', '')
        # Extract command from env_op.shell("...")
        match = re.search(r'env_op\.shell\([\'"](.+?)[\'"]\)', original_code)
        if match:
          command = match.group(1)
    
    if not command:
      return False  # No command found, skip translation
    
    # Commands that DON'T need translation (pure queries/info)
    safe_commands_prefixes = [
      'pm list packages',          # List all packages (no parameters)
      'dumpsys battery',           # Battery info
      'dumpsys window',            # Window info
      'getprop',                   # Get system property
      'wm size',                   # Screen size
      'wm density',                # Screen density
      'settings list',             # List settings
      'settings get',              # Get setting value (query only)
    ]
    
    for prefix in safe_commands_prefixes:
      if command.strip().startswith(prefix):
        return False  # Safe command, no translation needed
    
    # Commands that NEED translation (contain hardcoded parameters)
    needs_translation_patterns = [
      r'pm uninstall\s+\S+',       # pm uninstall <package>
      r'pm clear\s+\S+',           # pm clear <package>
      r'pm path\s+\S+',            # pm path <package>
      r'am start\s+-n\s+\S+',      # am start -n <package>/<activity>
      r'am force-stop\s+\S+',      # am force-stop <package>
      r'settings put\s+\S+\s+\S+', # settings put <namespace> <key> <value>
      r'rm\s+/\S+',                # rm <file_path>
      r'cat\s+/\S+',               # cat <file_path>
      r'mkdir\s+/\S+',             # mkdir <dir_path>
      r'ls\s+/\S+',                # ls <dir_path> (with specific path)
      r'input tap\s+\d+\s+\d+',    # input tap <x> <y> (coordinate)
      r'input text\s+',            # input text <text>
      r'input swipe\s+\d+',        # input swipe <coords>
    ]
    
    for pattern in needs_translation_patterns:
      if re.search(pattern, command):
        return True  # Found hardcoded parameter, needs translation
    
    # Default: conservative - assume needs translation for safety
    # (Better to unnecessarily translate than to miss hardcoded values)
    return True
  
  def _prepare_action_translator_input(
    self,
    raw_step: Any,
    task_goal: str
  ) -> dict:
    """
    Prepare input for ActionTranslator from a RawStep.
    
    Args:
      raw_step: RawStep object
      task_goal: Task goal
      
    Returns:
      Dict with ActionTranslator input fields
    """
    action = raw_step.action
    obs_before = raw_step.observation_before
    
    # Extract original action code
    original_code = ""
    comment = ""
    reasoning = ""
    
    if hasattr(action, 'agent_specific_data') and action.agent_specific_data:
      if 'droidrun' in action.agent_specific_data:
        droidrun_data = action.agent_specific_data['droidrun']
        original_code = droidrun_data.get('original_code', '')
        comment = droidrun_data.get('comment', '')
        reasoning = droidrun_data.get('reasoning', '')
      elif 'autorpa' in action.agent_specific_data:
        autorpa_data = action.agent_specific_data['autorpa']
        original_code = autorpa_data.get('original_action', '')
        comment = autorpa_data.get('related_element', '')
        reasoning = autorpa_data.get('code_reason', '')
    
    # If no original code found, construct from action
    if not original_code:
      if action.action_type and action.parameters:
        params_str = ", ".join(str(v) for v in action.parameters.values())
        original_code = f"{action.action_type}({params_str})"
    
    # Format UI information
    ui_info_str = self._format_ui_elements_for_translator(obs_before)
    
    # Handle coordinate-based actions: find element at coordinate and add to related_element
    related_element_info = comment or (str(action.target_element) if action.target_element else '')
    
    # Check if action uses coordinates
    coordinates_info = self._extract_coordinate_info(action, original_code, obs_before)
    if coordinates_info:
      related_element_info = f"{related_element_info}\n\n[Coordinate Information]\n{coordinates_info}".strip()
    
    return {
      'goal': task_goal,
      'obs_analysis': getattr(obs_before, 'description', '') or (str(obs_before.ui_elements[:3]) if (hasattr(obs_before, 'ui_elements') and obs_before.ui_elements) else ''),
      'action_reason': reasoning or action.thought or '',
      'action': original_code,
      'related_element': related_element_info,
      'ui_info_str': ui_info_str
    }
  
  def _extract_coordinate_info(
    self,
    action: Any,
    original_code: str,
    obs_before: Any
  ) -> Optional[str]:
    """
    Extract coordinate information from action and find corresponding UI element.
    
    Args:
      action: RawAction object
      original_code: Original action code string
      obs_before: Observation before action
      
    Returns:
      String describing coordinate and corresponding element, or None if not coordinate-based
    """
    import re
    
    # Get UI elements list - prefer ui_content_full_dict, fallback to converting ui_elements
    ui_content_full_dict = None
    if hasattr(obs_before, 'ui_content_full_dict') and obs_before.ui_content_full_dict:
      ui_content_full_dict = obs_before.ui_content_full_dict
    elif hasattr(obs_before, 'ui_elements') and obs_before.ui_elements:
      # Fallback: convert UIElement objects to dict format using project_ui_elements_to_full_dict
      # This handles old data or cases where ui_content_full_dict wasn't set
      # Try to infer screen_size from ui_elements' bbox, or use default
      screen_size = None
      # Try to infer from bbox_pixels in ui_elements
      if obs_before.ui_elements:
        max_x, max_y = 0, 0
        for elem in obs_before.ui_elements:
          bbox = getattr(elem, 'bbox_pixels', None)
          if bbox:
            x_max = getattr(bbox, 'x_max', 0)
            y_max = getattr(bbox, 'y_max', 0)
            max_x = max(max_x, x_max)
            max_y = max(max_y, y_max)
        if max_x > 0 and max_y > 0:
          screen_size = (max_x, max_y)
      
      # Fallback to default if couldn't infer
      if not screen_size:
        screen_size = (1080, 2400)  # Default fallback
      
      try:
        ui_content_full_dict = agent_utils.project_ui_elements_to_full_dict(
          obs_before.ui_elements, screen_size
        )
      except Exception as e:
        # If conversion fails, return None (coordinate matching won't work)
        print_with_color(f"Warning: Failed to convert ui_elements to dict: {e}", 'yellow')
        return None
    
    if not ui_content_full_dict:
      return None
    
    # Extract coordinates from parameters or original_code
    x, y = None, None
    start_x, start_y, end_x, end_y = None, None, None, None
    
    # Try to extract from parameters first
    if hasattr(action, 'parameters') and isinstance(action.parameters, dict):
      if 'x' in action.parameters and 'y' in action.parameters:
        x = action.parameters.get('x')
        y = action.parameters.get('y')
      elif 'start_x' in action.parameters and 'start_y' in action.parameters:
        start_x = action.parameters.get('start_x')
        start_y = action.parameters.get('start_y')
        end_x = action.parameters.get('end_x')
        end_y = action.parameters.get('end_y')
      elif 'touch_xy' in action.parameters and 'lift_xy' in action.parameters:
        # drag_and_drop uses touch_xy and lift_xy
        touch_xy = action.parameters.get('touch_xy')
        lift_xy = action.parameters.get('lift_xy')
        if isinstance(touch_xy, (list, tuple)) and len(touch_xy) >= 2:
          start_x = touch_xy[0]
          start_y = touch_xy[1]
        if isinstance(lift_xy, (list, tuple)) and len(lift_xy) >= 2:
          end_x = lift_xy[0]
          end_y = lift_xy[1]
    
    # If not in parameters, try to extract from original_code
    if x is None and original_code:
      # Pattern: env_op.click(x, y) or env_op.long_press(x, y)
      coord_match = re.search(r'env_op\.(click|long_press)\(\s*(\d+)\s*,\s*(\d+)', original_code)
      if coord_match:
        x = int(coord_match.group(2))
        y = int(coord_match.group(3))
      
      # Pattern: env_op.input_text(text, x, y, clear_text) - text is first param, then x, y
      input_text_match = re.search(r'env_op\.input_text\([^,]+,\s*(\d+)\s*,\s*(\d+)', original_code)
      if input_text_match:
        x = int(input_text_match.group(1))
        y = int(input_text_match.group(2))
      
      # Pattern: env_op.swipe(start_x, start_y, end_x, end_y)
      swipe_match = re.search(r'env_op\.swipe\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)', original_code)
      if swipe_match:
        start_x = int(swipe_match.group(1))
        start_y = int(swipe_match.group(2))
        end_x = int(swipe_match.group(3))
        end_y = int(swipe_match.group(4))
      
      # Pattern: env_op.drag_and_drop(start_x, start_y, end_x, end_y) or drag_and_drop(start_index, end_index)
      drag_match = re.search(r'env_op\.drag_and_drop\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)', original_code)
      if drag_match:
        # Coordinate-based drag_and_drop
        start_x = int(drag_match.group(1))
        start_y = int(drag_match.group(2))
        end_x = int(drag_match.group(3))
        end_y = int(drag_match.group(4))
      else:
        # Check for index-based drag_and_drop: drag_and_drop(start_index, end_index)
        drag_index_match = re.search(r'env_op\.drag_and_drop\(\s*(\d+)\s*,\s*(\d+)', original_code)
        if drag_index_match:
          # Index-based drag_and_drop - we'll handle this differently
          start_index = int(drag_index_match.group(1))
          end_index = int(drag_index_match.group(2))
          # Note: For index-based drag_and_drop, we can't extract coordinates here
          # The translation will need to convert indices to element descriptions
    
    # Find elements at coordinates
    info_parts = []
    
    if x is not None and y is not None:
      # For click/long_press/input_text
      element = agent_utils.find_element_at_coordinate(x, y, ui_content_full_dict)
      if element:
        elem_desc = f"Element at coordinate ({x}, {y}): index={element.get('index', 'N/A')}, "
        if element.get('text'):
          elem_desc += f"text='{element['text']}', "
        if element.get('content_description'):
          elem_desc += f"content_description='{element['content_description']}', "
        elem_desc += f"is_clickable={element.get('is_clickable', False)}"
        info_parts.append(f"Coordinate ({x}, {y}) → {elem_desc}")
      else:
        info_parts.append(f"Coordinate ({x}, {y}) → No UI element found at this coordinate (may be blank area)")
    
    if start_x is not None and start_y is not None:
      # For swipe or drag_and_drop
      start_elem = agent_utils.find_element_at_coordinate(start_x, start_y, ui_content_full_dict)
      end_elem = agent_utils.find_element_at_coordinate(end_x, end_y, ui_content_full_dict) if end_x and end_y else None
      
      # Determine action type from original_code
      action_type = "Swipe"
      if original_code and 'drag_and_drop' in original_code:
        action_type = "Drag and drop"
      
      action_info = f"{action_type} from ({start_x}, {start_y}) to ({end_x}, {end_y}):\n"
      if start_elem:
        action_info += f"  Start point → Element index={start_elem.get('index', 'N/A')}, text='{start_elem.get('text', '')}', is_clickable={start_elem.get('is_clickable', False)}\n"
      else:
        action_info += f"  Start point → No element (blank area)\n"
      
      if end_elem:
        action_info += f"  End point → Element index={end_elem.get('index', 'N/A')}, text='{end_elem.get('text', '')}', is_clickable={end_elem.get('is_clickable', False)}"
      else:
        action_info += f"  End point → No element (blank area)"
      
      info_parts.append(action_info)
    
    return "\n".join(info_parts) if info_parts else None
  
  def _format_ui_elements_for_translator(self, observation: Any) -> str:
    """
    Format UI elements from observation for ActionTranslator.
    
    Args:
      observation: RawObservation object (ScreenObs)
      
    Returns:
      Formatted string of UI elements
    """
    # Prefer ui_content_full_dict if available (most complete and consistent)
    if hasattr(observation, 'ui_content_full_dict') and observation.ui_content_full_dict:
      return agent_utils._generate_ui_elements_description_str(observation.ui_content_full_dict)
    
    # Fallback to ui_content_simple_str if available
    if hasattr(observation, 'ui_content_simple_str') and observation.ui_content_simple_str:
      return observation.ui_content_simple_str
    
    # Last resort: convert ui_elements if available
    if hasattr(observation, 'ui_elements') and observation.ui_elements and len(observation.ui_elements) > 0:
      # Check if it's dict format or UIElement objects
      if isinstance(observation.ui_elements[0], dict):
        # Already dict format
        return agent_utils._generate_ui_elements_description_str(observation.ui_elements)
      else:
        # UIElement objects - need screen_size for conversion
        # Try to infer screen_size from ui_elements' bbox, or use default
        screen_size = None
        if observation.ui_elements:
          max_x, max_y = 0, 0
          for elem in observation.ui_elements:
            bbox = getattr(elem, 'bbox_pixels', None)
            if bbox:
              x_max = getattr(bbox, 'x_max', 0)
              y_max = getattr(bbox, 'y_max', 0)
              max_x = max(max_x, x_max)
              max_y = max(max_y, y_max)
          if max_x > 0 and max_y > 0:
            screen_size = (max_x, max_y)
        
        # Fallback to default if couldn't infer
        if not screen_size:
          screen_size = (1080, 2400)  # Default fallback
        
        return agent_utils._generate_ui_elements_description_str(
          observation.ui_elements, screen_size
        )
    
    return "UI elements not available"