import os
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache
from typing import Any, Tuple

import cv2
import numpy as np
import regex
from absl import flags
from types import SimpleNamespace

from ..env_operation import EnvOperation
from ..prompts.action_translator_prompt import get_action_translator_prompt
from ..prompts.breakpoint_analyzer_prompt import get_breakpoint_analyzer_prompt
from ..prompts.concluder_prompt import get_concluder_prompt
from ..utils import agent_utils, models
from ..utils.llm_client import OpenAIWrapper
from ..utils.models import ScreenObs
from ..utils.agent_utils import print_with_color
from ..action_translators.registry import ActionTranslatorRegistry
from ..action_translators.react_star import ReactStarActionTranslator

FLAGS = flags.FLAGS
ActionTranslatorRegistry.register('react_star', lambda agent_rpa: ReactStarActionTranslator(agent_rpa))


# =============================================================================
# ARCHITECTURE NOTE: Decoupled Design
# =============================================================================
# Agent_RPA is now focused on core RPA functionality:
#   1. RPA execution (rpa_testing, Execute_rpa_code)
#   2. ActionTranslator
#   3. Concluder (shared across workflows)
#
# ReAct-specific logic (Planner, Summarizer) has been moved to ReactStarAgent.
# Agent_RPA no longer inherits from android_world.agents.base_agent to maintain
# clean separation from AndroidWorld framework.
#
# GUI agent adapters (ReactStarAgent, DroidRunAdapter) wrap Agent_RPA and
# implement the BaseGUIAgent interface for pluggable agent architecture.
# =============================================================================


class Agent_RPA:
  def __init__(
    self,
    env_op: EnvOperation,
    default_llm: OpenAIWrapper,
    name: str = 'Agent_RPA',
    concluder_llm: OpenAIWrapper = None,
    actiontranslator_llm: OpenAIWrapper = None,
    breakpoint_analyzer_llm: OpenAIWrapper = None,
    params_extractor_llm: OpenAIWrapper = None,
    gui_agent_type: str | None = None,
  ):
    """Initializes Agent_RPA.

    Args:
      env: The environment.
      default_llm: The default LLM wrapper (main LLM).
      concluder_llm: The LLM wrapper for Concluder.
      actiontranslator_llm: The LLM wrapper for ActionTranslator.
      breakpoint_analyzer_llm: The LLM wrapper for Breakpoint Analyzer.
      params_extractor_llm: The LLM wrapper for Params Extractor.
      name: The agent name.
      
    Note:
      Planner and Summarizer are now part of ReactStarAgent, not Agent_RPA core.
      This allows Agent_RPA to focus on core RPA functionality.
      Agent_RPA no longer inherits from android_world base classes for clean decoupling.
    """
    self.name = name
    self.env_op = env_op
    self.default_llm = default_llm
    self.concluder_llm = concluder_llm if concluder_llm is not None else default_llm
    self.actiontranslator_llm = actiontranslator_llm if actiontranslator_llm is not None else default_llm
    self.breakpoint_analyzer_llm = breakpoint_analyzer_llm if breakpoint_analyzer_llm is not None else default_llm
    self.params_extractor_llm = params_extractor_llm if params_extractor_llm is not None else default_llm
    # Used to select the proper ActionTranslator implementation (tied to GUI agent action space).
    # Example: 'react_star', 'droidrun', 'askui'
    self.gui_agent_type = gui_agent_type
    
    self.rpa_bank = None
    
    # Set within run_tasks_*.py
    self.cur_task_type = None
    self.cur_task = None  # the current task description
    self.rpa_mode = False  # False for ReAct, True for RPA Verification or RPA Testing
    self.additional_guidelines = ''
    self.action_history = []
    self.reflection = None  # reset in run_tasks_react.py
    self.reflection_history = []
    
    self.record_token = models.RecordToken()
    
    # Image cache for performance optimization
    self._image_cache = {}

  def set_gui_agent_type(self, gui_agent_type: str | None) -> None:
    """Set GUI agent type to select the right ActionTranslator.

    Action translation is tightly coupled with the GUI agent's action space,
    so we must choose translator implementation accordingly.
    """
    self.gui_agent_type = gui_agent_type
  
  @staticmethod
  @lru_cache(maxsize=128)
  def _load_and_resize_image_cached(image_path: str, target_size: Tuple[int, int]) -> np.ndarray:
    """Load and resize image with LRU cache for performance.
    
    Args:
      image_path: Path to the image file (relative or absolute)
      target_size: Target size (width, height)
      
    Returns:
      Resized image as numpy array or None if failed
      
    Note:
      Uses LRU cache to avoid re-loading and re-resizing the same images.
      Cache size is set to 128 images (configurable).
      agent_utils.load_image_as_ndarray handles relative paths automatically.
    """
    try:
      # agent_utils.load_image_as_ndarray handles both absolute and relative paths
      img = agent_utils.load_image_as_ndarray(image_path)
      if img is not None:
        return cv2.resize(img, target_size)
    except Exception as e:
      print_with_color(f"Failed to load/resize image {image_path}: {e}", 'yellow')
      import traceback
      traceback.print_exc()
    return None
  
  def set_task_guidelines(self, task_guidelines: list[str]) -> None:
    self.additional_guidelines = task_guidelines
  
  def reset(self, task, log_task_path: str, to_init_task: bool = True):
    self.log_task_path = log_task_path
    os.makedirs(self.log_task_path, exist_ok=True)
    
    self.cur_task_type = task.name
    # self.cur_task = task.goal # set in episode_runner.py, after env_op.reset()
    
    if to_init_task:
      # reset to initial state for each ReAct trial
      self.previous_plan = 'You are just starting the task, and no previous plan exists.'
      self.completed_tasks = ['You just started, no plan has been completed yet.']
      self.screen_changes = 'You just started, no actions has been performed yet.'
      self.action_history = []  # store execution_info (get from env_op and summarizer_agent)
      self._action_translation_context = []  # store context for batch action translation
    else:
      self.previous_plan = 'You ran the rpa code generated earlier.'
      # No need to reset completed_tasks. It will be set in run_tasks_rpa.py
      self.screen_changes = 'Because the code was run directly, no screen_changes were recorded. The screen prior to execution was the desktop.'
      # No need to reset action_history and env_op when Breakpoint_Analyzer_Agent output 'Y'
    
    self.flag_done = False
    self.agent_traj = []  # store ReActStepInfo
    
    return len(self.action_history)
  
  def rpa_testing(self) -> models.RPAExecTraj:
    
    print_with_color('============================================', 'magenta')
    print_with_color(f"Current Stage: {self.rpa_mode}\n", 'magenta')
    
    rpa_dict = self.rpa_bank.rpa_dict
    
    if self.cur_task_type not in rpa_dict:
      print_with_color("rpa doesn't exist", 'magenta')
      return models.RPAExecTraj()
    
    rpa = rpa_dict[self.cur_task_type]
    # -----start: RPA Verification and Testing
    function_call, exec_traj, exec_result = self.Execute_rpa_code(
      task=self.cur_task,
      execution_file_path=self.log_task_path,
      rpa=rpa
    )
    print(f'answer return: {exec_result.answer_return}\n')
    
    rpa_exec_traj = models.RPAExecTraj(
      task=self.cur_task,
      function_call=function_call,
      rpa_code=rpa['rpa_code'],
      exec_result=exec_result,
      action_history=self.env_op.action_history,  # self.action_history
      traj=exec_traj,
    )
    self.action_history = rpa_exec_traj.action_history
    # flag_done should represent agent's active completion (via answer/stop action)
    # not just environment stopping (which could be due to max steps)
    self.flag_done = exec_result.agent_done
    return rpa_exec_traj
  
  # =========================================================================
  # Core RPA Execution Methods
  # =========================================================================
  
  def Execute_rpa_code(
    self,
    task: str,
    execution_file_path: str,
    rpa: Any,
  ):
    print_with_color('============================================', 'cyan')
    print_with_color("Current Module: RPA Exec\n", 'cyan')
    
    rpa_description = rpa['rpa_description']
    rpa_params = rpa['rpa_params']
    rpa_code = rpa['rpa_code']
    rpa_example = rpa['example_usage']
    
    # -----start: use llm to extract parameters for function_call
    # based on 'rpa_description', 'parameters', 'new task' to extract current parameters
    system_prompt = (
      "You are an expert in extracting task parameters for RPA functions. "
      "Your task is to accurately extract the required parameters for a new task, "
      "based on the provided rpa_description and rpa_parameters format.\n\n"
      "[Output Format]\n"
      "Extract the appropriate parameters from the **New Task** according to the rpa_parameters specification, "
      "and construct a function call following the **Example Usage**.\n"
      "**Only include parameters explicitly mentioned in the New Task. Omit missing parameters (functions have defaults).**\n"
      "Do not change the function name or any of the parameter names in **Example Usage** under any circumstances.\n\n"
      
      "Output Example:\n"
      "{\n"
      '  "function_call": "delete_file(file_name=\'record.txt\')"\n'
      "}\n"
      "Example with multi-line text:\n"
      "{\n"
      '  "function_call": "add_header(file_name=\'note.md\', header=\'Line 1\\nLine 2\')"\n'
      "}"
    )
    user_prompt = (
      f"rpa_description: {rpa_description}\n"
      "rpa_parameters:\n"
      f"{rpa_params}\n"
      f"Example Usage: {rpa_example}\n"
      f"New Task: {task}\n\n")
    agent_utils.write_to_file(file_path=execution_file_path, file_name='0_extract_params_prompt.txt', content=f"[system]\n{system_prompt}\n\n[user]\n{user_prompt}")
    output, raw_response = self.params_extractor_llm.predict_mm(user_prompt=user_prompt, images=[], system_prompt=system_prompt, output_format=models.ParamsExtractionOutput)
    agent_utils.write_to_file(file_path=execution_file_path, file_name='0_extract_params_output.txt', content=output)
    
    cost_tokens = raw_response.usage
    self.record_token.step = '0'
    self.record_token.file_path = FLAGS.log_folder_exp  # Set log path for token recording
    self.record_token.agent = 'extract params'
    self.record_token.step_tokens = cost_tokens
    self.record_token.llm = self.params_extractor_llm.model_name
    agent_utils.record_cost_tokens(self.record_token)
    
    function_call = output.function_call
    if function_call:
      try:
        func_name = agent_utils.extract_function_names(rpa_example)
        pattern = r'\((?:[^()]+|(?R))*\)'  # (?R) is recursive
        params = (regex.findall(pattern, function_call))[0]
      except Exception as e:
        print(f'Error occurred when extracting params: {e}')
      else:
        function_call = f'{func_name}{params}'
      
      # Escape newlines in string literals within function_call
      # This handles cases where LLM outputs actual newlines instead of \n
      function_call = self._escape_newlines_in_function_call(function_call)
    
    print_with_color(f"Generated function call:\n{function_call}", 'cyan')
    # -----end: use llm to extract parameters for function_call
    
    exec_traj, exec_result = self.env_op.execute_code(code=rpa_code, vars={"function_call": function_call},
                                                 save_path=execution_file_path, flag_exec_rpa=True)
    
    return function_call, exec_traj, exec_result
  
  def _escape_newlines_in_function_call(self, function_call: str) -> str:
    """
    Escape actual newlines in string literals within function_call.
    
    This handles cases where LLM outputs actual newlines instead of \\n in string parameters.
    Similar to how input_text() uses .replace("\\n", "\\\\n") on the entire action_code string.
    For example: func(text='Line 1\nLine 2') -> func(text='Line 1\\nLine 2')
    
    Args:
      function_call: Function call string that may contain unescaped newlines in string literals
      
    Returns:
      Function call string with newlines properly escaped in string literals
    """
    result = []
    i = 0
    in_string = False
    quote_char = None
    
    while i < len(function_call):
      char = function_call[i]
      
      if not in_string:
        # Not in a string, look for string start
        if char in ("'", '"'):
          in_string = True
          quote_char = char
          result.append(char)
        else:
          result.append(char)
        i += 1
      else:
        # In a string, look for string end or escape sequences
        if char == '\\':
          # Escape sequence, copy both characters (e.g., \\n, \\t, etc.)
          result.append(char)
          if i + 1 < len(function_call):
            result.append(function_call[i + 1])
            i += 2
          else:
            i += 1
        elif char == quote_char:
          # End of string
          result.append(char)
          in_string = False
          quote_char = None
          i += 1
        elif char == '\n':
          # Actual newline in string (not escaped), escape it
          result.append('\\n')
          i += 1
        else:
          # Regular character in string
          result.append(char)
          i += 1
    
    return ''.join(result)
  
  # =========================================================================
  # Batch Action Translation (Execute Before RPA Builder)
  # =========================================================================
  
  def _build_action_translation_context_from_react_traj(
    self,
    react_traj: models.ReActTraj
  ) -> list[dict]:
    """
    Build action translation context from ReActTraj for batch translation.
    
    This is AutoRPA's responsibility, not GUI agent's.
    
    Args:
      react_traj: ReActTraj to build context from
      
    Returns:
      List of context dictionaries for each step
    """
    translation_context = []
    
    # Build context from react_traj steps
    for step_info in react_traj.traj:
      # Get execution info from exec_step_info
      exec_step_info = step_info.exec_step_info if hasattr(step_info, 'exec_step_info') else None
      
      is_screen_changed = False
      if exec_step_info and hasattr(exec_step_info, 'is_screen_changed'):
        is_screen_changed = exec_step_info.is_screen_changed
      
      # Check if this is the final step
      # Use soft-coded action if available; otherwise fall back to hard-coded action
      action_code = step_info.soft_coded_action or step_info.hard_coded_action
      is_final_step = (
        'env_op.stop' in action_code or 
        'stop' in action_code.lower() or
        step_info.step_n == len(react_traj.traj)
      )
      
      # Load screenshots from paths if available in exec_step_info
      screenshot_resized = None
      screenshot_with_som_resized = None
      
      if exec_step_info:
        # Use cached image loading for better performance
        # Check for both None and empty string
        if exec_step_info.before_screenshot_path and exec_step_info.before_screenshot_path.strip():
          screenshot_resized = self._load_and_resize_image_cached(
            exec_step_info.before_screenshot_path, 
            (461, 1024)
          )
          if screenshot_resized is not None:
            print_with_color(f"    ✓ Loaded screenshot for step {step_info.step_n}", 'green')
          else:
            print_with_color(f"    ⚠️  Failed to load screenshot for step {step_info.step_n}", 'yellow')
        
        if exec_step_info.before_screenshot_w_som_path and exec_step_info.before_screenshot_w_som_path.strip():
          screenshot_with_som_resized = self._load_and_resize_image_cached(
            exec_step_info.before_screenshot_w_som_path,
            (461, 1024)
          )
          if screenshot_with_som_resized is not None:
            print_with_color(f"    ✓ Loaded SoM screenshot for step {step_info.step_n}", 'green')
          else:
            print_with_color(f"    ⚠️  Failed to load SoM screenshot for step {step_info.step_n}", 'yellow')
      
      # Create a minimal observation object using ScreenObs
      # Prefer structured UI dump for translator prompt; fall back to related_elements (often empty in traj banks).
      ui_full_dict = (
        getattr(step_info.exec_step_info, "before_ui_content_full_dict", None)
        if (hasattr(step_info, "exec_step_info") and step_info.exec_step_info)
        else None
      )
      ui_simple_str = ""
      try:
        if ui_full_dict:
          ui_simple_str = agent_utils._generate_ui_elements_description_str(ui_full_dict)
      except Exception:
        ui_simple_str = ""
      if not ui_simple_str:
        ui_simple_str = (
          step_info.exec_step_info.related_elements
          if (hasattr(step_info, "exec_step_info") and step_info.exec_step_info and getattr(step_info.exec_step_info, "related_elements", None))
          else ""
        )
      obs = models.ScreenObs(
        # V2 ReActStepInfo no longer stores ui_content string; use exec_step_info structured UI if present.
        # NOTE: related_elements is not full UI; keep it in simple_str only for display/debug contexts.
        ui_content_simple_str=ui_simple_str,
        ui_content_full_dict=ui_full_dict,
        screenshot_resized=screenshot_resized,
        screenshot_with_som_resized=screenshot_with_som_resized
      )
      
      # Pass minimal struct to ActionTranslator: simple object holding obs/output
      step_data = SimpleNamespace(
        obs=obs,
        obs_description=step_info.obs_description or "",
        action_reason=step_info.action_reason or "",
        # Keep both names for compatibility across translators
        action=step_info.hard_coded_action,
        hard_coded_action=step_info.hard_coded_action,
        exec_step_info=exec_step_info,
      )
      
      # Build translation context entry
      ctx = {
        'step_n': step_info.step_n,
        'step_data': step_data,
        'related_element': exec_step_info.related_elements if exec_step_info and hasattr(exec_step_info, 'related_elements') else "",
        'related_index': exec_step_info.related_target if exec_step_info and hasattr(exec_step_info, 'related_target') else None,
        'is_screen_changed': is_screen_changed,
        'is_final_step': is_final_step,
      }
      
      translation_context.append(ctx)
    
    return translation_context
  
  def _translate_single_step(
    self,
    goal: str,
    step_n: int,
    step_data: Any,
    related_element: str,
    related_index: int,
    log_path: str
  ) -> tuple[str, str | None, int | None]:
    """Helper method to translate a single step (for parallel processing).
    
    Args:
      goal: Task goal
      step_n: Step number
      step_data: Step data containing observation and action
      related_element: Related UI element description
      related_index: Related element index
      log_path: Log path for saving translation details
      
    Returns:
      Tuple of (translated_action, matched_related_element, matched_related_index)
    """
    try:
      return self.ActionTranslator_Agent(
        goal=goal,
        step_n=step_n,
        step_data=step_data,
        related_element=related_element,
        related_index=related_index,
        log_path=log_path,
        suppress_header=True  # Suppress header for parallel processing
      )
    except Exception as e:
      # Provide more detailed error information
      import traceback
      error_details = traceback.format_exc()
      print_with_color(f"❌ Error translating step {step_n}: {e}", 'red')
      print_with_color(f"Screenshot available: {step_data.obs.screenshot_with_som_resized is not None}", 'yellow')
      raise Exception(f"Step {step_n} translation failed: {e}\n{error_details}")
  
  def batch_translate_actions(
    self,
    react_trajs: list[models.ReActTraj],
    log_path: str = None,
    use_parallel: bool = True,
    max_workers: int = 4
  ) -> list[models.ReActTraj]:
    """
    Translate all actions in ReAct trajectories before RPA Builder.
    
    This is called before RPA Builder to process actions in batch
    rather than during RPA Builder execution.
    
    Args:
      react_trajs: List of ReAct trajectories to translate
      log_path: Optional log path for translation results
      use_parallel: Whether to use parallel processing (default: True)
      max_workers: Maximum number of parallel workers (default: 4)
      
    Returns:
      List of ReAct trajectories with updated soft_coded_action fields
      
    Note:
      Parallel processing can significantly speed up translation for large
      trajectories, but LLM API rate limits may apply.
    """
    if not FLAGS.use_action_translator:
      # Action translation disabled, return as-is
      print_with_color("Action translation disabled, skipping batch translation.", 'yellow')
      return react_trajs
    
    print_with_color("\n" + "="*80, 'blue')
    print_with_color("🔧 Batch Action Translation (Before RPA Builder)", 'blue')
    print_with_color("="*80, 'blue')
    
    translated_trajs = []
    
    # Use only the passed react_trajs to build context; no extra reads
    action_translation_context = []
    if react_trajs:
      print_with_color("📦 Building action translation context from provided ReActTraj...", 'cyan')
      last_traj = react_trajs[-1]  # Use the last (most recent) trajectory
      action_translation_context = self._build_action_translation_context_from_react_traj(last_traj)
      print_with_color(
        f"📦 Built action translation context with {len(action_translation_context)} steps",
        'cyan'
      )
    
    # Only translate the last trajectory (most recent ReAct episode)
    # Previous trajectories are kept as-is
    for traj_idx, react_traj in enumerate(react_trajs):
      is_last_traj = (traj_idx == len(react_trajs) - 1)
      
      if is_last_traj and action_translation_context:
        print_with_color(f"\n📋 Processing trajectory {traj_idx + 1}/{len(react_trajs)}:", 'cyan')
        translated_steps = []
        
        # Create a mapping from step_n to context for easier lookup
        context_map = {ctx['step_n']: ctx for ctx in action_translation_context}
        
        # Print summary
        traj_step_nums = [s.step_n for s in react_traj.traj]
        print_with_color(
          f"   Steps to process: {len(traj_step_nums)} | "
          f"Context available: {len(context_map)} | "
          f"Workers: {max_workers if use_parallel else 1}",
          'grey'
        )
        
        if use_parallel:
          
          # Determine the log path to use
          effective_log_path = log_path or self.log_task_path
          if not effective_log_path or not effective_log_path.strip():
            print_with_color(
              "⚠️  Warning: No valid log path available for action translation. "
              "Translation logs will not be saved.",
              'yellow'
            )
            # Use a temporary fallback path
            effective_log_path = os.path.join(tempfile.gettempdir(), 'action_translation_logs')
            os.makedirs(effective_log_path, exist_ok=True)
          
          # Collect steps that need translation
          steps_to_translate = []
          step_indices = []
          
          for idx, step_info in enumerate(react_traj.traj):
            ctx = context_map.get(step_info.step_n)
            if ctx and (ctx['is_screen_changed'] or ctx['is_final_step']):
              steps_to_translate.append((
                react_traj.task,
                ctx['step_n'],
                ctx['step_data'],
                ctx['related_element'],
                ctx['related_index'],
                effective_log_path
              ))
              step_indices.append(idx)
          
          # Parallel translation
          print_with_color(f"\n   ⚙️  Translating {len(steps_to_translate)} step(s) in parallel...", 'cyan')
          translation_results = {}
          with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_idx = {
              executor.submit(
                self._translate_single_step, 
                *step_args
              ): idx
              for idx, step_args in zip(step_indices, steps_to_translate)
            }
            
            # Collect all results
            for future in as_completed(future_to_idx):
              idx = future_to_idx[future]
              try:
                translation_results[idx] = future.result()  # Returns (translated_action, matched_related_element, matched_related_index)
              except Exception as e:
                print_with_color(
                  f"    ❌ Failed to translate step {react_traj.traj[idx].step_n}: {e}",
                  'red'
                )
                translation_results[idx] = None
          
          # Display results in order
          print_with_color("\n  📝 Translation Results:", 'cyan')
          print_with_color("  " + "="*78, 'grey')
          for idx in sorted(translation_results.keys()):
            step_n = react_traj.traj[idx].step_n
            result = translation_results[idx]
            # V2: keep original hard-coded action; translated soft-coded action stored separately.
            original_action = react_traj.traj[idx].hard_coded_action
            
            if result:
              translated_action = result[0] if isinstance(result, tuple) else result
              # Display complete soft-coded action
              print_with_color(f"\n  ✅ Step {step_n}: {original_action}", 'cyan')
              print_with_color(f"  {'─' * 78}", 'grey')
              
              # Display each line of the soft-coded action with proper indentation
              for line in translated_action.split('\n'):
                if line.strip():
                  print_with_color(f"     {line}", 'green')
              
              print_with_color(f"  {'─' * 78}", 'grey')
            else:
              print_with_color(f"\n  ❌ Step {step_n}: {original_action}", 'cyan')
              print_with_color(f"     Translation failed", 'red')
              print_with_color(f"  {'─' * 78}", 'grey')
          
          # Apply translation results
          for idx, step_info in enumerate(react_traj.traj):
            if idx in translation_results and translation_results[idx]:
              result = translation_results[idx]
              if isinstance(result, tuple):
                translated_action, matched_related_element, matched_related_index = result
                step_info.soft_coded_action = translated_action
                
                # Update exec_step_info with matched related_element and related_index
                # This allows builder to use the matched element without modifying react_trajs_bank.json
                if matched_related_element is not None and hasattr(step_info, 'exec_step_info') and step_info.exec_step_info:
                  step_info.exec_step_info.related_elements = matched_related_element
                  if matched_related_index is not None:
                    step_info.exec_step_info.related_target = matched_related_index
              else:
                # Fallback for old return format (should not happen)
                step_info.soft_coded_action = result
            translated_steps.append(step_info)
        else:
          # Sequential translation (original behavior)
          # Determine the log path to use
          effective_log_path = log_path or self.log_task_path
          if not effective_log_path or not effective_log_path.strip():
            print_with_color(
              "⚠️  Warning: No valid log path available for action translation. "
              "Translation logs will not be saved.",
              'yellow'
            )
            # Use a temporary fallback path
            effective_log_path = os.path.join(tempfile.gettempdir(), 'action_translation_logs')
            os.makedirs(effective_log_path, exist_ok=True)
          
          for step_info in react_traj.traj:
            # Find context by step_n
            ctx = context_map.get(step_info.step_n)
            
            if ctx:
              should_translate = (
                ctx['is_screen_changed'] or ctx['is_final_step']
              )
              
              if should_translate:
                # Perform action translation
                result = self.ActionTranslator_Agent(
                  goal=react_traj.task,
                  step_n=ctx['step_n'],
                  step_data=ctx['step_data'],
                  related_element=ctx['related_element'],
                  related_index=ctx['related_index'],
                  log_path=effective_log_path
                )
                
                if isinstance(result, tuple):
                  translated_action, matched_related_element, matched_related_index = result
                  step_info.soft_coded_action = translated_action
                  
                  # Update exec_step_info with matched related_element and related_index
                  # This allows builder to use the matched element without modifying react_trajs_bank.json
                  if matched_related_element is not None and hasattr(step_info, 'exec_step_info') and step_info.exec_step_info:
                    step_info.exec_step_info.related_elements = matched_related_element
                    if matched_related_index is not None:
                      step_info.exec_step_info.related_target = matched_related_index
                else:
                  # Fallback for old return format (should not happen)
                  step_info.soft_coded_action = result
                  translated_action = result
                
                print_with_color(
                  f"    ✓ Translated step {ctx['step_n']}: {translated_action[:60]}...",
                  'green'
                )
              else:
                # Keep original action
                print_with_color(
                  f"    - Skipped step {ctx['step_n']} (no screen change)",
                  'grey'
                )
            else:
              # No context available, use original action
              print_with_color(
                f"    ⚠ No context for step {step_info.step_n}, keeping original action",
                'yellow'
              )
            
            translated_steps.append(step_info)
        
        # Create new trajectory with translated steps
        translated_traj = models.ReActTraj(
          task=react_traj.task,
          reflection=react_traj.reflection,
          traj=translated_steps,
          action_history=react_traj.action_history,
          env_success_score=react_traj.env_success_score,
          agent_done_bool=react_traj.agent_done_bool,
          final_success_score=react_traj.final_success_score,
          final_success_bool=react_traj.final_success_bool,
          conclusion=react_traj.conclusion
        )
        
        translated_trajs.append(translated_traj)
      else:
        # Keep trajectory as-is (no translation context or not the last trajectory)
        print_with_color(f"\n  Keeping trajectory {traj_idx + 1}/{len(react_trajs)} as-is (no translation)", 'grey')
        translated_trajs.append(react_traj)
    
    print_with_color("\n" + "="*80, 'blue')
    print_with_color("✅ Batch Action Translation Complete", 'blue')
    print_with_color("="*80 + "\n", 'blue')
    
    # Clear translation context after processing
    if hasattr(self, '_action_translation_context'):
      self._action_translation_context = []
    
    return translated_trajs
  
  def Concluder_Agent(
    self,
    goal: str,
    log_task_path: str,
    episode_results: models.EpisodeResult,
    screenshot_resized = None,
  ) -> models.ConcluderOutput:
    """
    Concluder Agent for generating reflection after task completion.
    
    NOTE: This is also a ReAct-specific method, used for generating reflections
    to improve performance in subsequent rounds.
    """
    print('--------------------------------------------')
    print("Current Agent: Concluder\n")
    
    is_success = False
    if episode_results.final_success_bool:
      is_success = True
    
    if episode_results.env_success_score == 1.0:
      benchmark_feedback = "the benchmark judged the task as fully completed"
    elif episode_results.env_success_score == 0.0:
      benchmark_feedback = "the benchmark judged the task as not completed at all"
    else:
      benchmark_feedback = f"the benchmark judged the task as partially completed (approximately {episode_results.env_success_score * 100:.0f}%)"
    
    if episode_results.agent_done_bool:
      agent_feedback = "the agent believes the task was completed"
    else:
      agent_feedback = "the agent believes the task is still incomplete"
    
    if ((episode_results.env_success_score == 1.0 and episode_results.agent_done_bool) or
        (episode_results.env_success_score == 0.0 and not episode_results.agent_done_bool)):
      conjunction = "and"
    else:
      conjunction = "while"
    
    env_feedback = f"Regarding the task outcome: {benchmark_feedback}, {conjunction} {agent_feedback}.\n"
    
    # Save debug information before generating prompt
    debug_info = f"""Concluder Debug Information
============================

Goal: {goal}

Completed Tasks:
{self.completed_tasks[-1] if self.completed_tasks else 'No completed tasks'}

Action History ({len(self.action_history)} steps):
{chr(10).join(self.action_history) if self.action_history else 'No actions performed'}

Reflection History ({len(self.reflection_history)} reflections):
{chr(10).join(self.reflection_history) if self.reflection_history else 'No previous reflections'}

Environment Feedback:
{env_feedback}

Is Success: {is_success}
Env Success Score: {episode_results.env_success_score}
Agent Done Bool: {episode_results.agent_done_bool}
Final Success Bool: {episode_results.final_success_bool}
"""
    agent_utils.write_to_file(file_path=log_task_path, file_name='concluder_debug_info.txt', content=debug_info)
    
    # Extract UI content (structured UI dict list) from different possible trajectory shapes.
    # `EpisodeResult.agent_traj` is defined as Union[list[ReActStepInfo], RPAExecTraj] in models,
    # but some call sites may still pass a ReActTraj-like object with `.traj`.
    ui_full = []
    agent_traj = getattr(episode_results, "agent_traj", None)
    try:
      # Case 1: list[ReActStepInfo]
      if isinstance(agent_traj, list):
        if len(agent_traj) > 0:
          last_step = agent_traj[-1]
          exec_info = getattr(last_step, "exec_step_info", None)
          ui_full = (
            getattr(exec_info, "after_ui_content_full_dict", None)
            or getattr(exec_info, "before_ui_content_full_dict", None)
            or []
          )
      # Case 2: RPAExecTraj (traj is list[EnvExecStepInfo])
      elif isinstance(agent_traj, models.RPAExecTraj):
        if agent_traj.traj and len(agent_traj.traj) > 0:
          last_env_step = agent_traj.traj[-1]
          ui_full = (
            getattr(last_env_step, "after_ui_content_full_dict", None)
            or getattr(last_env_step, "before_ui_content_full_dict", None)
            or []
          )
      # Case 3: legacy shape with `.traj` attribute (e.g., ReActTraj)
      elif hasattr(agent_traj, "traj"):
        traj_list = getattr(agent_traj, "traj", None)
        if traj_list and len(traj_list) > 0:
          last_step = traj_list[-1]
          exec_info = getattr(last_step, "exec_step_info", None)
          ui_full = (
            getattr(exec_info, "after_ui_content_full_dict", None)
            or getattr(exec_info, "before_ui_content_full_dict", None)
            or []
          )
    except Exception:
      # Concluder should never crash due to missing/unknown traj shape.
      ui_full = []

    ui_info_str = agent_utils._generate_ui_elements_description_str(ui_full) if ui_full else ""
    
    concluder_prompt_dict = get_concluder_prompt(
      goal=goal,
      completed_tasks=self.completed_tasks[-1],
      action_history=self.action_history,
      ui_info_str=ui_info_str,
      reflection_history=self.reflection_history,
      env_feedback=env_feedback,
      is_success=is_success,
    )
    agent_utils.write_to_file(file_path=log_task_path, file_name='concluder_prompt.txt', 
                              content=f"[system]\n{concluder_prompt_dict['system']}\n\n[user]\n{concluder_prompt_dict['user']}")
    
    concluder_output, raw_response = self.concluder_llm.predict_mm(
      user_prompt=concluder_prompt_dict['user'],
      images=[screenshot_resized if screenshot_resized is not None else self.env_op.cur_obs.screenshot_resized],
      system_prompt=concluder_prompt_dict['system'],
      output_format=models.ConcluderOutput
    )
    agent_utils.write_to_file(file_path=log_task_path, file_name='concluder_raw_output.txt', content=raw_response)
    agent_utils.write_to_file(file_path=log_task_path, file_name='concluder_output.txt', content=concluder_output)
    
    cost_tokens = raw_response.usage
    self.record_token.file_path = FLAGS.log_folder_exp  # Set log path for token recording
    self.record_token.agent = 'Concluder'
    self.record_token.step_tokens = cost_tokens
    self.record_token.llm = self.concluder_llm.model_name
    agent_utils.record_cost_tokens(self.record_token)
    
    if not raw_response:
      print("Error: Didn't get concluder response.")
    
    if concluder_output.reflection is not None:
      self.reflection = concluder_output.reflection
      self.reflection_history.append(concluder_output.reflection)
    
    print(f"concluder_output: {concluder_output}")
    
    return concluder_output
  
  def Breakpoint_Analyzer_Agent(
    self,
    rpa_exec_traj: models.RPAExecTraj,
    log_path: str,
  ) -> models.BreakpointAnalyzerOutput:
    print('============================================')
    print("Current Agent: Breakpoint_Analyzer_Agent\n")
    
    if not os.path.exists(log_path):
      os.makedirs(log_path)
    
    ui_content, screenshot_resized = self.env_op.cur_obs.ui_content_simple_str, self.env_op.cur_obs.screenshot_resized
    
    # get prompt
    prompt_dict = get_breakpoint_analyzer_prompt(rpa_exec_traj=rpa_exec_traj, ui_content=ui_content)
    agent_utils.write_to_file(file_path=log_path, file_name='Breakpoint_Analyzer_Agent_prompt.txt', 
                              content=f"[system]\n{prompt_dict['system']}\n\n[user]\n{prompt_dict['user']}")
    
    # call MLLM
    output, raw_response = self.breakpoint_analyzer_llm.predict_mm(
      user_prompt=prompt_dict['user'],
      images=[screenshot_resized],
      system_prompt=prompt_dict['system'],
      output_format=models.BreakpointAnalyzerOutput
    )
    agent_utils.write_to_file(file_path=log_path, file_name='Breakpoint_Analyzer_Agent_output.txt', content=output)
    cost_tokens = raw_response.usage
    self.record_token.step = '-'
    self.record_token.file_path = FLAGS.log_folder_exp  # Set log path for token recording
    self.record_token.agent = 'Breakpoint Analyzer'
    self.record_token.step_tokens = cost_tokens
    self.record_token.llm = self.breakpoint_analyzer_llm.model_name
    agent_utils.record_cost_tokens(self.record_token)
    
    print(f'Observations:\n{output.observation}\n')
    print(f'Completed Tasks:\n{output.completed_tasks}\n')
    print(f'Plan Justification:\n{output.plan_reason}\n')
    print(f'Plan List:\n{output.plan_list}\n')
    print(f'Code Diagnosis:\n{output.code_diagnosis}\n')
    print(f'Whether To Continue:\n{output.to_continue}\n')
    
    return output
  
  # =========================================================================
  # ActionTranslator (Shared across both ReAct and RPA workflows)
  # =========================================================================
  
  def _is_direction_swipe(self, action: str) -> bool:
    """Return True if action is a direction-based swipe (no translation needed)."""
    import re
    swipe_direction_pattern = r'(?:env_op\.)?swipe\s*\(\s*["\']?(up|down|left|right)["\']?\s*\)'
    return re.search(swipe_direction_pattern, action, re.IGNORECASE) is not None

  def _is_coordinate_swipe(self, action: str) -> bool:
    """Return True if action is a coordinate-based swipe (needs translation)."""
    import re
    swipe_coord_pattern = r'(?:env_op\.)?swipe\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)'
    return re.search(swipe_coord_pattern, action) is not None

  def _convert_click_like_coord_to_index_action(
    self,
    action: str,
    ui_content_full_dict: list[dict],
  ) -> tuple[str, int, dict] | None:
    """
    For click/long_press/input_text with coordinates, try engineering bbox match and
    return an index-based env_op.xxx(index) action if matched; otherwise None.
    """
    import re

    coord_action_patterns = [
      (r'(?:env_op\.)?click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)', 'click'),
      (r'(?:env_op\.)?long_press\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)', 'long_press'),
      (r'(?:env_op\.)?input_text\s*\(\s*["\']([^"\']*)["\']\s*,\s*(\d+)\s*,\s*(\d+)\s*', 'input_text'),
    ]

    for pattern, action_type in coord_action_patterns:
      match = re.search(pattern, action)
      if not match:
        continue

      if action_type == 'input_text':
        text = match.group(1)
        x = int(match.group(2))
        y = int(match.group(3))
      else:
        x = int(match.group(1))
        y = int(match.group(2))

      if not ui_content_full_dict:
        return None

      element = agent_utils.find_element_at_coordinate(x, y, ui_content_full_dict)
      if not element:
        print_with_color(
          f"  🧭 Preprocess: coord hit-test MISS for {action_type} at ({x},{y})",
          'yellow'
        )
        return None

      element_index = element.get('index', -1)
      if element_index < 0:
        return None

      if action_type == 'input_text':
        clear_text_match = re.search(r'clear_text\s*=\s*(True|False)', action, re.IGNORECASE)
        clear_text = clear_text_match.group(1).lower() == 'true' if clear_text_match else 'True'
        new_action = f'env_op.input_text("{text}", {element_index}, {clear_text})'
      else:
        new_action = f'env_op.{action_type}({element_index})'

      print_with_color(
        f"  🔄 Pre-processed coordinate action: {action} → {new_action} (matched element index {element_index})",
        'cyan'
      )
      # Extra debug: show minimal matched element info (without dumping huge dict)
      try:
        dbg_text = element.get("text") or ""
        dbg_cd = element.get("content_description") or ""
        dbg_bbox = element.get("bbox_pixels") or {}
        print_with_color(
          f"     🎯 Matched element: index={element_index}, text={dbg_text!r}, content_description={dbg_cd!r}, bbox={dbg_bbox}",
          'cyan'
        )
      except Exception:
        pass
      return new_action, element_index, element

    return None

  def _preprocess_action_for_translation(
    self,
    llm_action: str,
    exec_action: str | None,
    ui_content_full_dict: list[dict],
  ) -> tuple[str, bool, int | None, dict | None]:
    """
    Preprocess action before translation with consistent coordinate systems:
    - LLM sees resized screenshots (461x1024) => use `llm_action` (compressed coords) in prompt.
    - Engineering bbox matching uses original coordinate system => use `exec_action` (executed/original coords)
      to match against `ui_content_full_dict.bbox_pixels`.
    """
    # 0) Fast-path actions that should never enter translator.
    # These actions are already "soft" (not tied to a specific UI element) or are control-flow.
    # - wait(), keyboard_enter(), open_app(), go_back(), go_home(), stop(...), shell(...)
    # - swipe(direction) where direction is textual ("up/down/left/right")
    if self._is_passthrough_action(llm_action):
      print_with_color(f"  ⏭️  Translator bypass (passthrough): {llm_action}", "green")
      return llm_action, False, None, None

    # 1) Direction swipe: can be decided purely from llm_action, no translation needed.
    if self._is_direction_swipe(llm_action):
      print_with_color(f"  ⏭️  Translator bypass (direction swipe): {llm_action}", "green")
      return llm_action, False, None, None

    # 2) Prefer engineering matching with executed/original coords for click/long_press/input_text.
    if exec_action:
      match_res = self._convert_click_like_coord_to_index_action(exec_action, ui_content_full_dict)
      if match_res:
        index_action, element_index, element = match_res
        print_with_color("  ✅ Preprocess decision: coordinate->index matched (using executed_action)", "green")
        return index_action, True, element_index, element
    else:
      # Fallback: if we don't have exec_action, attempt match using llm coords (best-effort).
      match_res = self._convert_click_like_coord_to_index_action(llm_action, ui_content_full_dict)
      if match_res:
        index_action, element_index, element = match_res
        print_with_color("  ✅ Preprocess decision: coordinate->index matched (fallback using llm_action)", "green")
        return index_action, True, element_index, element

    # 3) Coordinate swipe (or unmatched click): keep llm_action for translation.
    if self._is_coordinate_swipe(llm_action):
      print_with_color("  🧭 Preprocess decision: coordinate swipe -> needs translator", "yellow")
      return llm_action, True, None, None

    # 4) No special preprocessing; keep as-is and translate (or passthrough).
    print_with_color(f"  🧩 Preprocess decision: needs translator -> {llm_action}", "yellow")
    return llm_action, True, None, None

  def _is_passthrough_action(self, action: str | None) -> bool:
    """Return True if the action should bypass ActionTranslator entirely."""
    if not action or not isinstance(action, str):
      return False
    s = action.strip()
    if not s:
      return False
    # Normalize optional env_op prefix
    if s.startswith("env_op."):
      s2 = s[len("env_op."):]
    else:
      s2 = s
    # Extract function name
    m = regex.match(r"^([a-zA-Z_]\w*)\s*\(", s2)
    if not m:
      return False
    fn = m.group(1)
    # Hard whitelist: these don't need translation.
    if fn in {"wait", "keyboard_enter", "open_app", "go_back", "go_home", "stop", "shell"}:
      return True
    return False
  
  def ActionTranslator_Agent(
    self,
    log_path: str,
    goal: str,
    step_n: int,
    step_data: Any,
    related_element: str,
    related_index: int = None,
    suppress_header: bool = False,
  ) -> tuple[str, str | None, int | None]:
    """Dispatch ActionTranslator by gui-agent type.

    Action translation logic is tightly coupled with the GUI agent's action space.
    React-Star is currently the only fully implemented translator; other gui-agents
    use a placeholder translator (explicitly raises NotImplementedError).
    """
    if not suppress_header:
      print_with_color('============================================', 'light_red')
      print_with_color(
        f"Current Agent: ActionTranslator_Agent (dispatch) | gui_agent_type={self.gui_agent_type or 'react_star'}",
        'light_red'
      )
    translator = ActionTranslatorRegistry.create(self, self.gui_agent_type or 'react_star')
    return translator.translate(
      log_path=log_path,
      goal=goal,
      step_n=step_n,
      step_data=step_data,
      related_element=related_element,
      related_index=related_index,
      suppress_header=suppress_header,
    )

  def _action_translate_react_star(
    self,
    log_path: str,
    goal: str,
    step_n: int,
    step_data: Any,
    related_element: str,
    related_index: int = None,
    suppress_header: bool = False,
  ) -> tuple[str, str | None, int | None]:
    if not suppress_header:
      print_with_color('============================================', 'light_red')
      print_with_color("Current Agent: ActionTranslator_Agent (react_star)\n", 'light_red')
    
    # Create directories (exist_ok=True for parallel processing)
    os.makedirs(log_path, exist_ok=True)
    
    # Create action_translation subfolder for debugging
    translation_folder = os.path.join(log_path, 'action_translation')
    os.makedirs(translation_folder, exist_ok=True)
    
    # Create README if it doesn't exist
    readme_path = os.path.join(translation_folder, 'README.txt')
    if not os.path.exists(readme_path):
      readme_content = """Action Translation Debug Files
==============================

This folder contains debug information for the Action Translation process.

File Structure:
--------------
step-{N}_input_info.txt          - Input parameters: goal, observation, original action, related elements
step-{N}_prompt.txt              - The complete prompt sent to the ActionTranslator LLM
step-{N}_output.txt              - Raw output from ActionTranslator (thought + soft action)
step-{N}_final_action.txt        - Final optimized soft action after extraction
step-{N}_screenshot.png          - Screenshot before the action (if available)
step-{N}_screenshot_with_som.png - Screenshot with Set-of-Mark annotations (if available)

Debugging Tips:
--------------
1. Check input_info.txt to see what information was provided to the translator
2. Compare screenshot files to verify if images were successfully loaded
3. Check output.txt vs final_action.txt to see if kwargs extraction was applied
4. Use prompt.txt to reproduce the LLM call if needed
"""
      agent_utils.write_to_file(file_path=translation_folder, file_name='README.txt', content=readme_content)
    
    file_prefix = f'step-{step_n}'
    
    if step_data.obs.screenshot_with_som_resized is not None:
      som_screenshot_path = os.path.join(translation_folder, f'{file_prefix}_screenshot_with_som.png')
      agent_utils.save_image(step_data.obs.screenshot_with_som_resized, som_screenshot_path)
      print_with_color(f"  ✓ Saved SoM screenshot to {som_screenshot_path}", 'green')
    else:
      print_with_color(f"  ⚠️  No SoM screenshot available for step {step_n}", 'yellow')
    
    # Extract observation and action information based on step_data structure
    if hasattr(step_data, 'output'):
      obs_analysis = step_data.output.observation
      action_reason = step_data.output.code_reason
      action = step_data.output.code
    else:
      obs_analysis = step_data.obs_description
      action_reason = step_data.action_reason
      action = step_data.hard_coded_action
    
    # Preprocess actions with consistent coordinate systems:
    # - LLM sees resized screenshots => use `action` (hard-coded / compressed) in prompt.
    # - Engineering matching uses bbox_pixels (original coords) => use executed_action if available.
    ui_content_full_dict = None
    if hasattr(step_data.obs, 'ui_content_full_dict') and step_data.obs.ui_content_full_dict:
      ui_content_full_dict = step_data.obs.ui_content_full_dict
    
    exec_action = None
    if hasattr(step_data, "exec_step_info") and step_data.exec_step_info and hasattr(step_data.exec_step_info, "executed_action"):
      exec_action = step_data.exec_step_info.executed_action
    elif hasattr(step_data, "obs") and hasattr(step_data.obs, "executed_action"):
      exec_action = getattr(step_data.obs, "executed_action", None)
    
    # Preprocess action: direction swipe -> skip translation; coordinate click/long_press/input_text -> engine match
    # using exec_action, then rewrite to index-based input for translator.
    original_action = action
    processed_action, needs_translation, matched_index, matched_element = self._preprocess_action_for_translation(
      llm_action=action,
      exec_action=exec_action,
      ui_content_full_dict=ui_content_full_dict or [],
    )
    
    if not needs_translation:
      # Action doesn't need translation (e.g., direction-based swipe)
      if not suppress_header:
        print_with_color(f'  ✓ Action does not need translation: {processed_action}', 'green')
      return processed_action, None, None
    
    # Update action if it was preprocessed
    if not suppress_header:
      print_with_color("  🔎 Preprocess inputs:", 'grey')
      print_with_color(f"     - hard_coded_action (LLM/resized): {original_action}", 'grey')
      print_with_color(f"     - executed_action (engine/original): {exec_action}", 'grey')
      print_with_color(f"     - ui_content_full_dict items: {len(ui_content_full_dict or [])}", 'grey')
      print_with_color(f"     - direction_swipe(hard_coded): {self._is_direction_swipe(original_action)}", 'grey')
      print_with_color(f"     - coordinate_swipe(hard_coded): {self._is_coordinate_swipe(original_action)}", 'grey')

    if processed_action != original_action:
      action = processed_action
      if not suppress_header:
        print_with_color(f'  🔄 Action preprocessed: {original_action} → {action}', 'cyan')

    # If engineering match produced an element index, feed it to translator as related_element/index.
    matched_related_element = None
    matched_related_index = None
    if matched_index is not None and matched_index >= 0:
      related_index = matched_index
      if not related_element:
        related_element = agent_utils._generate_ui_elements_description_str(ui_elements=ui_content_full_dict, target_index=related_index)
      if not suppress_header:
        print_with_color(
          f"  🧷 Using engineering-matched related_index={related_index} for ActionTranslator",
          "green",
        )
      
      # Save matched information to return
      matched_related_element = related_element
      matched_related_index = related_index
    
    # Save input information for debugging
    input_info = f"""Goal: {goal}
Step Number: {step_n}
Observation Analysis: {obs_analysis}
Action Reason: {action_reason}
Hard Coded Action (LLM / resized coords): {action}
Executed Action (engine / original coords): {exec_action}
Preprocess Result:
  - processed_action: {processed_action}
  - needs_translation: {needs_translation}
  - matched_index: {matched_index}
Related Element: {related_element}
Related Index: {related_index}

UI Content:
{step_data.obs.ui_content_simple_str}
"""
    agent_utils.write_to_file(file_path=translation_folder, file_name=f'{file_prefix}_input_info.txt',
                              content=input_info)

    prompt_dict = get_action_translator_prompt(goal=goal, obs_analysis=obs_analysis,
                                          action_reason=action_reason,
                                          action=action,
                                          related_element=related_element,
                                          ui_info_str=step_data.obs.ui_content_simple_str)
    agent_utils.write_to_file(file_path=translation_folder, file_name=f'{file_prefix}_prompt.txt',
                              content=f"[system]\n{prompt_dict['system']}\n\n[user]\n{prompt_dict['user']}")
    
    # Ensure screenshot is available for LLM
    if step_data.obs.screenshot_with_som_resized is None:
      raise ValueError(f"Screenshot with SoM is required for ActionTranslator but is None for step {step_n}")
    
    # Retry logic for translation output parsing (max 1 retry, 2 attempts total)
    output = None
    raw_response = None
    for attempt in range(2):
      try:
        output, raw_response = self.actiontranslator_llm.predict_mm(
          user_prompt=prompt_dict['user'],
          images=[step_data.obs.screenshot_with_som_resized],
          system_prompt=prompt_dict['system'],
          output_format=models.ActionTranslatorOutput
        )
        break  # Success, exit retry loop
      except Exception as e:
        if attempt < 1:  # First attempt failed, retry once
          print_with_color(f"❌ Error translating step {step_n} (attempt {attempt + 1}/2): {e}", 'red')
          print_with_color(f"🔄 Retrying translation for step {step_n}...", 'yellow')
        else:  # Second attempt also failed, raise exception
          print_with_color(f"❌ Error translating step {step_n} (attempt {attempt + 1}/2): {e}", 'red')
          raise
    
    # Save output information
    output_info = f"""Thought:
{output.thought}

Soft Action:
{output.soft_action}
"""
    agent_utils.write_to_file(file_path=translation_folder, file_name=f'{file_prefix}_output.txt',
                              content=output_info)
    cost_tokens = raw_response.usage
    self.record_token.file_path = FLAGS.log_folder_exp  # Set log path for token recording
    self.record_token.step = str(step_n)
    self.record_token.agent = 'ActionTranslator'
    self.record_token.step_tokens = cost_tokens
    self.record_token.llm = self.actiontranslator_llm.model_name
    agent_utils.record_cost_tokens(self.record_token)
    
    # Only show detailed output in non-parallel mode
    if not suppress_header:
      print_with_color(f'\n💭 Thought:', 'cyan')
      print(f'   {output.thought}\n')
      print_with_color(f'📝 Soft Action:', 'cyan')
      print(f'   {output.soft_action}\n')
    
    if 'kwargs' in output.soft_action:  # Soft action with kwargs needs index; extract_ui_value ensures kwargs accuracy
      output.soft_action = agent_utils.extract_ui_value(
        output.soft_action, 
        related_element, 
        related_index,
        verbose=(not suppress_header)
      )
    
    if not suppress_header:
      print_with_color(f'✨ Optimized Soft Action:', 'green')
      print(f'   {output.soft_action}\n')
    
    # Save final optimized soft action
    final_action_info = f"""Final Optimized Soft Action:
{output.soft_action}

Extraction Applied: {'Yes (kwargs extracted)' if 'kwargs' in output.soft_action else 'No'}
"""
    agent_utils.write_to_file(file_path=translation_folder, file_name=f'{file_prefix}_final_action.txt',
                              content=final_action_info)
    
    # Return translated action along with matched related_element and related_index
    return output.soft_action, matched_related_element, matched_related_index
