import gzip
import logging
import os
import pickle
import random
import re
from typing import List

from agentlab.llm.response_api import MessageBuilder
from agentlab.llm.response_api import APIPayload

from jephhinter.configs import HintPromptConfig

logger = logging.getLogger(__name__)

SYSTEM_PROMPT = "You are a hint generation expert. You MUST respond using the structured format with <think> and <hint> tags. Use the <think> section for thorough analysis (200-800 words) and the <hint> section for concise, actionable guidance (under 256 tokens)."


class SimpleDiscussion:
    """Minimal message grouping for hint prompting."""

    def __init__(self):
        self.messages: List[MessageBuilder] = []

    def append(self, message: MessageBuilder):
        self.messages.append(message)

    def flatten(self) -> List[MessageBuilder]:
        return self.messages


def load_all_step_pickles(root_dir):
    step_data = []
    # TODO: Use yield here to avoid loading everything in memory at once.
    for dirpath, _, filenames in os.walk(root_dir):
        for fname in filenames:
            if fname.endswith(".pkl.gz") and fname.startswith("step_"):
                fpath = os.path.join(dirpath, fname)
                try:
                    with gzip.open(fpath, "rb") as f:
                        compressed_data = f.read()
                    data = pickle.loads(compressed_data)  # type: ignore
                    step_data.append({"file": fpath, "data": data})
                except Exception as e:
                    logger.info(f"Failed to load {fpath}: {e}")
    return step_data


def extract_trace_info(step_data):
    """Extract meaningful information from step data for hint generation."""
    trace_info = []
    for step in step_data:
        step_info = step["data"]
        if hasattr(step_info, "obs") and hasattr(step_info, "agent_info"):
            if step_info.obs is None:
                continue
            else:
                # Extract observation and agent info
                obs = step_info.obs
                agent_info = step_info.agent_info
                axtree_txt = obs.get("axtree_txt", "")

                html_txt = obs.get("pruned_html", "")

                # Get the agent's thinking process
                think = getattr(agent_info, "think", "")

                # Get the action taken
                action = getattr(step_info, "action", "")

                # Get any error messages
                last_action_error = obs.get("last_action_error", "") if isinstance(obs, dict) else ""

                # Get goal information
                goal = obs.get("goal", []) if isinstance(obs, dict) else []

                # Get the current reward
                reward = getattr(step_info, "reward", 0)

                trace_info.append(
                    {
                        "axtree_txt": axtree_txt,
                        "html_txt": html_txt,
                        "think": think,
                        "action": action,
                        "error": last_action_error,
                        "goal": goal,
                        "reward": reward,
                        "step_file": step["file"],
                    }
                )

    return trace_info


def get_common_hint_guidance(hint_topics: str):
    """Get consolidated common guidance for hint generation to avoid redundancy."""
    logger.info(f"Available hint topics:\n{hint_topics}")
    return f"""
=== HINT REQUIREMENTS ===
IMPORTANT: Keep your hint SHORT and write it as a SINGLE LINE without line breaks.
Focus on:
- Common pitfalls or errors to avoid
- Specific strategies that work well
- Important details to pay attention to
- Step-by-step guidance if applicable

=== SPECIFIC GUIDANCE ===
When analyzing the traces, look for:
1. UI elements that indicate the correct path (buttons, links, form fields)
2. Common error patterns and how to avoid them
3. Successful action sequences that could be generalized
4. Context clues that help determine the right approach
5. Timing or order dependencies in the workflow


=== ENHANCED HINT REQUIREMENTS ===

REQUIREMENT 1: GENERALIZABILITY
- Make hints general enough to apply to similar tasks, not just this specific task
- DO NOT include: specific usernames, task-specific text content, element IDs like [123], domain-specific details
- DO include: common UI element names (buttons, links, form fields), general patterns, and reusable strategies

REQUIREMENT 2: SPECIFICITY AND ACTIONABILITY
- Include exact text of relevant UI elements when they represent common patterns
- Specify element types and positions when relevant (e.g., "button 'Submit' at the bottom of the form")
- Provide step-by-step guidance when multiple actions are needed

REQUIREMENT 3: STRUCTURE AND LENGTH
- Keep hints under 256 tokens
- Write as a single line without line breaks
- Focus on what to do, not why it works
- Use clear, actionable language
- DO NOT use double quotes (") in hints as they cause issues when saving to CSV files - use single quotes (') instead

REQUIREMENT 4: CONTENT FOCUS
- Common pitfalls or errors to avoid
- Successful strategies and patterns
- Important UI elements to look for
- Context clues that indicate the right approach
- Timing or sequence dependencies

REQUIREMENT 5: AVOID EXPLANATIONS
- Don't explain why something works
- Focus on what to do and how to do it
- Provide executable guidance rather than analysis

REQUIREMENT 6: APPLICABILITY TOPIC
- Always provide 1 short sentence describing the general topic of the task in which it applies.
- IMPORTANT: If SUMMARIZATION: <summarization> is present in the prompt, use it inside the topic description <topic> tags.
- Something like "filtering the table", or "filling the combobox in the form" or "filling the multitab form" and so on.
- Use one of the previously used topics if possible, or create a new one if nothing fits.
- Example: If the SUMMARIZATION: filtering the table, the topic description should be "<topic> filtering the table </topic>".

Known applicability topics: {hint_topics}

=== RESPONSE FORMAT ===
You MUST use this exact format:
<think>
Your reasoning about the traces, patterns, and insights here...
</think>

<topic>
Your brief description of the task topic here
</topic>

<hint>
Your concise, actionable hint here (single line, under 256 tokens)
</hint>

=== LENGTH BALANCE ===
IMPORTANT: Balance your response appropriately:
- <think> section: 200-800 words (thorough analysis)
- <hint> section: 1-3 sentences (concise action)
- The thinking should be comprehensive, the hint should be focused
- Use thinking for analysis, use hint for execution

=== THINKING SECTION GUIDANCE ===
The <think> section should be thorough and analytical:
- Analyze the execution traces in detail
- Identify key patterns, mistakes, and successful strategies
- Explain why certain approaches work or fail
- Consider multiple perspectives and edge cases
- Aim for 200-800 words of thoughtful analysis
- This is where you can be comprehensive and detailed

=== HINT SECTION GUIDANCE ===
The <hint> section should be concise and actionable:
- Focus on the most critical action or strategy
- Avoid lengthy explanations or multiple examples
- Prioritize what the agent should DO, not why it works
- If you have multiple points, combine them into a single, clear instruction
- Keep under 256 tokens (about 1-3 sentences)
- This should be immediately executable guidance
- IMPORTANT: Never use double quotes (") in hints - use single quotes (') to avoid CSV formatting issues

=== HINT LENGTH GUIDANCE ===
IMPORTANT: Keep your hint concise and focused. Aim for 1-3 clear, actionable sentences.
- Focus on the most critical action or strategy
- Avoid lengthy explanations or multiple examples
- Prioritize what to do, not why it works
- If you have multiple points, combine them into a single, clear instruction

=== SERVICE NOW SPECIFIC GUIDANCE ===
For ServiceNow tasks involving navigation:
- Focus on the specific UI element to use (e.g., "Filter/Filter navigator" input)
- Specify the exact location (e.g., "left panel", "Application Navigator")
- Include the key action (e.g., "type the module name", "click the resulting link")
- Avoid explaining multiple approaches - pick the most effective one

=== HINT EXAMPLES ===

GOOD BALANCE EXAMPLES:

Example 1 - ServiceNow Navigation:
<think>
Looking at the execution traces, I can see the agent consistently fails by using the global search bar at the top of the page instead of the Application Navigator on the left. The successful traces show agents using the "Filter/Filter navigator" input within the left panel, which provides auto-completion and direct access to modules. The agent also makes the mistake of clicking "More menus/Admin" repeatedly, which is unnecessary for basic navigation. The key insight is that ServiceNow's Application Navigator is designed for hierarchical browsing, not keyword search.
</think>
<hint>
Use the Application Navigator (left panel) with the 'Filter/Filter navigator' input to find modules, not the global search bar at the top.
</hint>

Example 2 - Form Submission:
<think>
The traces reveal that agents often fail to submit forms because they're looking for a 'Submit' button when the actual button text is 'Save' or 'Create'. The successful attempts show agents reading the button labels carefully and clicking the appropriate action button. The agent also sometimes tries to press Enter, which doesn't work in this form context. The form requires explicit button clicks for submission.
</think>
<hint>
Look for and click the 'Save', 'Create', or 'Submit' button at the bottom of the form instead of pressing Enter.
</hint>

BAD HINT EXAMPLES:
- "Click the button with ID [123] to submit the form" (too specific, includes element ID)
- "Enter 'john.doe@email.com' in the email field" (too specific, includes actual content)
- "This task requires careful attention to detail" (too vague, not actionable)
- "The agent should understand the context before proceeding" (explanation, not guidance)
- 'Click the "Submit" button to continue' (uses double quotes, causes CSV issues - should use single quotes)
"""


def _format_trace_steps(
    trace_info: list, hint_prompt_config: HintPromptConfig, start_index: int = 0, include_goal: bool = False
):
    """Helper function to format trace steps into prompt parts."""
    prompt_parts = []
    if include_goal:
        prompt_parts.append(f"GOAL: {trace_info[0]['goal']} \n")
    for i, step in enumerate(trace_info):
        step_idx = start_index + i
        prompt_parts.append(f"\nStep {step_idx + 1}:")
        if not hint_prompt_config.exclude_html and step.get("html_txt"):
            prompt_parts.append(f"\nHTML: {step['html_txt']} \n")
        if not hint_prompt_config.exclude_axtree and step.get("axtree_txt"):
            prompt_parts.append(f"\nAXTree: {step['axtree_txt']} \n")
        if not hint_prompt_config.exclude_think and step.get("think"):
            prompt_parts.append(f"\nAgent's reasoning: {step['think']} \n")
        if not hint_prompt_config.exclude_actions and step.get("action"):
            prompt_parts.append(f"\nAction taken: {step['action']} \n")
        if step.get("error"):
            prompt_parts.append(f"\nError encountered: {step['error']} \n")
        if not hint_prompt_config.exclude_reward:
            prompt_parts.append(f"\nCurrent reward: {step['reward']} \n")
    return prompt_parts


def construct_hint_prompt_step_wise(
    trace_info: list[dict],
    start_step_idx: int,
    n_steps: int,
    task_name: str,
    hint_prompt_config: HintPromptConfig,
    hint_topics: str,
):
    """
    Construct a prompt for step-wise hinting based on multiple consecutive steps.
    This generates hints for transitions between consecutive step sequences.

    Args:
        trace_info: List of trace steps
        start_step_idx: Starting index of the step sequence (0-based)
        n_steps: Number of consecutive steps to include in the sequence
        task_name: Name of the task
        hint_prompt_config: Configuration for hint prompting

    Returns:
        Formatted prompt for step-wise hint generation
    """
    if start_step_idx + n_steps > len(trace_info):
        return None

    prompt_parts = []
    prompt_parts.append(f"Task: {task_name}")
    prompt_parts.append(f"\n=== STEP SEQUENCE ANALYSIS ({n_steps} consecutive steps) ===")
    prompt_parts.append(f"Goal: {trace_info[0]['goal']}")
    # Include all consecutive steps in the sequence
    for i in range(n_steps):
        step_idx = start_step_idx + i
        step_number = step_idx + 1

        prompt_parts.append(f"\nStep {step_number}:")
        step_parts = _format_trace_steps(
            [trace_info[step_idx]], hint_prompt_config, start_index=step_idx
        )
        prompt_parts.extend(step_parts)

    prompt_parts.append("\n=== STEP-SEQUENCE HINT GENERATION ===")
    prompt_parts.append(
        f"Based on the sequence of {n_steps} consecutive steps above, provide a concise, actionable hint that explains:"
    )
    prompt_parts.append("1. What the agent accomplished across these steps")
    prompt_parts.append("2. What the agent should do next based on the context")
    prompt_parts.append("3. How to recognize when this sequence is needed")
    prompt_parts.append("4. Common mistakes to avoid during this sequence")

    prompt_parts.append("\n=== STEP-SEQUENCE GUIDANCE ===")
    prompt_parts.append(f"Focus on the specific sequence of {n_steps} steps:")
    prompt_parts.append("- What changed in the environment across these steps")
    prompt_parts.append("- What the agent learned or accomplished")
    prompt_parts.append("- What the next logical action should be")
    prompt_parts.append("- How to recognize the right moment for the next action")
    prompt_parts.append("- The pattern or workflow this sequence represents")

    prompt_parts.append(
        f"Focus on the specific sequence of {n_steps} steps and what it teaches about the workflow."
    )

    # Add consolidated guidance
    prompt_parts.append(get_common_hint_guidance(hint_topics))

    # Add chain of thought tokens to encourage deeper thinking
    prompt_parts = add_chain_of_thought_tokens(prompt_parts)

    return "\n".join(prompt_parts)


def construct_hint_prompt(
    trace_info_list: list[list[dict]],
    task_name: str,
    hint_prompt_config: HintPromptConfig,
    hint_topics: str,
):
    """Construct a comprehensive prompt for hint generation based on trace analysis."""
    prompt_parts = []

    # Add system instruction
    prompt_parts.append(f"Task: {task_name}")

    # Determine the scenario based on the number and type of traces
    if len(trace_info_list) == 1:
        # Single trace scenario
        trace_info = trace_info_list[0]

        # Truncate if necessary to fit within token limits
        trace_info = truncate_trace_to_fit_prompt(trace_info, hint_prompt_config)

        prompt_parts.append("\n=== EXECUTION TRACE ===")
        prompt_parts.append(f"Goal: {trace_info[0]['goal']}")
        prompt_parts.extend(_format_trace_steps(trace_info, hint_prompt_config))

        # Add outcome summary
        if not hint_prompt_config.exclude_reward:
            cum_reward = sum(step.get("reward", 0) for step in trace_info)
            if cum_reward > 0:
                prompt_parts.append("\nThis was a successful trace.")
            else:
                prompt_parts.append("\nThis was a failed trace.")

        # Add analysis request for single trace
        prompt_parts.append("\n=== HINT GENERATION ===")
        prompt_parts.append(
            "Based on this trace, provide a concise, actionable hint that would help an agent avoid common mistakes and succeed at this task."
        )

        # Add consolidated guidance
        prompt_parts.append(get_common_hint_guidance(hint_topics))

    elif len(trace_info_list) == 2:
        # Failed and successful trace pair scenario
        for trace_idx, trace_info in enumerate(trace_info_list):
            cum_reward = sum(step.get("reward", 0) for step in trace_info)
            outcome = "SUCCESSFUL" if cum_reward > 0 else "FAILED"
            prompt_parts.append(f"\n--- Trace {trace_idx + 1} ({outcome}) ---")

            # Truncate each trace individually to fit within limits
            truncated_trace = truncate_trace_to_fit_prompt(
                trace_info, hint_prompt_config, max_tokens=80000
            )
            prompt_parts.append("\n=== EXECUTION TRACE ===")
            prompt_parts.append(f"Goal: {truncated_trace[0]['goal']}")
            prompt_parts.extend(_format_trace_steps(truncated_trace, hint_prompt_config))

        # Add analysis request for comparison
        prompt_parts.append("\n=== HINT GENERATION ===")
        prompt_parts.append(
            "Compare the failed and successful traces above. Provide a concise, actionable hint that explains the key differences and what the agent should do to succeed."
        )

        # Add consolidated guidance
        prompt_parts.append(get_common_hint_guidance(hint_topics))

    else:
        # Random set of traces scenario
        prompt_parts.append(f"\n=== MULTIPLE EXECUTION TRACES ({len(trace_info_list)} traces) ===")

        # For multiple traces, we need to be more aggressive with truncation
        max_tokens_per_trace = 50000  # Leave room for instructions and guidance

        for trace_idx, trace_info in enumerate(trace_info_list):
            cum_reward = sum(step.get("reward", 0) for step in trace_info)
            outcome = "SUCCESSFUL" if cum_reward > 0 else "FAILED"
            prompt_parts.append(f"\n--- Trace {trace_idx + 1} ({outcome}) ---")

            # Truncate each trace to fit within per-trace limits
            truncated_trace = truncate_trace_to_fit_prompt(
                trace_info, hint_prompt_config, max_tokens=max_tokens_per_trace
            )
            prompt_parts.append(f"Goal: {truncated_trace[0]['goal']}")
            prompt_parts.extend(_format_trace_steps(truncated_trace, hint_prompt_config))

        # Add analysis request for multiple traces
        prompt_parts.append("\n=== HINT GENERATION ===")
        prompt_parts.append(
            "Based on the multiple traces above, provide a concise, actionable hint that identifies common patterns, successful strategies, and key insights for completing this task."
        )

        # Add consolidated guidance
        prompt_parts.append(get_common_hint_guidance(hint_topics))

    # Add chain of thought tokens to encourage deeper thinking
    prompt_parts = add_chain_of_thought_tokens(prompt_parts)

    final_prompt = "\n".join(prompt_parts)

    # Log estimated token count
    estimated_tokens = estimate_token_count(final_prompt)
    logger.info(f"Generated hint prompt with estimated {estimated_tokens} tokens")

    if estimated_tokens > 180000:
        logger.warning(
            f"Prompt is very long ({estimated_tokens} tokens). Consider reducing trace detail or using step zoom."
        )

    return final_prompt


def construct_hint_prompt_step_zoom(
    trace_info: list[dict],
    important_indices: list[int],
    task_name: str,
    hint_prompt_config: HintPromptConfig,
    hint_topics: str,
):
    """
    Construct a prompt focusing on the most important step(s) for the judge model.
    """
    hint_prompt_config.exclude_axtree = True
    prompt_parts = []
    prompt_parts.append(f"Task: {task_name}")
    prompt_parts.append("\n=== ZOOMED-IN STEPS ===")
    prompt_parts.append(f"Goal: {trace_info[0]['goal']}")
    for idx in range(len(trace_info)):
        step = trace_info[idx]
        prompt_parts.extend(_format_trace_steps([step], hint_prompt_config, start_index=idx))

        if 0 <= idx < len(trace_info) and idx in important_indices:
            # Pass the original step index to maintain correct numbering
            prompt_parts.append(f"AXTree: {step['axtree_txt']}")

    prompt_parts.append("\n=== HINT GENERATION ===")
    prompt_parts.append(
        "Based on the most important step(s) above, provide a concise, actionable hint that would help an agent avoid common mistakes and succeed at this task."
    )

    # Add specific guidance for step-focused hints
    prompt_parts.append("\n=== STEP-FOCUSED GUIDANCE ===")
    prompt_parts.append(
        "Since we're focusing on the most critical step(s), pay special attention to:"
    )
    prompt_parts.append("1. What made this step particularly important for success/failure")
    prompt_parts.append("2. The specific UI elements or context that guided the correct action")
    prompt_parts.append("3. Common mistakes that could occur at this point")
    prompt_parts.append("4. How to recognize when this step is needed")
    prompt_parts.append("5. The right sequence or timing for this action")

    prompt_parts.append(
        "Focus on the specific insights from this critical step that would help in similar situations."
    )

    # Add consolidated guidance
    prompt_parts.append(get_common_hint_guidance(hint_topics))

    # Add chain of thought tokens to encourage deeper thinking
    prompt_parts = add_chain_of_thought_tokens(prompt_parts)

    return "\n".join(prompt_parts)


def construct_hint_prompt_step_zoom_dual_trace(
    trace_info: list[dict],
    important_indices: list[tuple[int, int]],
    task_name: str,
    hint_prompt_config: HintPromptConfig,
    hint_topics: str,
):
    """
    Construct a prompt focusing on the most important step(s) from dual traces for the judge model.

    Args:
        trace_info: Combined trace information with trace_id and step_id
        important_indices: List of (trace_id, step_idx) tuples for important steps
        task_name: Name of the task
        hint_prompt_config: Configuration for hint prompting

    Returns:
        Formatted prompt for dual trace step zoom hint generation
    """
    hint_prompt_config.exclude_axtree = True
    prompt_parts = []
    prompt_parts.append(f"Task: {task_name}")
    prompt_parts.append("\n=== DUAL TRACE STEP ZOOM ANALYSIS ===")

    # Group steps by trace for better organization
    traces = {}
    for step in trace_info:
        trace_id = step.get("trace_id", 0)
        if trace_id not in traces:
            traces[trace_id] = []
        traces[trace_id].append(step)

    # Sort traces by trace_id for consistent ordering
    for trace_id in sorted(traces.keys()):
        trace_steps = traces[trace_id]
        prompt_parts.append(f"\n--- Trace {trace_id + 1} ---")
        # Add outcome summary
        if not hint_prompt_config.exclude_reward:
            cum_reward = sum(step.get("reward", 0) for step in trace_steps)
            if cum_reward > 0:
                prompt_parts.append("\nThis was a successful trace.")
            else:
                prompt_parts.append("\nThis was a failed trace.")

        prompt_parts.append(f"Goal: {trace_steps[0]['goal']}")

        for step in trace_steps:
            step_idx = step.get("step_id", 0)
            step_number = step_idx + 1

            step_parts = _format_trace_steps([step], hint_prompt_config, start_index=step_idx)
            prompt_parts.extend(step_parts)

            # Check if this step is marked as important
            is_important = any(
                idx[0] == trace_id and idx[1] == step_idx for idx in important_indices
            )
            if is_important:
                prompt_parts.append("*** IMPORTANT STEP ***")
                if task_name.split(".")[0] == "miniwob":
                    prompt_parts.append(f"HTML: {step.get('html', '')}")
                else:
                    prompt_parts.append(f"AXTree: {step.get('axtree_txt', '')}")


    prompt_parts.append("\n=== DUAL TRACE HINT GENERATION ===")
    prompt_parts.append(
        "Based on the most important step(s) from both traces above, provide a concise, actionable hint that would help an agent avoid common mistakes and succeed at this task."
    )

    # Add specific guidance for dual trace step-focused hints
    prompt_parts.append("\n=== DUAL TRACE STEP-FOCUSED GUIDANCE ===")
    prompt_parts.append(
        "Since we're analyzing multiple traces and focusing on critical steps, pay special attention to:"
    )
    prompt_parts.append("1. What patterns emerge across both traces in the important steps")
    prompt_parts.append("2. How the important steps differ between successful and failed attempts")
    prompt_parts.append(
        "3. The specific UI elements or context that guided correct vs. incorrect actions"
    )
    prompt_parts.append("4. Common mistakes that occur at similar decision points across traces")
    prompt_parts.append("5. How to recognize when these critical steps are needed")
    prompt_parts.append("6. The right sequence or timing for actions at these critical points")

    prompt_parts.append(
        "Focus on the insights from comparing critical steps across both traces to help in similar situations."
    )

    # Add consolidated guidance
    prompt_parts.append(get_common_hint_guidance(hint_topics))

    # Add chain of thought tokens to encourage deeper thinking
    prompt_parts = add_chain_of_thought_tokens(prompt_parts)

    return "\n".join(prompt_parts)


def summarize_trace_for_important_steps(
    trace_info, summarizer_llm, msg_builder, hint_prompt_config
):
    """
    Use an LLM to summarize the full trace and select the most important step(s).
    Returns a list of step indices (0-based) to zoom in on.
    """
    # Build a summary prompt
    prompt_parts = []
    prompt_parts.append(
        "You are a trace summarizer. Given the following execution trace, identify the step or steps that are most important for understanding success or failure. Return the step numbers (starting from 1) and a brief reason why they are important."
    )
    prompt_parts.append("\n=== EXECUTION TRACE ===")
    hint_prompt_config.exclude_axtree = True
    prompt_parts.append(f"Goal: {trace_info[0]['goal']}")
    # Since traces are short (< 15 steps), include all steps but check content length
    for i, step in enumerate(trace_info):
        # Use _format_trace_steps for consistent formatting
        # Pass the original step index to maintain correct numbering
        prompt_parts.extend(_format_trace_steps([step], hint_prompt_config, start_index=i))

    prompt_parts.append("\n=== STEP SELECTION CRITERIA ===")
    prompt_parts.append("Look for steps that are critical because they:")
    prompt_parts.append("1. Represent a key decision point or branching moment")
    prompt_parts.append("2. Show a common mistake that could be avoided")
    prompt_parts.append("3. Demonstrate a successful strategy or pattern")
    prompt_parts.append("4. Involve important UI elements or context clues")
    prompt_parts.append("5. Show timing or sequence dependencies")
    prompt_parts.append("6. Represent the moment where success/failure was determined")

    prompt_parts.append("\n=== STEP SELECTION ===")
    prompt_parts.append(
        "List the most important step numbers (comma separated) and a brief reason for each."
    )
    prompt_parts.append(
        "IMPORTANT: Do not repeat the same step number - each step should be listed only once."
    )
    prompt_parts.append(
        "Select 1-2 critical steps that would provide the most valuable insights for generating actionable hints."
    )
    prompt_parts.append(
        "Focus on steps that would provide the most valuable insights for generating actionable hints."
    )

    # Add chain of thought tokens to encourage deeper analysis
    chain_of_thought_instruction = [
        "\n=== THINKING PROCESS ===",
        "Before selecting the most important steps, take time to think through the following:",
        "1. Which steps represent critical decision points or branching moments?",
        "2. Which steps show common mistakes that could be avoided?",
        "3. Which steps demonstrate successful strategies or patterns?",
        "4. Which steps involve important UI elements or context clues?",
        "5. Which steps show timing or sequence dependencies?",
        "6. Which steps represent the moment where success/failure was determined?",
        "",
        "Think step by step and analyze each step carefully before making your selection.",
    ]
    prompt_parts.extend(chain_of_thought_instruction)

    summary_prompt = "\n".join(prompt_parts)

    # Check token count and truncate if necessary (even short traces can have long content)
    estimated_tokens = estimate_token_count(summary_prompt)
    if estimated_tokens > 180000:
        logger.warning(
            f"Summary prompt is too long ({estimated_tokens} tokens) despite short trace. Truncating content."
        )
        # Truncate the prompt text itself to fit within limits
        max_chars = 180000 * 4  # Convert tokens back to characters
        if len(summary_prompt) > max_chars:
            summary_prompt = summary_prompt[:max_chars] + "\n... [truncated for length]"
            logger.info(f"Truncated summary prompt to {max_chars} characters")

    discussion = SimpleDiscussion()
    sys_msg = msg_builder.system().add_text(
        "You are a trace summarizer focused on identifying critical decision points and key insights."
    )
    discussion.append(sys_msg)
    user_msg = msg_builder.user().add_text(summary_prompt)
    discussion.append(user_msg)

    payload = APIPayload(messages=discussion.flatten())
    response = summarizer_llm(payload)
    # Parse step numbers from response.think or str(response)
    text = response.think if hasattr(response, "think") else str(response)
    # Look for numbers (steps) in the response
    step_nums = re.findall(r"Step\s*(\d+)", text)
    if not step_nums:
        # fallback: look for any numbers
        step_nums = re.findall(r"\b(\d+)\b", text)

    # Convert to 0-based indices and ensure uniqueness
    indices = []
    seen_steps = set()
    for num in step_nums:
        if num.isdigit() and 0 < int(num) <= len(trace_info):
            step_idx = int(num) - 1
            if step_idx not in seen_steps:
                indices.append(step_idx)
                seen_steps.add(step_idx)

    # If nothing found, fallback to last step
    if not indices:
        indices = [len(trace_info) - 1]

    # Sort indices to maintain chronological order and remove any duplicates
    indices = sorted(list(set(indices)))

    # Limit to maximum 3 steps to keep prompts manageable
    if len(indices) > 2:
        logger.info(f"Limiting step selection from {len(indices)} to 2 most important steps")
        indices = indices[:2]

    logger.info(
        f"Selected step indices: {indices} (0-based, corresponding to steps {[i + 1 for i in indices]} in trace)"
    )

    return indices


def _select_traces_for_hint(traces, n_traces_to_hinter):
    """Select traces for hint generation based on the scenario."""
    if  len(traces) < n_traces_to_hinter:
        logger.info(f"Not enough traces available for task {traces[0]["task_name"]}.Reducing n_traces_to_hinter to {len(traces)}")
    if n_traces_to_hinter == 1:
        # Single trace scenario
        return [random.choice(traces)["trace_info"]]

    elif n_traces_to_hinter == 2:
        # Two traces scenario - ideally one failed and one successful
        successful_traces = []
        failed_traces = []

        for trace in traces:
            cum_reward = sum(step.get("reward", 0) for step in trace["trace_info"])
            if cum_reward > 0:
                successful_traces.append(trace)
            else:
                failed_traces.append(trace)

        selected_traces = []

        if successful_traces and failed_traces:
            selected_traces = [random.choice(successful_traces), random.choice(failed_traces)]
        else:
            pool = successful_traces or failed_traces or traces
            selected_traces = random.sample(pool, min(2, len(pool)))
        return [trace["trace_info"] for trace in selected_traces]

    else:
        # Random set of traces scenario
        n_available = len(traces)
        n_to_sample = min(n_traces_to_hinter, n_available)
        selected_traces = random.sample(traces, n_to_sample)
        return [trace["trace_info"] for trace in selected_traces]


def estimate_token_count(text: str) -> int:
    """
    Estimate token count for a given text.
    This is a rough approximation: ~4 characters per token for English text.
    """
    return len(text) // 4


def smart_truncate_text(
    text: str,
    max_length: int = 4000,
    context: str = "text",
    preserve_sentences: bool = True,
) -> str:
    """
    Smartly truncate text to preserve complete sentences and avoid cutting off important content.

    Args:
        text: The text to truncate
        max_length: Maximum length allowed (in characters)
        context: Context for logging (e.g., "hint", "trace", "prompt")
        preserve_sentences: Whether to try to preserve complete sentences

    Returns:
        Truncated text that preserves complete sentences when possible
    """
    if len(text) <= max_length:
        return text

    if preserve_sentences:
        # Try to find a good break point near the max length
        # Look for sentence endings (.!?) or natural breaks
        good_break_chars = [".", "!", "?", ";", ":", "\n"]

        # Start from max_length and work backwards to find a good break
        # Look further back (up to 500 characters) to find a better break point
        for i in range(max_length, max(0, max_length - 500), -1):
            if i < len(text) and text[i] in good_break_chars:
                # Found a good break point
                truncated = text[: i + 1].strip()
                logger.info(
                    f"📝 {context.capitalize()} truncated from {len(text)} to {len(truncated)} characters at sentence boundary"
                )
                return truncated

        # If no good break point found, try to break at word boundaries
        # Look further back for word boundaries
        for i in range(max_length, max(0, max_length - 300), -1):
            if i < len(text) and text[i] == " ":
                # Found a word boundary
                truncated = text[:i].strip()
                logger.info(
                    f"📝 {context.capitalize()} truncated from {len(text)} to {len(truncated)} characters at word boundary"
                )
                return truncated

        # If still no good break point, try to find any complete word
        for i in range(max_length, max(0, max_length - 200), -1):
            if i < len(text) and text[i].isalnum():
                # Found a complete word, break before it
                truncated = text[:i].strip()
                logger.info(
                    f"📝 {context.capitalize()} truncated from {len(text)} to {len(truncated)} characters at word boundary"
                )
                return truncated

    # Last resort: truncate at max_length and add ellipsis
    truncated = text[: max_length - 3] + "..."
    logger.info(
        f"📝 {context.capitalize()} truncated from {len(text)} to {len(truncated)} characters (hard cut)"
    )
    return truncated


def truncate_trace_to_fit_prompt(
    trace_info: list,
    hint_prompt_config,
    max_tokens: int = 180000,
    context: str = "trace",
) -> list:
    """
    Truncate trace information to fit within token limits while preserving important information.

    Args:
        trace_info: List of trace steps
        hint_prompt_config: Configuration for hint prompting
        max_tokens: Maximum tokens allowed (default: 180k to leave room for system prompt and instructions)
        context: Context for logging

    Returns:
        Truncated trace_info that fits within token limits
    """
    if not trace_info:
        return trace_info

    # Start with essential prompt parts
    base_prompt = "Task: [TASK_NAME]\n=== EXECUTION TRACE ===\n=== HINT GENERATION ===\nBased on this trace, provide a concise, actionable hint that would help an agent avoid common mistakes and succeed at this task."
    base_tokens = estimate_token_count(base_prompt)

    # Calculate how many tokens we have left for trace data
    available_tokens = max_tokens - base_tokens

    # Try to include as many steps as possible
    truncated_trace = []
    current_tokens = 0

    for step in trace_info:
        # Format this step
        step_parts = _format_trace_steps(
            [step], hint_prompt_config, start_index=len(truncated_trace)
        )
        step_text = "\n".join(step_parts)
        step_tokens = estimate_token_count(step_text)

        # Check if adding this step would exceed the limit
        if current_tokens + step_tokens <= available_tokens:
            truncated_trace.append(step)
            current_tokens += step_tokens
        else:
            # We can't fit this step, stop here
            break

    if len(truncated_trace) < len(trace_info):
        logger.warning(
            f"Prompt was too long ({len(trace_info)} steps). Truncated to {len(truncated_trace)} steps to fit within token limits."
        )
        logger.info(f"Estimated tokens: {current_tokens + base_tokens} (limit: {max_tokens})")

    return truncated_trace


def add_chain_of_thought_tokens(prompt_parts: List[str]) -> List[str]:
    """
    Add chain of thought tokens to encourage the model to spend more time thinking.

    Args:
        prompt_parts: List of prompt parts to modify

    Returns:
        Modified prompt parts with chain of thought tokens
    """
    # Check if thinking process already exists to avoid duplication
    has_thinking_process = any("=== THINKING PROCESS ===" in part for part in prompt_parts)
    
    if has_thinking_process:
        # If thinking process already exists, just return the original parts
        return prompt_parts
    
    # Add chain of thought instruction before the final hint generation request
    chain_of_thought_instruction = [
        "\n=== THINKING PROCESS ===",
        "Before providing your final hint, take time to think through the following:",
        "1. What patterns do you observe in the execution traces?",
        "2. What are the key differences between successful and failed attempts?",
        "3. What UI elements or context clues are most important?",
        "4. What common mistakes could be avoided?",
        "5. What strategies consistently lead to success?",
        "",
        "Think step by step and analyze the traces carefully before formulating your hint.",
    ]

    # Find the HINT GENERATION section and insert chain of thought before it
    modified_parts = []
    for i, part in enumerate(prompt_parts):
        modified_parts.append(part)
        if "=== HINT GENERATION ===" in part:
            # Insert chain of thought tokens before the hint generation request
            modified_parts.extend(chain_of_thought_instruction)

    return modified_parts


def extract_structured_response(response_text: str) -> tuple[str, str, str]:
    """
    Extract thinking and hint from structured response format.

    Args:
        response_text: Raw response text from LLM

    Returns:
        Tuple of (thinking, hint) - both cleaned and normalized
    """
    # Debug logging
    logger.info(f"🔍 Extracting structured response from text of length {len(response_text)}")
    logger.info(f"🔍 Response text preview: {response_text[:200]}...")

    # Look for <think> tags
    think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
    think = think_match.group(1).strip() if think_match else ""

    topic_match = re.search(r"<topic>(.*?)</topic>", response_text, re.DOTALL)
    topic = topic_match.group(1).strip() if topic_match else ""

    # Look for <hint> tags - handle both complete and incomplete tags
    hint_match = re.search(r"<hint>(.*?)</hint>", response_text, re.DOTALL)
    if hint_match:
        hint = hint_match.group(1).strip()
    else:
        # Try to find incomplete <hint> tag (opened but not closed)
        incomplete_hint_match = re.search(r"<hint>(.*)", response_text, re.DOTALL)
        if incomplete_hint_match:
            hint = incomplete_hint_match.group(1).strip()
            logger.info(f"🔍 Found incomplete <hint> tag, extracted content: {hint[:200]}...")
        else:
            hint = ""

    # Debug logging
    logger.info(f"🔍 Found <think> tags: {bool(think_match)}")
    logger.info(f"🔍 Found <hint> tags: {bool(hint_match)}")
    logger.info(f"🔍 Found <topic> tags: {bool(topic_match)}")
    logger.info(
        f"🔍 Found incomplete <hint> tag: {bool(incomplete_hint_match) if 'incomplete_hint_match' in locals() else False}"
    )
    if think_match:
        logger.info(f"🔍 Think content length: {len(think)}")
    if hint:
        logger.info(f"🔍 Hint content length: {len(hint)}")
    if topic != "general":
        logger.info(f"🔍 Topic: {topic}")

    # If no tags found, try alternative patterns
    if not think_match or not hint:
        logger.warning(
            f"⚠️ Missing tags in response. Think: {bool(think_match)}, Hint: {bool(hint)}"
        )
        logger.warning(f"⚠️ Full response: {response_text}")

        # Try to find any hint-like content if tags are missing
        if not hint:
            # Look for content that might be a hint (after "hint:" or similar)
            hint_indicators = ["hint:", "hint", "Hint:", "Hint"]
            for indicator in hint_indicators:
                if indicator in response_text:
                    parts = response_text.split(indicator, 1)
                    if len(parts) > 1:
                        potential_hint = parts[1].strip()
                        # Take first sentence or reasonable chunk
                        sentences = potential_hint.split(".")
                        if sentences:
                            hint = sentences[0].strip()
                            logger.info(
                                f"🔍 Extracted hint using fallback method: {hint[:1000]}..."
                            )
                            break

    # Clean up whitespace
    think = " ".join(think.split()) if think else ""
    hint = " ".join(hint.split()) if hint else ""

    logger.info(f"🔍 Final extracted - Think: {len(think)} chars, Hint: {len(hint)} chars")

    return think, hint, topic


def context_identification(trace_info_list, llm, msg_builder):
    """Identify context from trace information."""
    
    # Compare the traces to find the first action that differs between the two traces
    for i in range(len(trace_info_list[0])):
        if trace_info_list[0][i] != trace_info_list[1][i]:
            break
    
    # Get the trace info until the first action that differs
    trace_info_until_diff = _format_trace_steps(trace_info_list[0][:i+1], HintPromptConfig, include_goal=True)
    
    # Use the provided msg_builder instead of creating a new one
    sys_msg = msg_builder.system().add_text("You are a helpful assistant that identifies the context of a task based on trace information.")
    
    # Build the new prompt using <context> tags
    context_identification_prompt = f"""{''.join(trace_info_until_diff)}

# Querying memory

Before choosing an action, let's search our available documentation and memory for relevant context.
Generate a brief, general summary of the current status to help identify useful hints. Return your answer as follows:
<think>chain of thought</think>
<context>one short sentence summary</context>

# Concrete Example

<think>
I have to sort by client and country. I could use the built-in sort on each column but I'm not sure if
I will be able to sort by both at the same time.
</think>

<context>
The user is preparing to apply multi-column sorting and needs guidance on adding the next criterion.
</context>
"""
    
    user_msg = msg_builder.user().add_text(context_identification_prompt)
    
    discussion = SimpleDiscussion()
    discussion.append(sys_msg)
    discussion.append(user_msg)
    
    payload = APIPayload(messages=discussion.flatten())
    response = llm(payload)
    
    # Extract the context from the response (prefer <context>, fallback to <think>)
    response_text = response.think if hasattr(response, "think") else str(response)
    
    import re
    context_match = re.search(r"<context>(.*?)</context>", response_text, re.DOTALL)
    if context_match:
        context = context_match.group(1).strip()
    else:
        think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
        if think_match:
            context = think_match.group(1).strip()
        else:
            # Fallback: use the entire response if tags are missing
            context = response_text.strip()
    return context.strip()


def construct_two_trace_comparison_prompt(
    trace_info_list: list[list[dict]],
    task_name: str,
    context: str,
    hint_prompt_config: HintPromptConfig,
    hint_topics: str,
):
    """Construct a prompt for two-trace comparison with specific format."""
    if len(trace_info_list) != 2:
        raise ValueError("This function requires exactly 2 traces for comparison")
    
    # Construct the prompt with your specific format
    prompt_parts = []
    prompt_parts.append(f"{task_name}. You will be provided with a desired and undesired trajectory of the same task. What is the first action that differs between the two trajectories? Why do you think it makes one trajectory failed and the other successful? Based on your answer, generate an action guideline to make future task avoid the same mistake. The guideline should specify what to do in what situation in the format of \"When in what status, you should (or should not)...\". On a product's page with product information, strictly refer to the option buttons as 'buying options such as sizes, colors, scents, and flavors', and clearly say that buying options are not subpages like [Description] and [Attributes] when you mention buying options. Your guideline must be general enough for any task, therefore never include any task-specific information, instead, refer to all the requirements as the requierments in Instruction. Strictly follow what the desired trajectory does and never suggest actions that the desired trajectory didn't do. When referring to actions, use the allowed action format.")

    # Add desired trajectory
    prompt_parts.extend(_format_trace_steps(trace_info_list[0], hint_prompt_config, include_goal=True))
    
    # Add undesired trajectory  
    prompt_parts.extend(_format_trace_steps(trace_info_list[1], hint_prompt_config, include_goal=True))

    prompt_parts.append(f"SUMMARIZATION: {context}")

    prompt_parts.append(get_common_hint_guidance(hint_topics))
    
    return "\n".join(prompt_parts)