"""
Evaluator for Mathematical Problems

This module implements an evaluation approach with single continuous conversation,
iterative code execution, natural context retention, and standardized answer submission.
"""

import json
import logging
import os
import traceback
from pathlib import Path
from typing import Dict, Any, List

from inspect_ai import Task, eval
from inspect_ai.agent import react
from inspect_ai.agent._types import AgentSubmit, AgentState
from inspect_ai.dataset import Sample, MemoryDataset
from inspect_ai.model._model import sample_model_usage
from inspect_ai.tool import tool, Tool, ToolResult
from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
import asyncio
import concurrent.futures

from django.utils import timezone

from .db_config import DatabaseFrameworkConfig, get_proofbench_sandbox
from .inspect_tools import get_mathematical_tools, python, bash, web_search
from .models import Question
from .answer_utils import check_answer_correctness
from questions.models import Subquestion
logger = logging.getLogger(__name__)


def extract_answer_from_malformed_submit(tool_call_args: str) -> str:
    """
    Extract the answer from a malformed submit tool call using heuristics.

    Common malformation patterns:
    1. Using raw strings: r'''...''' or r\"\"\"...\"\"\" inside JSON
    2. Unescaped quotes and backslashes
    3. Missing closing tags or braces

    Args:
        tool_call_args: The raw tool call arguments string

    Returns:
        The extracted answer text, or empty string if extraction fails
    """
    import re

    try:
        # Pattern 1a: Look for raw string syntax r"""...""" (triple double quotes)
        raw_triple_double = re.search(r'''r"""(.*?)"""''', tool_call_args, re.DOTALL)
        if raw_triple_double:
            return raw_triple_double.group(1).strip()

        # Pattern 1b: Look for raw string syntax r'''...''' (triple single quotes)
        raw_triple_single = re.search(r"""r'''(.*?)'''""", tool_call_args, re.DOTALL)
        if raw_triple_single:
            return raw_triple_single.group(1).strip()

        # Pattern 2: Look for "answer": followed by any string pattern
        # Try to find content between "answer": and the next closing structure
        answer_pattern = re.search(r'''"answer"\s*:\s*["'](.*)["']\s*[}\]]''', tool_call_args, re.DOTALL)
        if answer_pattern:
            return answer_pattern.group(1).strip()

        # Pattern 3: Try to find content after "answer": that might be malformed
        # Look for r''' or r""" after "answer":
        answer_raw = re.search(r'''"answer"\s*:\s*r['"]{3}(.*?)$''', tool_call_args, re.DOTALL)
        if answer_raw:
            # Content might be cut off, but extract what we have
            content = answer_raw.group(1)
            # Try to find the closing ''' or """
            close_match = re.search(r'''(.*?)['"]{3}''', content, re.DOTALL)
            if close_match:
                return close_match.group(1).strip()
            else:
                # No closing quotes found, use what we have
                return content.strip()

        # Pattern 4: Look for nested quotes without proper escaping
        # {"answer": "...\"...\"..."}
        nested_quotes = re.search(r'''"answer"\s*:\s*"((?:[^"\\]|\\.)*)"\s*[}\]]''', tool_call_args, re.DOTALL)
        if nested_quotes:
            return nested_quotes.group(1).strip()

        # Pattern 5: Last resort - extract everything after "answer":
        answer_start = tool_call_args.find('"answer"')
        if answer_start >= 0:
            after_answer = tool_call_args[answer_start + 8:].strip()
            # Skip past the colon
            colon_idx = after_answer.find(':')
            if colon_idx >= 0:
                content = after_answer[colon_idx + 1:].strip()
                # Remove leading quote if present
                if content.startswith('"') or content.startswith("'"):
                    content = content[1:]
                # Remove trailing JSON wrapper characters (}, ], and quotes)
                content = re.sub(r'[}\]"\']+\s*$', '', content)
                # Clean up leaked </think> tags that sometimes appear in malformed output
                content = re.sub(r'</think>.*$', '', content, flags=re.DOTALL)
                return content.strip()

        return ""

    except Exception as e:
        logger.error(f"Error extracting answer from malformed submit: {e}")
        return ""


def detect_raw_json_tool_call(message_text: str) -> dict | None:
    """
    Detect raw JSON tool call without XML tags.

    Some models output raw JSON in the content field without using the proper
    tool calling mechanism. This catches patterns like:
    - {"submit": {"answer": "..."}}  (wrong format)
    - {"name": "submit", "arguments": {"answer": "..."}}  (correct format, no XML)

    Args:
        message_text: The raw message text from the model

    Returns:
        dict with 'tool_name' and 'arguments' if raw JSON tool call detected,
        None otherwise
    """
    import re
    import json

    if not message_text or not isinstance(message_text, str):
        return None

    try:
        # Pattern 1: {"submit": {"answer": "..."}} - wrong format but common
        submit_pattern = re.search(r'^\s*\{\s*"submit"\s*:\s*(\{.*\})\s*\}\s*$', message_text, re.DOTALL)
        if submit_pattern:
            try:
                args_json = submit_pattern.group(1)
                arguments = json.loads(args_json)
                logger.info("Detected raw JSON with wrong format: {'submit': {...}}")
                return {
                    'tool_name': 'submit',
                    'arguments': arguments
                }
            except json.JSONDecodeError:
                # Try to extract answer using malformed submit handler
                extracted = extract_answer_from_malformed_submit(args_json)
                if extracted:
                    return {
                        'tool_name': 'submit',
                        'arguments': {'answer': extracted}
                    }

        # Pattern 2: {"name": "submit", "arguments": {...}} - correct format, no XML
        correct_pattern = re.search(r'^\s*\{\s*"name"\s*:\s*"submit"\s*,\s*"arguments"\s*:\s*(\{.*\})\s*\}\s*$', message_text, re.DOTALL)
        if correct_pattern:
            try:
                args_json = correct_pattern.group(1)
                arguments = json.loads(args_json)
                logger.info("Detected raw JSON with correct format but no XML tags")
                return {
                    'tool_name': 'submit',
                    'arguments': arguments
                }
            except json.JSONDecodeError:
                # Try to extract answer using malformed submit handler
                extracted = extract_answer_from_malformed_submit(args_json)
                if extracted:
                    return {
                        'tool_name': 'submit',
                        'arguments': {'answer': extracted}
                    }

        return None

    except Exception as e:
        logger.error(f"Error detecting raw JSON tool call: {e}")
        return None


def detect_xml_with_malformed_json(message_text: str) -> dict | None:
    """
    Detect complete XML tool call tags with malformed JSON inside.

    This catches cases where the model outputs <tool_call>...</tool_call> with
    complete XML tags, but the JSON inside is malformed (e.g., using Python
    raw string syntax r'''...''' which is not valid JSON). The XML parser
    succeeds but JSON parsing fails, so the tool call never appears in tool_calls[].

    Args:
        message_text: The raw message text from the model

    Returns:
        dict with 'tool_name' and 'arguments' if detected, None otherwise
    """
    import re
    import json

    if not message_text or not isinstance(message_text, str):
        return None

    try:
        # Look for complete <tool_call>...</tool_call> tags
        tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', message_text, re.DOTALL)
        if not tool_call_match:
            return None

        json_content = tool_call_match.group(1).strip()

        # Try to parse as JSON
        try:
            tool_data = json.loads(json_content)
            # JSON parsing succeeded, not a malformed case
            return None
        except json.JSONDecodeError:
            # JSON is malformed! Try to extract using our heuristics
            logger.warning("Found complete XML tags with malformed JSON inside")

            # Extract tool name
            tool_name = None
            name_match = re.search(r'"name"\s*:\s*"([^"]+)"', json_content)
            if name_match:
                tool_name = name_match.group(1)

            # Try to extract answer using malformed submit extraction
            extracted_answer = extract_answer_from_malformed_submit(json_content)

            if extracted_answer and tool_name:
                logger.info(f"Successfully extracted answer from XML with malformed JSON")
                return {
                    'tool_name': tool_name,
                    'arguments': {'answer': extracted_answer}
                }
            else:
                logger.warning("Could not extract answer from XML with malformed JSON")
                return None

    except Exception as e:
        logger.error(f"Error detecting XML with malformed JSON: {e}")
        return None


def detect_incomplete_tool_call(message_text: str) -> dict | None:
    """
    Detect incomplete XML tool call tags in message text.

    This catches cases where the model outputs <tool_call>...</tool_call> but the
    closing tag is missing, causing XML parsing to fail and the tool call to never
    be recognized by the Inspect framework.

    Args:
        message_text: The raw message text from the model

    Returns:
        dict with 'tool_name' and 'arguments' if incomplete tool call detected,
        None otherwise
    """
    import re
    import json

    if not message_text or not isinstance(message_text, str):
        return None

    try:
        # Look for <tool_call> followed by JSON, but missing </tool_call>
        # Pattern: <tool_call>{"name": "...", "arguments": {...}}
        # Without a closing </tool_call> tag

        # First check if there's an opening tag
        if '<tool_call>' not in message_text:
            return None

        # Check if closing tag is missing
        if '</tool_call>' in message_text:
            # Complete tag pair exists, no issue
            return None

        # Extract content between <tool_call> and end of string
        tool_call_match = re.search(r'<tool_call>\s*(\{.*)', message_text, re.DOTALL)
        if not tool_call_match:
            return None

        json_content = tool_call_match.group(1).strip()

        # Try to find where the JSON ends
        # Look for balanced braces
        brace_count = 0
        json_end = -1
        for i, char in enumerate(json_content):
            if char == '{':
                brace_count += 1
            elif char == '}':
                brace_count -= 1
                if brace_count == 0:
                    json_end = i + 1
                    break

        if json_end > 0:
            json_str = json_content[:json_end]
            try:
                # Try to parse the JSON
                tool_data = json.loads(json_str)
                if isinstance(tool_data, dict) and 'name' in tool_data:
                    tool_name = tool_data.get('name')
                    arguments = tool_data.get('arguments', {})

                    # Special handling for submit tool with duplicate answer keys:
                    # JSON parser only keeps the last value, but we want to concatenate all
                    if tool_name == 'submit' and isinstance(arguments, dict):
                        # Find all "answer" field values in the raw JSON string
                        answer_matches = list(re.finditer(r'"answer"\s*:\s*"([^"]*(?:\\"[^"]*)*)"', json_str))
                        if len(answer_matches) > 1:
                            # Multiple answer fields found - concatenate them
                            all_answers = [match.group(1) for match in answer_matches]
                            concatenated = ''.join(all_answers)
                            arguments = {'answer': concatenated}
                            logger.info(f"Concatenated {len(all_answers)} answer fields in incomplete tool call")

                    return {
                        'tool_name': tool_name,
                        'arguments': arguments
                    }
            except json.JSONDecodeError:
                # JSON is also malformed (e.g., unescaped LaTeX backslashes)
                # Try to extract the answer using the same heuristics as malformed submit
                logger.warning("Found incomplete tool call with malformed JSON, attempting extraction")

                # Extract tool name if possible
                tool_name = None
                name_match = re.search(r'"name"\s*:\s*"([^"]+)"', json_str)
                if name_match:
                    tool_name = name_match.group(1)

                # Try to extract answer using malformed submit extraction
                extracted_answer = extract_answer_from_malformed_submit(json_str)

                if extracted_answer and tool_name:
                    logger.info(f"Successfully extracted answer from incomplete tool call with malformed JSON")
                    return {
                        'tool_name': tool_name,
                        'arguments': {'answer': extracted_answer}
                    }
                else:
                    logger.warning("Could not extract answer from incomplete tool call with malformed JSON")
                    return None

        return None

    except Exception as e:
        logger.error(f"Error detecting incomplete tool call: {e}")
        return None


def clean_thinking_tags(text: str) -> str:
    """
    Remove thinking tags from text responses.

    Models sometimes include thinking tags like <think>...</think> or <thinking>...</thinking>
    in their responses. This function removes these tags and returns only the actual answer.

    Args:
        text: The text that may contain thinking tags
        
    Returns:
        The cleaned text with thinking tags removed
    """
    import re
    
    if not text:
        return text
    
    # Remove <think>...</think> tags and their content
    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Remove <thinking>...</thinking> tags and their content  
    text = re.sub(r'<thinking>.*?</thinking>', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Also handle unclosed tags at the start (in case model doesn't close them)
    text = re.sub(r'^<think>.*', '', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'^<thinking>.*', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Strip whitespace from the result
    text = text.strip()
    
    return text


class Evaluator:
    """
    Evaluator using single continuous conversation.

    This evaluator implements the approach:
    1. Single continuous conversation (no stage breaks)
    2. Extended conversation limits for thorough exploration  
    3. Iterative code execution and experimentation
    4. Natural context retention through conversation flow
    5. Standardized answer submission format
    """
    
    def __init__(self):
        """Initialize the evaluator."""
        self.config = DatabaseFrameworkConfig()
        self.config.setup_directories()
    
    def _create_custom_submit_tool(self):
        """Create a minimal submit tool that just signals completion."""
        # Import tool description utilities
        from inspect_ai.tool._tool_description import set_tool_description, ToolDescription
        from inspect_ai.tool._tool_params import ToolParams, ToolParam
        from .config.prompt_config import get_prompt_config
        
        # Get tool description from centralized config
        prompt_config = get_prompt_config()
        tool_desc = prompt_config.get_tool_description('submit')
        
        @tool
        def proofbench_submit() -> Tool:  
            # Set the outer function docstring from config
            proofbench_submit.__doc__ = tool_desc['description']
            
            async def execute(answer: str) -> ToolResult:
                # Just acknowledge - the real work happens in on_continue
                return "Answer recorded successfully."
            
            # Set up parameters for tool description
            params = ToolParams()
            param_config = tool_desc.get('parameters', {}).get('answer', {})
            params.properties['answer'] = ToolParam(
                type=param_config.get('type', 'string'),
                description=param_config.get('description', 'Your answer for the current question')
            )
            params.required = ['answer']
            
            # Apply custom description for model view
            set_tool_description(
                execute,
                ToolDescription(
                    name=tool_desc['name'],
                    description=tool_desc['description'].strip(),
                    parameters=params
                )
            )
            
            # Also set the execute function's docstring directly (belt and suspenders approach)
            execute.__doc__ = tool_desc['description']
            
            return execute
        
        return proofbench_submit()
    
    async def _save_answer_to_database(self, stage: str, answer: str, model_attempt_id: int, ran_out_of_tokens: bool = False):
        """Save the model's answer to the database immediately.
        
        Args:
            stage: Current evaluation stage (e.g., 'main_question', 'subquestion_a')
            answer: The model's submitted answer
            model_attempt_id: ID of the current ModelAttempt
            ran_out_of_tokens: Whether the model ran out of tokens before submitting
        """
        from asgiref.sync import sync_to_async
        from .models import ModelAnswer, ModelSubquestionAnswer, ModelAttempt
        from questions.models import Subquestion
        
        logger.info(f"Saving answer for stage {stage}: {answer[:100]}...")
        
        try:
            if stage == 'main_question':
                # Save main question answer
                @sync_to_async
                def save_main_answer():
                    model_attempt = ModelAttempt.objects.get(id=model_attempt_id)
                    # Check if answer already exists
                    existing = ModelAnswer.objects.filter(
                        attempt=model_attempt,
                        question_id=model_attempt.question_id,
                        model_id=model_attempt.model_id
                    ).first()
                    
                    if existing:
                        # Update existing answer
                        existing.answer = answer
                        existing.ran_out_of_tokens = ran_out_of_tokens
                        existing.save()
                        return existing
                    else:
                        # Create new answer
                        return ModelAnswer.objects.create(
                            attempt=model_attempt,
                            question_id=model_attempt.question_id,
                            model_id=model_attempt.model_id,
                            answer=answer,
                            ran_out_of_tokens=ran_out_of_tokens
                        )
                
                saved_answer = await save_main_answer()
                logger.info(f"Saved main question answer (ID: {saved_answer.id})")
                
            elif stage.startswith('subquestion_'):
                # Save subquestion answer
                subquestion_order = stage.replace('subquestion_', '')
                
                @sync_to_async
                def save_subquestion_answer():
                    model_attempt = ModelAttempt.objects.get(id=model_attempt_id)
                    subquestion = Subquestion.objects.get(
                        question_id=model_attempt.question_id,
                        subquestion_order=subquestion_order
                    )
                    
                    # Check for exact match
                    is_correct = self._check_answer_correctness(answer, subquestion.answer)
                    
                    # Check if answer already exists
                    existing = ModelSubquestionAnswer.objects.filter(
                        attempt=model_attempt,
                        subquestion=subquestion,
                        model_id=model_attempt.model_id
                    ).first()
                    
                    if existing:
                        # Update existing answer
                        existing.answer = answer
                        existing.is_correct = 1 if is_correct else 0
                        existing.ran_out_of_tokens = ran_out_of_tokens
                        existing.save()
                        return existing
                    else:
                        # Create new answer
                        return ModelSubquestionAnswer.objects.create(
                            attempt=model_attempt,
                            subquestion=subquestion,
                            model_id=model_attempt.model_id,
                            answer=answer,
                            is_correct=1 if is_correct else 0,
                            ran_out_of_tokens=ran_out_of_tokens
                        )
                
                saved_answer = await save_subquestion_answer()
                logger.info(f"Saved subquestion {subquestion_order} answer (ID: {saved_answer.id}, correct: {saved_answer.is_correct})")
                
        except Exception as e:
            logger.error(f"Error saving answer to database: {str(e)}")
            logger.error(f"Traceback: {traceback.format_exc()}")
    
    def _check_answer_correctness(self, model_answer: str, correct_answer: str) -> bool:
        """Check if the model's answer matches the correct answer.

        Uses shared normalization logic from answer_utils that handles:
        - Case-insensitive comparison
        - Boolean equivalences (yes/no/true/false)
        - Whitespace and formatting normalization

        Args:
            model_answer: The model's submitted answer
            correct_answer: The correct answer from the database

        Returns:
            True if answers match after normalization, False otherwise
        """
        return check_answer_correctness(model_answer, correct_answer)
    
    async def evaluate(
        self,
        question_id: int,
        model_key: str,
        attempt_number: int = 1,
        model_attempt_id: int = None
    ) -> Dict[str, Any]:
        """
        Evaluate a question using sequential stage-based approach.
        
        This method implements the correct workflow:
        1. Main question -> model submits answer
        2. Subquestion A -> model submits answer  
        3. Subquestion B -> model submits answer
        4. etc.
        
        Args:
            question_id: Database ID of the question to evaluate
            model_key: Model identifier from database (model_name or display_name)
            attempt_number: Attempt number (1 or 2)
            
        Returns:
            Dictionary containing evaluation results with all stage answers
        """
        try:
            # Get question and model configuration
            question = await self._get_question(question_id)
            
            # Get model config using async wrapper
            from asgiref.sync import sync_to_async
            get_model_config_async = sync_to_async(self.config.get_model_config)
            model_config = await get_model_config_async(model_key)
            
            if not model_config:
                raise ValueError(f"Unknown model key: {model_key}")
            
            # Get subquestions for the evaluation
            from asgiref.sync import sync_to_async
            get_subquestions = sync_to_async(
                lambda: list(Subquestion.objects.filter(question=question).order_by('subquestion_order'))
            )
            subquestions = await get_subquestions()
            
            logger.info(f"Starting sequential evaluation: Q{question_id} with {model_key} (main + {len(subquestions)} subquestions)")
            
            # Set up evaluation environment
            evaluation_id = f"q{question_id}_m{model_key.replace('/', '_')}_a{attempt_number}"
            
            # Run single react agent evaluation (submit tool handles stage transitions)
            result = await self._run_single_agent_evaluation(
                question=question,
                subquestions=subquestions,
                model_config=model_config,
                evaluation_id=evaluation_id,
                model_attempt_id=model_attempt_id
            )
            
            return result
            
        except Exception as e:
            # Check if this is a timeout error
            error_msg = str(e)
            if 'time limit' in error_msg.lower() or 'timeout' in error_msg.lower():
                logger.warning(f"Evaluation TIMEOUT for Q{question_id} after 86400 seconds (24 hours): {error_msg}")
                error_msg = f"Evaluation exceeded time limit (86400 seconds / 24 hours). Original error: {error_msg}"
            else:
                logger.error(f"Sequential evaluation failed for Q{question_id}: {error_msg}")
            logger.error(traceback.format_exc())
            return {
                "success": False,
                "error": error_msg,
                "error_type": "sequential_evaluation_error",
                "question_id": question_id,
                "model_key": model_key,
                "attempt_number": attempt_number,
            }
    
    async def _run_single_agent_evaluation(
        self,
        question,
        subquestions,
        model_config,
        evaluation_id: str,
        model_attempt_id: int = None
    ) -> Dict[str, Any]:
        """
        Run single react agent evaluation with stage transitions handled by submit tool.
        
        This method starts the agent with only the main question. The submit tool
        handles stage transitions by prompting the model with subquestions.
        """
        try:
            start_time = timezone.now()
            
            # Set up environment variables for the submit tool and token tracking
            import os
            from .config.prompt_config import get_prompt_config
            
            # Get token limits from configuration
            prompt_config = get_prompt_config()
            token_limits = prompt_config.get_token_limits()
            
            
            os.environ['PROOFBENCH_EVALUATION_ID'] = evaluation_id
            os.environ['PROOFBENCH_CURRENT_STAGE'] = 'main_question'
            os.environ['PROOFBENCH_SUBMISSION_COUNT'] = '0'
            os.environ['PROOFBENCH_QUESTION_ID'] = str(question.id)
            
            # Token tracking environment variables
            os.environ['PROOFBENCH_STAGE_TOKEN_COUNT'] = '0'
            os.environ['PROOFBENCH_STAGE_TOKEN_LIMIT'] = str(token_limits['main_question'])
            os.environ['PROOFBENCH_MAX_TOKENS_PER_CALL'] = str(token_limits['max_per_call'])
            os.environ['PROOFBENCH_LAST_TOTAL_TOKENS'] = '0'  # Track cumulative tokens for incremental calculation
            
            # Create problem text with ONLY the main question initially
            problem_text = f"# Main Problem\n\n{question.text}"
            
            # Load system prompt for mathematical problem solving with token limits
            system_prompt = self._load_system_prompt(token_limits)
            
            # Create the sample for the react agent
            sample = Sample(
                input=problem_text,
                metadata={
                    "question_id": question.id,
                    "evaluation_id": evaluation_id,
                    "total_subquestions": len(subquestions),
                    "approach": "sequential_with_submit_transitions"
                }
            )
            
            # Check if tool emulation is enabled for this model
            # Note: emulate_tools is converted to boolean True/False when loaded from database
            use_tool_emulation = (
                model_config.model_args and
                model_config.model_args.get('emulate_tools') is True
            )

            # Create custom continuation function for multi-stage evaluation
            async def custom_agent_continue(state: AgentState) -> bool | str:
                """
                Determines evaluation continuation after interactions.
                Handles answer extraction, database saving, stage transitions, and token tracking.
                
                CRITICAL: on_continue is called after EVERY interaction.
                After submit: Assistant (with tool_call) -> Tool (response) -> on_continue called here
                
                Returns:
                - False to terminate evaluation 
                - True to continue silently
                - String to inject as User message
                """
                import json
                import os
                from asgiref.sync import sync_to_async
                from questions.models import Subquestion
                from .config.prompt_config import get_prompt_config
                
                # Early exit if no messages
                if not state.messages or len(state.messages) < 2:
                    return True
                
                
                # Get configuration
                prompt_config = get_prompt_config()
                
                # Check the last two messages to understand what just happened
                last_msg = state.messages[-1] if state.messages else None
                second_last_msg = state.messages[-2] if len(state.messages) >= 2 else None
                
                # Token tracking: extract tokens from latest message and check limits
                current_stage_tokens = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_COUNT', '0'))
                stage_token_limit = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_LIMIT', '300000'))
                
                # Extract tokens from the latest interaction
                latest_tokens = self._extract_tokens_from_state(state)
                if latest_tokens > 0:
                    current_stage_tokens += latest_tokens
                    os.environ['PROOFBENCH_STAGE_TOKEN_COUNT'] = str(current_stage_tokens)
                    logger.info(f"Token usage updated: {current_stage_tokens} of {stage_token_limit} tokens")
                
                # Check if token limit exceeded and we're not already in final round
                is_final_round = os.environ.get('PROOFBENCH_FINAL_ROUND', 'false') == 'true'
                
                if current_stage_tokens > stage_token_limit and not is_final_round:
                    logger.warning(f"Token limit exceeded: {current_stage_tokens} > {stage_token_limit}")
                    current_stage = os.environ.get('PROOFBENCH_CURRENT_STAGE', 'main_question')
                    
                    # Check if an answer has already been submitted for this stage
                    answer_already_submitted = os.environ.get(f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}', 'false') == 'true'
                    
                    # Also check if submit was just called in the current interaction
                    just_submitted = False
                    if second_last_msg and hasattr(second_last_msg, 'tool_calls'):
                        for tool_call in second_last_msg.tool_calls:
                            if tool_call.function == 'submit':
                                just_submitted = True
                                logger.info("Submit tool was just called in this interaction, not triggering final round")
                                break
                    
                    
                    if not (answer_already_submitted or just_submitted):
                        logger.info(f"Triggering final round for stage '{current_stage}' due to token limit (no answer submitted yet)")
                        
                        # Set final round flag
                        os.environ['PROOFBENCH_FINAL_ROUND'] = 'true'
                        os.environ['PROOFBENCH_TOKEN_EXHAUSTED'] = 'true'
                        os.environ['PROOFBENCH_FINAL_ROUND_PROMPT_SENT'] = 'true'
                        
                        # Return special prompt for final round
                        final_prompt = prompt_config.get_final_round_prompt()
                        return final_prompt
                    else:
                        # Answer already submitted, token limit exceeded
                        # Don't trigger final round, but also don't end evaluation
                        # Let the code continue to check for submit and handle transition
                        logger.info(f"Token limit exceeded for stage '{current_stage}' but answer already submitted, continuing to transition")
                
                # Handle final round - any tool call or text is treated as submission
                is_final_round = os.environ.get('PROOFBENCH_FINAL_ROUND', 'false') == 'true'
                final_round_prompt_sent = os.environ.get('PROOFBENCH_FINAL_ROUND_PROMPT_SENT', 'false') == 'true'
                
                if is_final_round and final_round_prompt_sent:
                    
                    # Check if model made any tool call or text response
                    answer = None
                    
                    # We need to find the ASSISTANT's response, not the user prompt
                    # Check for tool calls in second-last message if it's from assistant
                    if second_last_msg and hasattr(second_last_msg, 'role') and second_last_msg.role == 'assistant':
                        if hasattr(second_last_msg, 'tool_calls') and second_last_msg.tool_calls:
                            # Extract content from first tool call
                            tool_call = second_last_msg.tool_calls[0]
                            
                            try:
                                args = json.loads(tool_call.arguments) if isinstance(tool_call.arguments, str) else tool_call.arguments
                                # Try to get answer from various possible argument names
                                answer = args.get('answer') or args.get('sage_code') or args.get('code') or args.get('cmd') or str(args)
                            except:
                                answer = str(tool_call.arguments)
                        
                        # Check for text response if no tool call
                        elif second_last_msg.content:
                            # Handle both string content and structured content
                            if isinstance(second_last_msg.content, str):
                                answer = clean_thinking_tags(second_last_msg.content)
                            elif hasattr(second_last_msg.content, '__iter__'):
                                # Content might be a list of ContentText/ContentReasoning objects
                                text_parts = []
                                for item in second_last_msg.content:
                                    if hasattr(item, 'text'):
                                        text_parts.append(item.text)
                                    elif hasattr(item, 'content'):
                                        text_parts.append(str(item.content))
                                combined_text = '\n'.join(text_parts) if text_parts else str(second_last_msg.content)
                                answer = clean_thinking_tags(combined_text)
                            else:
                                answer = clean_thinking_tags(str(second_last_msg.content))
                    
                    # Also check the last message if it's from assistant (in case we're called after assistant response)
                    elif last_msg and hasattr(last_msg, 'role') and last_msg.role == 'assistant':
                        if hasattr(last_msg, 'tool_calls') and last_msg.tool_calls:
                            # Extract content from first tool call
                            tool_call = last_msg.tool_calls[0]
                            
                            try:
                                args = json.loads(tool_call.arguments) if isinstance(tool_call.arguments, str) else tool_call.arguments
                                # Try to get answer from various possible argument names
                                answer = args.get('answer') or args.get('sage_code') or args.get('code') or args.get('cmd') or str(args)
                            except:
                                answer = str(tool_call.arguments)
                        
                        # Check for text response if no tool call
                        elif last_msg.content:
                            # Handle both string content and structured content
                            if isinstance(last_msg.content, str):
                                answer = clean_thinking_tags(last_msg.content)
                            elif hasattr(last_msg.content, '__iter__'):
                                # Content might be a list of ContentText/ContentReasoning objects
                                text_parts = []
                                for item in last_msg.content:
                                    if hasattr(item, 'text'):
                                        text_parts.append(item.text)
                                    elif hasattr(item, 'content'):
                                        text_parts.append(str(item.content))
                                combined_text = '\n'.join(text_parts) if text_parts else str(last_msg.content)
                                answer = clean_thinking_tags(combined_text)
                            else:
                                answer = clean_thinking_tags(str(last_msg.content))
                    
                    # If we found an answer, save it and transition
                    if answer:
                        current_stage = os.environ.get('PROOFBENCH_CURRENT_STAGE', 'main_question')
                        
                        # Save with ran_out_of_tokens=True
                        if model_attempt_id:
                            await self._save_answer_to_database(current_stage, answer, model_attempt_id, ran_out_of_tokens=True)
                            
                            # Mark that an answer has been submitted for this stage
                            os.environ[f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}'] = 'true'
                        
                        # Clear final round flags for next stage
                        os.environ['PROOFBENCH_FINAL_ROUND'] = 'false'
                        os.environ['PROOFBENCH_FINAL_ROUND_PROMPT_SENT'] = 'false'
                        
                        # Proceed with stage transition (same as normal submit)
                        question_id = os.environ.get('PROOFBENCH_QUESTION_ID')
                        if not question_id:
                            logger.warning("No PROOFBENCH_QUESTION_ID, ending evaluation")
                            return False
                        
                        # No need to re-fetch - subquestions is captured from outer scope
                        # subquestions variable is available from the closure
                        
                        # Verify subquestions is available from closure
                        try:
                            # Verify subquestions is available from closure
                            len(subquestions)
                        except NameError:
                            logger.error("subquestions not available in closure (final round)")
                            return False
                        
                        # Stage transition logic
                        if current_stage == "main_question":
                            if subquestions:
                                # Transition to first subquestion
                                first_subq = subquestions[0]
                                os.environ['PROOFBENCH_CURRENT_STAGE'] = f'subquestion_{first_subq.subquestion_order}'
                                
                                # Reset token counter for new stage
                                token_limits = prompt_config.get_token_limits()
                                os.environ['PROOFBENCH_STAGE_TOKEN_COUNT'] = '0'
                                os.environ['PROOFBENCH_STAGE_TOKEN_LIMIT'] = str(token_limits['subquestion'])
                                # Clear answer submission flag from previous stage
                                os.environ.pop(f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}', None)
                                
                                logger.info(f"Transitioning from main to subquestion {first_subq.subquestion_order} after final round")
                                
                                # Return subquestion prompt
                                transition_prompt = prompt_config.get_evaluation_transition_prompt(
                                    'main_to_subquestion',
                                    subquestion_order=first_subq.subquestion_order.upper(),
                                    subquestion_text=first_subq.text
                                )
                                return transition_prompt
                            else:
                                # No subquestions, evaluation complete
                                logger.info("No subquestions found, evaluation complete after final round")
                                return False
                        
                        elif current_stage.startswith("subquestion_"):
                            # Find next subquestion
                            current_order = current_stage.replace('subquestion_', '')
                            next_subq = None
                            for i, subq in enumerate(subquestions):
                                if subq.subquestion_order == current_order and i + 1 < len(subquestions):
                                    next_subq = subquestions[i + 1]
                                    break
                            
                            if next_subq:
                                # Transition to next subquestion
                                os.environ['PROOFBENCH_CURRENT_STAGE'] = f'subquestion_{next_subq.subquestion_order}'
                                os.environ['PROOFBENCH_STAGE_TOKEN_COUNT'] = '0'
                                # Clear answer submission flag from previous stage
                                os.environ.pop(f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}', None)
                                
                                logger.info(f"Transitioning to subquestion {next_subq.subquestion_order} after final round")
                                
                                return prompt_config.get_evaluation_transition_prompt(
                                    'subquestion_to_subquestion',
                                    current_subquestion=current_order.upper(),
                                    next_subquestion=next_subq.subquestion_order.upper(),
                                    subquestion_text=next_subq.text
                                )
                            else:
                                # All subquestions complete
                                logger.info("All subquestions complete after final round, evaluation ending")
                                return False
                        
                        else:
                            # Unknown stage
                            logger.error(f"Unknown stage after final round: {current_stage}")
                            return False
                
                
                # Check if we're waiting for confirmation of a malformed submit
                awaiting_confirmation = os.environ.get('PROOFBENCH_AWAITING_SUBMIT_CONFIRMATION', 'false') == 'true'

                if awaiting_confirmation:
                    # Check if the model wants to abort the submission
                    # The model's response could be in either last_msg or second_last_msg
                    # depending on whether there's a tool call or just text
                    confirmation_text = ""

                    # First try last_msg (most common case - assistant responds with acknowledgement)
                    if last_msg and hasattr(last_msg, 'content') and hasattr(last_msg, 'role'):
                        if last_msg.role == 'assistant':
                            if isinstance(last_msg.content, str):
                                confirmation_text = last_msg.content.strip()
                            elif hasattr(last_msg.content, '__iter__'):
                                text_parts = []
                                for item in last_msg.content:
                                    if hasattr(item, 'text'):
                                        text_parts.append(item.text)
                                confirmation_text = ' '.join(text_parts).strip()

                    # Fall back to second_last_msg (if on_continue is called after a tool response)
                    if not confirmation_text and second_last_msg and hasattr(second_last_msg, 'content') and hasattr(second_last_msg, 'role'):
                        if second_last_msg.role == 'assistant':
                            if isinstance(second_last_msg.content, str):
                                confirmation_text = second_last_msg.content.strip()
                            elif hasattr(second_last_msg.content, '__iter__'):
                                text_parts = []
                                for item in second_last_msg.content:
                                    if hasattr(item, 'text'):
                                        text_parts.append(item.text)
                                confirmation_text = ' '.join(text_parts).strip()

                    # Check if the model wants to abort
                    if 'ABORT_SUBMISSION' in confirmation_text.upper():
                        # Model wants to abort! Clear flags and let it continue working
                        logger.info("Model aborted malformed submit, continuing evaluation")
                        os.environ['PROOFBENCH_AWAITING_SUBMIT_CONFIRMATION'] = 'false'
                        os.environ.pop('PROOFBENCH_PENDING_SUBMIT_ANSWER', None)
                        # Return continue prompt to let model keep working
                        return prompt_config.get_continue_prompt(
                            current_tokens=int(os.environ.get('PROOFBENCH_STAGE_TOKEN_COUNT', '0')),
                            token_limit=int(os.environ.get('PROOFBENCH_STAGE_TOKEN_LIMIT', '0')),
                            include_emulation_reminder=use_tool_emulation
                        )
                    else:
                        # Model did not abort! Process the stored answer
                        stored_answer = os.environ.get('PROOFBENCH_PENDING_SUBMIT_ANSWER', '')
                        logger.info("Model did not abort malformed submit, processing stored answer")

                        # Clear confirmation flags
                        os.environ['PROOFBENCH_AWAITING_SUBMIT_CONFIRMATION'] = 'false'
                        os.environ.pop('PROOFBENCH_PENDING_SUBMIT_ANSWER', None)

                        # Process as if it was a normal confirmed submit
                        # We'll use the same logic as the normal submit path below
                        # Just set answer and jump to save/transition logic
                        answer = stored_answer
                        current_stage = os.environ.get('PROOFBENCH_CURRENT_STAGE', 'main_question')

                        # Save to database
                        if model_attempt_id:
                            ran_out_of_tokens = os.environ.get('PROOFBENCH_TOKEN_EXHAUSTED', 'false') == 'true'
                            await self._save_answer_to_database(current_stage, answer, model_attempt_id, ran_out_of_tokens=ran_out_of_tokens)
                            logger.info(f"Saved confirmed answer for {current_stage} to database")
                            os.environ[f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}'] = 'true'

                        # Get question ID for transitions
                        question_id = os.environ.get('PROOFBENCH_QUESTION_ID')
                        if not question_id:
                            logger.warning("No PROOFBENCH_QUESTION_ID, ending evaluation")
                            return False

                        # Verify subquestions is available from closure
                        try:
                            len(subquestions)
                        except NameError:
                            logger.error("subquestions not available in closure")
                            return False

                        # Stage transition logic (same as normal submit)
                        if current_stage == "main_question":
                            if subquestions:
                                # Transition to first subquestion
                                first_subq = subquestions[0]
                                os.environ['PROOFBENCH_CURRENT_STAGE'] = f'subquestion_{first_subq.subquestion_order}'

                                # Reset token counter for new stage and set subquestion limit
                                token_limits = prompt_config.get_token_limits()
                                os.environ['PROOFBENCH_STAGE_TOKEN_COUNT'] = '0'
                                os.environ['PROOFBENCH_STAGE_TOKEN_LIMIT'] = str(token_limits['subquestion'])
                                # Clear token exhaustion and answer submission flags for new stage
                                os.environ['PROOFBENCH_TOKEN_EXHAUSTED'] = 'false'
                                os.environ.pop(f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}', None)

                                logger.info(f"Transitioning from main to subquestion {first_subq.subquestion_order}")
                                logger.info(f"Reset token counter for subquestion, limit: {token_limits['subquestion']}")

                                # Return subquestion as User message using centralized prompt
                                return prompt_config.get_evaluation_transition_prompt(
                                    'main_to_subquestion',
                                    subquestion_order=first_subq.subquestion_order.upper(),
                                    subquestion_text=first_subq.text
                                )
                            else:
                                # No subquestions, evaluation complete
                                logger.info("No subquestions found, evaluation complete")
                                return False

                        elif current_stage.startswith("subquestion_"):
                            # We're in a subquestion, check for next one
                            current_letter = current_stage.replace("subquestion_", "")
                            current_index = ord(current_letter) - ord('a')

                            if current_index + 1 < len(subquestions):
                                # There's another subquestion
                                next_subq = subquestions[current_index + 1]

                                os.environ['PROOFBENCH_CURRENT_STAGE'] = f'subquestion_{next_subq.subquestion_order}'

                                # Reset token counter for new subquestion stage
                                os.environ['PROOFBENCH_STAGE_TOKEN_COUNT'] = '0'
                                # Clear token exhaustion and answer submission flags for new stage
                                os.environ['PROOFBENCH_TOKEN_EXHAUSTED'] = 'false'
                                os.environ.pop(f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}', None)

                                logger.info(f"Transitioning from subquestion {current_letter} to {next_subq.subquestion_order}")

                                # Return next subquestion as User message
                                return prompt_config.get_evaluation_transition_prompt(
                                    'subquestion_to_subquestion',
                                    current_subquestion=current_letter.upper(),
                                    next_subquestion=next_subq.subquestion_order.upper(),
                                    subquestion_text=next_subq.text
                                )
                            else:
                                # No more subquestions, evaluation complete
                                logger.info(f"Completed final subquestion {current_letter}, evaluation complete")
                                return False

                        else:
                            logger.error(f"Unknown stage: {current_stage}")
                            return False

                # Check for incomplete XML tool call tags (missing </tool_call>)
                # This must happen BEFORE the normal tool call check, because incomplete
                # XML tags never make it into tool_calls[] - they're just plain text
                # We check BOTH last_msg and second_last_msg because incomplete tool calls
                # might not trigger a tool response, so they could be in either position

                messages_to_check = []
                if last_msg and hasattr(last_msg, 'content') and hasattr(last_msg, 'role'):
                    if last_msg.role == 'assistant':
                        messages_to_check.append(('last', last_msg))
                if second_last_msg and hasattr(second_last_msg, 'content') and hasattr(second_last_msg, 'role'):
                    if second_last_msg.role == 'assistant':
                        messages_to_check.append(('second_last', second_last_msg))

                for msg_label, msg in messages_to_check:
                    # Extract text content from the message
                    message_text = ""
                    if isinstance(msg.content, str):
                        message_text = msg.content
                    elif hasattr(msg.content, '__iter__'):
                        text_parts = []
                        for item in msg.content:
                            if hasattr(item, 'text'):
                                text_parts.append(item.text)
                        message_text = ' '.join(text_parts)

                    if message_text:
                        # First check for complete XML with malformed JSON inside
                        # (This must be checked before incomplete XML, because it has complete tags)
                        xml_malformed = detect_xml_with_malformed_json(message_text)
                        if xml_malformed and xml_malformed['tool_name'] == 'submit':
                            logger.warning(f"Detected complete XML with malformed JSON in {msg_label} message")

                            # Extract answer
                            arguments = xml_malformed['arguments']
                            answer = ""

                            if isinstance(arguments, dict):
                                answer = arguments.get('answer', '')
                            elif isinstance(arguments, str):
                                try:
                                    args_dict = json.loads(arguments)
                                    answer = args_dict.get('answer', '')
                                except:
                                    answer = arguments

                            if answer:
                                # Store the extracted answer and ask for confirmation
                                os.environ['PROOFBENCH_PENDING_SUBMIT_ANSWER'] = answer
                                os.environ['PROOFBENCH_AWAITING_SUBMIT_CONFIRMATION'] = 'true'

                                logger.info(f"Extracted answer from XML with malformed JSON (length: {len(answer)} chars)")

                                # Return confirmation prompt
                                confirmation_prompt = prompt_config.get_malformed_submit_confirmation(
                                    'xml_malformed_json',
                                    answer
                                )
                                return confirmation_prompt
                            else:
                                logger.error("Could not extract answer from XML with malformed JSON")
                                return "Error: Your submit tool call had invalid JSON syntax. Please try submitting again with proper formatting."

                        # Then check for raw JSON without any XML tags
                        raw_json_call = detect_raw_json_tool_call(message_text)
                        if raw_json_call and raw_json_call['tool_name'] == 'submit':
                            logger.warning(f"Detected raw JSON tool call (no XML tags) in {msg_label} message")

                            # Extract answer
                            arguments = raw_json_call['arguments']
                            answer = ""

                            if isinstance(arguments, dict):
                                answer = arguments.get('answer', '')
                            elif isinstance(arguments, str):
                                try:
                                    args_dict = json.loads(arguments)
                                    answer = args_dict.get('answer', '')
                                except:
                                    answer = arguments

                            if answer:
                                # Store the extracted answer and ask for confirmation
                                os.environ['PROOFBENCH_PENDING_SUBMIT_ANSWER'] = answer
                                os.environ['PROOFBENCH_AWAITING_SUBMIT_CONFIRMATION'] = 'true'

                                logger.info(f"Extracted answer from raw JSON tool call (length: {len(answer)} chars)")

                                # Return confirmation prompt
                                confirmation_prompt = prompt_config.get_malformed_submit_confirmation(
                                    'raw_json_no_xml',
                                    answer
                                )
                                return confirmation_prompt
                            else:
                                logger.error("Could not extract answer from raw JSON tool call")
                                return "Error: Your submit tool call was malformed (missing XML tags) and the answer could not be extracted. Please try submitting again with proper formatting."

                        # Then check for incomplete XML (missing closing tag)
                        incomplete_call = detect_incomplete_tool_call(message_text)
                        if incomplete_call and incomplete_call['tool_name'] == 'submit':
                            logger.warning(f"Detected incomplete XML tool call (missing </tool_call> tag) in {msg_label} message")

                            # Try to extract answer from the arguments
                            arguments = incomplete_call['arguments']
                            answer = ""

                            if isinstance(arguments, dict):
                                answer = arguments.get('answer', '')
                            elif isinstance(arguments, str):
                                # Arguments might be a JSON string
                                try:
                                    args_dict = json.loads(arguments)
                                    answer = args_dict.get('answer', '')
                                except:
                                    answer = arguments

                            if answer:
                                # Store the extracted answer and ask for confirmation
                                os.environ['PROOFBENCH_PENDING_SUBMIT_ANSWER'] = answer
                                os.environ['PROOFBENCH_AWAITING_SUBMIT_CONFIRMATION'] = 'true'

                                logger.info(f"Extracted answer from incomplete tool call (length: {len(answer)} chars)")

                                # Return confirmation prompt
                                confirmation_prompt = prompt_config.get_malformed_submit_confirmation(
                                    'incomplete_xml',
                                    answer
                                )
                                return confirmation_prompt
                            else:
                                logger.error("Could not extract answer from incomplete tool call")
                                return "Error: Your submit tool call was incomplete (missing closing tag) and the answer could not be extracted. Please try submitting again with proper formatting."

                # CRITICAL: Check if we just processed a submit tool
                # Pattern: Assistant (with submit tool call) -> Tool (response "Answer recorded")
                #
                # IMPORTANT: With parallel tool calls, there are multiple tool response messages,
                # so we need to search backward to find the assistant message with tool_calls.
                # We can't just check second_last_msg because with N parallel calls, the assistant
                # message is at position -(N+1).
                #
                # CRITICAL FIX: We must NOT reprocess old tool calls from previous interactions.
                # If there's a user message between the found assistant and the current position,
                # it means we've already processed those tool calls (the user message was our
                # stage transition prompt). We should skip them.

                # Search backward through recent messages to find the most recent assistant with tool_calls
                # BUT: Stop if we hit a user message first (that means previous tool calls were already handled)
                assistant_with_tools = None
                for i in range(len(state.messages) - 1, max(0, len(state.messages) - 10), -1):
                    msg = state.messages[i]
                    if hasattr(msg, 'role'):
                        if msg.role == 'user':
                            # Hit a user message - any tool calls before this were already processed
                            # in a previous on_continue call. Don't look further back.
                            logger.debug(f"Found user message at position {i}, stopping backward search")
                            break
                        elif msg.role == 'assistant' and hasattr(msg, 'tool_calls') and msg.tool_calls:
                            assistant_with_tools = msg
                            logger.debug(f"Found assistant with tool_calls at position {i}")
                            break

                if assistant_with_tools:
                    # Check if submit was among the tool calls
                    for tool_call in assistant_with_tools.tool_calls:
                        # Debug logging
                        logger.debug(f"Checking tool_call: function={tool_call.function}, type={type(tool_call.function)}")

                        if tool_call.function == 'submit':
                            # YES! We just processed a submit. Handle the stage transition.

                            try:
                                # Try to extract answer from the tool call
                                args = json.loads(tool_call.arguments) if isinstance(tool_call.arguments, str) else tool_call.arguments
                                answer = args.get('answer', '')
                            except (json.JSONDecodeError, ValueError, AttributeError) as e:
                                # JSON parsing failed - this is a malformed submit
                                logger.warning(f"Malformed submit tool call detected: {e}")
                                logger.info(f"Raw tool call arguments: {tool_call.arguments[:500]}...")  # Log first 500 chars

                                # Try to extract answer using heuristics
                                extracted_answer = extract_answer_from_malformed_submit(str(tool_call.arguments))

                                if extracted_answer:
                                    # Store the extracted answer and ask for confirmation
                                    os.environ['PROOFBENCH_PENDING_SUBMIT_ANSWER'] = extracted_answer
                                    os.environ['PROOFBENCH_AWAITING_SUBMIT_CONFIRMATION'] = 'true'

                                    logger.info(f"Extracted answer from malformed submit (length: {len(extracted_answer)} chars)")

                                    # Return confirmation prompt
                                    confirmation_prompt = prompt_config.get_malformed_submit_confirmation(
                                        'tool_call_malformed_json',
                                        extracted_answer
                                    )
                                    return confirmation_prompt
                                else:
                                    # Could not extract answer, log error and let model continue
                                    logger.error("Could not extract answer from malformed submit, asking model to try again")
                                    return "Error: Your submit tool call was malformed and the answer could not be extracted. Please try submitting again with proper JSON formatting."

                            # Normal path - answer was successfully extracted
                            try:
                                
                                # Get current stage
                                current_stage = os.environ.get('PROOFBENCH_CURRENT_STAGE', 'main_question')
                                
                                # Save to database (check if we previously ran out of tokens)
                                if model_attempt_id:
                                    ran_out_of_tokens = os.environ.get('PROOFBENCH_TOKEN_EXHAUSTED', 'false') == 'true'
                                    await self._save_answer_to_database(current_stage, answer, model_attempt_id, ran_out_of_tokens=ran_out_of_tokens)
                                    logger.info(f"Saved answer for {current_stage} to database (ran_out_of_tokens={ran_out_of_tokens})")
                                    
                                    # Mark that an answer has been submitted for this stage
                                    os.environ[f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}'] = 'true'
                                
                                # Get question ID for transitions
                                question_id = os.environ.get('PROOFBENCH_QUESTION_ID')
                                if not question_id:
                                    logger.warning("No PROOFBENCH_QUESTION_ID, ending evaluation")
                                    return False
                                
                                # No need to re-fetch - subquestions is captured from outer scope
                                # subquestions variable is available from the closure
                                
                                # Verify subquestions is available from closure
                                try:
                                    # Verify subquestions is available from closure
                                    len(subquestions)
                                except NameError:
                                    logger.error("subquestions not available in closure")
                                    return False
                                
                                # Stage transition logic
                                if current_stage == "main_question":
                                    if subquestions:
                                        # Transition to first subquestion
                                        first_subq = subquestions[0]
                                        os.environ['PROOFBENCH_CURRENT_STAGE'] = f'subquestion_{first_subq.subquestion_order}'
                                        
                                        # Reset token counter for new stage and set subquestion limit
                                        token_limits = prompt_config.get_token_limits()
                                        os.environ['PROOFBENCH_STAGE_TOKEN_COUNT'] = '0'
                                        os.environ['PROOFBENCH_STAGE_TOKEN_LIMIT'] = str(token_limits['subquestion'])
                                        # Clear token exhaustion and answer submission flags for new stage
                                        os.environ['PROOFBENCH_TOKEN_EXHAUSTED'] = 'false'
                                        os.environ.pop(f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}', None)
                                        # Don't reset PROOFBENCH_LAST_TOTAL_TOKENS - we want cumulative tracking across stages
                                        
                                        logger.info(f"Transitioning from main to subquestion {first_subq.subquestion_order}")
                                        logger.info(f"Reset token counter for subquestion, limit: {token_limits['subquestion']}")
                                        
                                        # Return subquestion as User message using centralized prompt
                                        return prompt_config.get_evaluation_transition_prompt(
                                            'main_to_subquestion',
                                            subquestion_order=first_subq.subquestion_order.upper(),
                                            subquestion_text=first_subq.text
                                        )
                                    else:
                                        # No subquestions, evaluation complete
                                        logger.info("No subquestions found, evaluation complete")
                                        return False
                                
                                elif current_stage.startswith("subquestion_"):
                                    # We're in a subquestion, check for next one
                                    current_letter = current_stage.replace("subquestion_", "")
                                    current_index = ord(current_letter) - ord('a')

                                    if current_index + 1 < len(subquestions):
                                        # There's another subquestion
                                        next_subq = subquestions[current_index + 1]
                                        
                                        os.environ['PROOFBENCH_CURRENT_STAGE'] = f'subquestion_{next_subq.subquestion_order}'
                                        
                                        # Reset token counter for new subquestion stage
                                        os.environ['PROOFBENCH_STAGE_TOKEN_COUNT'] = '0'
                                        # Clear token exhaustion and answer submission flags for new stage
                                        os.environ['PROOFBENCH_TOKEN_EXHAUSTED'] = 'false'
                                        os.environ.pop(f'PROOFBENCH_ANSWER_SUBMITTED_{current_letter}', None)
                                        # Keep same subquestion token limit
                                        # Don't reset PROOFBENCH_LAST_TOTAL_TOKENS - we want cumulative tracking
                                        
                                        
                                        # Return next subquestion as User message using centralized prompt
                                        transition_prompt = prompt_config.get_evaluation_transition_prompt(
                                            'subquestion_to_subquestion',
                                            current_subquestion=current_letter.upper(),
                                            next_subquestion=next_subq.subquestion_order.upper(),
                                            subquestion_text=next_subq.text
                                        )
                                        return transition_prompt
                                    else:
                                        # All subquestions complete
                                        return False
                                
                                else:
                                    # Unknown stage - shouldn't happen
                                    logger.error(f"Unknown stage: {current_stage}")
                                    return False
                                    
                            except Exception as e:
                                logger.error(f"Error processing submit: {str(e)}")
                                import traceback
                                logger.error(traceback.format_exc())
                                # On error, let evaluation continue with prompt (with token info)
                                current_tokens = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_COUNT', '0'))
                                token_limit = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_LIMIT', '300000'))
                                return prompt_config.get_continue_prompt(current_tokens, token_limit, include_emulation_reminder=use_tool_emulation)
                            
                            # CRITICAL: After processing submit, we've either:
                            # 1. Returned a subquestion (User message)
                            # 2. Returned False (terminate)
                            # We should NEVER reach here after submit
                
                # If we get here, submit was NOT just called
                # Check if the model needs prompting to continue

                # CRITICAL FIX: Check if answer was already submitted in parallel tool calls
                # This handles the case where model makes parallel calls like (3 web_search + 1 submit)
                # The submit may have been processed above, so don't return continue prompt
                current_stage = os.environ.get('PROOFBENCH_CURRENT_STAGE', 'main_question')
                answer_already_submitted = os.environ.get(f'PROOFBENCH_ANSWER_SUBMITTED_{current_stage}', 'false') == 'true'

                if answer_already_submitted:
                    # Submit was already processed! Don't prompt to continue.
                    # Return True to continue silently (evaluation will end naturally)
                    logger.info(f"Answer already submitted for {current_stage}, continuing silently")
                    return True

                # Case 1: Assistant made tool calls (but not submit)
                if hasattr(last_msg, 'tool_calls') and last_msg.tool_calls:
                    # Model just made tool calls but didn't submit - show token info
                    current_tokens = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_COUNT', '0'))
                    token_limit = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_LIMIT', '300000'))
                    return prompt_config.get_continue_prompt(current_tokens, token_limit, include_emulation_reminder=use_tool_emulation)

                # Case 2: Check if second-last was assistant with non-submit tools
                if second_last_msg and hasattr(second_last_msg, 'tool_calls'):
                    # We already handled submit above, so these are other tools
                    current_tokens = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_COUNT', '0'))
                    token_limit = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_LIMIT', '300000'))
                    return prompt_config.get_continue_prompt(current_tokens, token_limit, include_emulation_reminder=use_tool_emulation)

                # Case 3: Assistant message without any tool calls
                if hasattr(last_msg, 'role') and getattr(last_msg, 'role', None) == 'assistant':
                    if not hasattr(last_msg, 'tool_calls') or not last_msg.tool_calls:
                        # Assistant didn't make tool calls - show token info
                        current_tokens = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_COUNT', '0'))
                        token_limit = int(os.environ.get('PROOFBENCH_STAGE_TOKEN_LIMIT', '300000'))
                        continue_prompt = prompt_config.get_continue_prompt(current_tokens, token_limit, include_emulation_reminder=use_tool_emulation)
                        return continue_prompt
                
                # Default: continue silently
                return True
            submit_tool = self._create_custom_submit_tool()

            # Best practice web search configuration:
            # 1. Use internal providers for models that support them (faster, free)
            # 2. Always include external fallback (Tavily) for other models
            # 3. Skip Gemini internal search (conflicts with other tools)
            # 4. EXCEPTION: Gemini 3 Pro Preview uses Google CSE to avoid thought signature issues

            import os

            # Check if this is Gemini 3 Pro Preview - needs special configuration
            is_gemini_3_pro_preview = 'gemini-3-pro-preview' in model_config.model_name

            if is_gemini_3_pro_preview:
                # Gemini 3 Pro Preview: Use Tavily only (internal Gemini search conflicts with other tools)
                # With inspect-ai 0.3.148, thought signature handling should be fixed
                # Note: Custom Google CSE tool available in inspect_tools.py for future refinement
                web_search_providers = {
                    "tavily": {
                        "max_results": 5,
                        "search_depth": "basic",
                        "topic": "general"
                    }
                }
                logger.info("Gemini 3 Pro Preview detected: Using Tavily for web search (internal Gemini search conflicts with tools)")
            else:
                # All other models: Use standard multi-provider configuration
                web_search_providers = {
                    # Internal providers (work only with their respective models)
                    "openai": True,      # Works with: gpt-4o, gpt-4o-mini, gpt-5, o4-mini, o3
                    "anthropic": True,   # Works with: claude-opus-4*, claude-sonnet-4*
                    "grok": True,        # Works with: grok-3, grok-4 models
                    "perplexity": True,  # Works with: perplexity models (if any)
                    # Note: "gemini" excluded - it conflicts with other tools

                    # External fallback - works with ALL models including Gemini
                    "tavily": {
                        "max_results": 5,
                        "search_depth": "basic",  # Use "advanced" for deeper research
                        "topic": "general"        # Use "news" for current events
                    }
                }

                # Check if Tavily is configured
                if not os.getenv('TAVILY_API_KEY'):
                    logger.warning("TAVILY_API_KEY not set. Web search will only work for models with internal providers.")
                    # Remove Tavily if not configured
                    web_search_providers.pop("tavily", None)
            
            # Configure tools with best practice web search
            # Set timeout for individual tool calls (15 minutes)
            TOOL_TIMEOUT = 900  # 900 seconds = 15 minutes
            
            # Get mathematical tools with proper timeout
            mathematical_tools = get_mathematical_tools(timeout=TOOL_TIMEOUT)

            if web_search_providers:
                tool_list = [
                    python(timeout=TOOL_TIMEOUT),
                    bash(timeout=TOOL_TIMEOUT),
                    web_search(providers=web_search_providers),  # Web search has its own timeout handling
                    *mathematical_tools,
                    submit_tool
                ]
                logger.info(f"Web search configured with providers: {list(web_search_providers.keys())}")
                logger.info(f"Tool timeout set to {TOOL_TIMEOUT} seconds (15 minutes)")
            else:
                # No web search available
                tool_list = [
                    python(timeout=TOOL_TIMEOUT),
                    bash(timeout=TOOL_TIMEOUT),
                    *mathematical_tools,
                    submit_tool
                ]
                logger.warning("No web search providers available. Models will work without web search.")
                logger.info(f"Tool timeout set to {TOOL_TIMEOUT} seconds (15 minutes)")
            
            # Create task with react agent
            task = Task(
                dataset=MemoryDataset([sample]),
                solver=react(
                    prompt=system_prompt,
                    tools=tool_list,
                    submit=False,  # Disable automatic submit handling
                    on_continue=custom_agent_continue  # Use our custom continuation logic
                ),
                name=f"improofbench_sequential_{evaluation_id}",
                metadata={
                    "evaluation_id": evaluation_id,
                    "question_id": question.id,
                    "model_key": model_config.display_name,
                    "approach": "sequential_with_submit_transitions"
                }
            )
            
            # Run the evaluation
            logger.info(f"Starting single agent sequential evaluation for {evaluation_id}")

            # Monkey-patch Fireworks models to enable streaming for max_tokens > 5000
            # Fireworks requires stream=True for requests with max_tokens > 5000
            if model_config.company == 'fireworks':
                logger.info("Applying Fireworks streaming patch for max_tokens > 5000")
                from inspect_ai.model._providers.openai_compatible import OpenAICompatibleAPI
                from typing import Any, cast
                from openai.types.chat import ChatCompletion

                # Store original method
                original_generate_completion = OpenAICompatibleAPI._generate_completion

                # Create patched version that adds stream=True and handles streaming response
                async def patched_generate_completion(self, request: dict[str, Any], config) -> ChatCompletion:
                    # Add stream=True to request
                    request['stream'] = True
                    logger.info(f"Fireworks streaming enabled for request with max_tokens={request.get('max_tokens', 'unset')}")
                    logger.debug(f"Full request keys: {list(request.keys())}")

                    # Make streaming request
                    stream = await self.client.chat.completions.create(**request)

                    # Collect all chunks and accumulate the complete response
                    accumulated_content = ""
                    accumulated_tool_calls = {}  # Dict to accumulate tool calls by index
                    finish_reason = None
                    model = None
                    usage = None
                    first_chunk = None

                    async for chunk in stream:
                        if first_chunk is None:
                            first_chunk = chunk
                            model = chunk.model

                        # Accumulate content from delta
                        if chunk.choices and len(chunk.choices) > 0:
                            choice = chunk.choices[0]
                            if hasattr(choice, 'delta') and choice.delta:
                                if hasattr(choice.delta, 'content') and choice.delta.content:
                                    accumulated_content += choice.delta.content
                                if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
                                    # Accumulate tool calls - they come in pieces
                                    for delta_tc in choice.delta.tool_calls:
                                        idx = delta_tc.index
                                        if idx not in accumulated_tool_calls:
                                            # Initialize new tool call
                                            accumulated_tool_calls[idx] = {
                                                'id': delta_tc.id if hasattr(delta_tc, 'id') else None,
                                                'type': delta_tc.type if hasattr(delta_tc, 'type') else 'function',
                                                'function': {
                                                    'name': '',
                                                    'arguments': ''
                                                }
                                            }

                                        # Update tool call with delta
                                        if hasattr(delta_tc, 'id') and delta_tc.id:
                                            accumulated_tool_calls[idx]['id'] = delta_tc.id
                                        if hasattr(delta_tc, 'type') and delta_tc.type:
                                            accumulated_tool_calls[idx]['type'] = delta_tc.type
                                        if hasattr(delta_tc, 'function') and delta_tc.function:
                                            if hasattr(delta_tc.function, 'name') and delta_tc.function.name:
                                                accumulated_tool_calls[idx]['function']['name'] += delta_tc.function.name
                                            if hasattr(delta_tc.function, 'arguments') and delta_tc.function.arguments:
                                                accumulated_tool_calls[idx]['function']['arguments'] += delta_tc.function.arguments

                            if hasattr(choice, 'finish_reason') and choice.finish_reason:
                                finish_reason = choice.finish_reason
                                logger.debug(f"Fireworks stream finish_reason: {finish_reason}")

                        # Capture usage from final chunk
                        if hasattr(chunk, 'usage') and chunk.usage:
                            usage = chunk.usage
                            logger.debug(f"Fireworks stream usage: {usage}")

                    if not first_chunk:
                        raise RuntimeError("No chunks received from streaming response")

                    # Log accumulated content length for debugging
                    logger.info(f"Fireworks streaming complete: {len(accumulated_content)} chars, finish_reason={finish_reason}")

                    # Reconstruct a proper ChatCompletion object
                    from openai.types.chat.chat_completion import ChatCompletion, Choice
                    from openai.types.chat.chat_completion_message import ChatCompletionMessage
                    from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall, Function
                    from openai.types.completion_usage import CompletionUsage

                    # Convert accumulated tool calls to proper format
                    tool_calls_list = None
                    if accumulated_tool_calls:
                        tool_calls_list = []
                        for idx in sorted(accumulated_tool_calls.keys()):
                            tc_data = accumulated_tool_calls[idx]
                            tool_call = ChatCompletionMessageToolCall(
                                id=tc_data['id'] or f"call_{idx}",
                                type=tc_data['type'],
                                function=Function(
                                    name=tc_data['function']['name'],
                                    arguments=tc_data['function']['arguments']
                                )
                            )
                            tool_calls_list.append(tool_call)

                    # Create the message object
                    message = ChatCompletionMessage(
                        role="assistant",
                        content=accumulated_content if accumulated_content else None,
                        tool_calls=tool_calls_list
                    )

                    # CRITICAL: Add .text attribute for tool emulation parsing
                    # Llama31Handler.parse_assistant_response() expects choice.message.text
                    # but ChatCompletionMessage only has .content by default
                    object.__setattr__(message, 'text', accumulated_content or "")

                    # Create the choice object
                    choice = Choice(
                        finish_reason=finish_reason or "stop",
                        index=0,
                        message=message,
                        logprobs=None
                    )

                    # Create the final ChatCompletion
                    completion = ChatCompletion(
                        id=first_chunk.id,
                        choices=[choice],
                        created=first_chunk.created,
                        model=model or request.get('model', 'unknown'),
                        object="chat.completion",
                        usage=usage
                    )

                    return completion

                # Apply the patch
                OpenAICompatibleAPI._generate_completion = patched_generate_completion
                logger.info("Fireworks streaming patch applied successfully")

            def run_eval():
                # Pass model configuration with all arguments
                eval_kwargs = {
                    "tasks": [task],
                    "model": model_config.model_name,
                    "log_dir": str(self.config.LOGS_DIR),
                    "max_connections": 1,
                    "max_samples": 1,
                    "sandbox": get_proofbench_sandbox(),
                    "fail_on_error": False,
                    "time_limit": 86400  # 86400 seconds (24 hours) time limit
                }
                
                # Add reasoning parameters directly to eval() for o-series models
                # These parameters must be passed to eval(), not in model_args
                if hasattr(model_config, 'reasoning_args') and model_config.reasoning_args:
                    eval_kwargs.update(model_config.reasoning_args)
                
                # Add other model-specific arguments if they exist
                if model_config.model_args:
                    eval_kwargs["model_args"] = model_config.model_args
                
                return eval(**eval_kwargs)
            
            # Execute in thread pool to avoid event loop conflict
            loop = asyncio.get_event_loop()
            with concurrent.futures.ThreadPoolExecutor() as executor:
                logs = await loop.run_in_executor(executor, run_eval)
            
            end_time = timezone.now()
            
            # Check if evaluation hit time limit
            timeout_occurred = False
            timeout_error_msg = None
            
            # Check the logs for timeout indication
            if logs and len(logs) > 0:
                first_log = logs[0]
                # Check if the evaluation status indicates timeout
                if hasattr(first_log, 'status') and first_log.status == 'error':
                    if hasattr(first_log, 'error') and first_log.error:
                        error_str = str(first_log.error)
                        if 'time limit' in error_str.lower() or 'timeout' in error_str.lower():
                            timeout_occurred = True
                            timeout_error_msg = error_str
                            logger.warning(f"Detected timeout in evaluation logs: {error_str}")
                
                # Also check samples for limit field indicating timeout
                if hasattr(first_log, 'samples') and first_log.samples:
                    for sample in first_log.samples:
                        if hasattr(sample, 'limit') and sample.limit:
                            if hasattr(sample.limit, 'type') and sample.limit.type == 'time':
                                timeout_occurred = True
                                timeout_error_msg = f"Time limit exceeded. limit: {sample.limit.limit} seconds"
                                logger.warning(f"Detected time limit in sample: {timeout_error_msg}")
                                break
            
            # Extract results
            results = {
                "success": not timeout_occurred,  # Mark as failed if timeout occurred
                "approach": "sequential_single_agent",
                "evaluation_id": evaluation_id,
                "question_id": question.id,
                "total_subquestions": len(subquestions),
                "start_time": start_time.isoformat(),
                "end_time": end_time.isoformat(),
                "duration_seconds": (end_time - start_time).total_seconds(),
                "logs": logs,
                "conversation_log": logs[0] if logs else None
            }
            
            # Add error information if timeout occurred
            if timeout_occurred:
                results["error"] = timeout_error_msg or "Evaluation exceeded time limit (86400 seconds / 24 hours)"
                results["error_type"] = "timeout"
            
            logger.info(f"Single agent sequential evaluation completed for {evaluation_id}")
            return results
            
        except Exception as e:
            # Check if this is a timeout error
            error_msg = str(e)
            if 'time limit' in error_msg.lower() or 'timeout' in error_msg.lower():
                logger.warning(f"Evaluation TIMEOUT in single agent evaluation: {error_msg}")
            else:
                logger.error(f"Error in single agent evaluation: {error_msg}")
            raise
    
    def _load_system_prompt(self, token_limits: Dict[str, int]) -> str:
        """Load the system prompt for evaluation with token limits."""
        # Use centralized prompt configuration
        from .config.prompt_config import get_prompt_config
        prompt_config = get_prompt_config()
        
        try:
            prompt_content = prompt_config.get_system_prompt(
                main_token_limit=token_limits['main_question'],
                subquestion_token_limit=token_limits['subquestion']
            )
            logger.info(f"Loaded system prompt from centralized config ({len(prompt_content)} characters)")
            logger.info(f"Token limits: main={token_limits['main_question']}, sub={token_limits['subquestion']}")
            return prompt_content
        except Exception as e:
            logger.error(f"Error loading system prompt from config: {e}")
            # Fallback to basic prompt without token limits
            return "You are an expert mathematical problem solver."
    
    def _extract_tokens_from_state(self, state) -> int:
        """
        Extract incremental token count from the latest model interaction.
        
        Uses Inspect AI's sample_model_usage() to get accurate token counts
        and calculates the incremental usage since the last check.
        
        Args:
            state: AgentState (not used, kept for compatibility)
            
        Returns:
            Incremental number of output + reasoning tokens used in latest interaction
        """
        try:
            # Get the current sample's model usage from Inspect's context
            usage_dict = sample_model_usage()
            
            if not usage_dict:
                logger.debug("No model usage data available yet")
                return 0
            
            # Sum up total output and reasoning tokens across all model calls
            total_output = 0
            total_reasoning = 0
            
            for model_name, usage in usage_dict.items():
                # For o4-mini and other models with reasoning tokens
                if hasattr(usage, 'output_tokens'):
                    total_output += usage.output_tokens or 0
                if hasattr(usage, 'reasoning_tokens') and usage.reasoning_tokens:
                    total_reasoning += usage.reasoning_tokens
                
            current_total = total_output + total_reasoning
            
            # Get the last recorded total to calculate incremental usage
            last_total = int(os.environ.get('PROOFBENCH_LAST_TOTAL_TOKENS', '0'))
            incremental_tokens = current_total - last_total
            
            # Update the last total for next calculation
            os.environ['PROOFBENCH_LAST_TOTAL_TOKENS'] = str(current_total)
            
            if incremental_tokens > 0:
                logger.info(f"Incremental token usage: {incremental_tokens} (cumulative: output={total_output}, reasoning={total_reasoning}, total={current_total})")
            
            return incremental_tokens
                
        except Exception as e:
            logger.error(f"WARNING: Token counting failed - unable to track token usage accurately!")
            logger.error(f"Token extraction error: {e}")
            logger.error("Token limits cannot be enforced without accurate token counting.")
            # Return 0 to avoid breaking the evaluation, but the warning has been logged
            return 0
    
    async def _get_question(self, question_id: int):
        """Get question from database (async wrapper)."""
        from asgiref.sync import sync_to_async
        get_question = sync_to_async(
            lambda: Question.objects.select_related('author').get(id=question_id)
        )
        return await get_question()


# Convenience function for external use
async def evaluate_question(
    question_id: int,
    model_key: str,
    attempt_number: int = 1,
    model_attempt_id: int = None
) -> Dict[str, Any]:
    """
    Convenience function to evaluate a question.
    
    This function evaluates the complete problem (main question + subquestions)
    in a single continuous conversation, using token-based limits instead of time limits.
    
    Args:
        question_id: Question ID to evaluate
        model_key: Model key from config
        attempt_number: Attempt number (1 or 2)
        model_attempt_id: Optional ModelAttempt ID for database saving
        
    Returns:
        Evaluation result dictionary with conversation log and answers
    """
    evaluator = Evaluator()
    return await evaluator.evaluate(
        question_id, model_key, attempt_number, model_attempt_id=model_attempt_id
    )
