"""
Evaluation Executor Service for Mathematical Model Evaluation

This module implements the core execution service that processes the evaluation queue,
runs model evaluations using the Inspect framework, and manages Docker containers
with time-based limits and resource constraints.
"""

import asyncio
import subprocess
import logging
import signal
import os
import json
import traceback
import time
from datetime import datetime
from typing import Dict, Any, Optional, List
from pathlib import Path
from contextlib import asynccontextmanager

# No in-memory locks needed - using subprocess-based evaluation
# Each evaluation runs in its own isolated Python process
# Company isolation handled via database locks (CompanyExecutionLock)

from django.conf import settings
from django.utils import timezone
from django.db import transaction

from .models import EvaluationQueue, ModelAnswer, ModelAttempt, ModelSubquestionAnswer, ExecutionTracker
from .queue_manager import queue_manager, get_next_work, complete_work
from .evaluator import Evaluator
from .terminal_log_manager import terminal_log_manager
from .db_config import DatabaseFrameworkConfig

logger = logging.getLogger(__name__)


class EvaluationExecutor:
    """
    Main evaluation executor that processes the queue and runs evaluations.
    
    Key Features:
    - Async evaluation processing with Inspect framework
    - 2-hour time limits for mathematical reasoning
    - Docker container lifecycle management
    - Terminal log capture and storage
    - Atomic status transitions and error handling
    - Company lock management with automatic cleanup
    """
    
    def __init__(self):
        """Initialize the evaluation executor."""
        self.config = DatabaseFrameworkConfig()
        self.evaluator = Evaluator()
        self.is_running = False
        self.is_shutting_down = False  # Track if shutdown is in progress
        self.shutdown_event = None  # Will be created when service starts
        self.current_evaluations = {}  # Track running evaluations

        # Configuration
        self.poll_interval = getattr(settings, 'QUEUE_POLL_INTERVAL', 30)  # 30 seconds

        logger.info("Evaluation executor initialized")
    
    async def start_service(self):
        """Start the evaluation service."""
        if self.is_running:
            logger.warning("Evaluation service already running")
            return

        self.is_running = True
        self.shutdown_event = asyncio.Event()  # Create event for this run
        logger.info("Starting evaluation service")

        # Setup signal handlers for graceful shutdown
        self._setup_signal_handlers()
        
        try:
            # Main service loop
            await self._service_loop()
        except KeyboardInterrupt:
            logger.info("Received interrupt signal")
        except Exception as e:
            logger.error(f"Service error: {str(e)}")
        finally:
            await self.shutdown_service()
    
    async def shutdown_service(self):
        """
        Gracefully shutdown the evaluation service.

        This method handles clean shutdown when the service receives SIGTERM/SIGINT:
        1. Stops accepting new evaluations
        2. Gives running evaluations a short grace period (5 seconds)
        3. Terminates all running subprocesses
        4. Resets terminated evaluations to 'pending' so they restart after reboot
        5. Clears all execution trackers

        This allows `systemctl --user stop evaluation_queue.service` to cleanly
        shut down everything, with evaluations automatically resuming on restart.
        """
        # Prevent multiple shutdown calls
        if self.is_shutting_down:
            return
        self.is_shutting_down = True
        self.is_running = False

        print("\n" + "=" * 60)
        print("EVALUATION SERVICE SHUTDOWN INITIATED")
        print("=" * 60)

        logger.info("Shutting down evaluation service")

        # Track which evaluations we're terminating (for database reset)
        terminated_queue_ids = []

        # Short grace period for running evaluations (5 seconds instead of 60)
        # Since we're doing a controlled shutdown, we don't need to wait long
        if self.current_evaluations:
            num_evals = len(self.current_evaluations)
            print(f"\n[1/4] Found {num_evals} running evaluation(s)")
            logger.info(f"Waiting briefly for {num_evals} evaluations...")

            # Collect queue IDs before we start terminating
            for eval_id, eval_info in self.current_evaluations.items():
                terminated_queue_ids.append(eval_id)

            # Brief grace period (5 seconds)
            try:
                grace_period = 5
                start_time = time.time()
                while self.current_evaluations and (time.time() - start_time) < grace_period:
                    await self._cleanup_completed_subprocesses()
                    if self.current_evaluations:
                        await asyncio.sleep(1)
            except Exception as e:
                logger.warning(f"Error during grace period: {str(e)}")

            # Terminate remaining subprocesses
            if self.current_evaluations:
                print(f"\n[2/4] Terminating {len(self.current_evaluations)} subprocess(es)...")
                await self._cancel_running_subprocesses()
            else:
                print("\n[2/4] All evaluations completed during grace period")
        else:
            print("\n[1/4] No running evaluations to terminate")
            print("[2/4] Skipping subprocess termination")

        # Reset terminated evaluations to 'pending' in the database
        print("\n[3/4] Resetting terminated evaluations to 'pending'...")
        await self._reset_evaluations_to_pending(terminated_queue_ids)

        # Clear all execution trackers
        print("\n[4/4] Clearing execution trackers...")
        await self._clear_all_execution_trackers()

        print("\n" + "=" * 60)
        print("SHUTDOWN COMPLETE")
        print("=" * 60)
        print("\nEvaluations will automatically resume when the service restarts.")
        print("To restart: systemctl --user start evaluation_queue.service\n")

        logger.info("Evaluation service shutdown complete")
    
    def _setup_signal_handlers(self):
        """Setup signal handlers for graceful shutdown using asyncio's signal handling."""
        loop = asyncio.get_running_loop()

        def signal_handler():
            logger.info("Received shutdown signal")
            self.is_running = False
            # Set the shutdown event to wake up any sleeping coroutines immediately
            if self.shutdown_event:
                self.shutdown_event.set()

        # Use asyncio's signal handling for proper event loop integration
        for sig in (signal.SIGINT, signal.SIGTERM):
            loop.add_signal_handler(sig, signal_handler)
    
    async def _service_loop(self):
        """Main service loop that processes the evaluation queue."""
        logger.info("Service loop started")

        while self.is_running:
            try:
                # Check for shutdown signal at start of each iteration
                if self.shutdown_event and self.shutdown_event.is_set():
                    logger.info("Shutdown event detected, exiting service loop")
                    break

                # Check current evaluation count
                active_count = len(self.current_evaluations)
                logger.debug(f"Service loop iteration - current evaluations: {active_count}/4")

                # Clean up completed subprocesses first
                await self._cleanup_completed_subprocesses()

                # Check for shutdown after cleanup
                if not self.is_running:
                    break

                # Try to get next evaluation (queue manager will handle all limits)
                logger.info("Getting next work from queue...")
                queue_item = await get_next_work()

                # Check for shutdown after getting work
                if not self.is_running:
                    break

                if queue_item:
                    company_name = queue_item.attempt.model.company.company_name
                    model_name = queue_item.attempt.model.model_name
                    question_id = queue_item.attempt.question.id
                    attempt_num = queue_item.attempt.attempt_number
                    tier_num = queue_item.attempt.model.tier.tier_number
                    
                    print(f"🚀 STARTING EVALUATION: Queue {queue_item.id} | {model_name} ({company_name}) | "
                          f"Question {question_id} | Tier {tier_num} Attempt {attempt_num}")
                    logger.info(f"Starting subprocess for queue ID {queue_item.id}: Q{question_id} with {model_name} "
                              f"({company_name}) - Tier {tier_num} Attempt {attempt_num}")
                    
                    # Start evaluation subprocess
                    try:
                        process = await self._process_evaluation(queue_item)
                        
                        # Start reading output in background
                        output_task = asyncio.create_task(self._read_subprocess_output(process, queue_item.id))
                        
                        self.current_evaluations[queue_item.id] = {
                            'process': process,
                            'queue_item': queue_item,
                            'started_at': timezone.now(),
                            'output_task': output_task,
                        }
                        
                        # Update PID in execution tracker
                        await queue_manager.update_subprocess_pid(queue_item, process.pid)
                        
                        print(f"   └─ Subprocess PID: {process.pid} | Running: {len(self.current_evaluations)}/4")
                        logger.info(f"Started subprocess {process.pid} for queue ID {queue_item.id}")
                    except Exception as e:
                        print(f"❌ FAILED TO START: Queue {queue_item.id} | Error: {str(e)}")
                        logger.error(f"Failed to start subprocess for queue ID {queue_item.id}: {str(e)}")
                        # Release the tracker since we couldn't start the subprocess
                        await queue_manager.release_execution_tracker(queue_item)
                else:
                    # No work available or at capacity, wait before polling again
                    # Use shutdown_event.wait() with timeout so we can wake up immediately on shutdown
                    logger.debug("No available work or at capacity, sleeping...")
                    try:
                        await asyncio.wait_for(self.shutdown_event.wait(), timeout=self.poll_interval)
                        # If we get here, shutdown was requested
                        break
                    except asyncio.TimeoutError:
                        # Normal timeout, continue polling
                        pass
                
            except Exception as e:
                logger.error(f"Error in service loop: {str(e)}")
                # Sleep before retrying, but wake up immediately on shutdown
                try:
                    await asyncio.wait_for(self.shutdown_event.wait(), timeout=self.poll_interval)
                    break  # Shutdown requested
                except asyncio.TimeoutError:
                    pass
    
    async def _process_evaluation(self, queue_item: EvaluationQueue) -> subprocess.Popen:
        """
        Start a subprocess to process a single evaluation from the queue.
        
        Args:
            queue_item: Queue item to process
            
        Returns:
            subprocess.Popen: The spawned subprocess
        """
        evaluation_id = queue_item.id
        company_name = queue_item.attempt.model.company.company_name
        
        logger.info(f"Starting subprocess for evaluation {evaluation_id}: "
                   f"Q{queue_item.attempt.question.id} with {queue_item.attempt.model.model_name} "
                   f"(Company: {company_name})")
        
        # Build command to run subprocess
        import sys
        cmd = [
            sys.executable,  # Use same Python interpreter
            'manage.py',
            'run_single_evaluation',
            '--queue-id', str(evaluation_id),
        ]
        
        # Prepare environment variables for subprocess
        subprocess_env = os.environ.copy()
        
        # Get API keys from database and ensure they're available to subprocess
        try:
            from asgiref.sync import sync_to_async
            get_api_keys_async = sync_to_async(self._get_api_keys_from_database)
            api_keys = await get_api_keys_async()
            for key_name in api_keys:
                if key_name in os.environ:
                    subprocess_env[key_name] = os.environ[key_name]
        except Exception as e:
            logger.warning(f"Could not load API keys from database: {e}")
        
        # Start subprocess in same directory
        try:
            # Use asyncio subprocess for better output handling
            process = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
                env=subprocess_env,
                cwd=os.path.dirname(os.path.dirname(__file__)),  # web/ directory
                preexec_fn=os.setsid  # Create new process group for proper cleanup
            )
            
            logger.info(f"Started subprocess {process.pid} for evaluation {evaluation_id}")
            return process
            
        except Exception as e:
            logger.error(f"Failed to start subprocess for evaluation {evaluation_id}: {str(e)}")
            # Mark evaluation as failed immediately using sync method
            try:
                # Use sync version since we're not in async context
                import django
                from asgiref.sync import sync_to_async
                complete_work_sync = sync_to_async(queue_manager.complete_evaluation)
                asyncio.create_task(complete_work_sync(queue_item, False, f"Subprocess start error: {str(e)}"))
            except Exception as db_error:
                logger.error(f"Error marking evaluation {evaluation_id} as failed: {str(db_error)}")
            raise
    
    def _get_api_keys_from_database(self) -> List[str]:
        """Get list of API key environment variable names from database."""
        from .models import Company
        
        api_keys = []
        for company in Company.objects.all():
            if company.api_key:
                api_keys.append(company.api_key)
        
        return api_keys
    
    def _get_model_key(self, model) -> str:
        """
        Get the model key for the evaluator based on the model ID.

        Args:
            model: Model object

        Returns:
            Model key for the evaluator (uses unique model ID)
        """
        # Use model ID as the unique key (model_name is no longer unique with CLI variants)
        return str(model.id)
    
    def _save_react_agent_result(self, result: Dict[str, Any], queue_item: EvaluationQueue):
        """
        Process and save react agent evaluation results to database.
        
        Args:
            result: Inspect AI evaluation result dictionary
            queue_item: Queue item being processed
            
        Returns:
            ModelAnswer: The main model answer record created
        """
        try:
            logger.info("Processing evaluation result...")
            
            # Extract basic evaluation info
            question_id = queue_item.question.id
            evaluation_success = result.get('success', True)  # Default to True for completed evaluations
            
            # Get the ModelAttempt linked to this queue item
            model_attempt = queue_item.attempt
            logger.info(f"Using ModelAttempt {model_attempt.id} for queue item {queue_item.id}")
            
            # Check if answer already exists (may have been saved by custom_agent_continue)
            existing_answer = ModelAnswer.objects.filter(
                attempt=model_attempt,
                question_id=question_id,
                model=queue_item.model
            ).first()
            
            if existing_answer:
                # Answer was already saved by custom_agent_continue during evaluation
                logger.info(f"ModelAnswer already saved by evaluator for attempt {model_attempt.id} (ID: {existing_answer.id})")
                model_answer = existing_answer
                main_answer = existing_answer.answer
            else:
                # Check if this was a timeout - if so, use a specific message
                error_msg = result.get('error', '')
                logger.info(f"DEBUG: Checking for timeout. error_msg = '{error_msg}'")
                logger.info(f"DEBUG: Full result keys: {result.keys()}")
                logger.info(f"DEBUG: Result success: {result.get('success')}")
                
                if 'time limit' in error_msg.lower() or 'timeout' in error_msg.lower():
                    main_answer = f"Evaluation timed out after {86400} seconds (24 hours). Model did not submit an answer."
                    logger.info(f"Evaluation timed out - using timeout message as answer")
                else:
                    # Fall back to extracting from logs (for backwards compatibility or if direct saving failed)
                    evaluation_id = f"q{question_id}_m{queue_item.model.model_name.replace('/', '_')}_a{queue_item.attempt_number}"
                    main_answer = self._extract_answer_from_inspect_result(result, evaluation_id, model_attempt, queue_item)
                    
                    logger.info(f"Extracted main answer from logs (length: {len(main_answer) if main_answer else 0})")
                    if main_answer:
                        logger.info(f"Answer preview: {main_answer[:200]}...")
                
                # Create ModelAnswer since it doesn't exist
                model_answer = ModelAnswer.objects.create(
                    attempt=model_attempt,
                    question_id=question_id,
                    model=queue_item.model,
                    answer=main_answer
                )
                logger.info(f"Created ModelAnswer {model_answer.id} for attempt {model_attempt.id}")
            
            # Subquestion answers are already extracted and saved in _extract_answer_from_inspect_result
            
            logger.info("Completed saving main answer and subquestion answers")
            logger.info(f"React agent result processing completed for ModelAttempt {model_attempt.id}")
            return model_answer
            
        except Exception as e:
            logger.error(f"Error saving react agent result: {str(e)}")
            raise
    
    def _extract_answer_from_inspect_result(self, result: Dict[str, Any], evaluation_id: str, model_attempt=None, queue_item=None) -> str:
        """
        Extract the main question answer from the evaluation result's conversation log.
        Also extracts and saves subquestion answers if model_attempt and queue_item are provided.
        
        Args:
            result: Inspect AI evaluation result dictionary containing conversation_log
            evaluation_id: Evaluation ID (kept for compatibility)
            model_attempt: ModelAttempt instance (optional, for subquestion saving)
            queue_item: EvaluationQueue item (optional, for subquestion saving)
            
        Returns:
            Main question answer string extracted from the log
        """
        try:
            logger.info("Extracting main question answer from conversation log...")
            import re
            
            # Extract answer from conversation log in result
            conversation_log = result.get('conversation_log')
            
            if not conversation_log:
                logger.warning("No conversation log found in result")
                return "No conversation log available"
            
            # Convert conversation log to string for pattern matching
            log_text = ""
            if isinstance(conversation_log, str):
                log_text = conversation_log
            elif hasattr(conversation_log, 'samples') and conversation_log.samples:
                # Extract from Inspect evaluation log structure
                for sample in conversation_log.samples:
                    if hasattr(sample, 'messages'):
                        for message in sample.messages:
                            if hasattr(message, 'content'):
                                # Don't use str() which escapes newlines - get content directly
                                content = message.content if isinstance(message.content, str) else str(message.content)
                                log_text += content + "\n"
            else:
                # Try to convert to string - but this might escape newlines
                log_text = conversation_log if isinstance(conversation_log, str) else str(conversation_log)
            
            # Try to extract answer for each possible stage
            # We'll look for answers with unique delimiters for each stage
            from .config.prompt_config import get_prompt_config
            prompt_config = get_prompt_config()
            
            # First try the new delimiter format for all possible stages
            stages_to_check = ['main_question', 'subquestion_a', 'subquestion_b', 'subquestion_c', 'subquestion_d']
            
            for stage in stages_to_check:
                answer_start, answer_end = prompt_config.get_answer_delimiters(stage)
                
                # Build pattern to extract answer between delimiters
                pattern = re.escape(answer_start) + r'(.*?)' + re.escape(answer_end)
                match = re.search(pattern, log_text, re.DOTALL)
                
                if match:
                    # Strip whitespace including newlines from both ends
                    main_answer = match.group(1).strip()
                    logger.info(f"Successfully extracted answer for stage '{stage}' using new delimiters")
                    logger.info(f"Answer length: {len(main_answer)} characters")
                    logger.info(f"Answer preview: {main_answer[:200]}...")
                    
                    # For main question, we found what we need
                    if stage == 'main_question':
                        # Store the main answer to return at the end
                        main_question_answer = main_answer
                    else:
                        # This is a subquestion answer - save it if we have the context
                        if model_attempt and queue_item:
                            self._save_subquestion_answer(stage, main_answer, model_attempt, queue_item)
                        
            # Return the main question answer if we found it
            if 'main_question_answer' in locals():
                return main_question_answer
            
            # Fallback to old pattern for backward compatibility
            # Pattern: "Answer recorded: **{answer}**"
            # Updated to handle answers with markdown formatting
            pattern = r'Answer recorded:\s*\*\*(.*?)\*\*(?=\s*(?:\n|---|\*\*|🎉))'
            match = re.search(pattern, log_text, re.DOTALL)
            
            if match:
                main_answer = match.group(1).strip()
                logger.info(f"Successfully extracted answer from conversation log (legacy pattern)")
                logger.info(f"Answer length: {len(main_answer)} characters")
                logger.info(f"Answer preview: {main_answer[:200]}...")
                return main_answer
            else:
                # Try alternative patterns
                alt_patterns = [
                    r"Final answer:\s*(.+?)(?:\n|$)",
                    r"submit\(['\"](.+?)['\"]\)",
                    r"Result:\s*(.+?)(?:\n|$)"
                ]
                
                for alt_pattern in alt_patterns:
                    match = re.search(alt_pattern, log_text, re.IGNORECASE | re.DOTALL)
                    if match:
                        main_answer = match.group(1).strip()
                        logger.info(f"Extracted answer using alternative pattern: {alt_pattern}")
                        logger.info(f"Answer preview: {main_answer[:200]}...")
                        return main_answer
                
                logger.warning("No answer pattern found in conversation log")
                logger.debug(f"Log text sample: {log_text[:500]}...")
                return "No answer found in conversation log"
            
        except Exception as e:
            logger.error(f"Error extracting main question answer: {str(e)}")
            return f"Error extracting main question answer: {str(e)}"
    
    
    def _grade_subquestion_answer(self, subquestion, model_answer):
        """Grade a subquestion answer based on evaluation type."""
        try:
            expected_answer = subquestion.answer
            evaluation_type = subquestion.evaluation_type.evaluation_type
            
            logger.info(f"Grading: expected='{expected_answer}', model='{model_answer}', type='{evaluation_type}'")
            
            if not expected_answer or not model_answer:
                logger.warning(f"Missing expected ({expected_answer}) or model ({model_answer}) answer")
                return False
                
            # Clean both answers for comparison (strip whitespace, normalize case)
            expected_clean = str(expected_answer).strip()
            model_clean = str(model_answer).strip()
            
            if evaluation_type == 'exact_match':
                # Case-insensitive exact match with boolean equivalence
                expected_lower = expected_clean.lower()
                model_lower = model_clean.lower()
                
                # Handle boolean equivalences
                def normalize_boolean(answer):
                    """Normalize boolean-like answers to canonical forms."""
                    # Positive boolean values
                    if answer in ['true', 'yes', '1', 'correct', 'right']:
                        return 'true'
                    # Negative boolean values  
                    elif answer in ['false', 'no', '0', 'incorrect', 'wrong']:
                        return 'false'
                    # Return original if not a boolean
                    return answer
                
                expected_normalized = normalize_boolean(expected_lower)
                model_normalized = normalize_boolean(model_lower)
                
                is_correct = expected_normalized == model_normalized
                logger.info(f"Exact match comparison: '{expected_clean}' vs '{model_clean}' -> normalized: '{expected_normalized}' vs '{model_normalized}' -> {is_correct}")
                return is_correct
            
            # Add support for other evaluation types in the future
            elif evaluation_type == 'numeric_tolerance':
                try:
                    expected_num = float(expected_clean)
                    model_num = float(model_clean)
                    tolerance = 1e-6  # Default tolerance
                    is_correct = abs(expected_num - model_num) < tolerance
                    logger.info(f"Numeric tolerance comparison: {expected_num} vs {model_num} (tol={tolerance}) -> {is_correct}")
                    return is_correct
                except ValueError:
                    logger.warning(f"Could not convert to numbers for numeric_tolerance: '{expected_clean}' or '{model_clean}'")
                    return False
            
            else:
                logger.warning(f"Unsupported evaluation type: {evaluation_type}, defaulting to exact match")
                is_correct = expected_clean.lower() == model_clean.lower()
                return is_correct
                
        except Exception as e:
            logger.error(f"Error grading subquestion answer: {str(e)}")
            logger.error(f"Traceback: {traceback.format_exc()}")
            return False  # Default to incorrect if grading fails
    
    def _save_subquestion_answer(self, stage: str, answer_text: str, model_attempt, queue_item):
        """Save a subquestion answer to the database with automatic grading."""
        try:
            from questions.models import Subquestion
            from .models import ModelSubquestionAnswer
            
            # Extract subquestion order from stage (e.g., 'subquestion_a' -> 'a')
            if not stage.startswith('subquestion_'):
                logger.warning(f"Invalid stage format: {stage}")
                return
                
            subquestion_order = stage.replace('subquestion_', '')
            logger.info(f"Saving answer for subquestion {subquestion_order}")
            
            # Find the corresponding subquestion
            try:
                subquestion = Subquestion.objects.get(
                    question_id=model_attempt.question_id,
                    subquestion_order=subquestion_order
                )
            except Subquestion.DoesNotExist:
                logger.warning(f"Subquestion {subquestion_order} not found for question {model_attempt.question_id}")
                return
            
            # Grade the answer automatically
            is_correct = self._grade_subquestion_answer(subquestion, answer_text)
            
            # Save to database
            subq_answer, created = ModelSubquestionAnswer.objects.get_or_create(
                attempt=model_attempt,
                subquestion=subquestion,
                defaults={
                    'model': queue_item.model,
                    'answer': answer_text,
                    'is_correct': is_correct
                }
            )
            
            if created:
                logger.info(f"Created ModelSubquestionAnswer {subq_answer.id} for subquestion {subquestion_order} (correct: {is_correct})")
            else:
                # Update grading if this was re-extracted
                if subq_answer.is_correct != is_correct:
                    subq_answer.is_correct = is_correct
                    subq_answer.save()
                    logger.info(f"Updated grading for existing ModelSubquestionAnswer {subq_answer.id} (correct: {is_correct})")
                else:
                    logger.info(f"ModelSubquestionAnswer already exists for subquestion {subquestion_order} with same grading")
                    
        except Exception as e:
            logger.error(f"Error saving subquestion answer for {stage}: {str(e)}")
            logger.error(f"Traceback: {traceback.format_exc()}")
    
    def _extract_and_save_subquestion_answers(self, model_attempt):
        """Extract subquestion answers from answer.pkl and save them to database."""
        try:
            import pickle
            import os
            from questions.models import Subquestion
            from .models import ModelSubquestionAnswer
            
            if not os.path.exists('answer.pkl'):
                logger.info("No answer.pkl file found, skipping subquestion extraction")
                return
            
            # Load the submitted answer data
            with open('answer.pkl', 'rb') as f:
                submitted_data = pickle.load(f)
            
            if not isinstance(submitted_data, dict):
                logger.info("Answer data is not a dictionary, skipping subquestion extraction")
                return
            
            # Get subquestions for this question
            subquestions = Subquestion.objects.filter(
                question_id=model_attempt.question_id
            ).order_by('subquestion_order')
            
            logger.info(f"Found {subquestions.count()} subquestions for question {model_attempt.question_id}")
            
            # Extract and save each subquestion answer
            for subquestion in subquestions:
                subq_key = subquestion.subquestion_order.upper()  # 'A', 'B', 'C', 'D'
                
                if subq_key in submitted_data:
                    subq_answer_data = submitted_data[subq_key]
                    
                    # Handle tuple format: (boolean, explanation)
                    if isinstance(subq_answer_data, tuple) and len(subq_answer_data) >= 2:
                        answer_text = f"{subq_answer_data[0]}: {subq_answer_data[1]}"
                        is_correct = subq_answer_data[0] if isinstance(subq_answer_data[0], bool) else None
                    else:
                        answer_text = str(subq_answer_data)
                        is_correct = None
                    
                    # Check if subquestion answer already exists for this attempt
                    existing_subq_answer = ModelSubquestionAnswer.objects.filter(
                        attempt=model_attempt,
                        subquestion=subquestion
                    ).first()
                    
                    if existing_subq_answer:
                        logger.info(f"Subquestion answer already exists for {subq_key}, skipping")
                        continue
                    
                    # Create the subquestion answer
                    ModelSubquestionAnswer.objects.create(
                        attempt=model_attempt,
                        subquestion=subquestion,
                        model=model_attempt.model,
                        answer=answer_text,
                        is_correct=is_correct
                    )
                    
                    logger.info(f"Saved subquestion {subq_key} answer: {answer_text[:50]}...")
                else:
                    logger.info(f"No answer found for subquestion {subq_key}")
            
        except Exception as e:
            logger.error(f"Error extracting subquestion answers: {str(e)}")
    
    def _extract_answer_from_message_content(self, content: str) -> str:
        """Extract answer from message content that contains submit tool calls."""
        try:
            # Look for patterns like submit("answer here") or similar
            import re
            
            # Pattern 1: submit("answer")
            match = re.search(r'submit\(["\']([^"\']+)["\']', content)
            if match:
                return match.group(1)
            
            # Pattern 2: Look for the actual answer text in the message
            lines = content.split('\n')
            for line in lines:
                line = line.strip()
                if line and not line.startswith('submit') and len(line) > 10:
                    return line
            
            # Fallback: return the content as-is
            return content.strip()
            
        except Exception as e:
            logger.error(f"Error extracting answer from message content: {str(e)}")
            return content

    def _save_evaluation_result(self, queue_item: EvaluationQueue, result: Dict[str, Any]):
        """
        Save evaluation result to database with terminal log storage.
        
        Args:
            queue_item: Queue item being processed
            result: Evaluation result dictionary
        """
        try:
            logger.info("Processing evaluation result for terminal log storage...")
            
            # Store terminal log if available - extract from conversation_log
            terminal_log_hash = None
            conversation_log = result.get('conversation_log')
            if conversation_log:
                try:
                    # Extract terminal output from conversation log
                    terminal_output = self._extract_terminal_output_from_conversation(conversation_log)
                    logger.info(f"Extracted terminal output from conversation, length: {len(terminal_output)}")
                    
                    if terminal_output:
                        # Store log with metadata
                        evaluation_time = None
                        start_time = result.get('start_time')
                        if start_time and hasattr(start_time, 'isoformat'):
                            evaluation_time = start_time.isoformat()
                        elif start_time:
                            evaluation_time = str(start_time)
                        
                        # Now we can access Django ORM directly since method is synchronous
                        log_metadata = {
                            'evaluation_id': queue_item.id,
                            'model_name': queue_item.model.model_name,
                            'question_id': queue_item.question.id,
                            'attempt_number': queue_item.attempt_number,
                            'company': queue_item.model.company.company_name,
                            'tier': queue_item.model.tier.tier_number,
                            'evaluation_time': evaluation_time,
                        }
                        
                        logger.info("Storing terminal log...")
                        terminal_log_hash = terminal_log_manager.store_terminal_log(
                            terminal_output, log_metadata
                        )
                        logger.info(f"Terminal log stored with hash: {terminal_log_hash}")
                except Exception as terminal_error:
                    logger.error(f"Error processing terminal log: {str(terminal_error)}")
                    # Don't fail the entire save operation for terminal log issues
                    terminal_log_hash = None
            
            # Save to database using the react agent result processor
            logger.info("Processing react agent evaluation result...")
            model_answer = self._save_react_agent_result(
                result, queue_item
            )
            logger.info(f"React agent result processing completed: {model_answer}")
            
            # Update terminal log hash if we have one
            if model_answer and terminal_log_hash:
                model_answer.terminal_log_hash = terminal_log_hash
                model_answer.save()
            
        except Exception as e:
            logger.error(f"Error saving evaluation result: {str(e)}")
    
    def _extract_terminal_output(self, messages: List[Dict[str, Any]]) -> str:
        """
        Extract terminal output from evaluation messages.
        
        Args:
            messages: List of message dictionaries
            
        Returns:
            Combined terminal output string
        """
        terminal_lines = []
        
        try:
            for i, message in enumerate(messages):
                logger.debug(f"Processing message {i}: type={type(message)}")
                
                # Handle case where message might not be a dict
                if not isinstance(message, dict):
                    logger.warning(f"Message {i} is not a dict: {type(message)}")
                    terminal_lines.append(f"[MESSAGE {i}]")
                    terminal_lines.append(str(message))
                    terminal_lines.append("-" * 50)
                    continue
                
                role = message.get('role', '')
                content = message.get('content', '')
                
                logger.debug(f"Message {i}: role={role}, content type={type(content)}")
                
                # Handle case where content might be a list or other complex type
                if isinstance(content, list):
                    # Handle list of strings or complex objects
                    content_parts = []
                    for item in content:
                        if isinstance(item, dict):
                            # Handle structured content like tool calls
                            content_parts.append(str(item))
                        else:
                            content_parts.append(str(item))
                    content = '\n'.join(content_parts)
                elif isinstance(content, dict):
                    # Handle structured content
                    content = str(content)
                elif not isinstance(content, str):
                    content = str(content)
                
                # Add role headers for clarity
                if role and content:
                    terminal_lines.append(f"[{role.upper()}]")
                    terminal_lines.append(content)
                    terminal_lines.append("-" * 50)
                    
        except Exception as e:
            logger.error(f"Error in _extract_terminal_output: {str(e)}")
            terminal_lines.append(f"[ERROR EXTRACTING TERMINAL OUTPUT: {str(e)}]")
        
        result = "\n".join(terminal_lines)
        logger.debug(f"Terminal output extraction complete, length: {len(result)}")
        return result
    
    def _extract_terminal_output_from_conversation(self, conversation_log) -> str:
        """
        Extract terminal output from react agent conversation log.
        
        Args:
            conversation_log: Conversation log from react agent evaluation
            
        Returns:
            Combined terminal output string
        """
        terminal_lines = []
        
        try:
            # Handle case where conversation_log might be a string or object
            if isinstance(conversation_log, str):
                # If it's already a string, use it directly
                terminal_lines.append("[CONVERSATION LOG]")
                terminal_lines.append(conversation_log)
                terminal_lines.append("-" * 50)
            elif hasattr(conversation_log, 'samples') and conversation_log.samples:
                # Extract from Inspect evaluation log structure
                for i, sample in enumerate(conversation_log.samples):
                    terminal_lines.append(f"[SAMPLE {i}]")
                    
                    if hasattr(sample, 'messages') and sample.messages:
                        for j, message in enumerate(sample.messages):
                            role = getattr(message, 'role', 'unknown')
                            content = getattr(message, 'content', '')
                            
                            terminal_lines.append(f"[{role.upper()}]")
                            terminal_lines.append(str(content))
                            terminal_lines.append("-" * 30)
                    
                    # Add any output or final state
                    if hasattr(sample, 'output'):
                        terminal_lines.append("[OUTPUT]")
                        terminal_lines.append(str(sample.output))
                        terminal_lines.append("-" * 30)
                    
                    terminal_lines.append("-" * 50)
            else:
                # Fallback: convert whatever we have to string
                terminal_lines.append("[CONVERSATION LOG - FALLBACK]")
                terminal_lines.append(str(conversation_log))
                terminal_lines.append("-" * 50)
                
        except Exception as e:
            logger.error(f"Error in _extract_terminal_output_from_conversation: {str(e)}")
            terminal_lines.append(f"[ERROR EXTRACTING CONVERSATION LOG: {str(e)}]")
        
        result = "\n".join(terminal_lines)
        logger.debug(f"Conversation terminal output extraction complete, length: {len(result)}")
        return result
    
    
    async def _read_subprocess_output(self, process, eval_id):
        """Read and display subprocess output asynchronously."""
        import asyncio
        import sys

        async def read_stream(stream, prefix):
            """Read from a stream line by line."""
            while True:
                line = await stream.readline()
                if not line:
                    break
                # Output is captured but not printed to reduce noise

        # Create async subprocess to read output
        await asyncio.gather(
            read_stream(process.stdout, "STDOUT"),
            read_stream(process.stderr, "STDERR"),
            return_exceptions=True
        )
    
    async def _cleanup_completed_subprocesses(self):
        """Remove completed subprocess evaluations from tracking."""
        completed_ids = []
        
        for eval_id, eval_info in self.current_evaluations.items():
            process = eval_info['process']
            
            # Check if subprocess has completed
            return_code = process.returncode
            if return_code is not None:
                completed_ids.append(eval_id)
                
                # Cancel output task if still running
                if 'output_task' in eval_info:
                    eval_info['output_task'].cancel()
                
                # Log subprocess completion and handle failures
                queue_item = eval_info['queue_item']
                
                company_name = queue_item.attempt.model.company.company_name
                model_name = queue_item.attempt.model.model_name
                question_id = queue_item.attempt.question.id
                duration = (timezone.now() - eval_info['started_at']).total_seconds()
                
                if return_code == 0:
                    print(f"✅ COMPLETED: Queue {eval_id} | {model_name} ({company_name}) | Question {question_id} | Duration: {duration:.1f}s")
                    logger.info(f"Subprocess completed successfully for evaluation {eval_id} in {duration:.1f}s")
                else:
                    # Error details should have been printed by output reader
                    error_details = f"Subprocess failed with return code {return_code}"
                    
                    print(f"❌ FAILED: Queue {eval_id} | {model_name} ({company_name}) | Question {question_id} | Duration: {duration:.1f}s")
                    print(f"   └─ Error: {error_details}")
                    logger.error(f"Subprocess failed for evaluation {eval_id} with return code {return_code} after {duration:.1f}s")
                    
                    # Mark evaluation as failed in database
                    # The subprocess should have done this, but ensure it's marked as failed
                    try:
                        asyncio.create_task(
                            complete_work(queue_item, False, f"Subprocess failed: {error_details}")
                        )
                    except Exception as db_error:
                        logger.error(f"Error marking evaluation {eval_id} as failed: {str(db_error)}")
        
        # Remove completed subprocesses from tracking
        for eval_id in completed_ids:
            if eval_id in self.current_evaluations:
                del self.current_evaluations[eval_id]
    
    async def _wait_for_subprocesses(self):
        """Wait for all current subprocess evaluations to complete."""
        if not self.current_evaluations:
            return
        
        logger.info(f"Waiting for {len(self.current_evaluations)} subprocesses to complete...")
        
        # Wait for all subprocesses with timeout
        max_wait_time = 60  # 1 minute grace period
        start_time = time.time()
        
        while self.current_evaluations and (time.time() - start_time) < max_wait_time:
            await self._cleanup_completed_subprocesses()
            if self.current_evaluations:
                await asyncio.sleep(1)  # Check every second
        
        # If any subprocesses are still running, they'll be terminated by _cancel_running_subprocesses
    
    async def _cancel_running_subprocesses(self):
        """Terminate all running subprocess evaluations."""
        if not self.current_evaluations:
            return
            
        logger.info(f"Terminating {len(self.current_evaluations)} running subprocesses...")
        
        for eval_id, eval_info in self.current_evaluations.items():
            process = eval_info['process']
            
            try:
                if process.returncode is None:  # Process is still running
                    logger.info(f"Terminating subprocess {process.pid} for evaluation {eval_id}")
                    # Kill the entire process group to clean up child processes (like sage)
                    os.killpg(os.getpgid(process.pid), signal.SIGTERM)
            except Exception as e:
                logger.error(f"Error terminating subprocess for evaluation {eval_id}: {str(e)}")
        
        # Wait a bit for terminations to take effect
        await asyncio.sleep(2)
        
        # Force kill any remaining processes
        for eval_id, eval_info in self.current_evaluations.items():
            process = eval_info['process']
            
            try:
                if process.returncode is None:  # Process is still running
                    logger.warning(f"Force killing subprocess {process.pid} for evaluation {eval_id}")
                    # Kill the entire process group to clean up child processes (like sage)
                    os.killpg(os.getpgid(process.pid), signal.SIGKILL)
            except Exception as e:
                logger.error(f"Error force killing subprocess for evaluation {eval_id}: {str(e)}")

    async def _reset_evaluations_to_pending(self, queue_ids: List[int]):
        """
        Reset terminated evaluations to 'pending' status so they restart after reboot.

        Also resets any other evaluations that were marked as 'running' in the database
        (in case of previous crashes or issues).

        Args:
            queue_ids: List of queue IDs that were actively terminated
        """
        from asgiref.sync import sync_to_async

        @sync_to_async
        def reset_in_database():
            with transaction.atomic():
                # Reset evaluations that we actively terminated
                if queue_ids:
                    terminated_count = EvaluationQueue.objects.filter(
                        id__in=queue_ids
                    ).update(
                        status='pending',
                        started_at=None,
                        error_message='Reset by service shutdown - will restart automatically'
                    )
                    print(f"  Reset {terminated_count} actively running evaluation(s) to pending")
                    logger.info(f"Reset {terminated_count} actively terminated evaluations to pending")

                # Also reset any other evaluations stuck in 'running' state
                # (handles edge cases like previous crashes)
                orphaned_count = EvaluationQueue.objects.filter(
                    status='running'
                ).exclude(
                    id__in=queue_ids if queue_ids else []
                ).update(
                    status='pending',
                    started_at=None,
                    error_message='Reset by service shutdown - orphaned running evaluation'
                )

                if orphaned_count > 0:
                    print(f"  Reset {orphaned_count} orphaned running evaluation(s) to pending")
                    logger.info(f"Reset {orphaned_count} orphaned running evaluations to pending")

                # Count total pending for status message
                total_pending = EvaluationQueue.objects.filter(status='pending').count()
                return total_pending

        try:
            total_pending = await reset_in_database()
            print(f"  Total pending evaluations: {total_pending}")
        except Exception as e:
            logger.error(f"Error resetting evaluations to pending: {str(e)}")
            print(f"  Error resetting evaluations: {str(e)}")

    async def _clear_all_execution_trackers(self):
        """
        Clear all execution trackers from the database.

        This ensures a clean state for when the service restarts.
        Also cleans up legacy company execution locks if they exist.
        """
        from asgiref.sync import sync_to_async

        @sync_to_async
        def clear_trackers():
            from .models import ExecutionTracker, CompanyExecutionLock

            with transaction.atomic():
                # Clear execution trackers
                tracker_count = ExecutionTracker.objects.all().delete()[0]
                print(f"  Cleared {tracker_count} execution tracker(s)")
                logger.info(f"Cleared {tracker_count} execution trackers")

                # Clear legacy company execution locks
                try:
                    lock_count = CompanyExecutionLock.objects.all().delete()[0]
                    if lock_count > 0:
                        print(f"  Cleared {lock_count} company execution lock(s)")
                        logger.info(f"Cleared {lock_count} company execution locks")
                except Exception:
                    pass  # Table might not exist

        try:
            await clear_trackers()
        except Exception as e:
            logger.error(f"Error clearing execution trackers: {str(e)}")
            print(f"  Error clearing trackers: {str(e)}")

    def get_service_status(self) -> Dict[str, Any]:
        """
        Get current service status information.
        
        Returns:
            Service status dictionary
        """
        return {
            'is_running': self.is_running,
            'current_evaluations': len(self.current_evaluations),
            'poll_interval': self.poll_interval,
            'running_evaluations': [
                {
                    'queue_id': eval_id,
                    'model': info['queue_item'].attempt.model.model_name,
                    'question_id': info['queue_item'].attempt.question.id,
                    'company': info['queue_item'].attempt.model.company.company_name,
                    'subprocess_pid': info['process'].pid,
                    'started_at': info['started_at'].isoformat(),
                    'duration_seconds': (
                        timezone.now() - info['started_at']
                    ).total_seconds(),
                }
                for eval_id, info in self.current_evaluations.items()
            ]
        }


class EvaluationRunner:
    """
    Standalone evaluation runner for single evaluations or testing.
    """
    
    def __init__(self):
        self.executor = EvaluationExecutor()
    
    async def run_single_evaluation(
        self,
        model_name: str,
        question_id: int,
        attempt_number: int = 1
    ) -> Dict[str, Any]:
        """
        Run a single evaluation without using the queue.
        
        Args:
            model_name: Name of the model to evaluate
            question_id: ID of the question to evaluate
            attempt_number: Attempt number (1 or 2)
            
        Returns:
            Evaluation result dictionary
        """
        logger.info(f"Running single evaluation: {model_name} on Q{question_id}")
        
        try:
            # Map model name to key
            model_key = self.executor._get_model_key(type('Model', (), {'model_name': model_name})())
            
            # Run evaluation
            result = await self.executor.evaluator.evaluate(
                question_id=question_id,
                model_key=model_key,
                attempt_number=attempt_number
            )
            
            logger.info(f"Single evaluation completed: success={result.get('success', False)}")
            return result
            
        except Exception as e:
            logger.error(f"Single evaluation failed: {str(e)}")
            return {
                'success': False,
                'error': str(e),
                'question_id': question_id,
                'model_key': model_name,
                'attempt_number': attempt_number,
            }


# Global executor instance
evaluation_executor = EvaluationExecutor()


# Convenience functions
async def start_evaluation_service():
    """Start the evaluation service."""
    await evaluation_executor.start_service()


async def shutdown_evaluation_service():
    """Shutdown the evaluation service."""
    await evaluation_executor.shutdown_service()


def get_executor_status() -> Dict[str, Any]:
    """Get evaluation service status."""
    return evaluation_executor.get_service_status()