"""
Non-Agentic Model Evaluator for IMProofBench

Conducts ablation testing by evaluating models without agentic scaffolding.
Uses direct API calls with barebones prompting to test baseline model capabilities.

Evaluation Flow:
1. Main question: Send question text, get response
2. For each subquestion: Send follow-up with \boxed{} instruction
3. Extract answers using \boxed{} pattern matching
4. Save results to database

Multi-turn conversation maintains reasoning context across subquestions.
"""

import os
import re
import logging
import asyncio
from typing import Dict, Any, Optional, List, Tuple
from datetime import datetime

from django.utils import timezone
from django.db import transaction
from asgiref.sync import sync_to_async

from .api_clients import create_api_client, APIResponse
from .models import (
    Model, ModelAttempt, ModelAnswer, ModelSubquestionAnswer,
    EvaluationQueue
)
from .answer_utils import check_answer_correctness
from questions.models import Question, Subquestion

logger = logging.getLogger(__name__)


class AnswerExtractor:
    """
    Extracts mathematical answers from model responses.

    Looks for \boxed{} LaTeX environment, falls back to full response.
    """

    @staticmethod
    def extract_boxed_answer(text: str) -> Tuple[Optional[str], bool]:
        """
        Extract answer from \boxed{} environment.

        Args:
            text: Model response text

        Returns:
            Tuple of (extracted_answer, parsing_failed)
            - If \boxed{} found: (answer, False)
            - If not found: (full_text, True)
        """
        # Pattern for \boxed{...} with nested brace support
        # Matches \boxed{content} where content can have balanced braces
        pattern = r'\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}'

        matches = re.findall(pattern, text, re.DOTALL)

        if matches:
            # Use last \boxed{} if multiple found
            answer = matches[-1].strip()
            logger.info(f"Extracted answer from \\boxed{{}}: {answer}")
            return answer, False
        else:
            # No \boxed{} found - use full response
            logger.warning(f"No \\boxed{{}} found, using full response")
            return text.strip(), True

    @staticmethod
    def compare_answers(model_answer: str, correct_answer: str) -> bool:
        """
        Compare model answer with correct answer.

        Uses shared normalization logic that handles:
        - Case-insensitive comparison
        - Boolean equivalences (yes/no/true/false)
        - Whitespace and formatting normalization

        Args:
            model_answer: Model's answer
            correct_answer: Expected correct answer

        Returns:
            True if answers match after normalization
        """
        return check_answer_correctness(model_answer, correct_answer)


class NonAgenticEvaluator:
    """
    Evaluator for non-agentic model testing.

    Uses direct API calls without tools or agentic scaffolding.
    Maintains conversation context for multi-turn subquestion answering.
    """

    # Prompts for evaluation
    MAIN_QUESTION_PROMPT = "{question_text}"

    SUBQUESTION_INTRO = """Thanks! Below I'll ask some follow-up questions, where answers will be parsed automatically. They might repeat the main question or ask for results in special cases. Please provide the final answer in a \\boxed{{}} environment for easier extraction. This desired answer will typically be short, either \\boxed{{Yes}} or \\boxed{{No}} or a numerical result like \\boxed{{172}} or \\boxed{{-13/45}}. Do not use additional (LaTeX) formatting within the \\boxed environment.

Subquestion 1:
{subquestion_text}"""

    SUBQUESTION_FOLLOWUP = """Thanks!
Subquestion {number}:
{subquestion_text}"""

    def __init__(self, model_attempt_id: int):
        """
        Initialize evaluator for a specific model attempt.

        Args:
            model_attempt_id: ID of ModelAttempt to evaluate
        """
        self.attempt = ModelAttempt.objects.select_related(
            'model__company', 'model__tier', 'question'
        ).get(id=model_attempt_id)

        self.model = self.attempt.model
        self.question = self.attempt.question

        # Get API configuration
        self.api_key = os.getenv(self.model.company.api_key)
        if not self.api_key:
            raise ValueError(f"API key not found: {self.model.company.api_key}")

        # Create API client
        self.client = create_api_client(
            company_name=self.model.company.company_name,
            model_name=self.model.model_name,
            api_key=self.api_key,
            model_args=self.model.get_model_args(),
            reasoning_args=self.model.get_reasoning_args()
        )

        self.extractor = AnswerExtractor()

    async def run_evaluation(self) -> Dict[str, Any]:
        """
        Run complete evaluation: main question + all subquestions.

        Returns:
            Dict with evaluation results and statistics
        """
        logger.info(f"Starting non-agentic evaluation: {self.model.display_name} on Q{self.question.id}")

        try:
            # Step 1: Evaluate main question
            main_response = await self._evaluate_main_question()

            if main_response.error:
                return {
                    'success': False,
                    'error': main_response.error,
                    'stage': 'main_question'
                }

            # Step 2: Evaluate subquestions
            subquestion_results = await self._evaluate_subquestions(main_response.context)

            # Step 3: Save results to database
            await self._save_results(main_response, subquestion_results)

            logger.info(f"Evaluation completed successfully for attempt {self.attempt.id}")

            return {
                'success': True,
                'main_answer_length': len(main_response.content),
                'subquestions_evaluated': len(subquestion_results),
                'subquestions_correct': sum(1 for r in subquestion_results if r['is_correct']),
            }

        except Exception as e:
            logger.error(f"Evaluation failed: {str(e)}", exc_info=True)
            return {
                'success': False,
                'error': str(e),
                'stage': 'unknown'
            }

    async def _evaluate_main_question(self) -> APIResponse:
        """
        Evaluate main question.

        Returns:
            APIResponse with main question answer and context
        """
        prompt = self.MAIN_QUESTION_PROMPT.format(
            question_text=self.question.text
        )

        logger.info(f"Sending main question (length: {len(prompt)} chars)")
        response = await self.client.send_message(prompt)

        if response.error:
            logger.error(f"Main question failed: {response.error}")
        else:
            logger.info(f"Main question response received (length: {len(response.content)} chars)")

        return response

    async def _evaluate_subquestions(self, initial_context: Optional[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Evaluate all subquestions in sequence.

        Args:
            initial_context: Context from main question response

        Returns:
            List of subquestion results
        """
        # Fetch subquestions asynchronously
        subquestions = await sync_to_async(list)(
            Subquestion.objects.filter(
                question=self.question
            ).order_by('subquestion_order')
        )

        results = []
        context = initial_context

        for idx, subquestion in enumerate(subquestions, start=1):
            logger.info(f"Evaluating subquestion {idx}/{len(subquestions)}: {subquestion.subquestion_order}")

            # Format prompt
            if idx == 1:
                prompt = self.SUBQUESTION_INTRO.format(
                    subquestion_text=subquestion.text
                )
            else:
                prompt = self.SUBQUESTION_FOLLOWUP.format(
                    number=idx,
                    subquestion_text=subquestion.text
                )

            # Send message
            response = await self.client.send_message(prompt, context)

            if response.error:
                logger.error(f"Subquestion {idx} failed: {response.error}")
                results.append({
                    'subquestion': subquestion,
                    'full_response': '',
                    'extracted_answer': '',
                    'parsing_failure': True,
                    'is_correct': False,
                    'error': response.error
                })
                continue

            # Extract answer
            extracted_answer, parsing_failed = self.extractor.extract_boxed_answer(response.content)

            # Check correctness
            is_correct = self.extractor.compare_answers(extracted_answer, subquestion.answer)

            logger.info(f"Subquestion {idx}: {'✓ Correct' if is_correct else '✗ Incorrect'} "
                       f"(parsing_failed={parsing_failed})")

            results.append({
                'subquestion': subquestion,
                'full_response': response.content,
                'extracted_answer': extracted_answer,
                'parsing_failure': parsing_failed,
                'is_correct': is_correct,
                'error': None
            })

            # Update context for next turn
            context = response.context

        return results

    def _save_results_sync(self, main_response: APIResponse, subquestion_results: List[Dict[str, Any]]):
        """
        Save evaluation results to database (synchronous helper).

        Args:
            main_response: Response from main question
            subquestion_results: List of subquestion results
        """
        with transaction.atomic():
            # Save main question answer
            ModelAnswer.objects.update_or_create(
                attempt=self.attempt,
                defaults={
                    'question': self.question,
                    'model': self.model,
                    'answer': main_response.content,
                    'terminal_log_hash': None,  # Not applicable for non-agentic
                    'ran_out_of_tokens': False,
                }
            )

            # Save subquestion answers
            for result in subquestion_results:
                if result.get('error'):
                    # Skip failed subquestions
                    continue

                ModelSubquestionAnswer.objects.update_or_create(
                    attempt=self.attempt,
                    subquestion=result['subquestion'],
                    defaults={
                        'model': self.model,
                        'answer': result['extracted_answer'],
                        'full_response': result['full_response'],
                        'parsing_failure': result['parsing_failure'],
                        'is_correct': 1 if result['is_correct'] else 0,
                        'ran_out_of_tokens': False,
                    }
                )

    async def _save_results(self, main_response: APIResponse, subquestion_results: List[Dict[str, Any]]):
        """
        Save evaluation results to database.

        Args:
            main_response: Response from main question
            subquestion_results: List of subquestion results
        """
        logger.info(f"Saving results for attempt {self.attempt.id}")
        await sync_to_async(self._save_results_sync)(main_response, subquestion_results)
        logger.info(f"Results saved successfully for attempt {self.attempt.id}")


@sync_to_async
def _update_queue_status_running(queue_id: int):
    """Update queue status to running (sync helper for async context)."""
    queue_item = EvaluationQueue.objects.get(id=queue_id)
    queue_item.status = 'running'
    queue_item.started_at = timezone.now()
    queue_item.save()


@sync_to_async
def _update_queue_status_completed(queue_id: int, success: bool, error: Optional[str] = None):
    """Update queue status after evaluation (sync helper for async context)."""
    queue_item = EvaluationQueue.objects.get(id=queue_id)
    if success:
        queue_item.status = 'completed'
    else:
        queue_item.status = 'failed'
        queue_item.error_message = error or 'Unknown error'
    queue_item.completed_at = timezone.now()
    queue_item.save()


@sync_to_async
def _create_evaluator(model_attempt_id: int):
    """Create evaluator instance (sync helper for async context)."""
    return NonAgenticEvaluator(model_attempt_id)


async def run_non_agentic_evaluation(model_attempt_id: int, queue_id: Optional[int] = None) -> Dict[str, Any]:
    """
    Entry point for non-agentic evaluation.

    Args:
        model_attempt_id: ID of ModelAttempt to evaluate
        queue_id: Optional ID of EvaluationQueue item (for status updates)

    Returns:
        Dict with evaluation results
    """
    try:
        # Update queue status to running
        if queue_id:
            await _update_queue_status_running(queue_id)

        # Run evaluation
        evaluator = await _create_evaluator(model_attempt_id)
        result = await evaluator.run_evaluation()

        # Update queue status
        if queue_id:
            await _update_queue_status_completed(
                queue_id,
                success=result['success'],
                error=result.get('error')
            )

        return result

    except Exception as e:
        logger.error(f"Non-agentic evaluation failed: {str(e)}", exc_info=True)

        # Update queue status to failed
        if queue_id:
            try:
                await _update_queue_status_completed(
                    queue_id,
                    success=False,
                    error=str(e)
                )
            except:
                pass

        return {
            'success': False,
            'error': str(e)
        }
