"""
Views for model evaluation queue management and grading workflow.

Provides admin interfaces for:
- Evaluation queue status and management
- Individual evaluation detail views
- Bulk operations (cancel, rerun, retry, restart)
- Grading release workflow
- Subquestion answer verification
- MathArena export functionality
"""

from django.shortcuts import render, get_object_or_404, redirect
from django.db.models import Count, Q
from django.contrib import messages
from django.utils import timezone
from django.http import JsonResponse
from django.views.decorators.http import require_POST
from django.conf import settings
import re
import os
import json
import logging
from accounts.permissions import admin_required
from questions.models import Question, QuestionState, QuestionVersion
from .models import (
    Model, ModelTier, EvaluationQueue, ModelAnswer,
    ModelAttempt, ModelSubquestionAnswer, ExecutionTracker
)

logger = logging.getLogger(__name__)


@admin_required
def queue_status(request):
    """Display evaluation queue organized by active questions."""
    from django.db.models import Max, Min, Case, When, Value, CharField
    
    # Get only active questions
    active_questions = Question.objects.filter(
        status__status='active'
    ).select_related('status').order_by('id')
    
    # Get all evaluations for active questions
    # Find the latest ModelAttempt.id for each (model, question, attempt_number) group
    latest_attempts = ModelAttempt.objects.filter(
        question__in=active_questions
    ).values(
        'model', 'question', 'attempt_number'
    ).annotate(
        latest_attempt_id=Max('id')
    ).values_list('latest_attempt_id', flat=True)
    
    # Get EvaluationQueue items that correspond to these latest attempts
    all_evaluations = EvaluationQueue.objects.filter(
        attempt_id__in=latest_attempts
    ).select_related(
        'attempt__model__company', 
        'attempt__model__tier', 
        'attempt__question'
    )
    
    # Build question-centric data structure
    questions_data = []
    for question in active_questions:
        # Get all evaluations for this question
        question_evals = all_evaluations.filter(attempt__question=question)
        
        if not question_evals.exists():
            continue
            
        # Calculate summary statistics
        total_evals = question_evals.count()
        completed_evals = question_evals.filter(status='completed').count()
        
        # Get last activity (most recent submission or completion time)
        last_activity = question_evals.aggregate(
            last_activity=Max(Case(
                When(completed_at__isnull=False, then='completed_at'),
                default='submitted_at'
            ))
        )['last_activity']
        
        # Group evaluations by tier and add release status
        evaluations_by_tier = {}
        for tier in ModelTier.objects.filter(is_active=True).order_by('tier_number'):
            tier_evals = question_evals.filter(attempt__model__tier=tier).order_by(
                'attempt__model__company__company_name',
                'attempt__model__model_name',
                'attempt__attempt_number'
            )
            if tier_evals.exists():
                # Add release status for each evaluation
                tier_evals_with_status = []
                for eval_item in tier_evals:
                    # Get the corresponding ModelAnswer if it exists
                    model_answer = None
                    if eval_item.status == 'completed':
                        model_answer = ModelAnswer.objects.filter(
                            attempt=eval_item.attempt
                        ).order_by('-id').first()
                    
                    # Add release status to the evaluation object
                    eval_item.is_released = model_answer.released_for_grading if model_answer else False
                    eval_item.model_answer_id = model_answer.id if model_answer else None
                    tier_evals_with_status.append(eval_item)
                
                evaluations_by_tier[tier] = tier_evals_with_status
        
        # Check if question was modified after its latest evaluation
        modified_after_eval = QuestionVersion.was_modified_after_evaluation(question)

        questions_data.append({
            'question': question,
            'total_evaluations': total_evals,
            'completed_evaluations': completed_evals,
            'last_activity': last_activity,
            'evaluations_by_tier': evaluations_by_tier,
            'collapse_id': f'collapse_{question.id}',
            'modified_after_evaluation': modified_after_eval,
        })
    
    # Sort questions by last activity (most recent first) or by ID
    sort_by = request.GET.get('sort', 'activity')
    if sort_by == 'id':
        questions_data.sort(key=lambda x: x['question'].id)
    elif sort_by == 'completion':
        questions_data.sort(key=lambda x: x['completed_evaluations'] / x['total_evaluations'] if x['total_evaluations'] > 0 else 0, reverse=True)
    else:  # Default to activity
        questions_data.sort(key=lambda x: x['last_activity'] or timezone.now(), reverse=True)
    
    # Add queue statistics for header cards
    queue_stats = EvaluationQueue.objects.filter(
        attempt__question__status__status='active'
    ).aggregate(
        pending=Count('id', filter=Q(status='pending')),
        running=Count('id', filter=Q(status='running')),
        completed=Count('id', filter=Q(status='completed')),
        failed=Count('id', filter=Q(status='failed'))
    )
    
    context = {
        'questions_data': questions_data,
        'queue_stats': queue_stats,
        'sort_by': sort_by,
    }
    
    return render(request, 'model_evaluation/queue_status.html', context)


@admin_required
def evaluation_detail(request, queue_id):
    """View detailed information about a specific evaluation."""
    queue_item = get_object_or_404(
        EvaluationQueue.objects.select_related(
            'attempt__model__company', 'attempt__model__tier', 'attempt__question'
        ), 
        id=queue_id
    )
    
    # Get the model answer if it exists
    model_answer = None
    subquestion_answers = []
    if queue_item.status == 'completed':
        try:
            # Get the ModelAnswer with the largest ID for this attempt
            model_answer = ModelAnswer.objects.filter(
                attempt=queue_item.attempt
            ).order_by('-id').first()
            
            # Get subquestion answers with all related data
            subquestion_answers = ModelSubquestionAnswer.objects.filter(
                attempt=queue_item.attempt,
                model=queue_item.model
            ).select_related('subquestion').order_by('subquestion__subquestion_order')
            
        except ModelAnswer.DoesNotExist:
            pass
    
    # Check if re-evaluation is possible
    can_reevaluate = False
    reevaluation_type = None
    has_grading = False
    
    if queue_item.status == 'completed':
        # Any completed evaluation can be re-run (generates new ModelAnswer with same attempt)
        can_reevaluate = True
        reevaluation_type = 'rerun'
        
        # Check if this evaluation already has grading (placeholder for new system)
        if model_answer:
            has_grading = False  # Will be replaced with new grading system check
    
    elif queue_item.status == 'failed':
        # Failed attempts can always be retried
        can_reevaluate = True
        reevaluation_type = 'retry'
    
    elif queue_item.status == 'cancelled':
        # Cancelled attempts can be restarted
        can_reevaluate = True
        reevaluation_type = 'restart'
    
    # Load conversation log if available
    conversation_log = None
    if model_answer and model_answer.terminal_log_hash:
        # Use path relative to project root (go up from web/model_evaluation/views.py to root)
        project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        log_path = os.path.join(project_root, 'terminal_logs', f'{model_answer.terminal_log_hash}.log')
        try:
            with open(log_path, 'r', encoding='utf-8') as f:
                conversation_log = f.read()
        except (FileNotFoundError, IOError, UnicodeDecodeError):
            # Log file not found or corrupted
            pass
    
    # Check if the evaluation has been released for grading
    is_released = model_answer.released_for_grading if model_answer else False
    
    context = {
        'queue_item': queue_item,
        'model_answer': model_answer,
        'subquestion_answers': subquestion_answers,
        'can_reevaluate': can_reevaluate,
        'reevaluation_type': reevaluation_type,
        'has_grading': has_grading,
        'is_released': is_released,
        'conversation_log': conversation_log,
    }
    
    return render(request, 'model_evaluation/evaluation_detail.html', context)


@admin_required
def create_reevaluation(request, queue_id):
    """Create a NEW attempt for re-evaluation (never reprocess existing attempts)."""
    if request.method != 'POST':
        return JsonResponse({'error': 'POST request required'}, status=405)
    
    queue_item = get_object_or_404(EvaluationQueue, id=queue_id)
    
    # Check if this evaluation has been released for grading
    if queue_item.status == 'completed':
        model_answer = ModelAnswer.objects.filter(
            attempt=queue_item.attempt
        ).order_by('-id').first()
        
        if model_answer and model_answer.released_for_grading:
            # Check if user confirmed the warning
            if not request.POST.get('confirm_release_override'):
                messages.warning(
                    request,
                    'This evaluation has been released for grading. Re-running it will remove it from grading. '
                    'Please confirm if you want to proceed.'
                )
                return redirect('model_evaluation:evaluation_detail', queue_id=queue_id)
            
            # Mark as no longer released
            model_answer.released_for_grading = False
            model_answer.save()
            messages.info(request, 'Evaluation has been removed from grading release.')
    
    # Always create a NEW attempt for re-evaluation
    # This ensures immutable attempt records and prevents duplicate ModelAnswer creation
    
    # Create new ModelAttempt with SAME attempt_number (retries keep same attempt_number)
    # The attempt.id will auto-increment to create a new unique record
    from .models import ModelAttempt
    
    # Keep the same attempt_number but create new ModelAttempt record
    new_attempt = ModelAttempt.objects.create(
        model=queue_item.attempt.model,
        question=queue_item.attempt.question,
        attempt_number=queue_item.attempt.attempt_number,  # Same attempt_number!
        time=timezone.now()
    )
    
    # Create new EvaluationQueue entry for the new attempt
    new_queue_item = EvaluationQueue.objects.create(
        attempt=new_attempt,
        status='pending',
        submitted_at=timezone.now()
    )
    
    messages.success(
        request,
        f'Re-run created as new attempt! '
        f'New Queue ID: {new_queue_item.id} '
        f'(Attempt #{new_attempt.attempt_number} for {new_attempt.model.model_name} on Q{new_attempt.question.id})'
    )
    
    # Redirect based on where the request came from
    # Check for explicit redirect parameter first
    redirect_to = request.POST.get('redirect_to', '')
    if redirect_to == 'queue':
        return redirect('model_evaluation:queue_status')
    
    # Otherwise check referer
    referer = request.META.get('HTTP_REFERER', '')
    if 'queue/' in referer:
        return redirect('model_evaluation:queue_status')
    else:
        return redirect('model_evaluation:evaluation_detail', queue_id=new_queue_item.id)
    
    # Handle invalid status
    messages.error(request, f'Cannot re-evaluate evaluation with status "{queue_item.status}". Only completed, failed, or cancelled evaluations can be re-evaluated.')
    return redirect('model_evaluation:evaluation_detail', queue_id=queue_id)


@admin_required
def bulk_cancel(request):
    """Cancel multiple pending or running evaluations."""
    if request.method != 'POST':
        return JsonResponse({'error': 'POST request required'}, status=405)
    
    selected_ids = request.POST.getlist('selected_items')
    if not selected_ids:
        return JsonResponse({'error': 'No items selected'}, status=400)
    
    cancelled_count = 0
    for queue_id in selected_ids:
        try:
            queue_item = EvaluationQueue.objects.get(id=queue_id)
            if queue_item.status in ['pending', 'running']:
                queue_item.status = 'cancelled'
                queue_item.completed_at = timezone.now()
                queue_item.error_message = 'Evaluation cancelled by bulk operation'
                queue_item.save()
                cancelled_count += 1
        except EvaluationQueue.DoesNotExist:
            continue
    
    messages.success(request, f'Successfully cancelled {cancelled_count} evaluation(s).')
    return redirect('model_evaluation:queue_status')


@admin_required
def bulk_rerun(request):
    """Re-run multiple evaluations (completed, failed, or cancelled) by creating new attempts."""
    if request.method != 'POST':
        return JsonResponse({'error': 'POST request required'}, status=405)
    
    selected_ids = request.POST.getlist('selected_items')
    if not selected_ids:
        return JsonResponse({'error': 'No items selected'}, status=400)
    
    rerun_count = 0
    for queue_id in selected_ids:
        try:
            queue_item = EvaluationQueue.objects.get(id=queue_id)
            # Handle completed, failed, and cancelled statuses
            if queue_item.status in ['completed', 'failed', 'cancelled']:
                # Create new ModelAttempt with same attempt_number
                from .models import ModelAttempt
                new_attempt = ModelAttempt.objects.create(
                    model=queue_item.attempt.model,
                    question=queue_item.attempt.question,
                    attempt_number=queue_item.attempt.attempt_number,
                    time=timezone.now()
                )
                
                # Create new EvaluationQueue entry
                EvaluationQueue.objects.create(
                    attempt=new_attempt,
                    status='pending',
                    submitted_at=timezone.now()
                )
                rerun_count += 1
        except EvaluationQueue.DoesNotExist:
            continue
    
    messages.success(request, f'Successfully queued {rerun_count} evaluation(s) for re-run.')
    return redirect('model_evaluation:queue_status')


@admin_required
def bulk_retry(request):
    """Retry multiple failed evaluations by creating new attempts."""
    if request.method != 'POST':
        return JsonResponse({'error': 'POST request required'}, status=405)
    
    selected_ids = request.POST.getlist('selected_items')
    if not selected_ids:
        return JsonResponse({'error': 'No items selected'}, status=400)
    
    rerun_count = 0
    for queue_id in selected_ids:
        try:
            queue_item = EvaluationQueue.objects.get(id=queue_id)
            if queue_item.status == 'failed':
                # Create new ModelAttempt with same attempt_number
                from .models import ModelAttempt
                new_attempt = ModelAttempt.objects.create(
                    model=queue_item.attempt.model,
                    question=queue_item.attempt.question,
                    attempt_number=queue_item.attempt.attempt_number,
                    time=timezone.now()
                )
                
                # Create new EvaluationQueue entry
                EvaluationQueue.objects.create(
                    attempt=new_attempt,
                    status='pending',
                    submitted_at=timezone.now()
                )
                rerun_count += 1
        except EvaluationQueue.DoesNotExist:
            continue
    
    messages.success(request, f'Successfully queued {rerun_count} evaluation(s) for re-run.')
    return redirect('model_evaluation:index')


@admin_required
def bulk_restart(request):
    """Restart multiple cancelled evaluations by creating new attempts."""
    if request.method != 'POST':
        return JsonResponse({'error': 'POST request required'}, status=405)
    
    selected_ids = request.POST.getlist('selected_items')
    if not selected_ids:
        return JsonResponse({'error': 'No items selected'}, status=400)
    
    restart_count = 0
    for queue_id in selected_ids:
        try:
            queue_item = EvaluationQueue.objects.get(id=queue_id)
            if queue_item.status == 'cancelled':
                # Create new ModelAttempt with same attempt_number
                from .models import ModelAttempt
                new_attempt = ModelAttempt.objects.create(
                    model=queue_item.attempt.model,
                    question=queue_item.attempt.question,
                    attempt_number=queue_item.attempt.attempt_number,
                    time=timezone.now()
                )
                
                # Create new EvaluationQueue entry
                EvaluationQueue.objects.create(
                    attempt=new_attempt,
                    status='pending',
                    submitted_at=timezone.now()
                )
                restart_count += 1
        except EvaluationQueue.DoesNotExist:
            continue
    
    messages.success(request, f'Successfully queued {restart_count} evaluation(s) for restart.')
    return redirect('model_evaluation:index')




@admin_required
def release_for_grading(request):
    """
    Release selected completed evaluations for grading.

    Only completed evaluations that aren't already released will be processed.
    Non-completed evaluations (failed, cancelled, running, pending) are silently skipped.
    """
    if request.method != 'POST':
        return JsonResponse({'error': 'POST request required'}, status=405)

    selected_ids = request.POST.getlist('selected_items')
    if not selected_ids:
        return JsonResponse({'error': 'No items selected'}, status=400)

    released_count = 0
    for queue_id in selected_ids:
        try:
            queue_item = EvaluationQueue.objects.get(id=queue_id)
            if queue_item.status == 'completed':
                # Get the corresponding ModelAnswer
                model_answer = ModelAnswer.objects.filter(
                    attempt=queue_item.attempt
                ).order_by('-id').first()

                if model_answer and not model_answer.released_for_grading:
                    model_answer.released_for_grading = True
                    model_answer.save()
                    released_count += 1
        except (EvaluationQueue.DoesNotExist, ModelAnswer.DoesNotExist):
            continue

    if released_count > 0:
        messages.success(request, f'Successfully released {released_count} evaluation(s) for grading.')
    else:
        messages.info(request, 'No evaluations were released. Only completed evaluations can be released.')

    return redirect('model_evaluation:queue_status')


@admin_required
def cancel_evaluation(request, queue_id):
    """Cancel a pending or running evaluation."""
    if request.method != 'POST':
        return JsonResponse({'error': 'POST request required'}, status=405)
    
    queue_item = get_object_or_404(EvaluationQueue, id=queue_id)
    
    if queue_item.status == 'pending':
        # Cancel pending evaluation
        queue_item.status = 'cancelled'
        queue_item.completed_at = timezone.now()
        queue_item.error_message = 'Evaluation cancelled by user'
        queue_item.save()
        
        # Clean up any execution tracker (shouldn't exist for pending, but just in case)
        ExecutionTracker.objects.filter(queue_id=queue_id).delete()
        
        messages.success(
            request,
            f'Pending evaluation cancelled successfully! Queue ID: {queue_item.id}'
        )
        
    elif queue_item.status == 'running':
        # Use the async cancellation handler for running evaluations
        import asyncio
        from .queue_manager import cancel_evaluation as async_cancel
        
        # Run the async cancellation
        try:
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            success = loop.run_until_complete(async_cancel(queue_id))
            loop.close()
            
            if success:
                messages.success(
                    request,
                    f'Running evaluation cancelled successfully! Queue ID: {queue_item.id}. '
                    f'The subprocess has been terminated.'
                )
            else:
                messages.warning(
                    request,
                    f'Evaluation marked for cancellation but subprocess termination may have failed. '
                    f'Queue ID: {queue_item.id}'
                )
        except Exception as e:
            logger.error(f"Error cancelling evaluation {queue_id}: {str(e)}")
            messages.error(
                request,
                f'Error cancelling evaluation: {str(e)}'
            )
        
    else:
        messages.error(
            request, 
            f'Cannot cancel evaluation with status "{queue_item.status}". '
            f'Only pending and running evaluations can be cancelled.'
        )
    
    # Check if request came from queue page and redirect accordingly
    # Check for explicit redirect parameter first
    redirect_to = request.POST.get('redirect_to', '')
    if redirect_to == 'queue':
        return redirect('model_evaluation:queue_status')
    
    # Otherwise check referer
    referer = request.META.get('HTTP_REFERER', '')
    if 'queue/' in referer:
        return redirect('model_evaluation:queue_status')
    else:
        return redirect('model_evaluation:evaluation_detail', queue_id=queue_id)


@admin_required
@require_POST
def update_subquestion_override(request, subquestion_answer_id):
    """
    Update the admin override value for a subquestion answer.
    
    This endpoint allows admins to manually override the automatic evaluation
    result for a subquestion when parsing issues occur.
    """
    try:
        # Get the subquestion answer
        subquestion_answer = get_object_or_404(ModelSubquestionAnswer, id=subquestion_answer_id)
        
        # Parse the JSON request body
        data = json.loads(request.body)
        override_value = data.get('override_value')
        
        # Validate the override value
        if override_value not in ['0', '1', 'null']:
            return JsonResponse({
                'success': False, 
                'error': 'Invalid override value. Must be "0", "1", or "null".'
            }, status=400)
        
        # Update the admin_override field
        if override_value == 'null':
            subquestion_answer.admin_override = None
        else:
            subquestion_answer.admin_override = int(override_value)
        
        subquestion_answer.save()
        
        # Return the updated status
        return JsonResponse({
            'success': True,
            'new_status': subquestion_answer.get_correctness_display(),
            'effective_correctness': subquestion_answer.effective_correctness
        })
        
    except json.JSONDecodeError:
        return JsonResponse({
            'success': False,
            'error': 'Invalid JSON in request body'
        }, status=400)
    except Exception as e:
        return JsonResponse({
            'success': False,
            'error': str(e)
        }, status=500)


@admin_required
def subquestion_parsing_check(request):
    """
    Subquestion Parsing Check page for reviewing and verifying model answers to subquestions.

    Displays Active questions grouped by Question -> Subquestion, showing all model answers
    side-by-side for efficient review and verification of automatic parsing.
    """
    from questions.models import Subquestion
    from django.db.models import Prefetch
    from .answer_utils import check_answer_correctness

    # Get filter parameters
    model_filter = request.GET.get('model')
    hide_overridden = request.GET.get('hide_overridden') == 'true'
    show_mismatches = request.GET.get('show_mismatches') == 'true'

    # Get all active questions with their subquestions
    active_status = QuestionState.objects.get(status='active')
    questions = Question.objects.filter(
        status=active_status
    ).prefetch_related(
        'subquestion_set'
    ).order_by('id')

    # Get all active models for the filter dropdown
    all_models = Model.objects.filter(is_active=True).select_related('company', 'tier').order_by(
        'tier__tier_number', 'company__company_name', 'model_name'
    )

    # Build the data structure: questions -> subquestions -> model answers
    questions_data = []

    for question in questions:
        subquestions = question.subquestion_set.all().order_by('subquestion_order')

        # Skip questions without subquestions
        if not subquestions:
            continue

        subquestions_data = []
        question_has_visible_data = False  # Track if question has any visible answers

        for subquestion in subquestions:
            # Get all model answers for this subquestion
            model_answers_query = ModelSubquestionAnswer.objects.filter(
                subquestion=subquestion
            ).select_related(
                'model__company', 'model__tier', 'attempt'
            ).order_by(
                'model__tier__tier_number', 'attempt__attempt_number'
            )

            # Apply model filter if specified
            if model_filter:
                model_answers_query = model_answers_query.filter(model_id=model_filter)

            # Apply hide_overridden filter if checked
            if hide_overridden:
                model_answers_query = model_answers_query.filter(admin_override__isnull=True)

            model_answers = list(model_answers_query)

            # For each model answer, compute the "fresh" automated correctness
            # and compare with stored effective_correctness
            filtered_answers = []
            for ma in model_answers:
                # Calculate fresh correctness using the robust algorithm
                if ma.answer:
                    fresh_correct = check_answer_correctness(ma.answer, subquestion.answer)
                    fresh_is_correct = 1 if fresh_correct else 0
                else:
                    fresh_is_correct = 0

                # Store the fresh evaluation on the object for display
                ma.fresh_is_correct = fresh_is_correct

                # Get the currently stored effective correctness
                # (admin_override if set, otherwise is_correct)
                stored_effective = ma.effective_correctness

                # Check if there's a mismatch
                ma.has_mismatch = (fresh_is_correct != stored_effective)

                # If show_mismatches filter is active, only include mismatches
                if show_mismatches:
                    if ma.has_mismatch:
                        filtered_answers.append(ma)
                else:
                    filtered_answers.append(ma)

            model_answers = filtered_answers

            # Track if this subquestion has visible answers
            if model_answers:
                question_has_visible_data = True

            subquestions_data.append({
                'subquestion': subquestion,
                'model_answers': model_answers
            })

        # Only include questions that have visible data after filtering
        if question_has_visible_data:
            questions_data.append({
                'question': question,
                'subquestions': subquestions_data
            })

    context = {
        'questions_data': questions_data,
        'all_models': all_models,
        'selected_model': model_filter,
        'hide_overridden': hide_overridden,
        'show_mismatches': show_mismatches,
    }

    return render(request, 'model_evaluation/subquestion_parsing_check.html', context)


@admin_required
@require_POST
def mark_subquestions_verified(request, question_id):
    """
    Mark all subquestion answers for a question as verified.

    For all answers with admin_override=None (using automatic evaluation),
    sets admin_override to the current is_correct value. This allows tracking
    which answers have been human-verified vs still using automatic evaluation.

    Existing admin overrides are preserved.
    """
    try:
        question = get_object_or_404(Question, id=question_id)

        # Get all subquestions for this question
        from questions.models import Subquestion
        subquestions = Subquestion.objects.filter(question=question)

        # Get all model answers for these subquestions that don't have admin override
        answers_to_verify = ModelSubquestionAnswer.objects.filter(
            subquestion__in=subquestions,
            admin_override__isnull=True
        )

        # Count how many will be updated
        count = 0
        for answer in answers_to_verify:
            # Set admin_override to current is_correct value
            if answer.is_correct is not None:
                answer.admin_override = answer.is_correct
                answer.save()
                count += 1

        return JsonResponse({
            'success': True,
            'count': count,
            'message': f'Verified {count} subquestion answers for Question {question_id}'
        })

    except Exception as e:
        logger.error(f"Error marking subquestions as verified for question {question_id}: {e}")
        return JsonResponse({
            'success': False,
            'error': str(e)
        }, status=500)


@admin_required
@require_POST
def reset_to_automated_evaluation(request, question_id):
    """
    Reset all subquestion answers for a question to use automated evaluation.

    Recalculates is_correct using the robust answer comparison algorithm
    (case-insensitive, boolean equivalences, etc.) and clears any admin overrides.

    This is useful when the original automated evaluation used a less robust
    algorithm and needs to be corrected.
    """
    from questions.models import Subquestion
    from .answer_utils import check_answer_correctness

    try:
        question = get_object_or_404(Question, id=question_id)

        # Get all subquestions for this question
        subquestions = Subquestion.objects.filter(question=question)

        # Get all model answers for these subquestions
        all_answers = ModelSubquestionAnswer.objects.filter(
            subquestion__in=subquestions
        ).select_related('subquestion')

        # Count how many will be updated
        count = 0
        changed_count = 0

        for answer in all_answers:
            count += 1

            # Recalculate is_correct using robust algorithm
            if answer.answer:
                fresh_correct = check_answer_correctness(
                    answer.answer,
                    answer.subquestion.answer
                )
                new_is_correct = 1 if fresh_correct else 0
            else:
                new_is_correct = 0

            # Check if anything is changing
            old_effective = answer.effective_correctness
            has_change = (answer.is_correct != new_is_correct) or (answer.admin_override is not None)

            if has_change:
                changed_count += 1

            # Update the answer: set is_correct to fresh value and clear admin_override
            answer.is_correct = new_is_correct
            answer.admin_override = None
            answer.save()

        return JsonResponse({
            'success': True,
            'count': count,
            'changed_count': changed_count,
            'message': f'Reset {count} subquestion answers for Question {question_id} ({changed_count} changed)'
        })

    except Exception as e:
        logger.error(f"Error resetting subquestions to automated evaluation for question {question_id}: {e}")
        return JsonResponse({
            'success': False,
            'error': str(e)
        }, status=500)


@admin_required
def export_matharena_json(request):
    """
    Export model evaluation data in JSON format for MathArena.

    Exports data for all active benchmark questions (same scope as leaderboard).
    For each model answer, includes:
    - problem_id: Question ID
    - human_score: Averaged progress grade from finalized grading sessions (null if none)
    - subquestion_score: Percentage of subquestion points earned
    - input_tokens: 0 (not tracked yet)
    - output_tokens: 0 (not tracked yet)
    - cost: 0 (not tracked yet)
    - model_name: Model identifier
    """
    from django.http import HttpResponse
    from questions.models import Subquestion
    from .models import ModelGrading, ModelGradingSession
    from collections import defaultdict

    # Get active benchmark questions (same as leaderboard)
    try:
        active_state = QuestionState.objects.get(status='active')
    except QuestionState.DoesNotExist:
        return JsonResponse({'error': 'Active status not found'}, status=500)

    included_questions = Question.objects.filter(
        status=active_state,
        benchmark_inclusion=True,
        published__isnull=True  # Exclude published questions from benchmark
    ).prefetch_related('subquestion_set')

    export_data = []

    for question in included_questions:
        # Get all model attempts for this question
        model_attempts = ModelAttempt.objects.filter(
            question=question
        ).select_related('model', 'model__tier')

        # Group attempts by model
        attempts_by_model = defaultdict(list)
        for attempt in model_attempts:
            attempts_by_model[attempt.model.id].append(attempt)

        # Process each model's attempts
        for model_id, attempts in attempts_by_model.items():
            model = attempts[0].model

            # Get latest attempt for each attempt_number
            attempts_by_number = defaultdict(list)
            for attempt in attempts:
                attempts_by_number[attempt.attempt_number].append(attempt)

            latest_attempts = []
            for attempt_number, attempt_list in attempts_by_number.items():
                latest = max(attempt_list, key=lambda x: x.id)
                latest_attempts.append(latest)

            # Calculate subquestion score across latest attempts
            total_earned = 0
            total_points = 0

            for attempt in latest_attempts:
                try:
                    model_answer = ModelAnswer.objects.get(attempt=attempt)
                except ModelAnswer.DoesNotExist:
                    continue

                # Calculate subquestion score for this attempt
                subquestion_answers = ModelSubquestionAnswer.objects.filter(
                    attempt=attempt
                )

                for sq_answer in subquestion_answers:
                    subquestion = sq_answer.subquestion
                    points = subquestion.points if subquestion.points else 1
                    total_points += points

                    # Use admin_override if set, otherwise use is_correct
                    if sq_answer.admin_override is not None:
                        is_correct = sq_answer.admin_override == 1
                    else:
                        is_correct = sq_answer.is_correct == 1

                    if is_correct:
                        total_earned += points

            # Calculate percentage
            if total_points > 0:
                subquestion_score = (total_earned / total_points) * 100
            else:
                subquestion_score = 0

            # Calculate human score (average from finalized grading sessions)
            human_score = None
            finalized_sessions = ModelGradingSession.objects.filter(
                question=question,
                session_status='finalized'
            )

            if finalized_sessions.exists():
                progress_grades = []
                for session in finalized_sessions:
                    # Find grading for this model in this session
                    for attempt in latest_attempts:
                        try:
                            model_answer = ModelAnswer.objects.get(attempt=attempt)
                            grading = ModelGrading.objects.filter(
                                session=session,
                                model_answer=model_answer,
                                grading_status='completed',
                                progress_grade__isnull=False
                            ).first()

                            if grading:
                                progress_grades.append(grading.progress_grade)
                        except ModelAnswer.DoesNotExist:
                            pass

                # Average the progress grades
                if progress_grades:
                    human_score = sum(progress_grades) / len(progress_grades)

            # Build export entry
            export_data.append({
                'problem_id': question.id,
                'model_name': model.model_name,
                'human_score': human_score,
                'subquestion_score': round(subquestion_score, 1),
                'input_tokens': 0,
                'output_tokens': 0,
                'cost': 0
            })

    # Return as JSON file download
    response = HttpResponse(
        json.dumps(export_data, indent=2),
        content_type='application/json'
    )
    response['Content-Disposition'] = 'attachment; filename="improofbench_export.json"'
    return response