"""
Views for Model Scores - displaying evaluation results and performance metrics.
"""

from django.shortcuts import render, redirect
from django.http import JsonResponse
from django.views.decorators.http import require_POST
from django.db.models import Count, Q, Max, Sum, Avg, F
from django.contrib import messages
from accounts.permissions import admin_required
from questions.models import Question, QuestionState, Subquestion
from .models import (
    Model, ModelTier, ModelAttempt, ModelAnswer, 
    ModelSubquestionAnswer, ModelGradingSession, EvaluationQueue,
    ModelGrading, GradingState
)
import json
from collections import defaultdict


@admin_required
def model_scores(request):
    """
    Main view for Model Scores page showing evaluation results.
    """
    # Get active questions
    active_state = QuestionState.objects.filter(status='active').first()
    if not active_state:
        messages.warning(request, "No active question state found.")
        return render(request, 'model_evaluation/model_scores.html', {'questions': []})
    
    # Get all active questions with evaluation data
    questions = Question.objects.filter(status=active_state).order_by('id')
    
    # Pre-calculate total expected evaluations (same for all questions)
    tier1_models = Model.objects.filter(tier__tier_number=1, is_active=True).count()
    tier2_models = Model.objects.filter(tier__tier_number=2, is_active=True).count()
    tier3_models = Model.objects.filter(tier__tier_number=3, is_active=True).count()
    tier4_models = Model.objects.filter(tier__tier_number=4, is_active=True).count()
    total_expected = (tier1_models * 2) + tier2_models + tier3_models + tier4_models
    
    # Get all active models with tier info for reuse
    active_models = Model.objects.filter(is_active=True).select_related('tier')
    
    # Calculate evaluation progress for each question
    question_data = []
    for question in questions:
        # Get the expected number of subquestions for this question
        expected_subquestions = Subquestion.objects.filter(question=question).count()
        
        # Count completed evaluations - only count latest attempt for each (model, attempt_number)
        # AND only if they have answers to ALL subquestions (or no subquestions expected)
        completed_evaluations = 0
        
        for model in active_models:
            # Determine how many attempts this model should have
            max_attempts = 2 if model.tier.tier_number == 1 else 1
            
            for attempt_num in range(1, max_attempts + 1):
                # Get the latest attempt for this model/question/attempt_number combination
                latest_attempt = ModelAttempt.objects.filter(
                    model=model,
                    question=question,
                    attempt_number=attempt_num
                ).order_by('-id').first()
                
                if not latest_attempt:
                    continue
                
                # Check if this attempt's evaluation failed
                # Get the latest queue entry for this attempt (in case of re-runs)
                latest_queue = EvaluationQueue.objects.filter(
                    attempt=latest_attempt
                ).order_by('-id').first()
                
                # Skip if evaluation failed or was cancelled
                if latest_queue and latest_queue.status in ['failed', 'cancelled']:
                    continue
                
                # Check if this latest attempt has a main answer
                main_answer = ModelAnswer.objects.filter(attempt=latest_attempt).first()
                if not main_answer:
                    continue
                
                # If there are expected subquestions, check if all are answered
                if expected_subquestions > 0:
                    actual_subquestion_answers = ModelSubquestionAnswer.objects.filter(
                        attempt=latest_attempt
                    ).count()
                    
                    # Only count as complete if ALL subquestions are answered
                    if actual_subquestion_answers == expected_subquestions:
                        completed_evaluations += 1
                else:
                    # No subquestions expected, count as complete if main answer exists
                    completed_evaluations += 1
        
        # Check if any answers are released for grading
        has_released_answers = ModelAnswer.objects.filter(
            question=question, 
            released_for_grading=True
        ).exists()
        
        # Count submitted gradings
        grading_sessions = ModelGradingSession.objects.filter(
            question=question,
            session_status='finalized'
        ).count()
        
        # Check if question has subquestions
        has_subquestions = Subquestion.objects.filter(question=question).exists()
        
        question_data.append({
            'question': question,
            'completed_evaluations': completed_evaluations,
            'total_expected': total_expected,
            'has_released_answers': has_released_answers,
            'grading_sessions': grading_sessions,
            'has_subquestions': has_subquestions,
        })
    
    # Get model performance data for subquestions
    models_with_scores = calculate_model_subquestion_scores(questions)
    
    # Sort by score descending (N/A values at the bottom)
    models_with_scores.sort(key=lambda x: x['score_percentage'] if x['score_percentage'] is not None else -1, reverse=True)
    
    # Get grading statistics for included questions (exclude published questions)
    included_questions = Question.objects.filter(
        status=active_state,
        benchmark_inclusion=True,
        published__isnull=True
    )
    grading_stats = calculate_grading_statistics(included_questions)
    progress_stats = calculate_progress_grades(included_questions)
    
    # Get pairwise comparison data
    pairwise_data = calculate_pairwise_comparisons(included_questions)
    
    # Get word cloud data for active questions
    word_cloud_data = calculate_word_cloud_data(questions)
    
    # Get correlation data
    correlation_data = calculate_correlations(included_questions)

    # Get standard deviation data for models with multiple attempts
    variance_data = calculate_standard_deviations(included_questions)

    # Convert grading stats to JSON for JavaScript
    grading_stats_json = {}
    for key, value in grading_stats.items():
        grading_stats_json[key] = json.dumps(value)

    # Convert progress stats to JSON for JavaScript
    progress_stats_json = json.dumps(progress_stats) if progress_stats else None
    
    # Convert models data to JSON for JavaScript chart (for export functionality)
    models_json = []
    for model_data in models_with_scores:
        models_json.append({
            'model': {
                'display_name': model_data['model'].display_name,
                'id': model_data['model'].id,
            },
            'tier': model_data['tier'],
            'score_percentage': model_data['score_percentage'],
            'questions_attempted': model_data['questions_attempted'],
            'questions_total': model_data['questions_total'],
        })
    models_with_scores_json = json.dumps(models_json)
    
    # Convert pairwise data to JSON for JavaScript
    pairwise_data_json = json.dumps(pairwise_data) if pairwise_data else None
    
    # Convert word cloud data to JSON for JavaScript
    word_cloud_json = json.dumps(word_cloud_data) if word_cloud_data else None
    
    # Convert correlation data to JSON for JavaScript (if needed)
    correlation_data_json = json.dumps(correlation_data) if correlation_data else None

    # Convert variance data to JSON for JavaScript chart
    variance_data_json = []
    for data in variance_data:
        variance_data_json.append({
            'model': data['model'].display_name,
            'tier': data['tier'],
            'average_score': data['average_score'],
            'sigma': data['sigma'],
            'num_questions': data['num_questions'],
            'num_evaluations': data['num_total_evaluations'],
        })
    variance_data_json = json.dumps(variance_data_json) if variance_data_json else None

    context = {
        'questions': question_data,
        'models': models_with_scores,
        'models_json': models_with_scores_json,
        'grading_stats': grading_stats,
        'grading_stats_json': grading_stats_json,
        'progress_stats': progress_stats,
        'progress_stats_json': progress_stats_json,
        'pairwise_data': pairwise_data,
        'pairwise_data_json': pairwise_data_json,
        'word_cloud_data': word_cloud_data,
        'word_cloud_json': word_cloud_json,
        'correlation_data': correlation_data,
        'correlation_data_json': correlation_data_json,
        'variance_data': variance_data,
        'variance_data_json': variance_data_json,
    }

    return render(request, 'model_evaluation/model_scores.html', context)


def calculate_model_subquestion_scores(questions):
    """
    Calculate subquestion scores for each model.
    Only considers questions that are marked for benchmark inclusion.
    """
    # Filter questions with benchmark_inclusion=True and having subquestions
    included_questions = questions.filter(
        benchmark_inclusion=True,
        subquestion__isnull=False
    ).distinct()
    
    # Get all active models
    models = Model.objects.filter(is_active=True).select_related('tier').order_by(
        'tier__tier_number', 'display_name'
    )
    
    model_scores = []
    for model in models:
        question_percentages = []  # Store percentage for each question
        questions_attempted = 0
        
        for question in included_questions:
            # For re-runs, we need to get the latest attempt for each attempt_number
            # This ensures we use the most recent re-run if evaluation was repeated
            
            if model.tier.tier_number == 1:
                # Tier 1 models get 2 attempts - get the latest of each attempt_number
                attempts_to_consider = []
                for attempt_num in [1, 2]:
                    latest_attempt = ModelAttempt.objects.filter(
                        model=model,
                        question=question,
                        attempt_number=attempt_num
                    ).order_by('-id').first()  # Latest re-run for this attempt_number
                    
                    if latest_attempt:
                        attempts_to_consider.append(latest_attempt)
            else:
                # Other tiers get 1 attempt - get the latest re-run
                latest_attempt = ModelAttempt.objects.filter(
                    model=model,
                    question=question,
                    attempt_number=1  # Non-Tier 1 models only have attempt_number=1
                ).order_by('-id').first()  # Latest re-run
                
                attempts_to_consider = [latest_attempt] if latest_attempt else []
            
            if not attempts_to_consider:
                continue
            
            # Calculate percentage scores for each attempt
            attempt_percentages = []
            for attempt in attempts_to_consider:
                # Get subquestion answers for this attempt
                subquestion_answers = ModelSubquestionAnswer.objects.filter(
                    attempt=attempt
                ).select_related('subquestion')
                
                if not subquestion_answers.exists():
                    continue
                
                points_earned = 0
                points_possible = 0
                
                for sq_answer in subquestion_answers:
                    points = sq_answer.subquestion.points
                    points_possible += points
                    
                    # Use effective_correctness which considers admin overrides
                    if sq_answer.effective_correctness == 1:
                        points_earned += points
                
                if points_possible > 0:
                    # Calculate percentage for this attempt
                    percentage = (points_earned / points_possible) * 100
                    attempt_percentages.append(percentage)
            
            # Average the percentages if multiple attempts (for Tier 1)
            if attempt_percentages:
                avg_percentage = sum(attempt_percentages) / len(attempt_percentages)
                question_percentages.append(avg_percentage)
                questions_attempted += 1
        
        # Calculate overall average percentage across all questions
        if question_percentages:
            overall_percentage = sum(question_percentages) / len(question_percentages)
        else:
            overall_percentage = None
        
        model_scores.append({
            'model': model,
            'tier': model.tier.tier_number,
            'score_percentage': overall_percentage,
            'questions_attempted': questions_attempted,
            'questions_total': included_questions.count(),
        })
    
    return model_scores


@admin_required
@require_POST
def toggle_benchmark_inclusion(request, question_id):
    """
    AJAX endpoint to toggle benchmark_inclusion for a question.
    """
    try:
        question = Question.objects.get(id=question_id)
        
        # Toggle the benchmark_inclusion field
        question.benchmark_inclusion = not question.benchmark_inclusion
        question.save()
        
        return JsonResponse({
            'success': True,
            'new_value': question.benchmark_inclusion
        })
        
    except Question.DoesNotExist:
        return JsonResponse({
            'success': False,
            'error': 'Question not found'
        }, status=404)
    except Exception as e:
        return JsonResponse({
            'success': False,
            'error': str(e)
        }, status=500)


def calculate_grading_statistics(questions=None):
    """
    Calculate grading statistics for each model and grading category.
    Returns data structured for Chart.js visualization.
    Averages grades when multiple graders grade the same question.
    
    Returns:
        dict: Contains grading data for each category with model breakdowns
    """
    # Get active questions if not provided
    if questions is None:
        active_state = QuestionState.objects.filter(status='active').first()
        if not active_state:
            return {}
        questions = Question.objects.filter(status=active_state, benchmark_inclusion=True)
    
    # Get all finalized grading sessions for these questions
    finalized_sessions = ModelGradingSession.objects.filter(
        question__in=questions,
        session_status='finalized'
    )
    
    # Get all gradings from finalized sessions
    gradings = ModelGrading.objects.filter(
        session__in=finalized_sessions,
        grading_status='completed'
    ).select_related(
        'model_answer__model',
        'model_answer__question',
        'error_incorrect_logic',
        'error_hallucinated', 
        'error_calculation',
        'error_conceptual',
        'achievement_understanding',
        'achievement_correct_result',
        'achievement_insight',
        'achievement_usefulness'
    )
    
    # Define grading categories with their database field names and display names
    GRADING_CATEGORIES = [
        ('error_incorrect_logic', 'Incorrect Logic'),
        ('error_hallucinated', 'Hallucinated Content'),
        ('error_calculation', 'Calculation Error'),
        ('error_conceptual', 'Conceptual Error'),
        ('achievement_understanding', 'Shows Understanding'),
        ('achievement_correct_result', 'Correct Result'),
        ('achievement_insight', 'Shows Insight'),
        ('achievement_usefulness', 'Useful Progress'),
    ]
    
    # Get all active models sorted by subquestion score - FILTER TO TIERS 1-3 ONLY
    models_with_scores = calculate_model_subquestion_scores(questions)
    models_with_scores = [m for m in models_with_scores if m['tier'] in [1, 2, 3]]
    models_with_scores.sort(key=lambda x: x['score_percentage'] if x['score_percentage'] is not None else -1, reverse=True)
    
    # Build statistics for each category
    category_data = {}
    
    for field_name, display_name in GRADING_CATEGORIES:
        category_stats = {
            'display_name': display_name,
            'field_name': field_name,
            'models': []
        }
        
        # For each model (in score order)
        for model_data in models_with_scores:
            model = model_data['model']
            
            # Get gradings for this model
            model_gradings = gradings.filter(model_answer__model=model)
            
            # Group gradings by question to handle multiple graders
            question_grades = defaultdict(list)
            
            for grading in model_gradings:
                question_id = grading.model_answer.question_id
                grade_value = getattr(grading, field_name)
                if grade_value:
                    # Include all grades, including not_applicable
                    question_grades[question_id].append(grade_value.state_code)
            
            # Calculate averaged counts
            averaged_counts = defaultdict(float)
            total_questions = 0
            
            for question_id, grade_list in question_grades.items():
                if grade_list:
                    # Count occurrences of each grade for this question
                    grade_counts = defaultdict(int)
                    for grade in grade_list:
                        grade_counts[grade] += 1
                    
                    # Calculate the average contribution for this question
                    num_graders = len(grade_list)
                    for grade, count in grade_counts.items():
                        averaged_counts[grade] += count / num_graders
                    
                    total_questions += 1
            
            # Calculate percentages based on averaged counts (now including N/A)
            total_weight = sum(averaged_counts.values())
            if total_weight > 0:
                percentages = {
                    'yes': (averaged_counts.get('true', 0) / total_weight) * 100,
                    'not_sure': (averaged_counts.get('not_sure', 0) / total_weight) * 100,
                    'no': (averaged_counts.get('false', 0) / total_weight) * 100,
                    'na': (averaged_counts.get('not_applicable', 0) / total_weight) * 100,
                }
            else:
                percentages = {'yes': 0, 'not_sure': 0, 'no': 0, 'na': 0}
            
            category_stats['models'].append({
                'model_name': model.display_name,
                'tier': model.tier.tier_number,
                'total_questions': total_questions,
                'percentages': percentages,
                'averaged_counts': {
                    'yes': round(averaged_counts.get('true', 0), 2),
                    'not_sure': round(averaged_counts.get('not_sure', 0), 2),
                    'no': round(averaged_counts.get('false', 0), 2),
                    'na': round(averaged_counts.get('not_applicable', 0), 2),
                }
            })
        
        category_data[field_name] = category_stats
    
    return category_data


def calculate_progress_grades(questions=None):
    """
    Calculate progress grade statistics for each model.
    Returns data for visualization showing distribution of 0-3 progress grades.
    Averages grades when multiple graders grade the same question.
    """
    # Get active questions if not provided
    if questions is None:
        active_state = QuestionState.objects.filter(status='active').first()
        if not active_state:
            return {}
        questions = Question.objects.filter(status=active_state, benchmark_inclusion=True)
    
    # Get all finalized grading sessions
    finalized_sessions = ModelGradingSession.objects.filter(
        question__in=questions,
        session_status='finalized'
    )
    
    # Get all gradings with progress grades (only for released answers)
    gradings = ModelGrading.objects.filter(
        session__in=finalized_sessions,
        grading_status='completed',
        model_answer__released_for_grading=True,
        progress_grade__isnull=False
    ).select_related('model_answer__model', 'model_answer__question')
    
    # Get models sorted by subquestion score - FILTER TO TIERS 1-3 ONLY
    models_with_scores = calculate_model_subquestion_scores(questions)
    models_with_scores = [m for m in models_with_scores if m['tier'] in [1, 2, 3]]
    models_with_scores.sort(key=lambda x: x['score_percentage'] if x['score_percentage'] is not None else -1, reverse=True)
    
    progress_data = {
        'models': []
    }
    
    for model_data in models_with_scores:
        model = model_data['model']
        
        # Get gradings for this model
        model_gradings = gradings.filter(model_answer__model=model)
        
        # Group progress grades by question to handle multiple graders
        question_grades = defaultdict(list)
        
        for grading in model_gradings:
            if grading.progress_grade is not None:
                question_id = grading.model_answer.question_id
                question_grades[question_id].append(grading.progress_grade)
        
        # Count questions in each progress category (matching leaderboard logic)
        # Average grades per question, round to nearest integer, count in that category
        progress_counts = {0: 0, 1: 0, 2: 0, 3: 0}
        total_questions = 0
        sum_weighted_grades = 0.0

        for question_id, grade_list in question_grades.items():
            if grade_list:
                # Calculate average grade for this question
                avg_grade = sum(grade_list) / len(grade_list)
                # Round to nearest integer (0.5 rounds up)
                rounded_grade = int(avg_grade + 0.5)
                progress_counts[rounded_grade] += 1

                sum_weighted_grades += avg_grade
                total_questions += 1

        # Calculate overall average progress grade and percentages
        if total_questions > 0:
            avg_progress = sum_weighted_grades / total_questions
            percentages = {grade: (count / total_questions) * 100 for grade, count in progress_counts.items()}
        else:
            avg_progress = None
            percentages = {0: 0, 1: 0, 2: 0, 3: 0}
        
        progress_data['models'].append({
            'model_name': model.display_name,
            'tier': model.tier.tier_number,
            'total_questions': total_questions,
            'average_progress': round(avg_progress, 2) if avg_progress is not None else None,
            'percentages': percentages,
            'counts': progress_counts
        })
    
    # Sort by complete solution percentage (grade 3) to match leaderboard ordering
    progress_data['models'].sort(key=lambda x: x['percentages'][3], reverse=True)
    
    return progress_data


def calculate_word_cloud_data(questions=None):
    """
    Calculate word frequency data from question tags for word cloud visualization.
    
    Returns:
        list: List of dictionaries with 'text' and 'weight' for each tag
    """
    from collections import Counter
    
    # Get active questions if not provided
    if questions is None:
        active_state = QuestionState.objects.filter(status='active').first()
        if not active_state:
            return []
        questions = Question.objects.filter(status=active_state)
    
    # Collect all tags from questions
    all_tags = []
    for question in questions:
        if question.tags:
            # Split tags by comma and clean them - normalize case for counting
            tags = [tag.strip() for tag in question.tags.split(',') if tag.strip()]
            # Normalize to title case for consistent counting
            tags = [tag.lower().title() for tag in tags]
            all_tags.extend(tags)
    
    # Count tag frequencies
    tag_counts = Counter(all_tags)
    
    # If no tags found, return empty list
    if not tag_counts:
        return []
    
    # Get the maximum count for normalization
    max_count = max(tag_counts.values())
    
    # Create word cloud data with normalized weights
    word_cloud_data = []
    for tag, count in tag_counts.items():
        # Normalize weight between 10 and 100 for better visualization
        weight = 10 + (count / max_count) * 90
        word_cloud_data.append({
            'text': tag,  # Already in title case
            'weight': int(weight),
            'count': count  # Keep original count for tooltip
        })
    
    # Sort by weight for consistent rendering
    word_cloud_data.sort(key=lambda x: x['weight'], reverse=True)
    
    return word_cloud_data


def calculate_pairwise_comparisons(questions=None):
    """
    Pairwise wins by problem using weighted subquestion points.

    For each model M and problem P with subquestions, compute A(M,P) as
    sum(points_i for correct subquestions i) / sum(points_i over all subquestions in P).
    The pairwise matrix entry (M, N) increments by 1 if A(M,P) > A(N,P).

    Rules:
    - Consider only attempt_number=1 (latest re-run) for all models (incl. Tier 1).
    - Restrict to active + benchmark_inclusion + has subquestions when questions is None.
    - Missing/None correctness is treated as not contributing to the numerator (implicit 0).
    - Only attempts whose latest queue status is 'completed' are counted; others treated as 0.
    """
    from django.db.models import Max

    # Determine included questions if not provided
    if questions is None:
        active_state = QuestionState.objects.filter(status='active').first()
        if not active_state:
            return {'models': [], 'matrix': [], 'total_questions': 0, 'total_subquestions': 0}
        questions = Question.objects.filter(
            status=active_state,
            benchmark_inclusion=True,
            subquestion__isnull=False,
        ).distinct().order_by('id')

    # Models list (active)
    model_qs = Model.objects.filter(is_active=True).select_related('tier').order_by('tier__tier_number', 'display_name')
    model_list = list(model_qs)
    if not model_list or not questions.exists():
        return {'models': [], 'matrix': [], 'total_questions': questions.count() if hasattr(questions, 'count') else 0, 'total_subquestions': 0}

    # Subquestions and points for included questions
    subq_rows = list(Subquestion.objects.filter(question__in=questions).values('id', 'question_id', 'points'))
    if not subq_rows:
        return {
            'models': [
                {'id': m.id, 'name': m.display_name, 'tier': m.tier.tier_number if m.tier else None}
                for m in model_list
            ],
            'matrix': [[None if i == j else 0 for j in range(len(model_list))] for i in range(len(model_list))],
            'total_questions': questions.count(),
            'total_subquestions': 0,
        }

    # Build per-question subquestion lists and total points
    q_subqs = {}
    q_total_points = {}
    all_subq_ids = set()
    for row in subq_rows:
        sid = row['id']
        qid = row['question_id']
        pts = row['points'] if isinstance(row['points'], int) and row['points'] > 0 else 1
        all_subq_ids.add(sid)
        q_subqs.setdefault(qid, []).append((sid, pts))
        q_total_points[qid] = q_total_points.get(qid, 0) + pts

    # Latest attempt (attempt_number=1) per (model, question)
    model_ids = [m.id for m in model_list]
    question_ids = list(questions.values_list('id', flat=True))
    latest_attempt_rows = list(
        ModelAttempt.objects.filter(
            model_id__in=model_ids,
            question_id__in=question_ids,
            attempt_number=1,
        ).values('model_id', 'question_id').annotate(latest_id=Max('id'))
    )
    attempt_map = {(r['model_id'], r['question_id']): r['latest_id'] for r in latest_attempt_rows}
    attempt_ids = set(attempt_map.values())

    # Keep only attempts whose latest queue entry is 'completed'
    if attempt_ids:
        latest_q_rows = list(
            EvaluationQueue.objects.filter(attempt_id__in=attempt_ids)
            .values('attempt_id').annotate(latest_qid=Max('id'))
        )
        qid_map = {r['attempt_id']: r['latest_qid'] for r in latest_q_rows}
        q_objs = {q.id: q for q in EvaluationQueue.objects.filter(id__in=qid_map.values())}
        completed_attempt_ids = {
            a_id for a_id, qid in qid_map.items()
            if qid in q_objs and q_objs[qid].status == 'completed'
        }
        # Filter attempt map
        attempt_map = {k: v for k, v in attempt_map.items() if v in completed_attempt_ids}
        attempt_ids = set(attempt_map.values())

    # Fetch all subquestion answers for these attempts
    ans_by_attempt = {}
    if attempt_ids:
        for ans in ModelSubquestionAnswer.objects.filter(
            attempt_id__in=attempt_ids,
            subquestion_id__in=all_subq_ids
        ).values('attempt_id', 'subquestion_id', 'is_correct', 'admin_override'):
            # Compute effective correctness inline (avoid per-object property overhead)
            eff = ans['admin_override'] if ans['admin_override'] is not None else ans['is_correct']
            # Treat None as 0 for now (missing handling not critical per request)
            val = 1 if eff == 1 else 0
            ans_by_attempt.setdefault(ans['attempt_id'], {})[ans['subquestion_id']] = val

    # Precompute A(M,P) for all model/question
    # A = weighted_correct_points / total_points for that question
    A = {m.id: {} for m in model_list}
    for qid in question_ids:
        total_pts = q_total_points.get(qid, 0)
        if total_pts <= 0:
            continue
        subqs = q_subqs.get(qid, [])
        for m in model_list:
            attempt_id = attempt_map.get((m.id, qid))
            if not attempt_id:
                A[m.id][qid] = 0.0
                continue
            correct_pts = 0
            am = ans_by_attempt.get(attempt_id, {})
            for sid, pts in subqs:
                if am.get(sid, 0) == 1:
                    correct_pts += pts
            A[m.id][qid] = correct_pts / total_pts

    # Determine display order by descending average subquestion percentage (same as other tables)
    try:
        model_scores = calculate_model_subquestion_scores(questions)
        score_by_id = {
            row['model'].id: (row['score_percentage'] if row['score_percentage'] is not None else -1)
            for row in model_scores
        }
    except Exception:
        score_by_id = {}

    sorted_models = sorted(
        model_list,
        key=lambda m: score_by_id.get(m.id, -1),
        reverse=True,
    )

    # Build pairwise matrix in the sorted order: count problems where A(M,P) > A(N,P)
    n = len(sorted_models)
    matrix = [[None if i == j else 0 for j in range(n)] for i in range(n)]
    for qi in question_ids:
        for i, mi in enumerate(sorted_models):
            ai = A.get(mi.id, {}).get(qi, 0.0)
            for j, mj in enumerate(sorted_models):
                if i == j:
                    continue
                aj = A.get(mj.id, {}).get(qi, 0.0)
                if ai > aj:
                    matrix[i][j] += 1

    models_info = [
        {'id': m.id, 'name': m.display_name, 'tier': m.tier.tier_number if m.tier else None}
        for m in sorted_models
    ]

    return {
        'models': models_info,
        'matrix': matrix,
        'total_questions': len(question_ids),
        'total_subquestions': len(all_subq_ids),
    }


def calculate_standard_deviations(questions=None):
    """
    Calculate standard deviations for models with multiple attempts.

    For each model M and question Q with subquestions:
    - Calculate the weighted subquestion score for each evaluation
    - Calculate sample variance V(M,Q) across evaluations
    - Estimate overall standard deviation: σ(M) = (1/N) * √(Σ V(M,Qi))

    Only includes:
    - Questions marked for benchmark inclusion
    - Questions with subquestions
    - Models with at least 2 evaluations for at least one question

    Returns data for visualization with error bars.
    """
    import numpy as np

    # Get active questions if not provided
    if questions is None:
        active_state = QuestionState.objects.filter(status='active').first()
        if not active_state:
            return {}
        questions = Question.objects.filter(status=active_state, benchmark_inclusion=True)

    # Filter to questions with subquestions
    questions_with_subquestions = questions.filter(
        subquestion__isnull=False
    ).distinct()

    # Get all active models
    models = Model.objects.filter(is_active=True).select_related('tier').order_by(
        'tier__tier_number', 'display_name'
    )

    model_variance_data = []

    for model in models:
        # Track scores for each question
        question_scores = {}  # question_id -> list of scores
        question_variances = {}  # question_id -> variance

        for question in questions_with_subquestions:
            # Get ONLY the latest attempt for each attempt_number (matching Overview tab logic)
            # For Tier 1: get latest for attempt_number=1 and attempt_number=2
            # For other tiers: get latest for attempt_number=1

            attempts_to_consider = []

            if model.tier.tier_number == 1:
                # Tier 1 models get 2 attempts - get the latest of each attempt_number
                for attempt_num in [1, 2]:
                    latest_attempt = ModelAttempt.objects.filter(
                        model=model,
                        question=question,
                        attempt_number=attempt_num
                    ).order_by('-id').first()  # Latest re-run for this attempt_number

                    if latest_attempt:
                        attempts_to_consider.append(latest_attempt)
            else:
                # Other tiers get 1 attempt - get the latest re-run
                latest_attempt = ModelAttempt.objects.filter(
                    model=model,
                    question=question,
                    attempt_number=1  # Non-Tier 1 models only have attempt_number=1
                ).order_by('-id').first()  # Latest re-run

                if latest_attempt:
                    attempts_to_consider.append(latest_attempt)

            if not attempts_to_consider:
                continue

            # For each attempt, calculate the weighted subquestion score
            scores_for_question = []

            for attempt in attempts_to_consider:
                # Check if this attempt's evaluation completed successfully
                latest_queue = EvaluationQueue.objects.filter(
                    attempt=attempt
                ).order_by('-id').first()

                if not latest_queue or latest_queue.status != 'completed':
                    continue

                # Get subquestion answers for this attempt
                subquestion_answers = ModelSubquestionAnswer.objects.filter(
                    attempt=attempt
                ).select_related('subquestion')

                if not subquestion_answers.exists():
                    continue

                # Calculate weighted score (as percentage)
                points_earned = 0
                points_possible = 0

                for sq_answer in subquestion_answers:
                    points = sq_answer.subquestion.points
                    points_possible += points

                    if sq_answer.effective_correctness == 1:
                        points_earned += points

                if points_possible > 0:
                    percentage = (points_earned / points_possible) * 100
                    scores_for_question.append(percentage)

            # Only include if we have at least 2 evaluations for this question
            if len(scores_for_question) >= 2:
                question_scores[question.id] = scores_for_question
                # Calculate sample variance (ddof=1 for unbiased estimator)
                question_variances[question.id] = float(np.var(scores_for_question, ddof=1))

        # Only include models that have at least 2 evaluations for at least one question
        if not question_variances:
            continue

        # Calculate overall standard deviation using the formula
        # σ(M) = (1/N) * √(Σ V(M,Qi))
        N = len(question_variances)
        sum_of_variances = sum(question_variances.values())
        sigma = (1 / N) * np.sqrt(sum_of_variances)

        # Calculate average score across all questions (for plotting)
        all_scores = []
        for scores in question_scores.values():
            all_scores.extend(scores)
        average_score = np.mean(all_scores) if all_scores else None

        # Also calculate the average score per question (matching the Overview tab methodology)
        question_percentages = []
        for scores in question_scores.values():
            question_percentages.append(np.mean(scores))
        avg_score_per_question = np.mean(question_percentages) if question_percentages else None

        model_variance_data.append({
            'model': model,
            'tier': model.tier.tier_number,
            'num_questions': N,
            'num_total_evaluations': len(all_scores),
            'average_score': avg_score_per_question,  # Use per-question average to match Overview
            'sigma': float(sigma),
            'question_variances': question_variances,
            'question_scores': question_scores,
        })

    # Sort by average score descending
    model_variance_data.sort(key=lambda x: x['average_score'] if x['average_score'] is not None else -1, reverse=True)

    return model_variance_data


def calculate_correlations(questions=None):
    """
    Calculate correlations between grading metrics.
    Currently calculates:
    1. Total Progress grade vs. Subquestion percentage correlation.
    2. All subquestions correct vs. Progress grade = 3 correlation (both binary).

    For each graded model answer:
    - Average Total Progress grade across all gradings for that answer
    - Subquestion percentage for that specific model attempt
    - Binary: all subquestions correct (0 or 1)
    - Binary: progress grade = 3 from all graders (0 or 1)

    Returns correlation coefficients and supporting data.
    """
    from scipy import stats

    # Get active questions if not provided
    if questions is None:
        active_state = QuestionState.objects.filter(status='active').first()
        if not active_state:
            return {}
        questions = Question.objects.filter(status=active_state, benchmark_inclusion=True)

    # Get all finalized grading sessions for these questions
    finalized_sessions = ModelGradingSession.objects.filter(
        question__in=questions,
        session_status='finalized'
    )

    # Get all completed gradings with progress grades
    gradings = ModelGrading.objects.filter(
        session__in=finalized_sessions,
        grading_status='completed',
        progress_grade__isnull=False
    ).select_related(
        'model_answer__model',
        'model_answer__question',
        'model_answer__attempt'
    )

    # Group gradings by model_answer to calculate averages
    answer_progress_grades = defaultdict(list)
    for grading in gradings:
        answer_progress_grades[grading.model_answer_id].append(grading.progress_grade)

    # Calculate average progress grade for each answer and get subquestion percentage
    progress_grades = []
    subquestion_percentages = []

    # NEW: Lists for the binary correlation
    all_subquestions_correct = []  # Binary: 1 if all correct, 0 otherwise
    progress_grade_3 = []  # Binary: 1 if all graders gave 3, 0 otherwise

    for model_answer_id, grade_list in answer_progress_grades.items():
        # Get the model answer
        try:
            model_answer = ModelAnswer.objects.select_related('attempt', 'question').get(id=model_answer_id)
        except ModelAnswer.DoesNotExist:
            continue

        # Skip if question doesn't have subquestions (for the binary correlation)
        has_subquestions = Subquestion.objects.filter(question=model_answer.question).exists()

        # Calculate average progress grade for this answer
        avg_progress = sum(grade_list) / len(grade_list)

        # Get subquestion answers for this attempt
        subquestion_answers = ModelSubquestionAnswer.objects.filter(
            attempt=model_answer.attempt
        ).select_related('subquestion')

        # Calculate subquestion percentage for this attempt
        points_earned = 0
        points_possible = 0
        all_correct = True  # Track if all subquestions are correct

        for sq_answer in subquestion_answers:
            points = sq_answer.subquestion.points
            points_possible += points

            # Use effective_correctness which considers admin overrides
            if sq_answer.effective_correctness == 1:
                points_earned += points
            else:
                all_correct = False

        # Only include if there were subquestions to grade
        if points_possible > 0:
            percentage = (points_earned / points_possible) * 100
            progress_grades.append(avg_progress)
            subquestion_percentages.append(percentage)

            # NEW: Add binary values (only for questions with subquestions)
            if has_subquestions:
                all_subquestions_correct.append(1 if all_correct else 0)
                # Progress grade = 3 only if ALL graders gave 3 (avg == 3.0)
                progress_grade_3.append(1 if avg_progress == 3.0 else 0)

    # Calculate correlation if we have data
    correlation_data = {
        'progress_vs_subquestion': {
            'correlation': None,
            'p_value': None,
            'n_samples': len(progress_grades),
            'progress_grades': progress_grades,
            'subquestion_percentages': subquestion_percentages
        },
        # NEW: Binary correlation
        'all_correct_vs_progress_3': {
            'correlation': None,
            'p_value': None,
            'n_samples': len(all_subquestions_correct),
            'all_subquestions_correct': all_subquestions_correct,
            'progress_grade_3': progress_grade_3
        }
    }

    if len(progress_grades) >= 2:  # Need at least 2 points for correlation
        try:
            # Calculate Pearson correlation
            correlation, p_value = stats.pearsonr(progress_grades, subquestion_percentages)
            correlation_data['progress_vs_subquestion']['correlation'] = round(correlation, 4)
            correlation_data['progress_vs_subquestion']['p_value'] = round(p_value, 4) if p_value else None
        except Exception as e:
            # In case of numerical issues
            print(f"Error calculating correlation: {e}")

    # NEW: Calculate binary correlation
    if len(all_subquestions_correct) >= 2:
        try:
            # Calculate Pearson correlation for binary variables
            correlation, p_value = stats.pearsonr(all_subquestions_correct, progress_grade_3)
            correlation_data['all_correct_vs_progress_3']['correlation'] = round(correlation, 4)
            correlation_data['all_correct_vs_progress_3']['p_value'] = round(p_value, 4) if p_value else None
        except Exception as e:
            # In case of numerical issues (e.g., all values are the same)
            print(f"Error calculating binary correlation: {e}")

    return correlation_data
