{% extends 'base.html' %}
{% load static %}

{% block title %}Benchmark Results - IMProofBench{% endblock %}

{% block extra_css %}
<!-- Chart.js CDN -->
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
<style>
    /* Pairwise comparison table styles (from Model Scores) */
    .pairwise-table {
        font-size: 0.9rem;
    }
    
    .pairwise-table th {
        background-color: #f8f9fa;
        font-weight: 600;
        text-align: center;
        vertical-align: middle;
        padding: 8px;
    }
    
    .pairwise-table td {
        text-align: center;
        vertical-align: middle;
        padding: 6px;
        border: 1px solid #dee2e6;
    }
    
    .pairwise-table .model-header {
        background-color: #f8f9fa;
        font-weight: 600;
        text-align: left;
        white-space: nowrap;
    }
    
    .pairwise-table .diagonal {
        background-color: #e9ecef;
        color: #6c757d;
    }
    
    .pairwise-table td {
        transition: transform 0.2s ease;
    }
    
    .pairwise-table td:hover:not(.diagonal):not(.model-header) {
        transform: scale(1.1);
        box-shadow: 0 2px 5px rgba(0,0,0,0.2);
        position: relative;
        z-index: 10;
    }
    
    .tier-badge {
        font-size: 0.75rem;
        padding: 2px 6px;
        margin-left: 4px;
    }
    
    .tab-content {
        padding-top: 20px;
    }

    /* Non-Agentic Model Styling */
    .non-agentic-label {
        color: #0066cc;  /* Distinctive blue */
        font-weight: 600;
    }

    .non-agentic-model {
        color: #0066cc !important;  /* Distinctive blue for model names */
        font-weight: 500;
    }

    .non-agentic-tooltip {
        cursor: help;
        border-bottom: 1px dotted #0066cc;
    }
</style>
{% endblock %}

{% block content %}
<div class="container mt-4">
    <!-- Hero Section for General Audience -->
    <div class="text-center mb-4">
        <h1 class="display-5 fw-bold text-primary mb-3">IMProofBench Leaderboard</h1>
        <p class="lead">Evaluating AI Systems on Uncontaminated Research-Level Mathematics</p>
    </div>
    
    <!-- What is IMProofBench? -->
    <div class="alert alert-light border-primary mb-4" style="background-color: #f0f7ff;">
        <div class="row align-items-center">
            <div class="col-md-8">
                <h5 class="text-primary mb-2">
                    <i class="bi bi-info-circle-fill me-2"></i>What is IMProofBench?
                </h5>
                <p class="mb-2">
                    IMProofBench is a benchmark designed to measure whether AI systems can produce rigorous mathematical proofs 
                    at the level of professional mathematicians. We maintain a <strong>private, uncontaminated problem set</strong> 
                    sourced from active mathematical research, ensuring models are tested on genuinely novel problems they haven't seen during training.
                </p>
                <p class="mb-0 text-muted">
                    <strong>Key Features:</strong> 🔒 Private problems prevent data contamination • 👨‍🏫 Human expert grading by mathematicians • 
                    ✓ Focus on proof correctness, not just answers • 🔄 Regular updates with new problems
                </p>
            </div>
            <div class="col-md-4 text-center">
                <div class="bg-white rounded p-3 shadow-sm">
                    <h3 class="text-primary mb-1">{{ total_questions }}</h3>
                    <p class="text-muted mb-2">Active Problems</p>
                    <h3 class="text-success mb-1">{{ total_models }}</h3>
                    <p class="text-muted mb-0">Models Evaluated</p>
                </div>
            </div>
        </div>
    </div>
    
    <!-- Navigation Tabs -->
    <ul class="nav nav-tabs" id="leaderboardTab" role="tablist">
        <li class="nav-item" role="presentation">
            <button class="nav-link active" id="progress-tab" data-bs-toggle="tab" data-bs-target="#progress" type="button" role="tab">
                Progress Grades
            </button>
        </li>
        <li class="nav-item" role="presentation">
            <button class="nav-link" id="scores-tab" data-bs-toggle="tab" data-bs-target="#scores" type="button" role="tab">
                Subquestion Scores
            </button>
        </li>
        <li class="nav-item" role="presentation">
            <button class="nav-link" id="pairwise-tab" data-bs-toggle="tab" data-bs-target="#pairwise" type="button" role="tab">
                Pairwise Comparisons
            </button>
        </li>
    </ul>
    
    <!-- Tab Content -->
    <div class="tab-content" id="leaderboardTabContent">
        
        <!-- Progress Grades Tab (Default) -->
        <div class="tab-pane fade show active" id="progress" role="tabpanel">
            <div class="card mt-3">
                <div class="card-body">
                    <h4>Complete Solution Rate</h4>
                    <p class="text-muted">
                        <strong>What this measures:</strong> The percentage of problems where each model produced a complete and correct mathematical proof.
                        Problems are graded by expert mathematicians on a 0-3 scale, where a score of 3 indicates a complete solution.
                    </p>
                    <ul class="text-muted small">
                        <li><strong>3 (Complete Solution):</strong> The model provided a fully correct mathematical proof</li>
                        <li><strong>2 (Major Progress):</strong> Significant progress with key insights, but incomplete</li>
                        <li><strong>1 (Minor Progress):</strong> Some correct steps or partial understanding</li>
                        <li><strong>0 (No Progress):</strong> No meaningful progress toward the solution</li>
                    </ul>
                    <p class="text-muted">
                        The chart below shows what percentage of problems each model achieved at each progress level.
                    </p>
                    {% if progress_stats and progress_stats.models %}
                    <!-- Custom legend for tablets/mobile - hidden on desktop -->
                    <div id="customLegend" class="d-lg-none mb-3">
                        <div class="d-flex flex-column gap-2">
                            <div class="d-flex align-items-center">
                                <span style="display: inline-block; width: 20px; height: 12px; background-color: #28a745; border: 1px solid #28a745; margin-right: 8px;"></span>
                                <span style="font-size: 12px;">Complete Solution (3)</span>
                            </div>
                            <div class="d-flex align-items-center">
                                <span style="display: inline-block; width: 20px; height: 12px; background-color: #17a2b8; border: 1px solid #17a2b8; margin-right: 8px;"></span>
                                <span style="font-size: 12px;">Major Progress (2)</span>
                            </div>
                            <div class="d-flex align-items-center">
                                <span style="display: inline-block; width: 20px; height: 12px; background-color: #ffc107; border: 1px solid #ffc107; margin-right: 8px;"></span>
                                <span style="font-size: 12px;">Minor Progress (1)</span>
                            </div>
                            <div class="d-flex align-items-center">
                                <span style="display: inline-block; width: 20px; height: 12px; background-color: #dc3545; border: 1px solid #dc3545; margin-right: 8px;"></span>
                                <span style="font-size: 12px;">No Progress (0)</span>
                            </div>
                        </div>
                    </div>
                    <div class="chart-container" style="position: relative; height:400px; width:100%;">
                        <canvas id="progressGradeChart"></canvas>
                    </div>
                    {% else %}
                    <div class="alert alert-warning">
                        No progress grade data available yet.
                    </div>
                    {% endif %}
                </div>
            </div>
        </div>
        
        <!-- Subquestion Scores Tab -->
        <div class="tab-pane fade" id="scores" role="tabpanel">
            <div class="card mt-3">
                <div class="card-body">
                    <h4>Verifiable Subproblem Performance</h4>
                    <p class="text-muted">
                        <strong>What this measures:</strong> How well models solve specific, automatically-verifiable components of larger problems.
                        These subproblems test precise mathematical calculations and logical reasoning that can be checked without human review.
                    </p>
                    <p class="text-muted small">
                        <strong>How to read:</strong> The solid bar shows the percentage of points earned out of <em>all</em> benchmark questions.
                        For models with partial evaluations, a lighter "whisker" extends to show the maximum achievable score if the model
                        were to answer all remaining questions perfectly. This helps distinguish actual performance from incomplete evaluation.
                    </p>
                    <p class="text-muted small">
                        Examples: Computing specific values, verifying formulas, checking special cases, or determining truth values of modified statements.
                    </p>

                    <!-- Non-Agentic Model Toggle -->
                    <div class="form-check mb-3">
                        <input class="form-check-input" type="checkbox" id="showNonAgenticCheckbox" checked>
                        <label class="form-check-label non-agentic-label" for="showNonAgenticCheckbox">
                            Show <span class="non-agentic-tooltip" data-bs-toggle="tooltip" data-bs-placement="top"
                                  data-bs-html="true"
                                  title="<strong>Non-Agentic Evaluations:</strong> Direct model prompting without agentic scaffolding (no multi-turn reasoning, no Docker container). Provides baseline performance comparison. <br><br>See <a href='https://arxiv.org/abs/2509.26076' target='_blank' style='color: white; text-decoration: underline;'>Section 3.3 of our paper</a> for details.">
                                Non-Agentic evaluations
                            </span>
                        </label>
                    </div>
                    <div class="chart-container" style="position: relative; height: 600px; width: 100%;">
                        <canvas id="subquestionResultsChart"></canvas>
                    </div>
                    <div class="mt-3 text-center">
                        <small class="text-muted">
                            <strong>Color Scale:</strong>
                            <span class="badge" style="background-color: #28a745; color: white;">≥80%</span>
                            <span class="badge" style="background-color: #5cb85c; color: white;">≥60%</span>
                            <span class="badge" style="background-color: #ffc107; color: dark;">≥40%</span>
                            <span class="badge" style="background-color: #fd7e14; color: white;">≥20%</span>
                            <span class="badge" style="background-color: #dc3545; color: white;"><20%</span>
                            <span class="badge" style="background-color: #6c757d; color: white;">N/A</span>
                        </small>
                    </div>
                </div>
            </div>
        </div>
        
        <!-- Pairwise Comparisons Tab -->
        <div class="tab-pane fade" id="pairwise" role="tabpanel">
            <div class="card mt-3">
                <div class="card-body">
                    <h4>Head-to-Head Win Rates</h4>
                    <p class="text-muted">
                        <strong>What this measures:</strong> Direct performance comparison between models. Each cell shows how often the row model
                        outperformed the column model on problems they both attempted. Higher numbers indicate better relative performance.
                    </p>
                    <p class="text-muted small">
                        <strong>How to read:</strong> Find a model in the rows (↓) and another in the columns (→).
                        The number shows on how many problems the row model achieved a higher weighted subquestion score than the column model.
                        Green shading indicates the row model generally outperforms the column model, while red indicates the opposite.
                    </p>

                    <!-- Non-Agentic Model Toggle -->
                    <div class="form-check mb-3">
                        <input class="form-check-input" type="checkbox" id="showNonAgenticPairwiseCheckbox" checked>
                        <label class="form-check-label non-agentic-label" for="showNonAgenticPairwiseCheckbox">
                            Show <span class="non-agentic-tooltip" data-bs-toggle="tooltip" data-bs-placement="top"
                                  data-bs-html="true"
                                  title="<strong>Non-Agentic Evaluations:</strong> Direct model prompting without agentic scaffolding (no multi-turn reasoning, no Docker container). Provides baseline performance comparison. <br><br>See <a href='https://arxiv.org/abs/2509.26076' target='_blank' style='color: white; text-decoration: underline;'>Section 3.3 of our paper</a> for details.">
                                Non-Agentic evaluations
                            </span>
                        </label>
                    </div>

                    {% if pairwise_data and pairwise_data.models %}
                    <div class="alert alert-info">
                        <strong>Total Questions:</strong> {{ pairwise_data.total_questions }} questions<br>
                        <strong>Total Subquestions:</strong> {{ pairwise_data.total_subquestions }} subquestions
                    </div>
                    
                    <div class="table-responsive" id="pairwise-table-container">
                        <!-- Table will be rendered by JavaScript -->
                    </div>
                    {% else %}
                    <div class="alert alert-warning">
                        No pairwise comparison data available yet.
                    </div>
                    {% endif %}
                </div>
            </div>
        </div>
    </div>
    
    <!-- FAQ Section -->
    <div class="mt-5 mb-5">
        <h2 class="text-primary mb-4">Frequently Asked Questions</h2>
        
        <div class="accordion" id="faqAccordion">
            <!-- Question 1 -->
            <div class="accordion-item">
                <h2 class="accordion-header" id="headingOne">
                    <button class="accordion-button" type="button" data-bs-toggle="collapse" data-bs-target="#collapseOne" aria-expanded="true" aria-controls="collapseOne">
                        <i class="bi bi-question-circle-fill text-primary me-2"></i>
                        How are the problems selected for IMProofBench?
                    </button>
                </h2>
                <div id="collapseOne" class="accordion-collapse collapse show" aria-labelledby="headingOne">
                    <div class="accordion-body">
                        Problems are contributed by professional mathematicians from active research areas. Each problem undergoes rigorous peer review 
                        to ensure it requires genuine mathematical reasoning and proof construction skills. We prioritize problems that test deep 
                        mathematical understanding rather than computational ability.
                    </div>
                </div>
            </div>
            
            <!-- Question 2 -->
            <div class="accordion-item">
                <h2 class="accordion-header" id="headingTwo">
                    <button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseTwo" aria-expanded="false" aria-controls="collapseTwo">
                        <i class="bi bi-question-circle-fill text-primary me-2"></i>
                        How does the evaluation process work?
                    </button>
                </h2>
                <div id="collapseTwo" class="accordion-collapse collapse" aria-labelledby="headingTwo">
                    <div class="accordion-body">
                        IMProofBench evaluations are conducted through an automated internal system to maintain consistency and prevent problem leakage. 
                        All models are given 24 hours per problem with up to 300,000 output tokens for the main question and 100,000 tokens per subquestion. 
                        Models have access to Python, SageMath, and web search capabilities to ensure fair comparison.
                    </div>
                </div>
            </div>
            
            <!-- Question 3 -->
            <div class="accordion-item">
                <h2 class="accordion-header" id="headingThree">
                    <button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseThree" aria-expanded="false" aria-controls="collapseThree">
                        <i class="bi bi-question-circle-fill text-primary me-2"></i>
                        How is the grading performed?
                    </button>
                </h2>
                <div id="collapseThree" class="accordion-collapse collapse" aria-labelledby="headingThree">
                    <div class="accordion-body">
                        Each model's solution is graded by expert mathematicians who evaluate the correctness and completeness of the mathematical proof. 
                        Graders assess whether the logical steps are valid, the proof strategy is sound, and the conclusion correctly answers the question.
                    </div>
                </div>
            </div>
            
            <!-- Question 4 -->
            <div class="accordion-item">
                <h2 class="accordion-header" id="headingFour">
                    <button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseFour" aria-expanded="false" aria-controls="collapseFour">
                        <i class="bi bi-question-circle-fill text-primary me-2"></i>
                        Why is the problem set kept private?
                    </button>
                </h2>
                <div id="collapseFour" class="accordion-collapse collapse" aria-labelledby="headingFour">
                    <div class="accordion-body">
                        Keeping problems private is essential to prevent data contamination. Once problems become public, they can be included in training 
                        data for future models, invalidating the benchmark. Our private problem set ensures that models are tested on genuinely novel 
                        problems, providing a true measure of their mathematical reasoning capabilities rather than memorization.
                    </div>
                </div>
            </div>
            
            <!-- Question 5 -->
            <div class="accordion-item">
                <h2 class="accordion-header" id="headingFive">
                    <button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseFive" aria-expanded="false" aria-controls="collapseFive">
                        <i class="bi bi-question-circle-fill text-primary me-2"></i>
                        How often is the benchmark updated?
                    </button>
                </h2>
                <div id="collapseFive" class="accordion-collapse collapse" aria-labelledby="headingFive">
                    <div class="accordion-body">
                        We continuously add new problems as they pass our review process. Models are re-evaluated periodically on the growing problem set. 
                        This ensures the benchmark remains challenging and relevant as AI capabilities advance. Check back regularly for updated results 
                        and newly evaluated models.
                    </div>
                </div>
            </div>
            
            <!-- Question 6 -->
            <div class="accordion-item">
                <h2 class="accordion-header" id="headingSix">
                    <button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSix" aria-expanded="false" aria-controls="collapseSix">
                        <i class="bi bi-question-circle-fill text-primary me-2"></i>
                        What makes IMProofBench different from other math benchmarks?
                    </button>
                </h2>
                <div id="collapseSix" class="accordion-collapse collapse" aria-labelledby="headingSix">
                    <div class="accordion-body">
                        IMProofBench focuses specifically on <strong>proof generation</strong> at research level, not just problem solving. 
                        Our problems require constructing rigorous mathematical arguments, not just finding answers. Human expert grading ensures we evaluate 
                        mathematical correctness, not just pattern matching.
                    </div>
                </div>
            </div>
            
            <!-- Question 7 -->
            <div class="accordion-item">
                <h2 class="accordion-header" id="headingSeven">
                    <button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseSeven" aria-expanded="false" aria-controls="collapseSeven">
                        <i class="bi bi-question-circle-fill text-primary me-2"></i>
                        How can I contribute or get involved?
                    </button>
                </h2>
                <div id="collapseSeven" class="accordion-collapse collapse" aria-labelledby="headingSeven">
                    <div class="accordion-body">
                        Mathematicians can contribute problems through our submission system (requires account creation and verification). 
                        We're particularly interested in problems from active research areas that test deep mathematical reasoning. 
                        You can also join our <a href="https://improofbench.zulipchat.com/join/oclqw6wstanvip4t7agk66kj/" target="_blank">Zulip community</a> 
                        to discuss the benchmark and stay updated on developments.
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>

<script>
// Initialize charts and table when page loads
document.addEventListener('DOMContentLoaded', function() {
    // Track which charts have been initialized
    let chartsInitialized = {
        scores: false,
        pairwise: false,
        progress: false
    };
    
    // Restore last selected tab from localStorage
    const savedTab = localStorage.getItem('leaderboardActiveTab') || 'progress';
    
    // Activate the saved tab
    const savedTabButton = document.querySelector(`#${savedTab}-tab`);
    const savedTabPane = document.querySelector(`#${savedTab}`);
    
    if (savedTabButton && savedTabPane) {
        // Remove active class from all tabs and panes
        document.querySelectorAll('.nav-link').forEach(tab => tab.classList.remove('active'));
        document.querySelectorAll('.tab-pane').forEach(pane => pane.classList.remove('show', 'active'));
        
        // Add active class to saved tab
        savedTabButton.classList.add('active');
        savedTabPane.classList.add('show', 'active');
        
        // Initialize the appropriate chart for the saved tab
        if (savedTab === 'progress' && !chartsInitialized.progress) {
            {% if progress_stats_json %}
            initializeProgressChart();
            {% endif %}
            chartsInitialized.progress = true;
        } else if (savedTab === 'scores' && !chartsInitialized.scores) {
            {% if models_json %}
            createSubquestionResultsChart('subquestionResultsChart', {{ models_json|safe }});
            {% endif %}
            chartsInitialized.scores = true;
        } else if (savedTab === 'pairwise' && !chartsInitialized.pairwise) {
            setTimeout(function() {
                initializePairwiseTable();
                chartsInitialized.pairwise = true;
            }, 100);
        }
    } else {
        // Fallback to progress tab if saved tab not found
        {% if progress_stats_json %}
        initializeProgressChart();
        {% endif %}
        chartsInitialized.progress = true;
    }
    
    // Initialize tabs and save selection to localStorage
    const progressTabButton = document.querySelector('#progress-tab');
    const scoresTabButton = document.querySelector('#scores-tab');
    const pairwiseTabButton = document.querySelector('#pairwise-tab');
    
    if (progressTabButton) {
        progressTabButton.addEventListener('click', function () {
            localStorage.setItem('leaderboardActiveTab', 'progress');
            setTimeout(function() {
                if (!chartsInitialized.progress) {
                    {% if progress_stats_json %}
                    initializeProgressChart();
                    {% endif %}
                    chartsInitialized.progress = true;
                }
            }, 200);
        });
    }
    
    if (scoresTabButton) {
        scoresTabButton.addEventListener('click', function () {
            localStorage.setItem('leaderboardActiveTab', 'scores');
            setTimeout(function() {
                if (!chartsInitialized.scores) {
                    {% if models_json %}
                    createSubquestionResultsChart('subquestionResultsChart', {{ models_json|safe }});
                    {% endif %}
                    chartsInitialized.scores = true;
                }
            }, 200);
        });
    }
    
    if (pairwiseTabButton) {
        pairwiseTabButton.addEventListener('click', function () {
            localStorage.setItem('leaderboardActiveTab', 'pairwise');
            setTimeout(function() {
                if (!chartsInitialized.pairwise) {
                    initializePairwiseTable();
                    chartsInitialized.pairwise = true;
                }
            }, 200);
        });
    }
    
    // Function to create subquestion results chart (from Model Scores)
    function createSubquestionResultsChart(canvasId, modelsData) {
        const canvas = document.getElementById(canvasId);
        if (!canvas) return;

        // Filter models based on checkbox state
        const showNonAgentic = document.getElementById('showNonAgenticCheckbox')?.checked ?? true;
        const filteredModels = showNonAgentic ? modelsData : modelsData.filter(m => m.framework_type !== 'non-agentic');

        // Prepare data for horizontal bar chart with whiskers
        const modelNames = [];
        const baseScores = [];  // Actual scores
        const whiskerScores = [];  // Extension to max possible
        const backgroundColors = [];
        const whiskerColors = [];
        const modelFrameworkTypes = [];  // Track framework types for styling

        // Process each model (already sorted by score)
        filteredModels.forEach(model => {
            modelNames.push(model.display_name || model.model.display_name);
            modelFrameworkTypes.push(model.framework_type);

            // Use score percentage or 0 for N/A
            const baseScore = model.score_percentage !== null ? model.score_percentage : 0;
            const maxScore = model.max_percentage !== null ? model.max_percentage : baseScore;

            baseScores.push(parseFloat(baseScore.toFixed(1)));

            // Whisker extends from base to max
            if (model.has_unevaluated && maxScore > baseScore) {
                whiskerScores.push(parseFloat((maxScore - baseScore).toFixed(1)));
            } else {
                whiskerScores.push(0);
            }

            // Color gradient based on performance (like MathArena)
            let color;
            if (model.score_percentage === null) {
                color = '#6c757d';  // Gray for N/A
            } else if (baseScore >= 80) {
                color = '#28a745';  // Green for excellent
            } else if (baseScore >= 60) {
                color = '#5cb85c';  // Light green for good
            } else if (baseScore >= 40) {
                color = '#ffc107';  // Yellow for moderate
            } else if (baseScore >= 20) {
                color = '#fd7e14';  // Orange for low
            } else {
                color = '#dc3545';  // Red for very low
            }
            backgroundColors.push(color);

            // Whisker is semi-transparent version of the same color
            const whiskerColor = color + '40';  // Add 40 for 25% opacity
            whiskerColors.push(whiskerColor);
        });
        
        // Create the chart with stacked bars for whiskers
        const ctx = canvas.getContext('2d');
        new Chart(ctx, {
            type: 'bar',
            data: {
                labels: modelNames,
                datasets: [
                    {
                        label: 'Actual Score (%)',
                        data: baseScores,
                        backgroundColor: backgroundColors,
                        borderColor: backgroundColors,
                        borderWidth: 1,
                        stack: 'stack0'
                    },
                    {
                        label: 'Potential if Perfect on Remaining',
                        data: whiskerScores,
                        backgroundColor: whiskerColors,
                        borderColor: whiskerColors,
                        borderWidth: 1,
                        stack: 'stack0'
                    }
                ]
            },
            options: {
                indexAxis: 'y',  // Horizontal bars
                responsive: true,
                maintainAspectRatio: false,
                plugins: {
                    title: {
                        display: false
                    },
                    legend: {
                        display: true,
                        position: 'bottom'
                    },
                    tooltip: {
                        callbacks: {
                            label: function(context) {
                                const modelData = filteredModels[context.dataIndex];
                                const questionsText = `${modelData.questions_attempted}/${modelData.questions_total} questions`;

                                if (modelData.score_percentage === null) {
                                    return `No subquestions evaluated`;
                                }

                                if (context.dataset.label === 'Actual Score (%)') {
                                    return `Actual: ${modelData.score_percentage.toFixed(1)}% (${questionsText})`;
                                } else {
                                    if (modelData.has_unevaluated) {
                                        return `Max possible: ${modelData.max_percentage.toFixed(1)}% if perfect on remaining`;
                                    } else {
                                        return null;  // Don't show whisker tooltip if fully evaluated
                                    }
                                }
                            },
                            afterLabel: function(context) {
                                // Add explanation on first hover
                                if (context.datasetIndex === 0) {
                                    const modelData = filteredModels[context.dataIndex];
                                    if (modelData.has_unevaluated) {
                                        return 'Note: Light bar shows potential if perfect on unevaluated questions';
                                    }
                                }
                                return null;
                            }
                        }
                    }
                },
                scales: {
                    x: {
                        stacked: true,  // Enable stacking for whiskers
                        title: {
                            display: true,
                            text: 'Score on All Benchmark Questions (%)'
                        },
                        min: 0,
                        max: 100,
                        ticks: {
                            callback: function(value) {
                                return value + '%';
                            }
                        }
                    },
                    y: {
                        stacked: true,  // Enable stacking for whiskers
                        title: {
                            display: false
                        },
                        ticks: {
                            color: function(context) {
                                // Color non-agentic model names in blue
                                const index = context.index;
                                return modelFrameworkTypes[index] === 'non-agentic' ? '#0066cc' : '#666';
                            },
                            font: function(context) {
                                // Make non-agentic model names slightly bolder
                                const index = context.index;
                                return {
                                    weight: modelFrameworkTypes[index] === 'non-agentic' ? '500' : 'normal'
                                };
                            }
                        }
                    }
                }
            }
        });
    }
    
    // Function to initialize progress chart
    function initializeProgressChart() {
        {% if progress_stats_json %}
        createProgressGradeChart('progressGradeChart', {{ progress_stats_json|safe }});
        {% endif %}
    }
    
    function createProgressGradeChart(canvasId, progressData) {
        const canvas = document.getElementById(canvasId);
        if (!canvas) return;
        
        // Prepare data for Chart.js
        const modelNames = [];
        const noProgressData = [];      // Grade 0
        const minorProgressData = [];   // Grade 1
        const majorProgressData = [];   // Grade 2
        const completeData = [];        // Grade 3
        
        // Extract data from each model
        progressData.models.forEach(model => {
            // Use model name without tier label
            modelNames.push(model.model_name);
            
            // Add percentages (rounded to 1 decimal)
            noProgressData.push(model.percentages[0].toFixed(1));
            minorProgressData.push(model.percentages[1].toFixed(1));
            majorProgressData.push(model.percentages[2].toFixed(1));
            completeData.push(model.percentages[3].toFixed(1));
        });
        
        // Create the chart with reversed order (Complete Solution first)
        const ctx = canvas.getContext('2d');
        new Chart(ctx, {
            type: 'bar',
            data: {
                labels: modelNames,
                datasets: [
                    {
                        label: 'Complete Solution (3)',
                        data: completeData,
                        backgroundColor: '#28a745',  // Green
                        borderColor: '#28a745',
                        borderWidth: 1
                    },
                    {
                        label: 'Major Progress (2)',
                        data: majorProgressData,
                        backgroundColor: '#17a2b8',  // Cyan/Info Blue
                        borderColor: '#17a2b8',
                        borderWidth: 1
                    },
                    {
                        label: 'Minor Progress (1)',
                        data: minorProgressData,
                        backgroundColor: '#ffc107',  // Yellow/Amber
                        borderColor: '#ffc107',
                        borderWidth: 1
                    },
                    {
                        label: 'No Progress (0)',
                        data: noProgressData,
                        backgroundColor: '#dc3545',  // Red
                        borderColor: '#dc3545',
                        borderWidth: 1
                    }
                ]
            },
            options: {
                indexAxis: 'y',  // Horizontal bars
                responsive: true,
                maintainAspectRatio: false,
                plugins: {
                    title: {
                        display: false
                    },
                    legend: {
                        display: window.innerWidth >= 992,  // Only show Chart.js legend on large screens (lg breakpoint)
                        position: 'top',
                        align: 'center',  // Center on desktop
                        labels: {
                            boxWidth: 40,
                            padding: 15,
                            font: {
                                size: 12
                            }
                        }
                    },
                    tooltip: {
                        callbacks: {
                            label: function(context) {
                                const label = context.dataset.label || '';
                                const value = context.parsed.x || 0;
                                return `${label}: ${value}%`;
                            }
                        }
                    }
                },
                scales: {
                    x: {
                        stacked: true,
                        title: {
                            display: true,
                            text: 'Percentage of Responses'
                        },
                        min: 0,
                        max: 100,
                        ticks: {
                            callback: function(value) {
                                return value + '%';
                            }
                        }
                    },
                    y: {
                        stacked: true,
                        title: {
                            display: false
                        }
                    }
                }
            }
        });
    }
    
    // Function to initialize pairwise table
    function initializePairwiseTable() {
        {% if pairwise_data_json %}
        const pairwiseData = {{ pairwise_data_json|safe }};
        createPairwiseTable(pairwiseData);
        {% endif %}
    }
    
    function createPairwiseTable(pairwiseData) {
        const tableContainer = document.querySelector('#pairwise-table-container');
        if (!tableContainer || !pairwiseData || !pairwiseData.models || !pairwiseData.matrix) return;

        // Filter models based on checkbox state
        const showNonAgentic = document.getElementById('showNonAgenticPairwiseCheckbox')?.checked ?? true;
        const filteredIndices = [];
        const filteredModels = [];

        pairwiseData.models.forEach((model, idx) => {
            if (showNonAgentic || model.framework_type !== 'non-agentic') {
                filteredIndices.push(idx);
                filteredModels.push(model);
            }
        });

        // Build filtered matrix
        const filteredMatrix = [];
        for (let i = 0; i < filteredIndices.length; i++) {
            const row = [];
            for (let j = 0; j < filteredIndices.length; j++) {
                const origI = filteredIndices[i];
                const origJ = filteredIndices[j];
                row.push(pairwiseData.matrix[origI][origJ]);
            }
            filteredMatrix.push(row);
        }

        // Compute maximum absolute pairwise advantage for scaling
        let maxAbsDiff = 0;
        for (let i = 0; i < filteredMatrix.length; i++) {
            for (let j = 0; j < filteredMatrix.length; j++) {
                if (i === j) continue;
                const vij = (filteredMatrix[i][j] ?? 0);
                const vji = (filteredMatrix[j][i] ?? 0);
                const diff = vij - vji;
                const ad = Math.abs(diff);
                if (ad > maxAbsDiff) maxAbsDiff = ad;
            }
        }
        
        // Color interpolation function for a given diff value centered at 0
        function getHeatmapColorForDiff(diff) {
            // Normalize to [0,1] with 0.5 at diff=0
            const normalized = maxAbsDiff > 0 ? (diff + maxAbsDiff) / (2 * maxAbsDiff) : 0.5;
            
            // Create color gradient from red (negative) through white (0) to green (positive)
            let r, g, b;
            if (normalized < 0.5) {
                // Red to white
                const intensity = normalized * 2;  // 0 to 1
                r = 255;
                g = Math.round(150 + (255 - 150) * intensity);  // 150 to 255
                b = Math.round(150 + (255 - 150) * intensity);  // 150 to 255
            } else {
                // White to green
                const intensity = (normalized - 0.5) * 2;  // 0 to 1
                r = Math.round(255 - (255 - 150) * intensity);  // 255 to 150
                g = 255;
                b = Math.round(255 - (255 - 150) * intensity);  // 255 to 150
            }
            
            return `rgba(${r}, ${g}, ${b}, 0.6)`;
        }
        
        let tableHtml = '<table class="table pairwise-table"><thead><tr>';
        tableHtml += '<th style="width: 200px;">Model ↓ vs Model →</th>';

        // Add column headers
        filteredModels.forEach(model => {
            const isNonAgentic = model.framework_type === 'non-agentic';
            const colorClass = isNonAgentic ? 'non-agentic-model' : '';
            tableHtml += `<th style="writing-mode: vertical-rl; text-orientation: mixed; height: 150px;" class="${colorClass}">
                ${model.name}
            </th>`;
        });
        tableHtml += '</tr></thead><tbody>';

        // Add rows
        filteredModels.forEach((model, rowIdx) => {
            tableHtml += '<tr>';
            const isNonAgentic = model.framework_type === 'non-agentic';
            const colorClass = isNonAgentic ? 'non-agentic-model' : '';
            tableHtml += `<td class="model-header ${colorClass}">
                ${model.name}
            </td>`;

            filteredMatrix[rowIdx].forEach((value, colIdx) => {
                if (rowIdx === colIdx) {
                    tableHtml += '<td class="diagonal">—</td>';
                } else {
                    const symmetric = (filteredMatrix[colIdx][rowIdx] ?? 0);
                    const diff = (value ?? 0) - symmetric;
                    const bgColor = getHeatmapColorForDiff(diff);
                    const textStyle = (maxAbsDiff > 0 && Math.abs(diff) / maxAbsDiff > 0.7) ? 'font-weight: bold;' : '';
                    tableHtml += `<td style="background-color: ${bgColor}; ${textStyle}">${value !== null ? value : 0}</td>`;
                }
            });
            tableHtml += '</tr>';
        });
        tableHtml += '</tbody></table>';

        tableContainer.innerHTML = tableHtml;
    }

    // Initialize Bootstrap tooltips
    const tooltipTriggerList = [].slice.call(document.querySelectorAll('[data-bs-toggle="tooltip"]'));
    tooltipTriggerList.map(function (tooltipTriggerEl) {
        return new bootstrap.Tooltip(tooltipTriggerEl);
    });

    // Handle checkbox toggle for non-agentic models (Subquestion Scores)
    const checkbox = document.getElementById('showNonAgenticCheckbox');
    if (checkbox) {
        checkbox.addEventListener('change', function() {
            // Re-render the subquestion scores chart
            const canvas = document.getElementById('subquestionResultsChart');
            if (canvas) {
                // Destroy existing chart
                const existingChart = Chart.getChart(canvas);
                if (existingChart) {
                    existingChart.destroy();
                }

                // Re-create chart with new filter
                {% if models_json %}
                createSubquestionResultsChart('subquestionResultsChart', {{ models_json|safe }});
                {% endif %}
            }
        });
    }

    // Handle checkbox toggle for non-agentic models (Pairwise Comparisons)
    const pairwiseCheckbox = document.getElementById('showNonAgenticPairwiseCheckbox');
    if (pairwiseCheckbox) {
        pairwiseCheckbox.addEventListener('change', function() {
            // Re-render the pairwise table
            {% if pairwise_data_json %}
            const pairwiseData = {{ pairwise_data_json|safe }};
            createPairwiseTable(pairwiseData);
            {% endif %}
        });
    }
});
</script>
{% endblock %}