"""
Presentation Generator - Visualizations optimized for presentations and talks.

This module generates clean, readable visualizations suitable for presentations,
with emphasis on clarity, larger fonts, and combined views.
"""

import os
from typing import Dict, Any
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scripts.visualization.core.data_loader import split_by_question_type
from scripts.visualization.core.utils import (
    save_plot,
    get_color_palette,
    get_model_family_order,
    apply_display_names_to_list,
    get_size_pattern_order,
)
from scripts.evaluate_responses import get_ground_truth_answer, compare_answers


def save_presentation_chart(
    fig: plt.Figure, base_filepath: str, no_titles: bool = False
):
    """
    Save a presentation chart in both high-quality PNG and PDF formats.

    Parameters:
    - fig: matplotlib Figure to save
    - base_filepath: Base filepath without extension
    - no_titles: Whether titles were suppressed
    """
    # Save as high-quality PNG
    png_filepath = base_filepath + ".png"
    fig.savefig(
        png_filepath, format="png", dpi=300, bbox_inches="tight", facecolor="white"
    )

    # Save as PDF
    pdf_filepath = base_filepath + ".pdf"
    fig.savefig(pdf_filepath, format="pdf", bbox_inches="tight", facecolor="white")

    plt.close(fig)

    return [png_filepath, pdf_filepath]


def create_combined_model_performance_chart(
    full_output_data: pd.DataFrame,
    question_data: pd.DataFrame,
    title: str = "Model Performance Comparison",
    figsize: tuple = (16, 10),
    no_titles: bool = False,
) -> plt.Figure:
    """
    Create a combined model performance chart with side-by-side bars for
    full output and question-based tasks, with additional dashed lines showing
    input and output accuracy for question-based tasks.

    Parameters:
    - full_output_data: DataFrame with full output task results
    - question_data: DataFrame with question-based task results
    - title: Chart title
    - figsize: Figure size (width, height)
    - no_titles: Whether to suppress titles

    Returns:
    - matplotlib Figure
    """
    # Calculate performance for both data types
    full_output_perf = pd.DataFrame()
    question_perf = pd.DataFrame()

    if not full_output_data.empty:
        full_output_perf = (
            full_output_data.groupby("model")["correct"]
            .agg(["mean", "count"])
            .reset_index()
        )
        full_output_perf.columns = ["model", "accuracy", "sample_count"]

    if not question_data.empty:
        question_perf = (
            question_data.groupby("model")["correct"]
            .agg(["mean", "count"])
            .reset_index()
        )
        question_perf.columns = ["model", "accuracy", "sample_count"]

    # Get all unique models and apply family ordering
    all_models = set()
    if not full_output_perf.empty:
        all_models.update(full_output_perf["model"].tolist())
    if not question_perf.empty:
        all_models.update(question_perf["model"].tolist())

    if not all_models:
        # Return empty figure if no data
        fig, ax = plt.subplots(figsize=figsize)
        ax.text(
            0.5,
            0.5,
            "No data available",
            ha="center",
            va="center",
            transform=ax.transAxes,
            fontsize=16,
        )
        return fig

    models = get_model_family_order(list(all_models))
    display_names = apply_display_names_to_list(models)

    # Create figure with larger font sizes for presentation
    fig, ax = plt.subplots(figsize=figsize)

    # Increase base font sizes for presentation
    plt.rcParams.update(
        {
            "font.size": 14,
            "axes.titlesize": 18,
            "axes.labelsize": 16,
            "xtick.labelsize": 14,
            "ytick.labelsize": 14,
            "legend.fontsize": 14,
        }
    )

    # Set up bar positions
    x = np.arange(len(models))
    width = 0.35  # Width of bars

    # Get base colors for models (we'll use variations for the two bar types)
    model_colors = get_color_palette(models, "models")

    # Prepare data arrays
    full_output_accuracies = []
    question_accuracies = []
    question_input_accuracies = []  # NEW: For input accuracy lines
    question_output_accuracies = []  # NEW: For output accuracy lines

    for model in models:
        # Full output accuracy
        full_match = full_output_perf[full_output_perf["model"] == model]
        full_acc = full_match["accuracy"].iloc[0] if not full_match.empty else 0
        full_output_accuracies.append(full_acc)

        # Question-based overall accuracy
        question_match = question_perf[question_perf["model"] == model]
        question_acc = (
            question_match["accuracy"].iloc[0] if not question_match.empty else 0
        )
        question_accuracies.append(question_acc)

        # NEW: Calculate input and output accuracies separately
        if not question_data.empty and "target" in question_data.columns:
            model_question_data = question_data[question_data["model"] == model]

            # Input accuracy
            input_data = model_question_data[model_question_data["target"] == "input"]
            input_acc = input_data["correct"].mean() if not input_data.empty else 0
            question_input_accuracies.append(input_acc)

            # Output accuracy
            output_data = model_question_data[model_question_data["target"] == "output"]
            output_acc = output_data["correct"].mean() if not output_data.empty else 0
            question_output_accuracies.append(output_acc)
        else:
            question_input_accuracies.append(0)
            question_output_accuracies.append(0)

    # Create bars with different patterns
    # Full output bars (solid)
    bars1 = ax.bar(
        x - width / 2,
        full_output_accuracies,
        width,
        label="Full Output Tasks",
        color=[model_colors[model] for model in models],
        alpha=0.8,
        edgecolor="black",
        linewidth=1.0,
    )

    # Question-based bars (with hatching/stripes)
    bars2 = ax.bar(
        x + width / 2,
        question_accuracies,
        width,
        label="Question-Based Tasks",
        color=[model_colors[model] for model in models],
        alpha=0.6,
        hatch="///",  # Diagonal stripes
        edgecolor="black",
        linewidth=1.0,
    )

    # NEW: Add dashed lines for input and output accuracy on question-based bars
    input_line_added = False
    output_line_added = False

    for i, (input_acc, output_acc) in enumerate(
        zip(question_input_accuracies, question_output_accuracies)
    ):
        x_pos = x[i] + width / 2  # Position over the question-based bar

        # Add input accuracy line (dark blue, solid dashed)
        if input_acc > 0:
            ax.hlines(
                input_acc,
                x_pos - width * 0.4,
                x_pos + width * 0.4,
                colors="darkblue",
                linestyles="dashed",
                linewidth=5,
                label="Input Accuracy" if not input_line_added else "",
            )
            input_line_added = True

        # Add output accuracy line (dark red, dotted dashed)
        if output_acc > 0:
            ax.hlines(
                output_acc,
                x_pos - width * 0.4,
                x_pos + width * 0.4,
                colors="darkred",
                linestyles="dotted",
                linewidth=5,
                label="Output Accuracy" if not output_line_added else "",
            )
            output_line_added = True

    # Customize chart
    ax.set_ylabel("Accuracy", fontsize=16, fontweight="bold")
    ax.set_xlabel("Models", fontsize=16, fontweight="bold")

    if not no_titles:
        ax.set_title(title, fontsize=18, fontweight="bold", pad=20)

    ax.set_xticks(x)
    ax.set_xticklabels(display_names, rotation=0, ha="center", fontsize=14)
    ax.set_ylim(0, 1.0)

    # Format y-axis as percentages
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0%}"))

    # Add grid for better readability
    ax.grid(True, axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
    ax.set_axisbelow(True)

    # Collect legend handles and labels
    handles, labels = ax.get_legend_handles_labels()

    # Desired order: bars first, then lines
    order = []
    for name in [
        "Full Output Tasks",
        "Question-Based Tasks",
        "Input Accuracy",
        "Output Accuracy",
    ]:
        if name in labels:
            order.append(labels.index(name))

    # Recreate legend with custom order
    legend = ax.legend(
        [handles[idx] for idx in order],
        [labels[idx] for idx in order],
        loc="upper left",
        frameon=True,
        fancybox=True,
        shadow=True,
        fontsize=14,
        framealpha=0.95,
    )
    legend.get_frame().set_facecolor("white")

    # Clean up spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)

    # Adjust layout to prevent label cutoff
    plt.tight_layout()

    return fig


def create_scaling_performance_chart(
    full_output_data: pd.DataFrame,
    title: str = "Model Scaling Performance by Pattern",
    figsize: tuple = (16, 10),
    no_titles: bool = False,
) -> plt.Figure:
    """
    Create a scaling performance chart showing model performance across size patterns
    for full output tasks only.

    Parameters:
    - full_output_data: DataFrame with full output task results
    - title: Chart title
    - figsize: Figure size (width, height)
    - no_titles: Whether to suppress titles

    Returns:
    - matplotlib Figure
    """
    if full_output_data.empty:
        # Return empty figure if no data
        fig, ax = plt.subplots(figsize=figsize)
        ax.text(
            0.5,
            0.5,
            "No full output data available",
            ha="center",
            va="center",
            transform=ax.transAxes,
            fontsize=16,
        )
        return fig

    # Calculate performance by size pattern and model
    pattern_performance = (
        full_output_data.groupby(["size_pattern", "model"])["correct"]
        .agg(["mean", "count"])
        .reset_index()
    )
    pattern_performance.columns = ["size_pattern", "model", "accuracy", "sample_count"]

    # Get unique patterns and models with proper ordering
    patterns = get_size_pattern_order(pattern_performance["size_pattern"].unique())
    models = get_model_family_order(pattern_performance["model"].unique())
    display_names = apply_display_names_to_list(models)

    # Create figure with larger font sizes for presentation
    fig, ax = plt.subplots(figsize=figsize)

    # Increase base font sizes for presentation
    plt.rcParams.update(
        {
            "font.size": 14,
            "axes.titlesize": 18,
            "axes.labelsize": 16,
            "xtick.labelsize": 14,
            "ytick.labelsize": 14,
            "legend.fontsize": 14,
        }
    )

    # Set up bar positions
    x = np.arange(len(patterns))
    width = 0.8 / len(models)  # Width of bars

    # Get colors for models
    model_colors = get_color_palette(models, "models")

    # Create bars for each model
    for i, (model, display_name) in enumerate(zip(models, display_names)):
        model_data = pattern_performance[pattern_performance["model"] == model]

        # Align data with pattern order
        accuracies = []
        for pattern in patterns:
            pattern_data = model_data[model_data["size_pattern"] == pattern]
            if not pattern_data.empty:
                accuracies.append(pattern_data["accuracy"].iloc[0])
            else:
                accuracies.append(0)

        # Calculate x positions for this model's bars
        x_pos = x + (i - (len(models) - 1) / 2) * width

        # Create bars
        bars = ax.bar(
            x_pos,
            accuracies,
            width,
            label=display_name,
            color=model_colors[model],
            alpha=0.8,
            edgecolor="black",
            linewidth=1.0,
        )

    # Customize chart
    ax.set_ylabel("Accuracy", fontsize=16, fontweight="bold")
    ax.set_xlabel("Size Pattern", fontsize=16, fontweight="bold")

    if not no_titles:
        ax.set_title(title, fontsize=18, fontweight="bold", pad=20)

    ax.set_xticks(x)
    ax.set_xticklabels(patterns, rotation=0, ha="center", fontsize=14)
    ax.set_ylim(0, 1.0)

    # Format y-axis as percentages
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0%}"))

    # Add grid for better readability
    ax.grid(True, axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
    ax.set_axisbelow(True)

    # Add horizontal legend below the plot
    legend = ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.1),
        ncol=min(len(models), 4),  # Max 4 columns, adjust based on number of models
        frameon=True,
        fancybox=True,
        shadow=True,
        fontsize=14,
        framealpha=0.95,
    )
    legend.get_frame().set_facecolor("white")

    # Clean up spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)

    # Adjust layout to make room for legend below
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.2)  # Make room for horizontal legend

    return fig


def create_input_output_comparison_chart(
    question_data: pd.DataFrame,
    title: str = "Model Performance: Input vs Output Targets",
    figsize: tuple = (16, 10),
    no_titles: bool = False,
) -> plt.Figure:
    """
    Create an input vs output comparison chart for question-based tasks,
    styled for presentations with model colors and input/output patterns.

    Parameters:
    - question_data: DataFrame with question-based task results
    - title: Chart title
    - figsize: Figure size (width, height)
    - no_titles: Whether to suppress titles

    Returns:
    - matplotlib Figure
    """
    if question_data.empty or "target" not in question_data.columns:
        # Return empty figure if no data
        fig, ax = plt.subplots(figsize=figsize)
        ax.text(
            0.5,
            0.5,
            "No question-based data with targets available",
            ha="center",
            va="center",
            transform=ax.transAxes,
            fontsize=16,
        )
        return fig

    # Calculate performance by target and model
    target_performance = (
        question_data.groupby(["target", "model"])["correct"]
        .agg(["mean", "count"])
        .reset_index()
    )
    target_performance.columns = ["target", "model", "accuracy", "sample_count"]

    # Get unique targets and models with proper ordering
    targets = sorted(target_performance["target"].unique())
    # Ensure input comes first
    if "input" in targets:
        targets = ["input"] + [t for t in targets if t != "input"]

    models = get_model_family_order(target_performance["model"].unique())
    display_names = apply_display_names_to_list(models)

    # Create figure with larger font sizes for presentation
    fig, ax = plt.subplots(figsize=figsize)

    # Increase base font sizes for presentation
    plt.rcParams.update(
        {
            "font.size": 14,
            "axes.titlesize": 18,
            "axes.labelsize": 16,
            "xtick.labelsize": 14,
            "ytick.labelsize": 14,
            "legend.fontsize": 14,
        }
    )

    # Set up bar positions
    x = np.arange(len(models))
    width = 0.35  # Width of bars

    # Get colors for models
    model_colors = get_color_palette(models, "models")

    # Prepare data arrays for input and output
    input_accuracies = []
    output_accuracies = []

    for model in models:
        # Input accuracy
        input_match = target_performance[
            (target_performance["model"] == model)
            & (target_performance["target"] == "input")
        ]
        input_acc = input_match["accuracy"].iloc[0] if not input_match.empty else 0
        input_accuracies.append(input_acc)

        # Output accuracy
        output_match = target_performance[
            (target_performance["model"] == model)
            & (target_performance["target"] == "output")
        ]
        output_acc = output_match["accuracy"].iloc[0] if not output_match.empty else 0
        output_accuracies.append(output_acc)

    # Create bars with different patterns
    # Input bars (with vertical hatching)
    bars1 = ax.bar(
        x - width / 2,
        input_accuracies,
        width,
        label="Input Targets",
        color=[model_colors[model] for model in models],
        alpha=0.8,
        hatch="|||",  # Vertical stripes
        edgecolor="black",
        linewidth=1.0,
    )

    # Output bars (with dotted pattern)
    bars2 = ax.bar(
        x + width / 2,
        output_accuracies,
        width,
        label="Output Targets",
        color=[model_colors[model] for model in models],
        alpha=0.6,
        hatch="***",  # Dotted pattern
        edgecolor="black",
        linewidth=1.0,
    )

    # Customize chart
    ax.set_ylabel("Accuracy", fontsize=16, fontweight="bold")
    ax.set_xlabel("Models", fontsize=16, fontweight="bold")

    if not no_titles:
        ax.set_title(title, fontsize=18, fontweight="bold", pad=20)

    ax.set_xticks(x)
    ax.set_xticklabels(display_names, rotation=0, ha="center", fontsize=14)
    ax.set_ylim(0, 1.0)

    # Format y-axis as percentages
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0%}"))

    # Add grid for better readability
    ax.grid(True, axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
    ax.set_axisbelow(True)

    # Add horizontal legend below the plot
    legend = ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.1),
        ncol=2,  # Two columns for input/output
        frameon=True,
        fancybox=True,
        shadow=True,
        fontsize=14,
        framealpha=0.95,
    )
    legend.get_frame().set_facecolor("white")

    # Clean up spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)

    # Adjust layout to make room for legend below
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.15)  # Make room for horizontal legend

    return fig


def calculate_global_transfer_analysis(data: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate input-output answer transfer analysis across all tasks.

    Parameters:
    - data: Full evaluation dataset

    Returns:
    - DataFrame with transfer rates by model and question type
    """
    # Target question types for the analysis
    target_questions = [
        "blue_node_count",
        "colored_node_count",
        "component_count",
    ]

    # Filter to incorrect input-targeted question-based responses for target questions
    transfer_candidates = data[
        (data["question_type"] != "full_output")
        & (data["target"] == "input")
        & (data["correct"] == False)
        & (data["question_type"].isin(target_questions))
    ].copy()

    if transfer_candidates.empty:
        # Return empty DataFrame if no candidates
        return pd.DataFrame(
            columns=[
                "model",
                "question_type",
                "transfer_rate",
                "total_incorrect",
                "transfer_count",
            ]
        )

    # Analyze transfer cases
    transfer_results = []

    for _, row in transfer_candidates.iterrows():
        try:
            # Extract model answer
            model_answer = ""
            try:
                details = row.get("details", {})
                if isinstance(details, dict):
                    response_metadata = details.get("response_metadata", {})
                    if isinstance(response_metadata, dict):
                        raw_answer = response_metadata.get("answer", "")
                        if raw_answer is not None:
                            model_answer = str(raw_answer).strip()

                if not model_answer:
                    continue

            except Exception:
                continue

            # Get paths
            ground_truth_input_path = row.get("ground_truth_path", "")
            if not ground_truth_input_path:
                continue

            ground_truth_output_path = ground_truth_input_path.replace(
                "/input/", "/output/"
            )

            if not os.path.exists(ground_truth_output_path):
                continue

            try:
                # Get correct output answer
                correct_output_answer = get_ground_truth_answer(
                    ground_truth_output_path, row["question_type"], "output"
                )

                # Check if model's wrong answer would be correct for output
                is_transfer_case = compare_answers(
                    correct_output_answer, model_answer, row["question_type"]
                )

                transfer_results.append(
                    {
                        "model": row["model"],
                        "question_type": row["question_type"],
                        "is_transfer_case": is_transfer_case,
                    }
                )

            except Exception:
                continue

        except Exception:
            continue

    if not transfer_results:
        return pd.DataFrame(
            columns=[
                "model",
                "question_type",
                "transfer_rate",
                "total_incorrect",
                "transfer_count",
            ]
        )

    # Convert to DataFrame and calculate statistics
    transfer_df = pd.DataFrame(transfer_results)

    # Calculate transfer rates
    transfer_stats = (
        transfer_df.groupby(["model", "question_type"])["is_transfer_case"]
        .agg(["sum", "count", "mean"])
        .reset_index()
    )
    transfer_stats.columns = [
        "model",
        "question_type",
        "transfer_count",
        "total_incorrect",
        "transfer_rate",
    ]

    return transfer_stats


def create_transfer_analysis_chart(
    data: pd.DataFrame,
    title: str = "Input-Output Answer Transfer Analysis",
    figsize: tuple = (16, 10),
    no_titles: bool = False,
) -> plt.Figure:
    """
    Create input-output answer transfer analysis chart showing transfer rates
    by model and question type across all tasks.

    Parameters:
    - data: Full evaluation dataset
    - title: Chart title
    - figsize: Figure size (width, height)
    - no_titles: Whether to suppress titles

    Returns:
    - matplotlib Figure
    """
    # Calculate transfer analysis
    transfer_stats = calculate_global_transfer_analysis(data)

    if transfer_stats.empty:
        # Return chart showing perfect performance
        fig, ax = plt.subplots(figsize=figsize)
        ax.text(
            0.5,
            0.5,
            "🎯 Perfect Input-Output Distinction\n\nAll models correctly distinguished between input and output graphs.\nNo input-output confusion detected across any question types.",
            ha="center",
            va="center",
            transform=ax.transAxes,
            fontsize=16,
            bbox=dict(boxstyle="round,pad=1", facecolor="lightgreen", alpha=0.8),
        )
        if not no_titles:
            ax.set_title(title, fontsize=18, fontweight="bold", pad=20)
        ax.axis("off")
        return fig

    # Target question types in order
    target_questions = [
        "blue_node_count",
        "colored_node_count",
        "component_count",
    ]

    # Filter to target questions that have data
    available_questions = [
        q for q in target_questions if q in transfer_stats["question_type"].unique()
    ]

    models = get_model_family_order(transfer_stats["model"].unique())
    display_names = apply_display_names_to_list(models)

    # Create figure with larger font sizes for presentation
    fig, ax = plt.subplots(figsize=figsize)

    # Increase base font sizes for presentation
    plt.rcParams.update(
        {
            "font.size": 14,
            "axes.titlesize": 18,
            "axes.labelsize": 16,
            "xtick.labelsize": 14,
            "ytick.labelsize": 14,
            "legend.fontsize": 14,
        }
    )

    # Set up bar positions
    x = np.arange(len(available_questions))
    width = 0.8 / len(models)  # Width of bars

    # Get colors for models
    model_colors = get_color_palette(models, "models")

    # Create bars for each model
    for i, (model, display_name) in enumerate(zip(models, display_names)):
        model_data = transfer_stats[transfer_stats["model"] == model]

        # Align data with question order
        transfer_rates = []
        sample_counts = []

        for question in available_questions:
            question_data = model_data[model_data["question_type"] == question]
            if not question_data.empty:
                transfer_rates.append(question_data["transfer_rate"].iloc[0])
                sample_counts.append(question_data["total_incorrect"].iloc[0])
            else:
                transfer_rates.append(0)
                sample_counts.append(0)

        # Calculate x positions for this model's bars
        x_pos = x + (i - (len(models) - 1) / 2) * width

        # Create bars
        bars = ax.bar(
            x_pos,
            transfer_rates,
            width,
            label=display_name,
            color=model_colors[model],
            alpha=0.8,
            edgecolor="black",
            linewidth=1.0,
        )

        # Add sample count annotations (but not accuracy)
        # for bar, count in zip(bars, sample_counts):
        #    if count > 0:  # Only show for non-zero sample counts
        #        height = bar.get_height()
        #        ax.text(
        #            bar.get_x() + bar.get_width() / 2,
        #            height + 0.01,
        #            f"n={count}",
        #            ha="center",
        #            va="bottom",
        #            fontsize=10,
        #            fontweight="bold",
        #        )

    # Customize chart
    ax.set_ylabel("Input-Output Answer Transfer Rate", fontsize=16, fontweight="bold")
    ax.set_xlabel("Question Type", fontsize=16, fontweight="bold")

    if not no_titles:
        ax.set_title(title, fontsize=18, fontweight="bold", pad=20)

    ax.set_xticks(x)
    ax.set_xticklabels(available_questions, rotation=0, ha="center", fontsize=14)
    ax.set_ylim(
        0,
        (
            max(1.0, transfer_stats["transfer_rate"].max() * 1.1)
            if not transfer_stats.empty
            else 1.0
        ),
    )

    # Format y-axis as percentages
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0%}"))

    # Add grid for better readability
    ax.grid(True, axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
    ax.set_axisbelow(True)

    # Add horizontal legend below the plot
    legend = ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.15),
        ncol=min(len(models), 5),
        frameon=True,
        fancybox=True,
        shadow=True,
        fontsize=14,
        framealpha=0.95,
    )
    legend.get_frame().set_facecolor("white")

    # Clean up spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)

    # Adjust layout to make room for legend below and rotated labels
    plt.tight_layout()
    plt.subplots_adjust(
        bottom=0.25
    )  # Make room for horizontal legend and rotated labels

    return fig


def create_task_performance_chart(
    full_output_data: pd.DataFrame,
    title: str = "Model Performance by Task (Full Output Tasks)",
    figsize: tuple = (16, 10),
    no_titles: bool = False,
) -> plt.Figure:
    """
    Create task performance chart showing model performance across specific tasks
    for full output tasks only.

    Parameters:
    - full_output_data: DataFrame with full output task results
    - title: Chart title
    - figsize: Figure size (width, height)
    - no_titles: Whether to suppress titles

    Returns:
    - matplotlib Figure
    """
    if full_output_data.empty or "benchmark" not in full_output_data.columns:
        # Return empty figure if no data
        fig, ax = plt.subplots(figsize=figsize)
        ax.text(
            0.5,
            0.5,
            "No full output task data available",
            ha="center",
            va="center",
            transform=ax.transAxes,
            fontsize=16,
        )
        return fig

    # Specific tasks in the requested order
    target_tasks = [
        "colorDegree3",
        "colorEquidistant",
        "colorDistanceAtLeast2",
        "bipartitionCompletion",
        "mergeAtBlue",
        "colorInternal",
        "BlueSubgraph",
        "addHub",
    ]

    # Filter to target tasks that have data
    available_tasks = [
        task for task in target_tasks if task in full_output_data["benchmark"].unique()
    ]

    if not available_tasks:
        # Return empty figure if no target tasks found
        fig, ax = plt.subplots(figsize=figsize)
        ax.text(
            0.5,
            0.5,
            "No data available for the specified tasks",
            ha="center",
            va="center",
            transform=ax.transAxes,
            fontsize=16,
        )
        return fig

    # Calculate performance by task and model
    task_performance = (
        full_output_data[full_output_data["benchmark"].isin(available_tasks)]
        .groupby(["benchmark", "model"])["correct"]
        .agg(["mean", "count"])
        .reset_index()
    )
    task_performance.columns = ["benchmark", "model", "accuracy", "sample_count"]

    models = get_model_family_order(task_performance["model"].unique())
    display_names = apply_display_names_to_list(models)

    # Create figure with larger font sizes for presentation
    fig, ax = plt.subplots(figsize=figsize)

    # Increase base font sizes for presentation
    plt.rcParams.update(
        {
            "font.size": 14,
            "axes.titlesize": 18,
            "axes.labelsize": 16,
            "xtick.labelsize": 14,
            "ytick.labelsize": 14,
            "legend.fontsize": 14,
        }
    )

    # Set up bar positions
    x = np.arange(len(available_tasks))
    width = 0.8 / len(models)  # Width of bars

    # Get colors for models
    model_colors = get_color_palette(models, "models")

    # Create bars for each model
    for i, (model, display_name) in enumerate(zip(models, display_names)):
        model_data = task_performance[task_performance["model"] == model]

        # Align data with task order
        accuracies = []

        for task in available_tasks:
            task_data = model_data[model_data["benchmark"] == task]
            if not task_data.empty:
                accuracies.append(task_data["accuracy"].iloc[0])
            else:
                accuracies.append(0)

        # Calculate x positions for this model's bars
        x_pos = x + (i - (len(models) - 1) / 2) * width

        # Create bars (no annotations)
        bars = ax.bar(
            x_pos,
            accuracies,
            width,
            label=display_name,
            color=model_colors[model],
            alpha=0.8,
            edgecolor="black",
            linewidth=1.0,
        )

    # Customize chart
    ax.set_ylabel("Accuracy", fontsize=16, fontweight="bold")
    ax.set_xlabel("Task", fontsize=16, fontweight="bold")

    if not no_titles:
        ax.set_title(title, fontsize=18, fontweight="bold", pad=20)

    ax.set_xticks(x)
    ax.set_xticklabels(available_tasks, rotation=45, ha="right", fontsize=14)
    ax.set_ylim(0, 1.0)

    # Format y-axis as percentages
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0%}"))

    # Add grid for better readability
    ax.grid(True, axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
    ax.set_axisbelow(True)

    # Add horizontal legend below the plot
    legend = ax.legend(
        loc="upper left",
        ncol=3,  # or keep min(len(models), 5) if you want multiple columns
        frameon=True,
        fancybox=True,
        shadow=True,
        fontsize=14,
        framealpha=0.95,
    )
    legend.get_frame().set_facecolor("white")

    # Clean up spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)

    # Adjust layout to make room for legend below and rotated labels
    plt.tight_layout()
    plt.subplots_adjust(
        bottom=0.25
    )  # Make room for horizontal legend and rotated labels

    return fig


def create_model_performance_by_pattern_chart(
    full_output_data: pd.DataFrame,
    title: str = "Model Performance by Size Pattern",
    figsize: tuple = (16, 10),
    no_titles: bool = False,
) -> plt.Figure:
    """
    Create model performance by size pattern chart for presentations.
    Shows each model's performance across different size patterns without annotations.

    Parameters:
    - full_output_data: DataFrame with full output task results
    - title: Chart title
    - figsize: Figure size (width, height)
    - no_titles: Whether to suppress titles

    Returns:
    - matplotlib Figure
    """
    if full_output_data.empty or "size_pattern" not in full_output_data.columns:
        # Return empty figure if no data
        fig, ax = plt.subplots(figsize=figsize)
        ax.text(
            0.5,
            0.5,
            "No full output data with size patterns available",
            ha="center",
            va="center",
            transform=ax.transAxes,
            fontsize=16,
        )
        return fig

    # Calculate statistics for each pattern-model combination
    pattern_model_stats = (
        full_output_data.groupby(["size_pattern", "model"])["correct"]
        .agg(["mean", "count", "std"])
        .reset_index()
    )

    # Get unique patterns and models with proper ordering
    patterns = get_size_pattern_order(pattern_model_stats["size_pattern"].unique())
    models = get_model_family_order(pattern_model_stats["model"].unique())
    display_names = apply_display_names_to_list(models)

    # Create figure with larger font sizes for presentation
    fig, ax = plt.subplots(figsize=figsize)

    # Increase base font sizes for presentation
    plt.rcParams.update(
        {
            "font.size": 14,
            "axes.titlesize": 18,
            "axes.labelsize": 16,
            "xtick.labelsize": 14,
            "ytick.labelsize": 14,
            "legend.fontsize": 14,
        }
    )

    # Set up positions for grouped bars
    x_positions = np.arange(len(patterns))
    bar_width = 0.8 / len(models)

    # Get colors for models
    model_colors = get_color_palette(models, "models")

    # Create bars for each model
    for i, (model, display_name) in enumerate(zip(models, display_names)):
        model_data = pattern_model_stats[pattern_model_stats["model"] == model]

        # Align data with pattern order
        accuracies = []
        for pattern in patterns:
            pattern_data = model_data[model_data["size_pattern"] == pattern]
            if not pattern_data.empty:
                accuracies.append(pattern_data["mean"].iloc[0])
            else:
                accuracies.append(0)

        # Calculate x positions for this model's bars
        x_pos = x_positions + (i - (len(models) - 1) / 2) * bar_width

        # Create bars (no annotations - clean presentation style)
        bars = ax.bar(
            x_pos,
            accuracies,
            bar_width,
            label=display_name,
            color=model_colors[model],
            alpha=0.8,
            edgecolor="black",
            linewidth=1.0,
        )

    # Customize chart
    ax.set_ylabel("Accuracy", fontsize=16, fontweight="bold")
    ax.set_xlabel("Size Pattern", fontsize=16, fontweight="bold")

    if not no_titles:
        ax.set_title(title, fontsize=18, fontweight="bold", pad=20)

    ax.set_xticks(x_positions)
    ax.set_xticklabels(patterns, rotation=0, ha="center", fontsize=14)
    ax.set_ylim(0, 1.0)

    # Format y-axis as percentages
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0%}"))

    # Add grid for better readability
    ax.grid(True, axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
    ax.set_axisbelow(True)

    # Add horizontal legend below the plot
    legend = ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.1),
        ncol=min(len(models), 5),  # Max 5 columns
        frameon=True,
        fancybox=True,
        shadow=True,
        fontsize=14,
        framealpha=0.95,
    )
    legend.get_frame().set_facecolor("white")

    # Clean up spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)

    # Adjust layout to make room for legend below
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.2)  # Make room for horizontal legend

    return fig


def create_task_question_impact_matrix():
    """
    Create a comprehensive matrix showing whether each task-question combination
    results in different answers between input and output graphs.

    Returns dictionary: task -> question -> impact_type
    - "change": Answer definitely changes between input and output
    - "no_change": Answer definitely stays the same
    - "maybe_change": Answer might change depending on graph structure
    """

    # Initialize the matrix
    impact_matrix = {}

    # Color-only tasks (don't change graph structure, only node colors)
    color_only_tasks = [
        "colorDegree1",
        "colorDegree2",
        "colorDegree3",
        "colorMaxDegree",
        "colorMinDegree",
        "colorInternal",
        "colorLeaves",
        "colorNeighbors",
        "colorPath",
        "colorComponents",
        "colorDistanceAtLeast2",
        "colorEquidistant",
    ]

    for task in color_only_tasks:
        impact_matrix[task] = {
            "node_count": "no_change",  # Same nodes, just recolored
            "edge_count": "no_change",  # Same edges
            "blue_node_count": "change",  # Coloring tasks change blue node count
            "colored_node_count": "change",  # Coloring tasks change colored node count
            "is_connected": "no_change",  # Structure unchanged
            "is_tree": "no_change",  # Structure unchanged
            "has_cycles": "no_change",  # Structure unchanged
            "max_degree": "no_change",  # Degree unchanged
            "min_degree": "no_change",  # Degree unchanged
            "component_count": "no_change",  # Components unchanged
        }

    # Node-adding tasks
    impact_matrix["addHub"] = {
        "node_count": "change",  # +1 node
        "edge_count": "change",  # +N edges
        "blue_node_count": "change",  # Hub typically colored blue
        "colored_node_count": "change",  # Hub adds to colored count
        "is_connected": "maybe_change",  # Disconnected graph becomes connected
        "is_tree": "maybe_change",  # Tree becomes non-tree
        "has_cycles": "maybe_change",  # Might create cycles
        "max_degree": "change",  # Hub has degree N
        "min_degree": "maybe_change",  # Min degree might change
        "component_count": "maybe_change",  # Multiple components become 1
    }

    # Node-removing tasks
    remove_tasks = ["removeDegree1", "removeDegree2", "removeDegree3"]
    for task in remove_tasks:
        impact_matrix[task] = {
            "node_count": "maybe_change",  # Removes nodes if they exist
            "edge_count": "maybe_change",  # Removes edges
            "blue_node_count": "maybe_change",  # Depends on colors of removed nodes
            "colored_node_count": "maybe_change",  # Depends on colors
            "is_connected": "maybe_change",  # Might disconnect graph
            "is_tree": "maybe_change",  # Tree structure might change
            "has_cycles": "maybe_change",  # Cycle structure might change
            "max_degree": "maybe_change",  # Degree distribution changes
            "min_degree": "maybe_change",  # Degree distribution changes
            "component_count": "maybe_change",  # Might increase component count
        }

    # Subgraph tasks
    impact_matrix["blueSubgraph"] = {
        "node_count": "change",  # Only blue nodes remain
        "edge_count": "change",  # Only edges between blue nodes
        "blue_node_count": "maybe_change",  # All remaining nodes might be blue
        "colored_node_count": "change",  # Subset of original colored nodes
        "is_connected": "maybe_change",  # Subgraph might be disconnected
        "is_tree": "maybe_change",  # Tree property might change
        "has_cycles": "maybe_change",  # Cycle property might change
        "max_degree": "change",  # Degree distribution changes
        "min_degree": "change",  # Degree distribution changes
        "component_count": "maybe_change",  # Component count might change
    }

    # Edge-modifying tasks
    impact_matrix["complementGraph"] = {
        "node_count": "no_change",  # Same nodes
        "edge_count": "change",  # Completely different edge set
        "blue_node_count": "no_change",  # Node colors unchanged
        "colored_node_count": "no_change",  # Node colors unchanged
        "is_connected": "maybe_change",  # Connectivity inverted
        "is_tree": "maybe_change",  # Tree property likely changes
        "has_cycles": "maybe_change",  # Cycle property likely changes
        "max_degree": "change",  # Degree distribution inverted
        "min_degree": "change",  # Degree distribution inverted
        "component_count": "maybe_change",  # Component structure inverted
    }

    # Completion tasks
    impact_matrix["bipartitionCompletion"] = {
        "node_count": "no_change",  # Same nodes
        "edge_count": "no_change",  # Same edges
        "blue_node_count": "change",  # More nodes become blue
        "colored_node_count": "change",  # More nodes become colored
        "is_connected": "no_change",  # Structure unchanged
        "is_tree": "no_change",  # Structure unchanged
        "has_cycles": "no_change",  # Structure unchanged
        "max_degree": "no_change",  # Degree unchanged
        "min_degree": "no_change",  # Degree unchanged
        "component_count": "no_change",  # Components unchanged
    }

    return impact_matrix


def filter_data_by_impact(data, impact_filter={"change", "maybe_change"}):
    """
    Filter evaluation data to only include task-question combinations where
    the answer can actually change between input and output.
    """
    impact_matrix = create_task_question_impact_matrix()

    # Create mask for filtering
    mask = []

    for _, row in data.iterrows():
        task = row.get("benchmark", "")
        question = row.get("question_type", "")

        # Skip full_output questions
        if question == "full_output":
            mask.append(False)
            continue

        # Check if this task-question combination should be included
        if task in impact_matrix and question in impact_matrix[task]:
            impact = impact_matrix[task][question]
            mask.append(impact in impact_filter)
        else:
            # If we don't know about this task-question combo, exclude it conservatively
            mask.append(False)

    return data[mask]


def create_filtered_input_output_comparison_chart(
    question_data: pd.DataFrame,
    title: str = "Input vs Output Performance (Meaningful Comparisons Only)",
    figsize: tuple = (16, 10),
    no_titles: bool = False,
) -> plt.Figure:
    """
    Create an input vs output comparison chart for question-based tasks,
    but only including task-question combinations where the answer actually
    can change between input and output targets.
    """
    if question_data.empty or "target" not in question_data.columns:
        # Return empty figure if no data
        fig, ax = plt.subplots(figsize=figsize)
        ax.text(
            0.5,
            0.5,
            "No question-based data with targets available",
            ha="center",
            va="center",
            transform=ax.transAxes,
            fontsize=16,
        )
        return fig

    # Filter data to only meaningful comparisons
    original_count = len(question_data)

    # First filter: only include "change" and "maybe_change" cases
    filtered_data_strong = filter_data_by_impact(question_data, {"change"})
    filtered_data_all = filter_data_by_impact(question_data, {"change", "maybe_change"})

    # Use the more inclusive filter for the main chart
    filtered_data = filtered_data_all

    if filtered_data.empty:
        fig, ax = plt.subplots(figsize=figsize)
        ax.text(
            0.5,
            0.5,
            f"No meaningful input-output comparisons found\n\n"
            f"Original dataset had {original_count:,} question-based responses,\n"
            f"but none involve task-question combinations where\n"
            f"the answer would differ between input and output graphs.",
            ha="center",
            va="center",
            transform=ax.transAxes,
            fontsize=14,
            bbox=dict(boxstyle="round,pad=1", facecolor="lightyellow", alpha=0.8),
        )
        if not no_titles:
            ax.set_title(title, fontsize=18, fontweight="bold", pad=20)
        return fig

    # Create figure with larger font sizes for presentation
    fig, ax = plt.subplots(figsize=figsize)

    # Calculate performance by target and model for filtered data
    target_performance = (
        filtered_data.groupby(["target", "model"])["correct"]
        .agg(["mean", "count"])
        .reset_index()
    )
    target_performance.columns = ["target", "model", "accuracy", "sample_count"]

    # Get unique targets and models with proper ordering
    targets = sorted(target_performance["target"].unique())
    if "input" in targets:
        targets = ["input"] + [t for t in targets if t != "input"]

    models = get_model_family_order(target_performance["model"].unique())
    display_names = apply_display_names_to_list(models)

    # Set up bar positions
    x = np.arange(len(models))
    width = 0.35

    # Get colors for models
    model_colors = get_color_palette(models, "models")

    # Prepare data arrays for input and output
    input_accuracies = []
    output_accuracies = []
    input_counts = []
    output_counts = []

    for model in models:
        # Input accuracy
        input_match = target_performance[
            (target_performance["model"] == model)
            & (target_performance["target"] == "input")
        ]
        input_acc = input_match["accuracy"].iloc[0] if not input_match.empty else 0
        input_count = (
            input_match["sample_count"].iloc[0] if not input_match.empty else 0
        )
        input_accuracies.append(input_acc)
        input_counts.append(input_count)

        # Output accuracy
        output_match = target_performance[
            (target_performance["model"] == model)
            & (target_performance["target"] == "output")
        ]
        output_acc = output_match["accuracy"].iloc[0] if not output_match.empty else 0
        output_count = (
            output_match["sample_count"].iloc[0] if not output_match.empty else 0
        )
        output_accuracies.append(output_acc)
        output_counts.append(output_count)

    # Create bars with different patterns
    bars1 = ax.bar(
        x - width / 2,
        input_accuracies,
        width,
        label="Input Targets",
        color=[model_colors[model] for model in models],
        alpha=0.8,
        hatch="|||",
        edgecolor="black",
        linewidth=1.0,
    )

    bars2 = ax.bar(
        x + width / 2,
        output_accuracies,
        width,
        label="Output Targets",
        color=[model_colors[model] for model in models],
        alpha=0.6,
        hatch="***",
        edgecolor="black",
        linewidth=1.0,
    )

    # Add value labels on bars
    for i, (input_acc, output_acc, input_count, output_count) in enumerate(
        zip(input_accuracies, output_accuracies, input_counts, output_counts)
    ):
        if input_acc > 0:
            ax.text(
                x[i] - width / 2,
                input_acc + 0.02,
                f"{input_acc:.3f}\n(n={input_count})",
                ha="center",
                va="bottom",
                fontsize=10,
                fontweight="bold",
            )

        if output_acc > 0:
            ax.text(
                x[i] + width / 2,
                output_acc + 0.02,
                f"{output_acc:.3f}\n(n={output_count})",
                ha="center",
                va="bottom",
                fontsize=10,
                fontweight="bold",
            )

    # Customize chart
    ax.set_ylabel("Accuracy", fontsize=16, fontweight="bold")
    ax.set_xlabel("Models", fontsize=16, fontweight="bold")

    if not no_titles:
        # Create detailed title with filtering info
        strong_count = len(filtered_data_strong)
        all_count = len(filtered_data)
        title_with_info = (
            f"{title}\n"
            f"Filtered from {original_count:,} to {all_count:,} meaningful comparisons "
            f"({strong_count:,} definite changes, {all_count-strong_count:,} possible changes)"
        )
        ax.set_title(title_with_info, fontsize=16, fontweight="bold", pad=25)

    ax.set_xticks(x)
    ax.set_xticklabels(display_names, rotation=0, ha="center", fontsize=14)
    ax.set_ylim(0, 1.0)

    # Format y-axis as percentages
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0%}"))

    # Add grid and legend
    ax.grid(True, axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
    ax.set_axisbelow(True)

    legend = ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.15),
        ncol=2,
        frameon=True,
        fancybox=True,
        shadow=True,
        fontsize=14,
        framealpha=0.95,
    )
    legend.get_frame().set_facecolor("white")

    # Add explanatory text box
    explanation = (
        "Only includes task-question combinations where input and output graphs\n"
        "would have different correct answers (e.g., node_count for addHub task,\n"
        "blue_node_count for coloring tasks, is_connected for structural tasks)."
    )

    ax.text(
        0.02,
        0.02,
        explanation,
        transform=ax.transAxes,
        fontsize=11,
        verticalalignment="bottom",
        bbox=dict(boxstyle="round,pad=0.4", facecolor="lightblue", alpha=0.7),
    )

    # Clean up spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)

    # Adjust layout
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.2, top=0.85)

    return fig


def generate_presentation_model_overview(
    data: pd.DataFrame, output_dir: str, verbose: bool = False, no_titles: bool = False
) -> Dict[str, Any]:
    """
    Generate presentation-optimized model overview visualizations.

    Parameters:
    - data: Full evaluation dataset
    - output_dir: Directory to save visualizations
    - verbose: Whether to print progress info
    - no_titles: Whether to suppress titles

    Returns:
    - Dict with generation summary
    """
    results = {"generated_files": [], "stats": {}}

    # Split data by question type
    full_output_data, question_data = split_by_question_type(data)

    # 1. Combined model performance chart
    if verbose:
        print("   📊 Creating combined model performance chart for presentation...")

    if not full_output_data.empty or not question_data.empty:
        # Create combined chart
        fig = create_combined_model_performance_chart(
            full_output_data,
            question_data,
            title="Model Performance: Full Output vs Question-Based Tasks",
            figsize=(16, 10),
            no_titles=no_titles,
        )

        # Save the chart in both formats
        base_filepath = f"{output_dir}/01_combined_model_performance"
        file_paths = save_presentation_chart(fig, base_filepath, no_titles)
        results["generated_files"].extend(file_paths)

        if verbose:
            print(f"   ✅ Generated combined model performance chart (PNG + PDF)")

    # 2. Scaling performance chart (full output tasks only)
    if verbose:
        print("   📈 Creating scaling performance chart for presentation...")

    if not full_output_data.empty and "size_pattern" in full_output_data.columns:
        patterns = full_output_data["size_pattern"].unique()
        if len(patterns) > 1:  # Only create if we have multiple patterns
            # Create scaling chart
            fig = create_scaling_performance_chart(
                full_output_data,
                title="Model Scaling Performance by Pattern Size (Full Output Tasks)",
                figsize=(16, 10),
                no_titles=no_titles,
            )

            # Save the chart in both formats
            base_filepath = f"{output_dir}/02_scaling_performance_by_pattern"
            file_paths = save_presentation_chart(fig, base_filepath, no_titles)
            results["generated_files"].extend(file_paths)

            if verbose:
                print(f"   ✅ Generated scaling performance chart (PNG + PDF)")
        else:
            if verbose:
                print("   ⚠️ Only one size pattern found, skipping scaling chart")

    # 3. Input vs Output comparison chart (question-based tasks only)
    if verbose:
        print("   🎯 Creating input vs output comparison chart for presentation...")

    if not question_data.empty and "target" in question_data.columns:
        targets = question_data["target"].unique()
        if len(targets) > 1:  # Only create if we have both input and output targets
            # Create input vs output chart
            fig = create_input_output_comparison_chart(
                question_data,
                title="Model Performance: Input vs Output Targets",
                figsize=(16, 10),
                no_titles=no_titles,
            )

            # Save the chart in both formats
            base_filepath = f"{output_dir}/03_input_output_comparison"
            file_paths = save_presentation_chart(fig, base_filepath, no_titles)
            results["generated_files"].extend(file_paths)

            if verbose:
                print(f"   ✅ Generated input vs output comparison chart (PNG + PDF)")
        else:
            if verbose:
                print("   ⚠️ Only one target type found, skipping input vs output chart")

    # 4. Input-Output Answer Transfer Analysis (question-based tasks)
    if verbose:
        print(
            "   🔄 Creating input-output answer transfer analysis for presentation..."
        )

    if not question_data.empty:
        # Create transfer analysis chart
        fig = create_transfer_analysis_chart(
            data,  # Use full data (function will filter internally)
            title="Input-Output Answer Transfer Analysis Across All Tasks",
            figsize=(16, 10),
            no_titles=no_titles,
        )

        # Save the chart in both formats
        base_filepath = f"{output_dir}/04_transfer_analysis"
        file_paths = save_presentation_chart(fig, base_filepath, no_titles)
        results["generated_files"].extend(file_paths)

        if verbose:
            print(f"   ✅ Generated transfer analysis chart (PNG + PDF)")

    # 5. Task Performance Chart (full output tasks only)
    if verbose:
        print("   📊 Creating task performance chart for presentation...")

    if not full_output_data.empty and "benchmark" in full_output_data.columns:
        # Create task performance chart
        fig = create_task_performance_chart(
            full_output_data,
            title="Model Performance by Task (Full Output Tasks)",
            figsize=(16, 10),
            no_titles=no_titles,
        )

        # Save the chart in both formats
        base_filepath = f"{output_dir}/05_task_performance"
        file_paths = save_presentation_chart(fig, base_filepath, no_titles)
        results["generated_files"].extend(file_paths)

        if verbose:
            print(f"   ✅ Generated task performance chart (PNG + PDF)")

    # 6. Model Performance by Pattern Chart (full output tasks only)
    if verbose:
        print("   📊 Creating model performance by pattern chart for presentation...")

    if not full_output_data.empty and "size_pattern" in full_output_data.columns:
        patterns = full_output_data["size_pattern"].unique()
        if len(patterns) > 1:  # Only create if we have multiple patterns
            # Create model performance by pattern chart
            fig = create_model_performance_by_pattern_chart(
                full_output_data,
                title="Model Performance by Size Pattern (Full Output Tasks)",
                figsize=(16, 10),
                no_titles=no_titles,
            )

            # Save the chart in both formats
            base_filepath = f"{output_dir}/06_model_performance_by_pattern"
            file_paths = save_presentation_chart(fig, base_filepath, no_titles)
            results["generated_files"].extend(file_paths)

            if verbose:
                print(f"   ✅ Generated model performance by pattern chart (PNG + PDF)")
        else:
            if verbose:
                print(
                    "   ⚠️ Only one size pattern found, skipping pattern performance chart"
                )

    # 7. Filtered Input vs Output Comparison Chart (NEW - meaningful comparisons only)
    if verbose:
        print(
            "   🎯 Creating filtered input vs output comparison chart for presentation..."
        )

    if not question_data.empty and "target" in question_data.columns:
        targets = question_data["target"].unique()
        if len(targets) > 1:  # Only create if we have both input and output targets
            # Create filtered input vs output chart
            fig = create_filtered_input_output_comparison_chart(
                question_data,
                title="Input vs Output Performance (Meaningful Comparisons Only)",
                figsize=(16, 10),
                no_titles=no_titles,
            )

            # Save the chart in both formats
            base_filepath = f"{output_dir}/07_filtered_input_output_comparison"
            file_paths = save_presentation_chart(fig, base_filepath, no_titles)
            results["generated_files"].extend(file_paths)

            if verbose:
                print(
                    f"   ✅ Generated filtered input vs output comparison chart (PNG + PDF)"
                )
        else:
            if verbose:
                print(
                    "   ⚠️ Only one target type found, skipping filtered input vs output chart"
                )

    # Store statistics for all task types
    if not full_output_data.empty:
        full_output_stats = full_output_data.groupby("model")["correct"].agg(
            ["mean", "count"]
        )
        results["stats"]["full_output_by_model"] = full_output_stats.to_dict()

        # Add pattern-specific stats if available
        if "size_pattern" in full_output_data.columns:
            pattern_stats = full_output_data.groupby(["size_pattern", "model"])[
                "correct"
            ].agg(["mean", "count"])
            results["stats"]["full_output_by_pattern_model"] = pattern_stats.to_dict()

    # Store statistics for all task types
    if not full_output_data.empty:
        full_output_stats = full_output_data.groupby("model")["correct"].agg(
            ["mean", "count"]
        )
        results["stats"]["full_output_by_model"] = full_output_stats.to_dict()

        # Add pattern-specific stats if available
        if "size_pattern" in full_output_data.columns:
            pattern_stats = full_output_data.groupby(["size_pattern", "model"])[
                "correct"
            ].agg(["mean", "count"])
            results["stats"]["full_output_by_pattern_model"] = pattern_stats.to_dict()

    if not question_data.empty:
        question_stats = question_data.groupby("model")["correct"].agg(
            ["mean", "count"]
        )
        results["stats"]["question_based_by_model"] = question_stats.to_dict()

        # Add target-specific stats if available
        if "target" in question_data.columns:
            target_stats = question_data.groupby(["target", "model"])["correct"].agg(
                ["mean", "count"]
            )
            results["stats"]["question_based_by_target_model"] = target_stats.to_dict()

    return results


def generate_presentation_visualizations(
    data: pd.DataFrame, output_dir: str, verbose: bool = False, no_titles: bool = False
) -> Dict[str, Any]:
    """
    Generate all presentation visualizations.

    Parameters:
    - data: Full evaluation dataset
    - output_dir: Directory to save visualizations
    - verbose: Whether to print progress info
    - no_titles: Whether to suppress titles

    Returns:
    - Dict with generation summary
    """
    if verbose:
        print("🎯 Generating Presentation Visualizations...")

    # Create presentation subdirectory
    presentation_dir = os.path.join(output_dir, "presentation")
    os.makedirs(presentation_dir, exist_ok=True)

    all_results = {
        "presentation_dir": presentation_dir,
        "model_overview": {},
        "total_files": 0,
    }

    # Generate combined model overview
    if verbose:
        print("  📊 Model Performance Overview...")
    model_results = generate_presentation_model_overview(
        data, presentation_dir, verbose, no_titles
    )
    all_results["model_overview"] = model_results

    # Calculate total files
    total_files = len(model_results["generated_files"])
    all_results["total_files"] = total_files

    # Create README for presentation visualizations
    create_presentation_readme(all_results, presentation_dir, verbose)

    if verbose:
        print(
            f"  ✅ Generated {total_files} presentation visualizations in {presentation_dir}"
        )

    return all_results


def create_presentation_readme(results: Dict, output_dir: str, verbose: bool = False):
    """Create README for presentation visualizations."""
    if verbose:
        print("   📋 Creating presentation README...")

    readme_content = f"""# Presentation Visualizations

This directory contains visualizations optimized for presentations and talks.

## Design Principles

- **Larger fonts**: All text elements sized for projection/presentation (14-18pt)
- **Combined views**: Related charts merged to reduce slide count
- **Clear patterns**: Different bar patterns (solid vs. hatched) for distinction
- **Minimal clutter**: No floating values or excessive annotations
- **High contrast**: Bold colors and clear visual hierarchy
- **Dual format**: Both high-quality PNG (300 DPI) and PDF for maximum compatibility

## Generated Files

### Model Performance
- `01_combined_model_performance.png/pdf` - Combined view of full output vs. question-based performance
- `02_scaling_performance_by_pattern.png/pdf` - Model scaling performance across different pattern sizes
- `03_input_output_comparison.png/pdf` - Model performance on input-targeted vs output-targeted questions
- `04_transfer_analysis.png/pdf` - Input-output answer transfer analysis across all tasks
- `05_task_performance.png/pdf` - Model performance by specific challenging tasks  
- `06_model_performance_by_pattern.png/pdf` - Model performance across different size patterns
- `07_filtered_input_output_comparison.png/pdf` - **NEW**: Input vs output comparison using only meaningful task-question combinations

## Chart Features

### Combined Model Performance Chart
- **Solid bars**: Full Output Task performance
- **Hatched bars**: Question-Based Task performance  
- **Legend**: Top-left corner for clean layout
- **Large fonts**: 14-18pt for readability
- **No annotations**: Clean bars without floating values
- **Horizontal labels**: Model names displayed horizontally

### Scaling Performance Chart
- **One bar per model**: Shows full output task performance only
- **X-axis**: Size patterns (e.g., scale_up_3, mixed_3, etc.)
- **Y-axis**: Accuracy percentage
- **Legend**: Horizontal layout below the chart for space efficiency
- **Color coded**: Each model has a consistent color across all charts
- **Pattern analysis**: Shows how models perform as task complexity scales

### Input vs Output Comparison Chart
- **Vertical striped bars**: Input-targeted question performance
- **Dotted bars**: Output-targeted question performance
- **Model colors**: Each model maintains consistent coloring
- **X-axis**: Models
- **Y-axis**: Accuracy percentage
- **Legend**: Horizontal layout below chart (Input/Output patterns)
- **Clean design**: No floating annotations or sample counts

### Input-Output Answer Transfer Analysis Chart
- **X-axis**: Question types (blue_node_count, colored_node_count, component_count, min_degree, max_degree, edge_count, node_count)
- **Y-axis**: Transfer rate (percentage of incorrect input answers that would be correct for output)
- **One bar per model**: Each question type shows all models
- **Model colors**: Consistent coloring across all charts
- **Sample annotations**: Shows sample count (n=X) on each bar
- **Legend**: Horizontal layout below chart for space efficiency
- **Analysis scope**: Aggregated across all tasks

### Filtered Input vs Output Comparison Chart (NEW)
- **Intelligent filtering**: Only includes task-question combinations where the answer actually differs between input and output
- **Meaningful comparisons**: Excludes cases like "node_count on colorLeaves task" where input and output always have the same answer
- **Impact categories**: 
  - "change": Answer definitely changes (e.g., blue_node_count for coloring tasks)
  - "maybe_change": Answer might change depending on graph structure (e.g., is_connected for removal tasks)
  - "no_change": Answer never changes (excluded from this chart)
- **Statistical info**: Shows how many comparisons were filtered and why
- **Clean design**: Same bar patterns as regular input/output chart but with more meaningful data

## File Formats

Each visualization is saved in two formats:
- **PNG**: High-quality (300 DPI) for presentations, slides, and print
- **PDF**: Vector format for scalable, crisp display at any size

## Usage Tips

- Use PNG files for PowerPoint presentations and image embedding
- Use PDF files for LaTeX documents and scalable displays
- Font sizes optimized for standard presentation screens
- Colors chosen for good contrast on both light and dark backgrounds
- **Consistent model colors**: Each model has the same color across all charts
- **Clear pattern language**: Solid bars vs. hatched bars for different comparisons
- Use these charts for talks, presentations, and high-level summaries

## Future Expansions

This presentation generator can be extended to include:
- Task difficulty rankings
- Key insight highlights  
- Executive summary charts
- Comparison matrices
- Performance trend analysis
- Error pattern analysis
- Model reasoning capability assessments

---
*Generated by Graph-Based ARC Presentation Visualization System*
"""

    readme_path = f"{output_dir}/README.md"
    with open(readme_path, "w", encoding="utf-8") as f:
        f.write(readme_content)

    if verbose:
        print(f"   📋 Presentation README saved to {readme_path}")
