import csv
import json
from pathlib import Path
import os
import re
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import squarify
import numpy as np
from matplotlib.ticker import LogLocator, ScalarFormatter, NullFormatter
from matplotlib.patches import Patch
from matplotlib.ticker import FuncFormatter

# --- Configuration ---
OUTPUT_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/output')
# Input files
TABLE_1A_CSV = OUTPUT_DIR / 'table_1a_interaction_breakdown.csv'
TABLE_2_CSV = OUTPUT_DIR / 'table_2_dynamics_and_patterns.csv'
TABLE_2A_CSV = OUTPUT_DIR / 'table_2a_function_distribution.csv'
TABLE_3_CSV = OUTPUT_DIR / 'table_3_evolution_data.csv'
TABLE_4_CSV = OUTPUT_DIR / 'table_4_static_token_counts.csv'
TABLE_5_CSV = OUTPUT_DIR / 'table_5_growth_analysis.csv'

# --- Manually Defined Hardening Events ---
HARDENING_EVENTS = {
    22: "M22: Framework Finalization", 41: "M41: Closed-Loop Guardian",
    53: "M53: AI-Native Refactoring", 65: "M65: State-Mgmt Protocol",
    73: "M73: Protocol Hardening from M72"
}

# --- Plotting Functions ---
def plot_evolution_and_efficacy(df):
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(16, 8))
    ax.plot(df.index, df['success_rate_ma'], label='Tool Success Rate (5-point Moving Avg)', color='royalblue', linewidth=2)
    ax.scatter(df.index, df['success_rate'], color='lightsteelblue', alpha=0.5, label='Individual Session Success Rate')
    for case_num, label in HARDENING_EVENTS.items():
        event_indices = df.index[df['case_num'] >= case_num]
        if not event_indices.empty:
            event_idx = event_indices[0]
            ax.axvline(x=event_idx, color='crimson', linestyle='--', alpha=0.7, linewidth=1)
            ax.text(event_idx + 0.5, 20, label, rotation=90, verticalalignment='bottom', color='crimson', fontsize=9)
    ax.set_title('CHAC Framework Evolution vs. AI Operational Reliability', fontsize=18, fontweight='bold')
    ax.set_xlabel('Timeline (Individual Sessions, Ordered by Case Study Number)', fontsize=12)
    ax.set_ylabel('Tool Success Rate (%)', fontsize=12)
    ax.set_ylim(0, 105)
    ax.legend(loc='lower right')
    tick_indices = range(0, len(df), max(1, len(df) // 10))
    ax.set_xticks(tick_indices)
    ax.set_xticklabels([f"M{df.iloc[i]['case_num']}" for i in tick_indices], rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'figure_2_evolution_and_efficacy.png', dpi=300)
    plt.close()
    print("[✔] Figure 2 (Evolution & Efficacy) generated.")

def plot_interaction_breakdown(df):
    df_top = df.head(20)
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.barh(df_top['Case_Study_ID'], df_top['User_Intent_Sequences'], color='skyblue')
    ax.invert_yaxis()
    ax.set_title('Top 20 Case Studies by Interaction Volume', fontsize=16, fontweight='bold')
    ax.set_xlabel('Number of User Intent Sequences', fontsize=12)
    ax.set_ylabel('Case Study ID', fontsize=12)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'figure_2_interaction_breakdown.png', dpi=300)
    plt.close()
    print("[✔] Figure 2 (Interaction Breakdown) generated.")

def plot_dynamics_scatter(df):
    df['Tool_Success_Rate'] = df['Tool_Success_Rate'].str.replace('%', '').astype(float)
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.scatter(df['Tokens_per_Sequence'], df['Tool_Success_Rate'], alpha=0.6, edgecolors='w', s=100)
    ax.set_title('Cognitive Cost vs. Operational Efficacy', fontsize=16, fontweight='bold')
    ax.set_xlabel('Cognitive Cost (Tokens per Sequence)', fontsize=12)
    ax.set_ylabel('Operational Efficacy (Tool Success Rate %)', fontsize=12)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'figure_3_dynamics_scatter.png', dpi=300)
    plt.close()
    print("[✔] Figure 3 (Dynamics Scatter) generated.")

def plot_function_distribution(df):
    df_top = df.head(10)
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.barh(df_top['Active_Function'], df_top['Count'], color='mediumseagreen')
    ax.invert_yaxis()
    ax.set_title('Top 10 AI Active Functions Distribution', fontsize=16, fontweight='bold')
    ax.set_xlabel('Activation Count', fontsize=12)
    ax.set_ylabel('Active Function', fontsize=12)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'figure_4_function_distribution.png', dpi=300)
    plt.close()
    print("[✔] Figure 4 (Function Distribution) generated.")

def plot_token_treemap(df):
    df = df[df['File_Path'] != '--- TOTAL ---'].copy()
    df['Token_Count'] = pd.to_numeric(df['Token_Count'])
    df = df[df['Token_Count'] > 0].sort_values(by='Token_Count', ascending=False).reset_index()

    TOP_N = 20
    if len(df) > TOP_N:
        df_top = df.head(TOP_N)
        other_sum = df.iloc[TOP_N:]['Token_Count'].sum()
        other_row = pd.DataFrame([{'File_Path': 'Other', 'Token_Count': other_sum}])
        df_plot = pd.concat([df_top, other_row], ignore_index=True)
    else:
        df_plot = df

    def get_category(filepath):
        if not isinstance(filepath, str): return 'Other'
        name = Path(filepath).name
        if name.startswith(('00_', '01_', '04_')): return 'Core Identity'
        if name.startswith(('02_', '05_', 'CHAC_SOP', 'CHAC_Intellectual')): return 'Operational Protocols'
        if name.startswith(('03_', 'CHAC_System_Prompt_Builder')) or 'scripts' in filepath: return 'Boot & Configuration'
        if 'Glossary' in name or 'Profile' in name: return 'Knowledge & Context'
        if 'Other' in name: return 'Other'
        return 'Knowledge & Context'

    df_plot['category'] = df_plot['File_Path'].apply(get_category)
    
    color_map = {
        'Core Identity': 'tab:blue', 'Operational Protocols': 'tab:green',
        'Boot & Configuration': 'tab:orange', 'Knowledge & Context': 'tab:purple',
        'Other': 'tab:gray'
    }
    colors = df_plot['category'].map(color_map).tolist()

    def get_text_color(bg_color_name):
        return 'white' if bg_color_name in ['tab:blue', 'tab:purple', 'tab:gray'] else 'black'
    
    labels = df_plot.apply(
        lambda row: f"{Path(row['File_Path']).name}\n({row['Token_Count']} tokens)" if row['File_Path'] != 'Other' else f"Other\n({row['Token_Count']} tokens)",
        axis=1
    )
    
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(20, 12))
    squarify.plot(sizes=df_plot['Token_Count'], label=labels, alpha=0.8, color=colors, text_kwargs={'fontsize': 10}, ax=ax)
    
    for text, cat in zip(ax.texts, df_plot['category']):
        text.set_color(get_text_color(color_map[cat]))

    legend_elements = [Patch(facecolor=color_map[cat], edgecolor='w', label=cat) for cat in color_map if cat in df_plot['category'].unique()]
    ax.legend(handles=legend_elements, title='File Category', bbox_to_anchor=(1.01, 1), loc='upper left', fontsize=12, title_fontsize=14)

    plt.title('Static Token Distribution of Top CHAC Toolkit Components by Purpose', fontsize=18, fontweight='bold')
    plt.axis('off')
    plt.tight_layout(pad=2)
    plt.savefig(OUTPUT_DIR / 'figure_5_token_treemap.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("[✔] Figure 5 (Token Treemap) generated with final semantic coloring and legend.")

def plot_growth_analysis(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df = df.dropna(subset=['timestamp', 'max_case_num'])
    df = df.sort_values('timestamp').reset_index(drop=True)

    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(16, 8))
    
    ax.plot(df['max_case_num'], df['total_project_tokens'], label='Total Project Tokens', color='navy', marker='.', linestyle='-')
    ax.plot(df['max_case_num'], df['total_toolkit_tokens'], label='Toolkit-Only Tokens', color='darkorange', marker='.', linestyle='-')
    
    ax.set_yscale('log')
    ax.yaxis.set_major_locator(LogLocator(base=10.0, numticks=15))
    ax.yaxis.set_major_formatter(ScalarFormatter())
    ax.yaxis.set_minor_locator(LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1, numticks=15))
    
    # Custom formatter to show only key minor ticks to avoid clutter
    def selective_formatter(x, pos):
        s = f'{x:.0f}'
        if s.startswith(('1', '2', '5')):
            return s
        return ''
    ax.yaxis.set_minor_formatter(FuncFormatter(selective_formatter))

    ax.set_title('Cognitive Scale Growth of the CHAC Workbench by Research Milestone', fontsize=18, fontweight='bold')
    ax.set_xlabel('Research Milestone (Associated M-Series Case Study Number)', fontsize=12)
    ax.set_ylabel('Total Token Count (Log Scale)', fontsize=12)
    ax.legend()
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)

    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'figure_1_growth_analysis.png', dpi=300)
    plt.close()
    print("[✔] Figure 1 (Growth Analysis) generated with final label density improvements.")

# --- Main Execution ---
def main():
    print(f"--- Step 8: Generating All Visualizations ---")
    
    try:
        df_evo = pd.read_csv(TABLE_3_CSV)
        df_interact = pd.read_csv(TABLE_1A_CSV)
        df_dyn = pd.read_csv(TABLE_2_CSV)
        df_func = pd.read_csv(TABLE_2A_CSV)
        df_token = pd.read_csv(TABLE_4_CSV)
        df_growth = pd.read_csv(TABLE_5_CSV)
    except FileNotFoundError as e:
        print(f"[ERROR] Input CSV not found: {e}. Make sure previous steps ran successfully.")
        return

    if not df_evo.empty: plot_evolution_and_efficacy(df_evo)
    # if not df_interact.empty: plot_interaction_breakdown(df_interact)
    # if not df_dyn.empty: plot_dynamics_scatter(df_dyn)
    # if not df_func.empty: plot_function_distribution(df_func)
    # if not df_token.empty: plot_token_treemap(df_token)
    if not df_growth.empty: plot_growth_analysis(df_growth)

    print("\n--- All visualizations generated successfully. ---")

if __name__ == "__main__":
    main()
