import csv
import json
from pathlib import Path
import os
import re
from collections import defaultdict

# --- Configuration ---
FINAL_MAP_CSV = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/output/final_log_analysis_data.csv')
INTERMEDIATE_DATA_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/intermediate_data')
CASE_STUDY_ROOT = Path('case-study')
OUTPUT_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/output')
ANALYSIS_START_CASE = 22

# --- Helper Functions ---
def get_case_number(case_name):
    match = re.search(r'[AM](\d+)', case_name)
    return int(match.group(1)) if match else float('inf')

# --- Main Analysis Logic ---
def main():
    print(f"--- Step 2: Generating Reports from Intermediate Data ---")

    log_metrics = {}
    for json_file in INTERMEDIATE_DATA_DIR.glob('*_metrics.json'):
        with open(json_file, 'r', encoding='utf-8') as f:
            metrics = json.load(f)
            log_metrics[metrics['log_file'].replace('.json', '.txt')] = metrics

    log_to_case_map = defaultdict(list)
    if FINAL_MAP_CSV.is_file():
        with open(FINAL_MAP_CSV, 'r', newline='') as f:
            reader = csv.reader(f)
            next(reader)
            for log_file, case_id, _, _ in reader:
                log_to_case_map[Path(log_file).name].append(case_id)

    case_stats = defaultdict(lambda: defaultdict(float))
    total_function_counts = defaultdict(float)
    total_event_counts = defaultdict(float)

    for log_name, metrics in log_metrics.items():
        associated_cases = log_to_case_map.get(log_name, [])
        if not associated_cases: continue
        
        num_cases = len(associated_cases)
        
        for func in metrics.get('active_functions', []): total_function_counts[func] += 1
        for event in metrics.get('events', []): total_event_counts[event] += 1

        for case_id in associated_cases:
            for key, value in metrics.items():
                if isinstance(value, (int, float)):
                    case_stats[case_id][key] += value / num_cases

    # --- Data-Driven Case Selection ---
    # We now directly use the keys from case_stats, which only contains
    # cases that have associated log files from our map.
    final_cases = {
        case_id: stats for case_id, stats in case_stats.items()
        if get_case_number(case_id) >= ANALYSIS_START_CASE or not case_id.startswith('M')
    }

    # --- Generate Reports ---
    generated_files = []
    
    # --- Tier 1 Metrics Calculation ---
    total_sequences = sum(log['user_intent_sequences'] for log in log_metrics.values())
    total_turns = sum(log['turns'] for log in log_metrics.values())
    
    # Calculate Total Case Studies (All)
    all_case_dirs = [d for d in os.scandir(CASE_STUDY_ROOT) if d.is_dir() and (d.name.startswith('A') or d.name.startswith('M'))]
    total_all_cases = len(all_case_dirs)

    # Calculate Total Verbatim Log Word Count
    raw_log_dir = Path('.ai-cli-log')
    total_word_count = 0
    if raw_log_dir.is_dir():
        for txt_file in raw_log_dir.glob('*.txt'):
            try:
                total_word_count += len(txt_file.read_text(encoding='utf-8').split())
            except Exception:
                pass # Ignore files that can't be read

    tier1_metrics = {
        "Time Span (Days)": 37,
        "Total Case Studies (All)": total_all_cases,
        "Total Case Studies (Analyzed)": len(final_cases),
        "Total User Intent Sequences": round(total_sequences),
        "Total User-AI Turns": round(total_turns),
        "Total Verbatim Log Word Count": total_word_count,
    }
    table_1_path = OUTPUT_DIR / 'table_1_scope_and_scale.csv'
    with open(table_1_path, 'w', newline='') as f:
        writer = csv.writer(f); writer.writerow(['Metric', 'Value']); writer.writerows(tier1_metrics.items())
    generated_files.append(table_1_path)

    sorted_cases = sorted(final_cases.items(), key=lambda item: item[1]['user_intent_sequences'], reverse=True)
    table_1a_path = OUTPUT_DIR / 'table_1a_interaction_breakdown.csv'
    with open(table_1a_path, 'w', newline='') as f:
        writer = csv.writer(f); writer.writerow(['Case_Study_ID', 'User_Intent_Sequences', 'Total_Turns'])
        for case, stats in sorted_cases:
            if stats['turns'] > 0: writer.writerow([case, round(stats['user_intent_sequences']), round(stats['turns'])])
    generated_files.append(table_1a_path)

    table_2_path = OUTPUT_DIR / 'table_2_dynamics_and_patterns.csv'
    with open(table_2_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Case_Study_ID', 'Tool_Success_Rate', 'Tokens_per_Sequence'])
        for case, stats in sorted_cases:
            tool_success = (stats['tool_calls_successful'] / stats['tool_calls_total']) * 100 if stats['tool_calls_total'] > 0 else 0
            total_tokens = stats['total_input_tokens'] + stats['total_output_tokens']
            tokens_per_seq = total_tokens / stats['user_intent_sequences'] if stats['user_intent_sequences'] > 0 else 0
            writer.writerow([case, f"{tool_success:.1f}%", round(tokens_per_seq)])
    generated_files.append(table_2_path)

    table_2a_path = OUTPUT_DIR / 'table_2a_function_distribution.csv'
    with open(table_2a_path, 'w', newline='') as f:
        writer = csv.writer(f); writer.writerow(['Active_Function', 'Count'])
        for func, count in sorted(total_function_counts.items(), key=lambda item: item[1], reverse=True):
            writer.writerow([func, round(count)])
    generated_files.append(table_2a_path)

    table_2b_path = OUTPUT_DIR / 'table_2b_event_distribution.csv'
    with open(table_2b_path, 'w', newline='') as f:
        writer = csv.writer(f); writer.writerow(['Event_Type', 'Count'])
        for event, count in sorted(total_event_counts.items(), key=lambda item: item[1], reverse=True):
            writer.writerow([event, round(count)])
    generated_files.append(table_2b_path)

    print("\n--- All Reports Generated Successfully ---")
    for f_path in generated_files:
        print(f"[✔] {f_path}")

if __name__ == "__main__":
    main()