import csv
import json
from pathlib import Path
import os
import re
from collections import defaultdict
from pytimeparse2 import parse

# --- Configuration ---
FINAL_MAP_CSV = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/output/final_log_analysis_data.csv')
PREPROCESSED_LOGS_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/preprocessed_logs')
CASE_STUDY_ROOT = Path('case-study')
OUTPUT_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/output')
ANALYSIS_START_CASE = 22

# --- Regex for Parsing ---
METADATA_BLOCK_RE = re.compile(r"-{3,}\n\s*\*{1,2}\s*METADATA LOG\s*\*{1,2}\n(.*?)\n\s*-{3,}", re.DOTALL)
EVENT_RE = re.compile(r"\*\s*event:\s*(\S+)")
ACTIVE_FUNCTION_RE = re.compile(r"\*\s*active_function:\s*(\S+)")

# --- Helper Functions ---
def get_case_number(case_name):
    match = re.search(r'[AM](\d+)', case_name)
    return int(match.group(1)) if match else float('inf')

def parse_time_string(time_str):
    if not isinstance(time_str, str): return 0.0
    cleaned_str = re.sub(r'\s*\([^)]*\)', '', time_str).strip()
    return parse(cleaned_str) or 0.0

# --- Data Extraction ---
def extract_metrics_from_log(json_path):
    """Extracts all defined metrics from a single preprocessed JSON log file."""
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except (json.JSONDecodeError, IOError): return None

    conversation = data.get('conversation', [])
    epilogue = data.get('epilogue', {}) or {}

    metrics = {
        'turns': len(conversation),
        'user_intent_sequences': sum(1 for t in conversation if t.get('speaker') == 'user'),
        'tool_calls_total': int(epilogue.get('tool_calls_total') or '0'),
        'tool_calls_successful': int(epilogue.get('tool_calls_successful') or '0'),
        'agent_active_time': parse_time_string(epilogue.get('agent_active_time') or '0s'),
        'api_time': parse_time_string(epilogue.get('api_time') or '0s'),
        'tool_time': parse_time_string(epilogue.get('tool_time') or '0s'),
        'total_requests': sum(u.get('requests', 0) for u in epilogue.get('model_usage', [])),
        'total_input_tokens': sum(u.get('input_tokens', 0) for u in epilogue.get('model_usage', [])),
        'total_output_tokens': sum(u.get('output_tokens', 0) for u in epilogue.get('model_usage', [])),
        'events': [],
        'active_functions': []
    }

    for turn in conversation:
        if turn.get('speaker') == 'ai':
            content = turn.get('content', '')
            for block_match in METADATA_BLOCK_RE.finditer(content):
                block_content = block_match.group(1)
                event_match = EVENT_RE.search(block_content)
                if event_match:
                    metrics['events'].append(event_match.group(1))
                
                function_match = ACTIVE_FUNCTION_RE.search(block_content)
                if function_match:
                    metrics['active_functions'].append(function_match.group(1))
            
    return metrics

# --- Main Analysis Logic ---
def main():
    print(f"--- Starting Tier 2 Analysis (Dynamics, Patterns & Cost) ---")

    log_metrics = {p.name.replace('.json', '.txt'): m for p in PREPROCESSED_LOGS_DIR.glob('*.json') if (m := extract_metrics_from_log(p))}

    log_to_case_map = defaultdict(list)
    if FINAL_MAP_CSV.is_file():
        with open(FINAL_MAP_CSV, 'r', newline='') as f:
            reader = csv.reader(f); next(reader)
            for log_file, case_id, _, _ in reader:
                log_to_case_map[Path(log_file).name].append(case_id)

    case_stats = defaultdict(lambda: defaultdict(float))
    # Tier 2 specific counters
    total_function_counts = defaultdict(float)
    total_event_counts = defaultdict(float)

    for log_name, metrics in log_metrics.items():
        associated_cases = log_to_case_map.get(log_name, [])
        if not associated_cases: continue
        
        num_cases = len(associated_cases)
        
        # Aggregate total counts for functions and events
        for func in metrics['active_functions']: total_function_counts[func] += 1
        for event in metrics['events']: total_event_counts[event] += 1

        for case_id in associated_cases:
            for key, value in metrics.items():
                if isinstance(value, (int, float)):
                    case_stats[case_id][key] += value / num_cases

    final_cases = {d.name: case_stats.get(d.name, defaultdict(float)) for d in os.scandir(CASE_STUDY_ROOT) if d.is_dir() and (d.name.startswith('A') or (d.name.startswith('M') and get_case_number(d.name) >= ANALYSIS_START_CASE))}

    # --- Generate Reports ---
    with open(OUTPUT_DIR / 'table_2a_function_distribution.csv', 'w', newline='') as f:
        writer = csv.writer(f); writer.writerow(['Active_Function', 'Count'])
        for func, count in sorted(total_function_counts.items(), key=lambda item: item[1], reverse=True):
            writer.writerow([func, round(count)])

    with open(OUTPUT_DIR / 'table_2b_event_distribution.csv', 'w', newline='') as f:
        writer = csv.writer(f); writer.writerow(['Event_Type', 'Count'])
        for event, count in sorted(total_event_counts.items(), key=lambda item: item[1], reverse=True):
            writer.writerow([event, round(count)])

    print("\n--- Tier 2 Reports Generated Successfully ---")
    print(f"See the '{OUTPUT_DIR.name}' directory for new CSV files.")

if __name__ == "__main__":
    main()
