import json
from pathlib import Path
import re
from collections import defaultdict
from pytimeparse2 import parse

# --- Configuration ---
PREPROCESSED_LOGS_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/preprocessed_logs')
RAW_LOGS_DIR = Path('.ai-cli-log')
OUTPUT_DIR = Path('case-study/M76_Quantitative_Study_of_the_CHAC_Workbench/intermediate_data')

# --- Regex for Parsing (Final, Definitive Version - ROLLED BACK TO BEST) ---
# This is the v4 regex that produced the best results (1 mismatch, 14 warnings).
METADATA_BLOCK_RE = re.compile(r"^(?:[ \t]*✦)?[ \t]*---+\s*\n[ \t]*\*{0,2} ?METADATA LOG ?\*{0,2}[ \t]*\n(.*?)\n^[ \t]*---+", re.DOTALL | re.MULTILINE | re.IGNORECASE)
EVENT_RE = re.compile(r"\*\s*event:\s*(.*)")
ACTIVE_FUNCTION_RE = re.compile(r"\*\s*active_function:\s*(.*)")

# --- Helper Functions ---
def parse_time_string(time_str):
    if not isinstance(time_str, str): return 0.0
    cleaned_str = re.sub(r'\s*\([^)]*\)', '', time_str).strip()
    return parse(cleaned_str) or 0.0

# --- Data Extraction ---
def extract_all_metrics(json_path):
    """Extracts all defined metrics from a single preprocessed JSON log file."""
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except (json.JSONDecodeError, IOError):
        return None

    conversation = data.get('conversation', [])
    epilogue = data.get('epilogue', {}) or {}

    metrics = {
        'log_file': json_path.name,
        'turns': len(conversation),
        'user_intent_sequences': sum(1 for t in conversation if t.get('speaker') == 'user'),
        'tool_calls_total': int(epilogue.get('tool_calls_total') or '0'),
        'tool_calls_successful': int(epilogue.get('tool_calls_successful') or '0'),
        'agent_active_time': parse_time_string(epilogue.get('agent_active_time') or '0s'),
        'api_time': parse_time_string(epilogue.get('api_time') or '0s'),
        'tool_time': parse_time_string(epilogue.get('tool_time') or '0s'),
        'total_requests': sum(u.get('requests', 0) for u in epilogue.get('model_usage', [])),
        'total_input_tokens': sum(u.get('input_tokens', 0) for u in epilogue.get('model_usage', [])),
        'total_output_tokens': sum(u.get('output_tokens', 0) for u in epilogue.get('model_usage', [])),
        'metadata_log_count': 0,
        'events': [],
        'active_functions': []
    }

    raw_log_path = RAW_LOGS_DIR / json_path.name.replace('.json', '.txt')
    if raw_log_path.is_file():
        try:
            raw_content = raw_log_path.read_text(encoding='utf-8')
            matches = METADATA_BLOCK_RE.finditer(raw_content)
            
            count = 0
            for match in matches:
                count += 1
                block_content = match.group(1)
                
                event_match = EVENT_RE.search(block_content)
                if event_match:
                    metrics['events'].append(event_match.group(1).strip())
                
                function_match = ACTIVE_FUNCTION_RE.search(block_content)
                if function_match:
                    full_match = function_match.group(1).strip()
                    function_name = full_match.split('\n')[0].strip()
                    metrics['active_functions'].append(function_name)
            
            metrics['metadata_log_count'] = count

        except Exception:
            pass
            
    return metrics

# --- Main Execution ---
def main():
    """
    Extracts metrics from all logs and saves them as intermediate files.
    """
    print("--- Step 1: Extracting Data (v13 - Rolled back to best Regex) ---")
    OUTPUT_DIR.mkdir(exist_ok=True)

    processed_count = 0
    for json_file in PREPROCESSED_LOGS_DIR.glob('*.json'):
        metrics = extract_all_metrics(json_file)
        if metrics:
            output_path = OUTPUT_DIR / f"{json_file.stem}_metrics.json"
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(metrics, f, indent=2)
            processed_count += 1
    
    print(f"Successfully processed {processed_count} log files.")
    print(f"Intermediate metrics saved to: {OUTPUT_DIR}")

if __name__ == "__main__":
    main()