#!/usr/bin/env python3
"""
Experimental runner for LinearizeLLM with multiple seeds for robust evaluation.
"""

import os
import json
import random
import time
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Tuple
from src.core.agent_pipeline import LinearizeLLMWorkflow
from src.scripts.ground_truth_labels import evaluate_detection_accuracy, get_ground_truth


def run_experimental_setup(tex_files: List[Path], instances_dir: Path, llm_model: str, args: Any):
    """
    Run experimental setup with multiple seeds for robust evaluation.
    
    Args:
        tex_files: List of .tex file paths to process
        instances_dir: Path to instances directory
        llm_model: LLM model configuration
        args: Command line arguments
    """
    print("="*80)
    print("EXPERIMENTAL SETUP - LINEARIZELLM EVALUATION")
    print("="*80)
    
    # Get experiment configuration
    experiment_config = get_experiment_config(args, llm_model)
    
    # Generate experiment identifier
    experiment_id = f"experiment_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    
    # Create results directory
    results_dir = Path("results") / experiment_id
    results_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"📁 Experiment results will be saved to: {results_dir}")
    print(f"🔢 Processing {len(tex_files)} problems with {len(experiment_config['seeds'])} seeds each")
    print(f"📊 Total runs: {len(tex_files) * len(experiment_config['seeds'])}")
    print(f"🔑 API Key source: {experiment_config['api_key']['source']}")
    
    # Set API key in environment for the experiment based on model provider
    import os
    
    # Determine the provider from the model
    if isinstance(llm_model, str):
        if llm_model.startswith("gpt-") or llm_model in ["o3"]:
            provider = "openai"
        elif llm_model.startswith("claude-"):
            provider = "anthropic"
        elif llm_model.startswith("gemini-"):
            provider = "google"
        else:
            # Default to OpenAI for unknown models
            provider = "openai"
    else:
        # If it's an LLMConfig object
        provider = llm_model.provider
    
    # Set the appropriate environment variable (only if not already set)
    env_var_name = f"{provider.upper()}_API_KEY"
    if not os.getenv(env_var_name):
        os.environ[env_var_name] = experiment_config['api_key']['key']
        print(f"✅ {provider.capitalize()} API key configured for experiment")
    else:
        print(f"✅ {provider.capitalize()} API key already set in environment (using existing key)")
    
    # Initialize results structure
    experiment_results = {}
    
    # Process each problem
    for problem_idx, tex_file in enumerate(tex_files, 1):
        problem_name = tex_file.parent.name
        print(f"\n" + "="*100)
        print(f"PROBLEM {problem_idx}/{len(tex_files)}: {problem_name}")
        print("="*100)
        
        experiment_results[problem_name] = {}
        
        # Process each seed for this problem
        for seed_idx, seed in enumerate(experiment_config['seeds'], 1):
            print(f"\n🌱 SEED {seed_idx}/{len(experiment_config['seeds'])}: {seed}")
            print("-" * 50)
            
            # Set random seed for reproducibility
            random.seed(seed)
            
            try:
                # Run single experiment
                seed_result = run_single_experiment(
                    tex_file, problem_name, seed, llm_model, args, results_dir
                )
                
                experiment_results[problem_name][str(seed)] = seed_result
                
                print(f"✅ Seed {seed} completed successfully")
                
            except Exception as e:
                print(f"❌ Seed {seed} failed: {str(e)}")
                experiment_results[problem_name][str(seed)] = {
                    'linearizellm_x_star': None,
                    'linearizellm_obj_star': None,
                    'RE': [1, f"Exception during execution: {str(e)}"],
                    'CE': [0, None],
                    'DE': [0, None]
                }
            
            # Small delay between seeds
            time.sleep(1)
    
    # Save experiment results
    results_file = results_dir / "experiment_results.json"
    with open(results_file, 'w', encoding='utf-8') as f:
        json.dump(experiment_results, f, indent=2)
    
    print(f"\n" + "="*100)
    print("EXPERIMENT COMPLETED")
    print("="*100)
    print(f"📁 Results saved to: {results_file}")
    
    # Generate summary statistics
    generate_experiment_summary(experiment_results, results_dir)


def get_experiment_config(args: Any = None, llm_model: Any = None) -> Dict[str, Any]:
    """
    Get experimental configuration from user input or command line arguments.
    
    Args:
        args: Command line arguments (optional)
        llm_model: Model specification to determine provider (optional)
        
    Returns:
        Dictionary with experiment configuration
    """
    print("\n🔧 EXPERIMENTAL CONFIGURATION")
    print("-" * 40)
    
    # Get seeds from command line or user input
    if args and hasattr(args, 'seeds') and args.seeds:
        seeds_input = args.seeds
        print(f"Using seeds from command line: {seeds_input}")
    else:
        print("Enter seeds for robust evaluation (comma-separated, e.g., 42,123,456):")
        seeds_input = input("Seeds: ").strip()
    
    try:
        seeds = [int(s.strip()) for s in seeds_input.split(',')]
        if not seeds:
            raise ValueError("No seeds provided")
    except ValueError as e:
        print(f"❌ Invalid seeds format: {e}")
        print("Using default seeds: [42, 123, 456, 789, 101112]")
        seeds = [42, 123, 456, 789, 101112]
    
    # Get API key configuration
    api_key_config = get_api_key_config(args, llm_model)
    
    return {
        'seeds': seeds,
        'api_key': api_key_config,
        'timestamp': datetime.now().isoformat()
    }


def get_api_key_config(args: Any = None, llm_model: Any = None) -> Dict[str, Any]:
    """
    Get API key configuration from user input or command line arguments.
    
    Args:
        args: Command line arguments (optional)
        llm_model: Model specification to determine provider (optional)
        
    Returns:
        Dictionary with API key configuration
    """
    print("\n🔑 API KEY CONFIGURATION")
    print("-" * 30)
    
    # Determine the provider from the model
    provider = "openai"  # default
    if llm_model:
        if isinstance(llm_model, str):
            if llm_model.startswith("gpt-") or llm_model in ["o3"]:
                provider = "openai"
            elif llm_model.startswith("claude-"):
                provider = "anthropic"
            elif llm_model.startswith("gemini-"):
                provider = "google"
        else:
            # If it's an LLMConfig object
            provider = llm_model.provider
    
    print(f"Provider: {provider.capitalize()}")
    
    # Check if API key is provided via command line
    if args and hasattr(args, 'api_key') and args.api_key:
        print(f"Using API key from command line")
        return {
            'source': 'command_line',
            'key': args.api_key
        }
    
    # Check if user wants to use default API key without prompting
    if args and hasattr(args, 'use_default_api_key') and args.use_default_api_key:
        from src.utils.api_key_manager import APIKeyManager
        manager = APIKeyManager(provider.capitalize())
        default_key = manager.get_api_key(provider.capitalize())
        print(f"✅ Using default {provider.capitalize()} API key (no prompt)")
        return {
            'source': 'default_no_prompt',
            'key': default_key
        }
    
    # Check if API key is set in environment
    import os
    env_var_name = f"{provider.upper()}_API_KEY"
    if os.getenv(env_var_name):
        print(f"✅ Using API key from environment variable {env_var_name}")
        return {
            'source': 'environment',
            'key': os.getenv(env_var_name)
        }
    
    # Check saved config file first (NO PROMPTING)
    from src.utils.api_key_manager import APIKeyManager
    manager = APIKeyManager(provider.capitalize())
    
    # Try to get key from saved config without prompting
    try:
        saved_keys = manager._load_saved_keys()
        saved_key = saved_keys.get(provider.lower())
        if saved_key:
            print(f"✅ Using API key from saved config file")
            return {
                'source': 'saved_config',
                'key': saved_key
            }
    except:
        pass
    
    # Use default API key without prompting (for overnight runs)
    print(f"✅ Using default {provider.capitalize()} API key (no prompting for overnight runs)")
    default_key = manager.get_api_key(provider.capitalize())
    return {
        'source': 'default_no_prompt',
        'key': default_key
    }


def run_single_experiment(tex_file: Path, problem_name: str, seed: int, llm_model: str, args: Any, results_dir: Path = None) -> Dict[str, Any]:
    """
    Run a single experiment for one problem and one seed.
    
    Args:
        tex_file: Path to the .tex file
        problem_name: Name of the problem
        seed: Random seed for this run
        llm_model: LLM model configuration
        args: Command line arguments
        results_dir: Directory to save detailed results
        
    Returns:
        Dictionary with experiment results
    """
    result = {
        'linearizellm_x_star': None,
        'linearizellm_obj_star': None,
        'RE': [0, None],  # Reformulation Error [has_error, error_message]
        'CE': [0, None],  # Compilation Error [has_error, error_message]
        'DE': [0, None]   # Detection Error [has_error, error_message]
    }
    
    try:
        # Initialize workflow
        workflow = LinearizeLLMWorkflow(
            tex_path=str(tex_file),
            problem_id=problem_name,
            save_results=False,  # Don't save individual results in experimental mode
            results_base_dir="temp",  # Temporary directory
            llm_model=llm_model
        )
        
        # Execute workflow
        workflow_results = workflow.run(verbose=False)
        
        # Save important intermediate files if results_dir is provided
        if results_dir:
            save_intermediate_files(workflow_results, problem_name, seed, results_dir)
        
        # Extract results and detect errors
        result = extract_experiment_results(workflow_results, problem_name)
        
        # Print error details to console if there are compilation errors
        if result['CE'][0] == 1:  # Has compilation error
            print(f"❌ COMPILATION ERROR: {result['CE'][1]}")
            # Try to get more detailed error information from optimization results
            if 'optimization_results' in workflow_results:
                opt_results = workflow_results['optimization_results']
                if not opt_results.get('success', False):
                    error_type = opt_results.get('error_type', 'UNKNOWN')
                    error_details = opt_results.get('details', '')
                    full_traceback = opt_results.get('full_traceback', '')
                    print(f"   Error Type: {error_type}")
                    if error_details:
                        print(f"   Details: {error_details}")
                    if full_traceback:
                        # Print just the first few lines of traceback to avoid spam
                        traceback_lines = full_traceback.split('\n')[:5]
                        print(f"   Traceback: {' '.join(traceback_lines)}...")
        
    except Exception as e:
        # Catch any unexpected errors
        result['CE'] = [1, f"Unexpected error: {str(e)}"]
    
    return result


def save_intermediate_files(workflow_results: Dict[str, Any], problem_name: str, seed: int, results_dir: Path):
    """
    Save important intermediate files for each seed run.
    
    Args:
        workflow_results: Results from LinearizeLLMWorkflow
        problem_name: Name of the problem
        seed: Random seed for this run
        results_dir: Base directory for experiment results
    """
    # Create directory for this problem and seed
    seed_dir = results_dir / f"{problem_name}_seed_{seed}"
    seed_dir.mkdir(parents=True, exist_ok=True)
    
    # Save gurobi_code.py
    if 'gurobi_code' in workflow_results:
        gurobi_code_path = seed_dir / "gurobi_code.py"
        with open(gurobi_code_path, 'w', encoding='utf-8') as f:
            f.write(workflow_results['gurobi_code'])
    
    # Save latex_model.tex (extracted LaTeX model)
    if 'latex_model' in workflow_results:
        latex_model_path = seed_dir / "latex_model.tex"
        with open(latex_model_path, 'w', encoding='utf-8') as f:
            f.write(workflow_results['latex_model'])
    
    # Save linearized_model.tex (reformulated model)
    if 'linearized_model' in workflow_results:
        linearized_model_path = seed_dir / "linearized_model.tex"
        with open(linearized_model_path, 'w', encoding='utf-8') as f:
            f.write(workflow_results['linearized_model'])
    
    # Save extracted patterns
    if 'extracted_patterns' in workflow_results:
        patterns_path = seed_dir / "extracted_patterns.txt"
        with open(patterns_path, 'w', encoding='utf-8') as f:
            f.write(workflow_results['extracted_patterns'])
    
    # Save code validation results
    if 'code_validation' in workflow_results:
        validation_path = seed_dir / "code_validation.txt"
        with open(validation_path, 'w', encoding='utf-8') as f:
            f.write(workflow_results['code_validation'])
    
    # Save optimization results as JSON
    if 'optimization_results' in workflow_results:
        opt_results_path = seed_dir / "optimization_results.json"
        with open(opt_results_path, 'w', encoding='utf-8') as f:
            json.dump(workflow_results['optimization_results'], f, indent=2)
    
    print(f"💾 Saved intermediate files to: {seed_dir}")


def extract_experiment_results(workflow_results: Dict[str, Any], problem_name: str) -> Dict[str, Any]:
    """
    Extract and classify results from workflow execution.
    
    Args:
        workflow_results: Results from LinearizeLLMWorkflow
        problem_name: Name of the problem
        
    Returns:
        Dictionary with classified results
    """
    result = {
        'linearizellm_x_star': None,
        'linearizellm_obj_star': None,
        'RE': [0, None],  # Reformulation Error
        'CE': [0, None],  # Compilation Error
        'DE': [0, None],  # Detection Error
        'detection_evaluation': None  # Detailed detection evaluation
    }
    
    # Check for detection errors (Step 2: Non-linear pattern extraction)
    # Use structured detection results instead of parsing LLM output
    if 'extracted_patterns' not in workflow_results:
        result['DE'] = [1, "No pattern extraction results found"]
        return result
    
    # Get the structured parsed patterns from the workflow
    # We need to parse the patterns again since the workflow doesn't store parsed_patterns
    from src.core.nonlinear_detector import NonLinearPatternExtractor
    pattern_extractor = NonLinearPatternExtractor()
    parsed_patterns = pattern_extractor.parse_patterns(workflow_results['extracted_patterns'])
    
    # Extract detected pattern types from structured data
    detected_types = []
    if parsed_patterns['bilinear_patterns']:
        detected_types.append('bilinear')
    if parsed_patterns['min_patterns']:
        detected_types.append('min')
    if parsed_patterns['max_patterns']:
        detected_types.append('max')
    if parsed_patterns['absolute_patterns']:
        detected_types.append('absolute_value')
    if parsed_patterns['quotient_patterns']:
        detected_types.append('quotient')
    if parsed_patterns['monotone_transformation_patterns']:
        detected_types.append('monotone_transformation')
    
    # Get ground truth for this problem
    from src.scripts.ground_truth_labels import get_ground_truth
    ground_truth = get_ground_truth(problem_name)
    
    # Compare detected vs ground truth
    correct_detections = [pt for pt in detected_types if pt in ground_truth]
    false_positives = [pt for pt in detected_types if pt not in ground_truth]
    false_negatives = [pt for pt in ground_truth if pt not in detected_types]
    
    # Calculate metrics
    precision = len(correct_detections) / len(detected_types) if detected_types else 0
    recall = len(correct_detections) / len(ground_truth) if ground_truth else 1.0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    is_correct = len(false_positives) == 0 and len(false_negatives) == 0
    
    # Store detection evaluation
    detection_evaluation = {
        'ground_truth': ground_truth,
        'detected_types': detected_types,
        'correct_detections': correct_detections,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'is_correct': is_correct
    }
    result['detection_evaluation'] = detection_evaluation
    
    # Set detection error if not correct
    if not is_correct:
        error_msg = f"Detection error: Expected {ground_truth}, detected {detected_types}"
        if false_positives:
            error_msg += f", false positives: {false_positives}"
        if false_negatives:
            error_msg += f", false negatives: {false_negatives}"
        result['DE'] = [1, error_msg]
    else:
        result['DE'] = [0, None]
    
    # Check for reformulation errors (Step 3: Pattern-based MIP Linearization)
    if 'linearized_model' not in workflow_results:
        result['RE'] = [1, "No linearized model found"]
    else:
        linearized_model = workflow_results['linearized_model']
        
        # Check for successful reformulation indicators first
        if "successful reformulations" in linearized_model.lower() and "failed reformulations: 0" in linearized_model.lower():
            result['RE'] = [0, None]  # Reformulation successful
        elif "infeasible" in linearized_model.lower():
            result['RE'] = [1, "Problem detected as infeasible during reformulation"]
        elif "non-convex" in linearized_model.lower() or "miqcp" in linearized_model.lower():
            result['RE'] = [1, "Non-convex problem detected - reformulation failed to create exact LP/MILP"]
        elif "error" in linearized_model.lower() or "failed" in linearized_model.lower():
            # Only flag as failed if it's not a successful reformulation
            if "successful reformulations" not in linearized_model.lower():
                result['RE'] = [1, f"Reformulation failed: {linearized_model[:200]}..."]
            else:
                result['RE'] = [0, None]  # Reformulation successful despite "failed" in text
        else:
            result['RE'] = [0, None]
    
    # Check for compilation errors (Steps 4-5: Code generation and validation)
    if 'gurobi_code' not in workflow_results:
        result['CE'] = [1, "No Gurobi code generated"]
    elif 'code_validation' not in workflow_results:
        result['CE'] = [1, "No code validation results found"]
    else:
        code_validation = workflow_results['code_validation']
        
        # Check if optimization actually succeeded despite validation issues
        optimization_succeeded = False
        if 'optimization_results' in workflow_results:
            opt_results = workflow_results['optimization_results']
            optimization_succeeded = opt_results.get('success', False)
        
        # If optimization succeeded, validation errors are likely post-processing issues
        if optimization_succeeded:
            # Check if the error is in post-processing (like missing 'exp' import)
            if ("nameerror" in code_validation.lower() and 
                ("exp" in code_validation.lower() or "math" in code_validation.lower() or "import" in code_validation.lower())):
                # This is a post-processing error, not a compilation error
                result['CE'] = [0, None]
            elif "error" in code_validation.lower() or "failed" in code_validation.lower():
                # Check if it's a critical compilation error vs parameter validation error
                if any(critical_error in code_validation.lower() for critical_error in [
                    "syntax error", "indentation error", "import error", "module not found",
                    "gurobipy", "model creation", "variable definition"
                ]):
                    result['CE'] = [1, f"Critical compilation error: {code_validation[:200]}..."]
                elif any(param_error in code_validation.lower() for param_error in [
                    "parameter", "dimension", "dimensions", "incorrect", "should be", "matrix"
                ]):
                    # This is a parameter validation error, not a compilation error
                    result['CE'] = [0, None]
                else:
                    # Likely a post-processing error, not critical
                    result['CE'] = [0, None]
            else:
                result['CE'] = [0, None]
        else:
            # Optimization failed, so validation errors are more critical
            if "error" in code_validation.lower() or "failed" in code_validation.lower():
                result['CE'] = [1, f"Code validation failed: {code_validation[:200]}..."]
            else:
                result['CE'] = [0, None]
    
    # Extract optimization results (Step 6: Optimization execution)
    if 'optimization_results' in workflow_results:
        opt_results = workflow_results['optimization_results']
        
        if opt_results.get('success', False):
            # Optimization successful
            optimization_data = opt_results.get('optimization_results', {})
            
            # Extract optimal solution
            variables = optimization_data.get('variables', {})
            if variables:
                result['linearizellm_x_star'] = variables
            
            # Extract optimal objective value
            objective_value = optimization_data.get('objective_value')
            if objective_value is not None:
                result['linearizellm_obj_star'] = objective_value
            
            # Extract model type from optimization results (if available)
            model_type = optimization_data.get('model_type', '').upper()
            # Only LP/MILP are allowed, everything else is a reformulation error
            if model_type not in ('LP', 'MILP'):
                result['RE'] = [1, f"Reformulation error: Model solved is {model_type}, not LP/MILP"]
            
            # Check if optimization revealed reformulation issues
            status = optimization_data.get('status', '').lower()
            is_linear = optimization_data.get('is_linear', True)  # Default to True for backward compatibility
            if 'infeasible' in status:
                result['RE'] = [1, f"Optimization revealed infeasibility: {status}"]
            elif not is_linear:
                # Model is not linear (QP/MIQP/QCP/MIQCP) - reformulation failed
                result['RE'] = [1, f"Reformulation failed - model is {model_type} instead of LP/MILP"]
            elif 'unbounded' in status:
                result['RE'] = [1, f"Optimization revealed unbounded problem: {status}"]
            elif 'numeric' in status:
                result['CE'] = [1, f"Numerical issues during optimization: {status}"]
            elif 'time_limit' in status or 'iteration_limit' in status or 'node_limit' in status:
                # These are not necessarily errors, but we should note them
                print(f"⚠️ Optimization reached limits: {status}")
        else:
            # Optimization failed - check if it's a compilation error or runtime error
            error_msg = opt_results.get('error', 'Unknown optimization error')
            error_type = opt_results.get('error_type', 'UNKNOWN')
            
            # Classify error types
            if error_type in ['SYNTAX_ERROR', 'COMPILATION_ERROR', 'NAME_ERROR', 'IMPORT_ERROR']:
                # This is a compilation error - the code couldn't even run
                result['CE'] = [1, f"Code compilation error: {error_msg}"]
            elif error_type in ['MODEL_CREATION_ERROR', 'RUNTIME_ERROR']:
                # This is a runtime error - the code compiled but failed during execution
                result['CE'] = [1, f"Runtime error: {error_msg}"]
            else:
                # Other optimization failures (like infeasible, unbounded, etc.)
                result['CE'] = [1, f"Optimization failed: {error_msg}"]
    
    return result


def generate_experiment_summary(experiment_results: Dict[str, Any], results_dir: Path):
    """
    Generate summary statistics for the experiment.
    
    Args:
        experiment_results: Complete experiment results
        results_dir: Directory to save summary
    """
    print("\n📊 GENERATING EXPERIMENT SUMMARY")
    print("-" * 40)
    
    summary = {
        'experiment_info': {
            'total_problems': len(experiment_results),
            'total_seeds': 0,
            'total_runs': 0
        },
        'error_statistics': {
            'detection_errors': 0,
            'reformulation_errors': 0,
            'compilation_errors': 0,
            'successful_runs': 0
        },
        'detection_accuracy': {
            'total_evaluations': 0,
            'correct_detections': 0,
            'average_precision': 0.0,
            'average_recall': 0.0,
            'average_f1_score': 0.0
        },
        'problem_summary': {}
    }
    
    for problem_name, problem_results in experiment_results.items():
        problem_summary = {
            'total_seeds': len(problem_results),
            'successful_runs': 0,
            'detection_errors': 0,
            'reformulation_errors': 0,
            'compilation_errors': 0,
            'detection_accuracy': {
                'correct_detections': 0,
                'total_evaluations': 0,
                'average_precision': 0.0,
                'average_recall': 0.0,
                'average_f1_score': 0.0
            },
            'objective_values': []
        }
        
        for seed, seed_result in problem_results.items():
            summary['experiment_info']['total_runs'] += 1
            
            # Count errors
            if seed_result['DE'][0] == 1:
                problem_summary['detection_errors'] += 1
                summary['error_statistics']['detection_errors'] += 1
            
            if seed_result['RE'][0] == 1:
                problem_summary['reformulation_errors'] += 1
                summary['error_statistics']['reformulation_errors'] += 1
            
            if seed_result['CE'][0] == 1:
                problem_summary['compilation_errors'] += 1
                summary['error_statistics']['compilation_errors'] += 1
            
            # Count successful runs (no errors)
            if (seed_result['DE'][0] == 0 and 
                seed_result['RE'][0] == 0 and 
                seed_result['CE'][0] == 0):
                problem_summary['successful_runs'] += 1
                summary['error_statistics']['successful_runs'] += 1
            
            # Collect detection accuracy statistics
            if seed_result.get('detection_evaluation'):
                eval_data = seed_result['detection_evaluation']
                problem_summary['detection_accuracy']['total_evaluations'] += 1
                summary['detection_accuracy']['total_evaluations'] += 1
                
                if eval_data['is_correct']:
                    problem_summary['detection_accuracy']['correct_detections'] += 1
                    summary['detection_accuracy']['correct_detections'] += 1
                
                # Accumulate metrics for averaging
                problem_summary['detection_accuracy']['average_precision'] += eval_data['precision']
                problem_summary['detection_accuracy']['average_recall'] += eval_data['recall']
                problem_summary['detection_accuracy']['average_f1_score'] += eval_data['f1_score']
                
                summary['detection_accuracy']['average_precision'] += eval_data['precision']
                summary['detection_accuracy']['average_recall'] += eval_data['recall']
                summary['detection_accuracy']['average_f1_score'] += eval_data['f1_score']
            
            # Collect objective values
            if seed_result['linearizellm_obj_star'] is not None:
                problem_summary['objective_values'].append(seed_result['linearizellm_obj_star'])
        
        summary['problem_summary'][problem_name] = problem_summary
    
    summary['experiment_info']['total_seeds'] = len(set(
        seed for problem_results in experiment_results.values() 
        for seed in problem_results.keys()
    ))
    
    # Calculate averages for detection accuracy
    if summary['detection_accuracy']['total_evaluations'] > 0:
        summary['detection_accuracy']['average_precision'] /= summary['detection_accuracy']['total_evaluations']
        summary['detection_accuracy']['average_recall'] /= summary['detection_accuracy']['total_evaluations']
        summary['detection_accuracy']['average_f1_score'] /= summary['detection_accuracy']['total_evaluations']
    
    # Calculate averages for each problem
    for problem_name, problem_summary in summary['problem_summary'].items():
        if problem_summary['detection_accuracy']['total_evaluations'] > 0:
            problem_summary['detection_accuracy']['average_precision'] /= problem_summary['detection_accuracy']['total_evaluations']
            problem_summary['detection_accuracy']['average_recall'] /= problem_summary['detection_accuracy']['total_evaluations']
            problem_summary['detection_accuracy']['average_f1_score'] /= problem_summary['detection_accuracy']['total_evaluations']
    
    # Save summary
    summary_file = results_dir / "experiment_summary.json"
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2)
    
    # Print summary
    print(f"\n📈 EXPERIMENT SUMMARY")
    print("=" * 50)
    print(f"Total Problems: {summary['experiment_info']['total_problems']}")
    print(f"Total Seeds: {summary['experiment_info']['total_seeds']}")
    print(f"Total Runs: {summary['experiment_info']['total_runs']}")
    print(f"\nError Statistics:")
    print(f"  Detection Errors: {summary['error_statistics']['detection_errors']}")
    print(f"  Reformulation Errors: {summary['error_statistics']['reformulation_errors']}")
    print(f"  Compilation Errors: {summary['error_statistics']['compilation_errors']}")
    print(f"  Successful Runs: {summary['error_statistics']['successful_runs']}")
    
    success_rate = (summary['error_statistics']['successful_runs'] / 
                   summary['experiment_info']['total_runs']) * 100
    print(f"\nOverall Success Rate: {success_rate:.1f}%")
    
    # Print detection accuracy statistics
    if summary['detection_accuracy']['total_evaluations'] > 0:
        detection_accuracy = (summary['detection_accuracy']['correct_detections'] / 
                            summary['detection_accuracy']['total_evaluations']) * 100
        print(f"\nDetection Accuracy Statistics:")
        print(f"  Total Evaluations: {summary['detection_accuracy']['total_evaluations']}")
        print(f"  Correct Detections: {summary['detection_accuracy']['correct_detections']}")
        print(f"  Detection Accuracy: {detection_accuracy:.1f}%")
        print(f"  Average Precision: {summary['detection_accuracy']['average_precision']:.3f}")
        print(f"  Average Recall: {summary['detection_accuracy']['average_recall']:.3f}")
        print(f"  Average F1-Score: {summary['detection_accuracy']['average_f1_score']:.3f}")
    
    print(f"\n📁 Summary saved to: {summary_file}") 