#!/usr/bin/env python3
"""
Test all Poster generation modes for BANNER (Banner Generation) module.

================================================================================
HOW TO RUN EXPERIMENTS
================================================================================

1. **Configure experiment parameters** (edit `poster_experiment_config.json`):
   {
     "modes": "all",                      # "all" or list of mode names
     "num_samples": 1,                    # Number of samples per mode
     "start_index": 1,                    # Start index (1-based)
     "max_qa_cycles": 2,                  # Number of QA cycles
     "max_questions_per_batch": 5,        # Max questions per cycle
     "parallel": false,                   # Run in parallel?
     "max_workers": 4,                    # Parallel workers
     "python_exe": null,                  # Python executable path
     "use_logos": true,                   # Use logo-based setup?
     "model_version": "gemini25"          # Model version
   }

2. **Run experiments**:
   python test_all_modes.py --config-file poster_experiment_config.json

3. **Run specific modes**:
   python test_all_modes.py --config-file poster_experiment_config.json --modes qa_mode_adaptive qa_mode_adaptive_mpc

4. **Run with command-line overrides**:
   python test_all_modes.py --config-file poster_experiment_config.json --max-qa-cycles 3 --max-questions-per-batch 5

================================================================================
HOW TO ANALYZE RESULTS
================================================================================

1. **Analyze results**:
   python tools/analyze_poster_results.py results/qa_mode_adaptive_20240101_120000

2. **Analyze with JSON output**:
   python tools/analyze_poster_results.py results/qa_mode_adaptive_20240101_120000 --json-output analysis.json

3. **Visualize cost-performance**:
   python tools/visualize_poster_cost_performance.py --results-dir results

================================================================================
CONFIGURATION REFERENCE
================================================================================

Total configurations: 8

Agent Formats (3):
- Naive_Agent: Direct questioning, no planning
- MPQC: MPC over multi-round dimension (ours, A-MPQC)
- RAG: Retrieves from fixed format experiment database

Available Modes:
- no_user: No user interaction baseline
- Naive_Agent_Free_Ask: Direct QA, no format constraints
- Naive_Agent_Fixed_Binary: Direct QA, Yes/No questions only
- Naive_Agent_Fixed_MultiChoice: Direct QA, Multiple choice (A/B/C/D)
- Naive_Agent_Fixed_OpenText: Direct QA, Open-ended text questions
- Naive_Agent_Flexible: Direct QA, can choose format freely
- MPQC_Adaptive: A-MPQC with adaptive format selection (QUESTION | NEXT)
- RAG_Flexible: RAG-augmented with flexible format

================================================================================
RESULTS STRUCTURE
================================================================================

Results are saved in: results/{mode}_{timestamp}/

Each experiment produces:
- summary.json: Experiment summary
- {sample_id}/: Sample-specific results
  - plan.txt: Design plan
  - generated.png: Generated poster
  - target.png: Target poster
  - qa_log.json: Q&A log

================================================================================
"""

import subprocess
import sys
import os
import argparse
import json
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

# Import config generation
import sys
sys.path.insert(0, str(Path(__file__).parent / "src"))
from poster_config import generate_all_configs, AgentFormat

# Generate all configs dynamically (8 configurations)
def get_all_modes():
    """Get all modes from configuration system."""
    configs = generate_all_configs()
    modes = []
    for config in configs:
        modes.append({
            "mode": config.id,
            "name": config.id,
            "config": config
        })
    return modes

# Legacy MODES for backward compatibility (if needed)
MODES_LEGACY = [
    {"mode": "no_user", "name": "no_user"},
    {"mode": "qa_mode_binary", "name": "qa_mode_binary"},
    {"mode": "qa_mode_multi_choice", "name": "qa_mode_multi_choice"},
    {"mode": "qa_mode_open_text", "name": "qa_mode_open_text"},
    {"mode": "qa_mode_adaptive", "name": "qa_mode_adaptive", "mpc_enabled": False},
    {"mode": "qa_mode_adaptive_mpc", "name": "qa_mode_adaptive_mpc", "mpc_enabled": True}
]


def run_test(mode_config, num_samples=1, start_index=1, max_qa_cycles=None, max_questions_per_batch=None, python_exe=None, use_logos=None,
             model_version=None, question_agent_model_version=None, answer_agent_model_version=None):
    """Run test for a single mode"""
    mode = mode_config["mode"]
    mode_name = mode_config.get("name", mode)
    mpc_enabled = mode_config.get("mpc_enabled", False)
    
    print("\n" + "=" * 60)
    print(f"Testing mode: {mode_name}")
    if mode == "qa_mode_adaptive" or mode == "qa_mode_adaptive_mpc":
        print(f"  MPC: {'Enabled' if mpc_enabled else 'Disabled'}")
    print("=" * 60)
    
    script_dir = Path(__file__).parent.absolute()
    
    # Find config directory
    config_dir = script_dir.parent / "config"
    
    # Determine setup: if use_logos is explicitly set, use it; otherwise auto-detect
    logos_folder = script_dir.parent / "poster_logos"
    if use_logos is None:
        # Auto-detect: check if logos folder exists (with banner-style structure)
        use_logos_setup = logos_folder.exists() and any(logos_folder.glob("*_prompt.txt"))
    else:
        # Explicitly set
        use_logos_setup = use_logos
    
    if use_logos_setup:
        # Logo-based setup: prompts and logos in same directory
        logos_dir = str(logos_folder)
        prompt_dir = str(logos_folder)  # Same directory
        print(f"  Setup: Logo-based (using {logos_folder.name}/)")
    else:
        # No-logo setup: use enhanced prompts
        logos_dir = str(script_dir.parent / "poster_data_renamed")
        prompt_dir = str(script_dir.parent / "poster_data_prompt_enhanced")
        # Fallback to original if enhanced doesn't exist
        if not Path(prompt_dir).exists():
            prompt_dir = str(script_dir.parent / "poster_data_prompt")
            print(f"  ⚠️ Enhanced prompts not found, using original prompts from {Path(prompt_dir).name}/")
        else:
            prompt_dir_path = Path(prompt_dir)
            prompt_count = len(list(prompt_dir_path.glob("*.txt")))
            print(f"  Setup: No-logo (using {prompt_dir_path.name}/ with {prompt_count} enhanced prompts)")
    
    # Build command
    cmd = [
        sys.executable if python_exe is None else python_exe,
        str(script_dir / "src" / "poster_experiments.py"),
        "--logos-dir", logos_dir,
        "--prompt-dir", prompt_dir,
        "--answer-base-dir", str(script_dir.parent / "poster_data_renamed"),
        "--config", str(config_dir / "config.ini"),
        "--llm-config", str(config_dir / "config_llm.ini"),
        "--mode", mode,
        "--num-samples", str(num_samples),
        "--start-index", str(start_index),
        "--output-dir", str(script_dir / "results" / f"test_{mode_name}")
    ]
    
    # Add model version parameters
    if model_version is not None:
        cmd.extend(["--model-version", model_version])
    if question_agent_model_version is not None:
        cmd.extend(["--question-agent-model-version", question_agent_model_version])
    if answer_agent_model_version is not None:
        cmd.extend(["--answer-agent-model-version", answer_agent_model_version])
    
    # Add use-logos flag if logo-based setup
    if use_logos_setup:
        cmd.extend(["--use-logos"])
    
    # Add max-qa-cycles parameter if specified
    if max_qa_cycles is not None:
        cmd.extend(["--max-qa-cycles", str(max_qa_cycles)])
    
    # Add max-questions-per-batch parameter if specified
    if max_questions_per_batch is not None:
        cmd.extend(["--max-questions-per-batch", str(max_questions_per_batch)])
    
    # mpc_enabled is now determined by the config itself, no need for separate flag
    
    try:
        result = subprocess.run(
            cmd,
            cwd=str(script_dir),
            capture_output=False,
            text=True
        )
        
        if result.returncode == 0:
            print(f"\n✅ Mode {mode_name} test successful")
            return True, None
        else:
            print(f"\n❌ Mode {mode_name} test failed (exit code: {result.returncode})")
            return False, result.returncode
    except Exception as e:
        print(f"\n❌ Mode {mode_name} test error: {e}")
        return False, str(e)


def get_total_samples(logos_dir, prompt_dir, use_logos=False):
    """Get total number of available samples"""
    try:
        if use_logos:
            # Banner-style: prompts in logos_dir
            logos_path = Path(logos_dir)
            if not logos_path.exists():
                return None
            prompt_files = list(logos_path.glob("*_prompt.txt"))
        else:
            # No-logo: separate prompt_dir
            prompt_path = Path(prompt_dir)
            if not prompt_path.exists():
                return None
            prompt_files = list(prompt_path.glob("*.txt"))
        
        return len(prompt_files)
    except Exception as e:
        print(f"⚠️ Error getting total samples: {e}")
        return None


def load_config_file(config_path: Path) -> dict:
    """Load experiment configuration from JSON file."""
    if not config_path.exists():
        raise FileNotFoundError(f"Configuration file not found: {config_path}")
    
    with open(config_path, 'r', encoding='utf-8') as f:
        config = json.load(f)
    
    return config


def _ensure_rag_data_exists(api_key: str, config_path: str, llm_config_path: str, use_logos: bool, max_qa_cycles: int = 2, max_questions_per_batch: int = 5) -> bool:
    """
    Ensure RAG data exists by running fixed format experiments on rag_data samples.
    
    RAG workflow for Banner:
    1. Check if RAG data (questions + embeddings) already exists
    2. If not, find samples in rag_data directory (e.g., 002, 004, 006, etc.)
    3. Run fixed format experiments on these rag_data samples
    4. Extract questions from results and generate embeddings
    5. Use these embeddings to retrieve questions for testing poster_logos/poster_data_prompt samples
    """
    from poster_rag_agent import PosterRAGAgent
    from poster_config import QuestionFormat
    from pathlib import Path
    
    # Find rag_data directory
    script_dir = Path(__file__).parent.absolute()
    rag_data_dir = script_dir.parent / "rag_data"
    if not rag_data_dir.exists():
        rag_data_dir = script_dir.parent.parent / "rag_data"
    
    if not rag_data_dir.exists():
        print(f"[WARN] RAG data directory not found: {rag_data_dir}")
        return False
    
    # Check if RAG data already exists (questions + embeddings)
    rag_agent = PosterRAGAgent(rag_data_dir, QuestionFormat.FLEXIBLE)
    if rag_agent.has_data():
        print(f"[OK] RAG data exists ({len(rag_agent._qa_database)} questions)")
        return True
    
    print(f"\n[WARN] RAG data not found. Generating Fixed format experiments on rag_data samples...")
    
    # Find samples in rag_data directory
    rag_data_samples = []
    if use_logos:
        logos_dir = rag_data_dir / "poster_logos"
        if logos_dir.exists():
            for prompt_file in logos_dir.glob("*_prompt.txt"):
                # Extract index from filename (e.g., "002_wildcare_prompt.txt" -> "002")
                index = prompt_file.stem.split('_')[0]
                if index.isdigit():
                    rag_data_samples.append(index)
    else:
        prompt_dir = rag_data_dir / "poster_data_prompt"
        if prompt_dir.exists():
            for prompt_file in prompt_dir.glob("*.txt"):
                # Extract index from filename (e.g., "002.txt" -> "002")
                index = prompt_file.stem
                if index.isdigit():
                    rag_data_samples.append(index)
    
    if not rag_data_samples:
        print("[ERROR] No samples found in rag_data directory!")
        print(f"  Expected: {rag_data_dir}/poster_logos/ or {rag_data_dir}/poster_data_prompt/")
        return False
    
    print(f"Found {len(rag_data_samples)} samples in rag_data: {', '.join(sorted(rag_data_samples))}")
    
    # Fixed format configs to generate
    fixed_configs = [
        "Naive_Agent_Fixed_Binary",
        "Naive_Agent_Fixed_MultiChoice",
        "Naive_Agent_Fixed_OpenText"
    ]
    
    # Run fixed format experiments on each rag_data sample
    # Use test_all_modes.py's run_test function instead of calling poster_experiments.py directly
    # This ensures consistency with the main experiment flow
    from poster_config import get_config_by_id
    
    for rag_sample_index in sorted(rag_data_samples):
        print(f"\n  Processing rag_data sample: {rag_sample_index}")
        
        for config_id in fixed_configs:
            print(f"    Running {config_id} on {rag_sample_index}...")
            
            # Get config
            config = get_config_by_id(config_id)
            if not config:
                print(f"      [FAIL] Config {config_id} not found")
                continue
            
            # Create mode config for run_test
            mode_config = {
                "mode": config_id,
                "name": config_id,
                "config": config
            }
            
            # Run test with this specific sample, but save to rag_data directory
            try:
                # Build command to save results to rag_data directory
                script_dir = Path(__file__).parent.absolute()
                config_dir = script_dir.parent / "config"
                
                # Determine setup
                logos_folder = script_dir.parent / "poster_logos"
                if use_logos is None:
                    use_logos_setup = logos_folder.exists() and any(logos_folder.glob("*_prompt.txt"))
                else:
                    use_logos_setup = use_logos
                
                if use_logos_setup:
                    logos_dir = str(logos_folder)
                    prompt_dir = str(logos_folder)
                else:
                    logos_dir = str(script_dir.parent / "poster_data_renamed")
                    prompt_dir = str(script_dir.parent / "poster_data_prompt_enhanced")
                    if not Path(prompt_dir).exists():
                        prompt_dir = str(script_dir.parent / "poster_data_prompt")
                
                # Output directory: rag_data/sample_{index}/{config_id}/
                # Format: sample_002, sample_003, etc. (3-digit zero-padded)
                sample_dir_name = f"sample_{int(rag_sample_index):03d}"
                rag_output_dir = rag_data_dir / sample_dir_name / config_id
                rag_output_dir.mkdir(parents=True, exist_ok=True)
                
                # Build command
                cmd = [
                    sys.executable,
                    str(script_dir / "src" / "poster_experiments.py"),
                    "--logos-dir", logos_dir,
                    "--prompt-dir", prompt_dir,
                    "--answer-base-dir", str(script_dir.parent / "poster_data_renamed"),
                    "--config", str(config_dir / "config.ini"),
                    "--llm-config", str(config_dir / "config_llm.ini"),
                    "--mode", config_id,
                    "--num-samples", "1",
                    "--start-index", str(rag_sample_index),
                    "--model-version", "gemini25",
                    "--output-dir", str(rag_output_dir)
                ]
                
                if use_logos_setup:
                    cmd.extend(["--use-logos"])
                
                if max_qa_cycles is not None:
                    cmd.extend(["--max-qa-cycles", str(max_qa_cycles)])
                
                if max_questions_per_batch is not None:
                    cmd.extend(["--max-questions-per-batch", str(max_questions_per_batch)])
                
                # Run experiment
                env = os.environ.copy()
                env["GEMINI_API_KEY"] = api_key
                env["PYTHONIOENCODING"] = "utf-8"
                
                result = subprocess.run(
                    cmd,
                    cwd=str(script_dir),
                    capture_output=False,
                    text=True,
                    env=env,
                )
                
                if result.returncode == 0:
                    print(f"      [OK] {config_id} completed")
                else:
                    print(f"      [FAIL] {config_id} failed (return code: {result.returncode})")
            except Exception as e:
                print(f"      [FAIL] {config_id} error: {e}")
                import traceback
                traceback.print_exc()
    
    # Verify RAG data now exists
    rag_agent = PosterRAGAgent(rag_data_dir, QuestionFormat.FLEXIBLE)
    if rag_agent.has_data():
        print(f"\n[OK] RAG data generated successfully ({len(rag_agent._qa_database)} questions)")
        return True
    else:
        print("\n[WARN] RAG data generation completed but no questions found")
        return False


def main():
    """Main function"""
    parser = argparse.ArgumentParser(description="Test all poster generation modes")
    parser.add_argument("--config-file", type=str, default="poster_experiment_config.json",
                        help="Path to experiment configuration JSON file (default: poster_experiment_config.json)")
    parser.add_argument("--num-samples", type=int, default=None,
                        help="Number of samples per mode (overrides config file)")
    parser.add_argument("--start-index", type=int, default=None,
                        help="Start index (1-based, overrides config file)")
    parser.add_argument("--all-samples", action="store_true",
                        help="Test all available samples (overrides --num-samples and --start-index)")
    parser.add_argument("--max-qa-cycles", type=int, default=None,
                        help="Maximum QA cycles (overrides config file)")
    parser.add_argument("--max-questions-per-batch", type=int, default=None,
                        help="Maximum questions per batch (overrides config file)")
    parser.add_argument("--parallel", action="store_true", default=None,
                        help="Run modes in parallel (overrides config file)")
    parser.add_argument("--max-workers", type=int, default=None,
                        help="Maximum number of parallel workers (overrides config file)")
    parser.add_argument("--python-exe", type=str, default=None,
                        help="Python executable path (overrides config file)")
    parser.add_argument("--use-logos", action="store_true", default=None,
                        help="Force logo-based setup (use poster_logos folder, overrides config file)")
    parser.add_argument("--no-logos", action="store_true", default=False,
                        help="Force no-logo setup (use enhanced/original prompts, overrides config file)")
    
    args = parser.parse_args()
    
    # Load configuration file
    config_file_path = Path(__file__).parent / args.config_file
    config_data = {}
    if config_file_path.exists():
        try:
            config_data = load_config_file(config_file_path)
            print(f"✓ Loaded configuration from {config_file_path}")
        except Exception as e:
            print(f"⚠️ Error loading config file: {e}")
            print("  Using command-line arguments and defaults only")
    else:
        print(f"⚠️ Configuration file not found: {config_file_path}")
        print("  Using command-line arguments and defaults only")
    
    # Merge config file and command-line arguments (CLI overrides config file)
    modes_spec = config_data.get("modes", "all")
    use_logos_config = config_data.get("use_logos")
    model_version = config_data.get("model_version", "gemini25")
    question_agent_model_version = config_data.get("question_agent_model_version")
    answer_agent_model_version = config_data.get("answer_agent_model_version")
    python_exe_config = config_data.get("python_exe")
    num_samples_config = config_data.get("num_samples", 1)
    start_index_config = config_data.get("start_index", 1)
    max_qa_cycles_config = config_data.get("max_qa_cycles")
    max_questions_per_batch_config = config_data.get("max_questions_per_batch")
    
    # Determine modes to run - use new config system
    all_configs = get_all_modes()
    if modes_spec == "all":
        modes_to_run = all_configs
    elif isinstance(modes_spec, list):
        # Filter configs based on config
        requested_modes = {m.get("mode") if isinstance(m, dict) else m for m in modes_spec}
        modes_to_run = [m for m in all_configs if m["mode"] in requested_modes]
        # Apply per-mode parameters if provided
        for mode_config in modes_to_run:
            for item in modes_spec:
                if isinstance(item, dict) and item.get("mode") == mode_config["mode"]:
                    mode_config.update(item)
    else:
        # Single mode string
        modes_to_run = [m for m in all_configs if m["mode"] == modes_spec]
    
    # Determine logo usage: CLI overrides config
    use_logos = args.use_logos if args.use_logos is not None else (args.no_logos if args.no_logos else use_logos_config)
    python_exe = args.python_exe if args.python_exe else python_exe_config
    num_samples = args.num_samples if args.num_samples is not None else num_samples_config
    start_index = args.start_index if args.start_index is not None else start_index_config
    max_qa_cycles = args.max_qa_cycles if args.max_qa_cycles is not None else max_qa_cycles_config
    max_questions_per_batch = args.max_questions_per_batch if args.max_questions_per_batch is not None else max_questions_per_batch_config
    parallel = args.parallel if args.parallel is not None else config_data.get("parallel", False)
    max_workers = args.max_workers if args.max_workers is not None else config_data.get("max_workers", len(modes_to_run))
    
    # If --all-samples specified, get total and set parameters
    
    if args.all_samples:
        script_dir = Path(__file__).parent.absolute()
        # Determine setup type (use explicit flag or auto-detect)
        logos_folder = script_dir.parent / "poster_logos"
        if use_logos is None:
            use_logos_setup = logos_folder.exists() and any(logos_folder.glob("*_prompt.txt"))
        else:
            use_logos_setup = use_logos
        
        if use_logos_setup:
            logos_dir = str(logos_folder)
            prompt_dir = str(logos_folder)
        else:
            logos_dir = str(script_dir.parent / "poster_data_renamed")
            prompt_dir = str(script_dir.parent / "poster_data_prompt_enhanced")
            if not Path(prompt_dir).exists():
                prompt_dir = str(script_dir.parent / "poster_data_prompt")
        
        total = get_total_samples(logos_dir, prompt_dir, use_logos=use_logos_setup)
        if total:
            num_samples = total
            start_index = 1
            print(f"📊 Detected {total} available samples, will test all samples")
        else:
            print("⚠️ Cannot detect total samples, using default values")
    
    print("=" * 60)
    print("Test All Poster Generation Modes")
    print(f"Total modes to run: {len(modes_to_run)}")
    print(f"Each mode tests {num_samples} samples, starting index: {start_index}")
    if use_logos is not None:
        print(f"Logo setup: {'Logo-based' if use_logos else 'No-logo'} (explicitly set)")
    else:
        print(f"Logo setup: Auto-detect")
    if max_qa_cycles is not None:
        print(f"Max QA cycles: {max_qa_cycles}")
    if max_questions_per_batch is not None:
        print(f"Max questions per batch: {max_questions_per_batch}")
    if parallel:
        print(f"Parallel execution: Yes (max workers: {max_workers})")
    else:
        print(f"Parallel execution: No (sequential)")
    print(f"Global model version: {model_version}")
    if question_agent_model_version is not None:
        print(f"Question agent model version: {question_agent_model_version}")
    if answer_agent_model_version is not None:
        print(f"Answer agent model version: {answer_agent_model_version}")
    print("=" * 60)
    print(f"Modes to run:")
    for i, mode_config in enumerate(modes_to_run, 1):
        mode_name = mode_config.get("name", mode_config["mode"])
        if mode_config["mode"] == "qa_mode_adaptive" or mode_config["mode"] == "qa_mode_adaptive_mpc":
            mpc_status = " (MPC Enabled)" if mode_config.get("mpc_enabled", False) else " (MPC Disabled)"
            print(f"  {i}. {mode_name}{mpc_status}")
        else:
            print(f"  {i}. {mode_name}")
    print()
    
    # Check for RAG configs and ensure data exists
    rag_configs = [m for m in modes_to_run if m.get("config") and m["config"].is_rag]
    if rag_configs:
        print("\n" + "="*80)
        print("Checking RAG Data Availability")
        print("="*80)
        
        # Get config paths (same as in run_test)
        script_dir = Path(__file__).parent.absolute()
        config_dir = script_dir.parent / "config"
        config_path = str(config_dir / "config.ini")
        llm_config_path = str(config_dir / "config_llm.ini")
        
        # Read API key from llm_config_path
        import configparser
        try:
            llm_config = configparser.ConfigParser()
            llm_config.read(llm_config_path)
            api_key = llm_config.get("KEYS", "GEMINI_API_KEY", fallback=None)
            if not api_key:
                print(f"⚠️ GEMINI_API_KEY not found in {llm_config_path}")
                print("  Skipping RAG data generation check")
            else:
                # Determine use_logos_setup for RAG data generation
                logos_folder = script_dir.parent / "poster_logos"
                if use_logos is None:
                    use_logos_setup = logos_folder.exists() and any(logos_folder.glob("*_prompt.txt"))
                else:
                    use_logos_setup = use_logos
                
                # Ensure RAG data exists
                _ensure_rag_data_exists(
                    api_key=api_key,
                    config_path=config_path,
                    llm_config_path=llm_config_path,
                    use_logos=use_logos_setup,
                    max_qa_cycles=max_qa_cycles if max_qa_cycles is not None else 2,
                    max_questions_per_batch=max_questions_per_batch if max_questions_per_batch is not None else 5
                )
        except Exception as e:
            print(f"⚠️ Error checking RAG data: {e}")
            import traceback
            traceback.print_exc()
        
        print("="*80 + "\n")
    
    # Use current Python if not specified
    if python_exe is None:
        python_exe = sys.executable
    
    results = []
    
    if parallel:
        # Run in parallel
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {
                executor.submit(
                    run_test,
                    mode_config,
                    mode_config.get("num_samples", num_samples),
                    mode_config.get("start_index", start_index),
                    mode_config.get("max_qa_cycles", max_qa_cycles),
                    mode_config.get("max_questions_per_batch", max_questions_per_batch),
                    python_exe,
                    use_logos,
                    model_version,
                    question_agent_model_version,
                    answer_agent_model_version
                ): mode_config
                for mode_config in modes_to_run
            }
            
            for future in as_completed(futures):
                mode_config = futures[future]
                success, error = future.result()
                results.append((mode_config["name"], success, error))
    else:
        # Run sequentially
        for mode_config in modes_to_run:
            success, error = run_test(
                mode_config,
                mode_config.get("num_samples", num_samples),
                mode_config.get("start_index", start_index),
                mode_config.get("max_qa_cycles", max_qa_cycles),
                mode_config.get("max_questions_per_batch", max_questions_per_batch),
                python_exe,
                use_logos,
                model_version,
                question_agent_model_version,
                answer_agent_model_version
            )
            results.append((mode_config["name"], success, error))
    
    # Print summary
    print("\n" + "=" * 60)
    print("Test Summary")
    print("=" * 60)
    for mode_name, success, error in results:
        status = "✅ PASS" if success else f"❌ FAIL ({error})"
        print(f"{mode_name}: {status}")
    
    # Exit with error if any test failed
    if any(not success for _, success, _ in results):
        sys.exit(1)


if __name__ == "__main__":
    main()
