#!/usr/bin/env python3
"""
RoboPhD Parallel Agent Researcher - Complete Migration from APE
This file contains the full researcher.py implementation for RoboPhD.
Due to size constraints, this will replace the partial researcher.py file.
"""

import argparse
import json
import logging
import os
import random
import re
import shutil
import subprocess
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

# RoboPhD imports - handle both module and script execution
try:
    from .ranking_table import generate_ranking_table, calculate_mean_ranks
    from .config import (
        API_KEY_ENV_VAR,
        CLAUDE_CLI_MODEL_MAP,
        DEFAULT_MODEL,
        SUPPORTED_MODELS
    )
    from .core import SQLGenerator, Evaluator, TestOutputGenerator, DatabaseManager, resolve_api_key
    from .agent_orchestrator import AgentOrchestrator
    from .evolution import EvolutionStrategySelector
except ImportError:
    # When run as a script, use absolute imports
    import sys
    from pathlib import Path
    sys.path.insert(0, str(Path(__file__).parent.parent))
    from RoboPhD.ranking_table import generate_ranking_table, calculate_mean_ranks
    from RoboPhD.config import (
        API_KEY_ENV_VAR,
        CLAUDE_CLI_MODEL_MAP,
        DEFAULT_MODEL,
        SUPPORTED_MODELS
    )
    from RoboPhD.core import SQLGenerator, Evaluator, TestOutputGenerator, DatabaseManager, resolve_api_key
    from RoboPhD.agent_orchestrator import AgentOrchestrator
    from RoboPhD.evolution import EvolutionStrategySelector

# Utilities
import psutil


class MemoryMonitor:
    """Monitor system memory usage."""
    
    def __init__(self, threshold_percent: float = 80.0):
        self.threshold_percent = threshold_percent
        
    def check_memory(self) -> bool:
        """Check if memory usage is below threshold."""
        memory = psutil.virtual_memory()
        if memory.percent > self.threshold_percent:
            print(f"⚠️ Memory usage high: {memory.percent:.1f}%")
            print(f"   Available: {memory.available / (1024**3):.1f} GB")
            return False
        return True


class ParallelAgentEvolver:
    """Manages agent evolution using Claude."""
    
    def __init__(self,
                 experiment_dir: Path,
                 evolution_model: str = 'opus-4.1',
                 evolution_timeout: int = 1800,
                 evolution_default: str = 'cross_pollination_judgment',
                 evolution_schedule: Optional[Dict[int, str]] = None,
                 evolution_context: bool = True,
                 agents_directory: Optional[str] = None,
                 context_reset_interval: int = 4,
                 evolution_random_pool: Optional[List[str]] = None,
                 evolution_weighted_random: Optional[Dict[str, int]] = None):
        """Initialize the evolver."""
        self.experiment_dir = Path(experiment_dir)
        self.evolution_model = evolution_model
        self.evolution_timeout = evolution_timeout
        self.evolution_default = evolution_default
        self.evolution_schedule = evolution_schedule or {}
        self.evolution_context = evolution_context
        self.agents_directory = agents_directory
        self.context_reset_interval = context_reset_interval
        self.evolution_random_pool = evolution_random_pool or []
        self.evolution_weighted_random = evolution_weighted_random or {}

        # Initialize shuffled random pool if using random strategies
        self.shuffled_random_pool = []
        if self.evolution_random_pool:
            self.shuffled_random_pool = self.evolution_random_pool.copy()
            random.shuffle(self.shuffled_random_pool)
            print(f"🎲 Initialized random evolution pool with {len(self.shuffled_random_pool)} strategies")

        # Validate weighted random pool if provided
        if self.evolution_weighted_random:
            total_percentage = sum(self.evolution_weighted_random.values())
            if total_percentage != 100:
                raise ValueError(f"Weighted random percentages must sum to 100%, got {total_percentage}%")
            print(f"⚖️ Initialized weighted random pool with {len(self.evolution_weighted_random)} strategies")
        
        # Initialize shuffled papers pool for research_driven strategy
        self.available_papers = [
            "bird_methods/askdata_gpt4o/Shkapenyuk_2025_AskData.pdf",
            "bird_methods/chase_sql_gemini/Pourreza_2024_CHASE_SQL.pdf",
            "bird_methods/xiyan_sql/Liu_2024_XiYan_SQL.pdf",
            "bird_methods/csc_sql/Sheng_2025_CSC_SQL.pdf",
            "bird_methods/reasoning_sql_14b/Pourreza_2025_Reasoning_SQL.pdf",
            "bird_methods/opensearch_sql_gpt4o/Xie_2025_OpenSearch_SQL.pdf",
            "bird_methods/omnisql_32b/Li_2025_OmniSQL.pdf",
            "bird_methods/distillery_gpt4o/Maamari_2024_Distillery.pdf",
            "bird_methods/genasql/Donder_2025_GenaSQL.pdf",
            "bird_methods/chess_stanford/Talaei_2024_CHESS.pdf",
        ]
        self.shuffled_papers_pool = self.available_papers.copy()
        random.shuffle(self.shuffled_papers_pool)
        print(f"📚 Initialized papers pool with {len(self.shuffled_papers_pool)} research papers")
        
        # Evolution tracking
        self.evolution_count = 0
        self.context_resets = []
        self.evolution_retries = []
        self.five_hour_limit_incidents = []
        self.restart_from_iteration = None  # Changed from boolean to track specific iteration
        self.force_fresh_next_evolution = False
        self.evolution_history = []
        self.header_repairs = []
        self.is_first_evolution_call = True
        self.evolution_validation_failures = []  # Track validation failures
        
        # Setup paths
        self.evolution_prompts_dir = Path(__file__).parent / "evolution_strategies"
        self.available_strategies = {}
        self._load_evolution_strategies()
        
        # Claude CLI path
        self.claude_path = os.path.expanduser("~/.claude/local/claude")
        if not Path(self.claude_path).exists():
            self.claude_path = "claude"  # Try system PATH
    
    def _load_evolution_strategies(self):
        """Load available evolution strategies from directory."""
        if not self.evolution_prompts_dir.exists():
            print(f"⚠️ Evolution strategies directory not found: {self.evolution_prompts_dir}")
            return
            
        for strategy_file in self.evolution_prompts_dir.glob("*.md"):
            strategy_name = strategy_file.stem
            self.available_strategies[strategy_name] = strategy_file
        
        # Validate configured strategies
        self._validate_strategies()
    
    def _validate_strategies(self):
        """Validate that all configured strategies exist."""
        # Check default strategy
        if self.evolution_default not in ['none', 'random', 'weighted_random']:
            if self.evolution_default not in self.available_strategies:
                raise ValueError(f"Default evolution strategy '{self.evolution_default}' not found in {self.evolution_prompts_dir}")

        # Check scheduled strategies
        if self.evolution_schedule:
            for iteration, strategy in self.evolution_schedule.items():
                if strategy not in ['none', 'random', 'weighted_random']:
                    if strategy not in self.available_strategies:
                        raise ValueError(f"Scheduled strategy '{strategy}' for iteration {iteration} not found")

        # Check random pool strategies
        if self.evolution_random_pool:
            for strategy in self.evolution_random_pool:
                if strategy not in self.available_strategies:
                    raise ValueError(f"Random pool strategy '{strategy}' not found")

        # Check weighted random strategies
        if self.evolution_weighted_random:
            for strategy in self.evolution_weighted_random.keys():
                if strategy not in ['none'] and strategy not in self.available_strategies:
                    raise ValueError(f"Weighted random strategy '{strategy}' not found")
    
    def get_strategy_for_iteration(self, iteration: int) -> Tuple[Optional[str], bool]:
        """
        Get the evolution strategy for a specific iteration.
        
        Returns:
            Tuple of (strategy_name, was_random)
            - strategy_name: The strategy to use (or None to skip)
            - was_random: True if the strategy was selected via random pool
        """
        # Check if iteration has a scheduled strategy
        if iteration in self.evolution_schedule:
            strategy = self.evolution_schedule[iteration]
        else:
            strategy = self.evolution_default
        
        # Handle special strategies
        if strategy == 'none':
            return None, False
        elif strategy == 'random':
            if not self.shuffled_random_pool:
                print(f"⚠️ Random strategy requested but no pool defined")
                return None, False
            # Count how many times random selection has been used
            random_count = sum(1 for entry in self.evolution_history
                             if entry.get('was_random', False))
            # Pick next strategy from shuffled pool based on random_count
            strategy = self.shuffled_random_pool[random_count % len(self.shuffled_random_pool)]
            print(f"🎲 Random strategy selected: {strategy} (random selection #{random_count + 1})")
            return strategy, True  # Mark as randomly selected
        elif strategy == 'weighted_random':
            if not self.evolution_weighted_random:
                print(f"⚠️ Weighted random strategy requested but no weights defined")
                return None, False
            # Use random.choices with weights for true random selection
            strategies = list(self.evolution_weighted_random.keys())
            weights = list(self.evolution_weighted_random.values())
            selected_strategy = random.choices(strategies, weights=weights, k=1)[0]
            weight_percent = self.evolution_weighted_random[selected_strategy]

            # Handle special case where "none" is selected
            if selected_strategy == 'none':
                print(f"⚖️ Weighted random selected: none ({weight_percent}% weight) - skipping evolution")
                return None, 'weighted'  # Return None to skip evolution, but mark as weighted selection

            print(f"⚖️ Weighted random selected: {selected_strategy} ({weight_percent}% weight)")
            return selected_strategy, 'weighted'  # Return 'weighted' to distinguish from deterministic random

        return strategy, False  # Not randomly selected
    
    def list_strategies(self) -> List[str]:
        """List all available evolution strategies."""
        return sorted(list(self.available_strategies.keys()))
    
    def list_all_strategies(self) -> List[str]:
        """List all strategies including special ones."""
        strategies = list(self.available_strategies.keys())
        strategies.append("none")
        strategies.append("random")
        return sorted(strategies)
    
    def create_new_agent(self,
                        agent_pool: Dict,
                        performance_records: Dict,
                        recent_results: Dict,
                        iteration: int,
                        test_history: List,
                        strategy_name: Optional[str] = None,
                        was_random: Union[bool, str] = False) -> Optional[Tuple]:
        """Create a new evolved agent."""
        # If strategy not provided, determine it (for backward compatibility)
        if strategy_name is None:
            strategy_name, was_random = self.get_strategy_for_iteration(iteration)
        
        if strategy_name is None:
            print(f"⏭️ Skipping evolution for iteration {iteration}")
            # Track if this was a weighted random selection of "none"
            if was_random == 'weighted':
                self.evolution_history.append({
                    'iteration': iteration,
                    'strategy': 'none (via weighted_random)',
                    'timestamp': datetime.now().isoformat(),
                    'was_weighted_random': True,
                    'skipped': True
                })
            return None
        
        print(f"\n🧬 EVOLUTION (Iteration {iteration})")
        print(f"Strategy: {strategy_name}")
        
        # Load evolution strategy
        if strategy_name not in self.available_strategies:
            print(f"❌ Strategy '{strategy_name}' not found")
            return None
        
        strategy_content = self.available_strategies[strategy_name].read_text()
        
        # Build evolution prompt
        prompt = self._build_evolution_prompt(
            strategy_content,
            agent_pool,
            performance_records,
            recent_results,
            iteration,
            test_history
        )
        
        # Create evolution workspace
        evolution_dir = self.experiment_dir / "evolution_output" / f"iteration_{iteration:03d}"
        evolution_dir.mkdir(parents=True, exist_ok=True)
        
        # Save the evolution prompt for debugging and transparency
        evolution_prompt_file = evolution_dir / "evolution_prompt.md"
        with open(evolution_prompt_file, 'w') as f:
            f.write(f"# Evolution Prompt for Iteration {iteration}\n\n")
            f.write(f"Strategy: {strategy_name}\n")
            f.write(f"Timestamp: {datetime.now().isoformat()}\n\n")
            f.write("## Prompt sent to Claude:\n\n")
            f.write(prompt)
        
        # Call Claude to generate new agent (from experiment dir for continuity)
        response, used_continue = self._call_claude_cli(prompt, self.experiment_dir, iteration, strategy_name=strategy_name)
        
        if not response:
            print(f"❌ Evolution failed - no response from Claude")
            return None
        
        # Save Claude's response for debugging
        response_file = evolution_dir / "claude_response.txt"
        with open(response_file, 'w') as f:
            f.write(f"# Claude Response for Iteration {iteration}\n\n")
            f.write(f"Strategy: {strategy_name}\n")
            f.write(f"Used Continue: {used_continue}\n")
            f.write(f"Timestamp: {datetime.now().isoformat()}\n\n")
            f.write("## Response:\n\n")
            f.write(response)
        
        # Track evolution
        self.evolution_count += 1
        evolution_entry = {
            'iteration': iteration,
            'strategy': strategy_name,
            'timestamp': datetime.now().isoformat()
        }
        # Handle different randomness types
        if was_random == 'weighted':
            evolution_entry['was_weighted_random'] = True
        elif was_random:
            evolution_entry['was_random'] = True
        self.evolution_history.append(evolution_entry)
        
        # Check if Claude created the required files
        agent_file = evolution_dir / "agent.md"
        eval_instructions_file = evolution_dir / "eval_instructions.md"
        tools_dir = evolution_dir / "tools"

        # Small delay to ensure all files are written (avoid race condition)
        time.sleep(2)

        # Validate all required artifacts
        validation_passed, validation_errors = self._validate_evolution_artifacts(evolution_dir, iteration)

        # Track if we've already retried to prevent infinite loops
        already_retried = False

        if not validation_passed:
            # Track validation failure
            self.evolution_validation_failures.append({
                'iteration': iteration,
                'errors': validation_errors.copy(),
                'used_continue': used_continue
            })

            # Always retry once with fresh context when validation fails
            print(f"⚠️ Evolution validation failed:")
            for error in validation_errors:
                print(f"    - {error}")
            print(f"  Retrying with fresh context...")

            # Retry with fresh context
            retry_response, _ = self._call_claude_cli(prompt, self.experiment_dir, iteration, force_fresh=True, strategy_name=strategy_name)
            already_retried = True

            if retry_response:
                # Save retry response
                retry_response_file = evolution_dir / "claude_response_retry.txt"
                with open(retry_response_file, 'w') as f:
                    f.write(f"# Claude Response for Iteration {iteration} (RETRY)\n\n")
                    f.write(f"Strategy: {strategy_name}\n")
                    f.write(f"Used Continue: False (forced fresh)\n")
                    f.write(f"Retry Reason: {', '.join(validation_errors)}\n")
                    f.write(f"Timestamp: {datetime.now().isoformat()}\n\n")
                    f.write("## Response:\n\n")
                    f.write(retry_response)

                # Re-validate after retry
                retry_validation_passed, retry_errors = self._validate_evolution_artifacts(evolution_dir, iteration)

                if retry_validation_passed:
                    # Track successful retry
                    self.evolution_retries.append({
                        'iteration': iteration,
                        'reason': 'validation_failure',
                        'original_errors': validation_errors,
                        'success': True
                    })
                    # Also track the context reset
                    self.context_resets.append({
                        'iteration': iteration,
                        'evolution_count': self.evolution_count,
                        'reason': 'retry_validation_failure'
                    })
                    print(f"  ✅ Retry successful - all artifacts valid")
                    validation_passed = True  # Update to continue with normal flow
                else:
                    # Track failed retry
                    self.evolution_retries.append({
                        'iteration': iteration,
                        'reason': 'validation_failure',
                        'original_errors': validation_errors,
                        'retry_errors': retry_errors,
                        'success': False
                    })
                    print(f"  ❌ Retry failed - still missing: {', '.join(retry_errors)}")
                    return None
            else:
                # No response from retry
                self.evolution_retries.append({
                    'iteration': iteration,
                    'reason': 'validation_failure',
                    'original_errors': validation_errors,
                    'success': False
                })
                print(f"  ❌ Retry failed - no response from Claude")
                return None

        # If we get here, validation passed (either originally or after retry)
        # All required files exist with proper format

        # Read the agent content
        agent_content = agent_file.read_text()
        
        # Generate agent ID from the content
        agent_id = self._generate_agent_id(agent_content, iteration)
        
        package_info = {
            'type': 'three_artifact',
            'agent_file': agent_file,
            'eval_instructions_file': eval_instructions_file,
            'tools_dir': tools_dir if tools_dir.exists() else None,
            'evolution_dir': evolution_dir
        }
        
        return (agent_content, agent_id, response, package_info)
    
    def _build_evolution_prompt(self,
                               strategy_content: str,
                               agent_pool: Dict,
                               performance_records: Dict,
                               recent_results: Dict,
                               iteration: int,
                               test_history: List) -> str:
        """Build the complete evolution prompt."""
        lines = []
        
        # Add performance data FIRST for context
        lines.append("## Performance Rankings Across All Iterations\n")
        ranking_table = self._generate_ranking_table(test_history, performance_records, for_evolution=True)
        lines.append(ranking_table)
        
        # Add previous iteration summary with per-database breakdown
        lines.append("\n")
        summary = self._get_previous_iteration_summary(iteration - 1, test_history)
        lines.append(summary)
        
        # Add experiment structure
        lines.append("\n## Experiment Directory Structure\n")
        structure = self._get_experiment_structure(iteration - 1)
        lines.append(structure)
        
        # Add agent pool summary
        lines.append("\n## Agent Pool\n")
        pool_summary = self._format_agent_pool_summary(agent_pool, performance_records)
        lines.append(pool_summary)
        
        # NOW add strategy content after context is established
        lines.append("\n## Evolution Strategy: " + strategy_content.split('\n')[0].strip('#').strip())
        lines.append("\n" + strategy_content)
        
        # Add output requirements for file creation
        lines.append("\n## OUTPUT REQUIREMENTS\n")
        lines.append(f"Create the following files in evolution_output/iteration_{iteration:03d}/:\n")
        lines.append("1. **reasoning.md** - Your analysis and improvement strategy")
        lines.append("2. **eval_instructions.md** - SQL generation instructions for the eval model")
        lines.append("3. **agent.md** - Database analysis agent with YAML frontmatter")
        lines.append("4. **tools/*.py** (optional, but recommended) - Python analysis tools\n")
        lines.append("Required agent.md frontmatter:")
        lines.append("```yaml")
        lines.append("---")
        lines.append("name: your-unique-agent-name")
        lines.append("description: Brief description")
        lines.append("---")
        lines.append("```\n")
        lines.append("The agent must write its output to: ./output/agent_output.txt")
        lines.append("Final system prompt will be: [agent_output] + [eval_instructions]")
        
        return "\n".join(lines)
    
    def _generate_ranking_table(self, test_history: List, performance_records: Dict, for_evolution: bool = False) -> str:
        """Generate comprehensive ranking table for agents across all iterations."""
        return generate_ranking_table(test_history, performance_records, for_evolution)
    
    def _calculate_mean_ranks(self, records: Dict) -> Dict[str, float]:
        """Calculate mean average rank for each agent across iterations."""
        return calculate_mean_ranks(records)
    
    def _get_experiment_structure(self, iteration: int) -> str:
        """Get a structured overview of experiment files for analysis."""
        lines = []
        iter_dir = self.experiment_dir / f"iteration_{iteration:03d}"
        
        if not iter_dir.exists() or iteration < 1:
            return "No previous iteration data available yet."
        
        lines.append("Experiment directory structure with key files:")
        lines.append("")
        lines.append("```")
        lines.append(f"iteration_{iteration:03d}/")
        
        # Group by agent
        agent_dirs = sorted(iter_dir.glob("agent_*"))
        for agent_dir in agent_dirs[:3]:  # Limit to 3 agents for brevity
            agent_name = agent_dir.name
            lines.append(f"  {agent_name}/")
            
            # Show first few databases for each agent
            db_dirs = sorted([d for d in agent_dir.iterdir() if d.is_dir()])
            for db_dir in db_dirs[:2]:  # Show first 2 databases
                db_name = db_dir.name
                lines.append(f"    {db_name}/")
                
                # Check for key files
                system_prompt = db_dir / "output" / "system_prompt.txt"
                evaluation = db_dir / "results" / "evaluation.json"
                
                if system_prompt.exists():
                    lines.append(f"      output/system_prompt.txt  ← Agent's database analysis")
                if evaluation.exists():
                    lines.append(f"      results/evaluation.json    ← Performance metrics")
            
            if len(db_dirs) > 2:
                lines.append(f"    ... and {len(db_dirs) - 2} more databases")
        
        if len(agent_dirs) > 3:
            lines.append(f"  ... and {len(agent_dirs) - 3} more agents")
        
        lines.append("```")
        
        lines.append("")
        lines.append("Key files to review:")
        lines.append("- system_prompt.txt: The analysis and instructions each agent generated")
        lines.append("- evaluation.json: Performance metrics with detailed results")
        lines.append("  * Top level: accuracy (0.0-1.0 decimal), correct count, total_questions")  
        lines.append("  * results: Dictionary keyed by question_id (as string) containing:")
        lines.append("    - question: Natural language question text")
        lines.append("    - evidence: Supplemental hints/context (may be empty string)")
        lines.append("    - ground_truth_sql: Expected SQL query")
        lines.append("    - predicted_sql: Model-generated SQL query")
        lines.append("    - ground_truth_results: Expected query results (array of arrays)")
        lines.append("    - predicted_results: Actual query results (array of arrays)")
        lines.append("    - matches: Boolean indicating if results match (uses set equality, ignores row order)")
        lines.append("    - status: 'match', 'mismatch', 'pred_error', 'pred_timeout', 'gt_error', etc.")
        lines.append("    - predicted_error/ground_truth_error: Error messages if queries failed (null if successful)")
        lines.append("    - verification_info: (optional) Universal SQL verification details:")
        lines.append("      * verification_attempts: Number of verification attempts made (0-k)")
        lines.append("      * verification_outcome: 'passed_immediately', 'improved', 'failed', or 'no_verification'")
        lines.append("      * final_retry_used: Boolean, whether fallback error retry was used after all verification attempts")
        lines.append("      * verification_details: Array of verification attempts, each containing:")
        lines.append("        - sql: The SQL query being verified")
        lines.append("        - summary: Actual query results or error message")
        lines.append("        - feedback: Model's verification decision - either 'CORRECT' or suggested SQL improvement")
        lines.append("        - is_correct: Whether model responded with 'CORRECT' for this attempt")
        lines.append("")
        lines.append("  Understanding verification_info:")
        lines.append("  - The system executes each SQL query and shows results to the model for verification")
        lines.append("  - The model must decide: respond 'CORRECT' if it believes the results look right, or provide improved SQL")
        lines.append("  - 'passed_immediately': Model said 'CORRECT' on first verification")
        lines.append("  - 'improved': Model provided better SQL that eventually passed verification")
        lines.append("  - 'failed': Model kept trying to improve but never got it right")
        lines.append("  - This reveals which questions the model struggles to verify even with result feedback")

        # Get a sample agent name from current iteration for examples
        sample_agent = "AGENT_NAME"
        if agent_dirs:
            # Use the first agent name, removing the 'agent_' prefix for the tool
            sample_agent = agent_dirs[0].name.replace('agent_', '')
        
        # Format the NEXT iteration number (the one being created)
        next_iter_num = f"{iteration + 1:03d}"
        
        lines.append("")
        lines.append("Available tools for analysis:")
        lines.append("- extract_evaluation_errors.py: Efficiently extract errors from evaluation files")
        lines.append("  * Quickly scan all databases for a specific agent (instead of reading files manually)")
        lines.append("  * Returns random sample of up to 30 errors with full context")
        lines.append("  * Usage examples (write to your output directory):")
        lines.append("    # Default: Get all errors (mismatch, pred_error, pred_timeout) from most recent iteration")
        lines.append(f'    python ../../RoboPhD/tools/extract_evaluation_errors.py --agent {sample_agent} \\')
        lines.append(f'      --output-json ./evolution_output/iteration_{next_iter_num}/error_analysis.json')
        lines.append("    ")
        lines.append("    # Show only successful matches")
        lines.append(f'    python ../../RoboPhD/tools/extract_evaluation_errors.py --agent {sample_agent} --status match \\')
        lines.append(f'      --output-json ./evolution_output/iteration_{next_iter_num}/success_analysis.json')
        lines.append("    ")
        lines.append("    # Limit to specific databases")
        lines.append(f'    python ../../RoboPhD/tools/extract_evaluation_errors.py --agent {sample_agent} \\')
        lines.append(f'      --database hockey,retail_complains,works_cycles \\')
        lines.append(f'      --output-json ./evolution_output/iteration_{next_iter_num}/specific_errors.json')
        lines.append("    ")
        lines.append("    # Get only prediction errors with SQL error messages")
        lines.append(f'    python ../../RoboPhD/tools/extract_evaluation_errors.py --agent {sample_agent} --status pred_error \\')
        lines.append(f'      --output-json ./evolution_output/iteration_{next_iter_num}/pred_errors.json')
        
        return "\n".join(lines)
    
    def _get_previous_iteration_summary(self, iteration: int, test_history: List) -> str:
        """Get performance breakdown for the previous iteration by database."""
        if iteration < 1 or not test_history or iteration > len(test_history):
            return "## Previous Iteration Results\n\nNo previous iteration data available yet."
        
        # Get the previous iteration's results
        prev_results = test_history[iteration - 1]  # test_history is 0-indexed
        
        lines = [f"## Previous Iteration Results (Iteration {iteration})"]
        lines.append("")
        
        # Get all databases tested in this iteration
        databases = set()
        for agent_id, agent_data in prev_results.items():
            databases.update(agent_data.get('databases_tested', []))
        
        if not databases:
            lines.append("No database results available.")
            return "\n".join(lines)
        
        databases = sorted(databases)
        agents = sorted(prev_results.keys())
        
        # Create performance table
        lines.append("### Agent Performance by Database")
        lines.append("")
        
        # Build header
        header = "| Agent |"
        for db in databases:
            # Truncate long database names
            db_display = db[:15] + "..." if len(db) > 15 else db
            header += f" {db_display} |"
        header += " Overall |"
        lines.append(header)
        
        # Build separator
        separator = "|-------|"
        for _ in databases:
            separator += "-------|"
        separator += "--------|"
        lines.append(separator)
        
        # Build rows for each agent
        for agent_id in agents:
            agent_data = prev_results[agent_id]
            # Truncate long agent names
            agent_display = agent_id[:25] + "..." if len(agent_id) > 25 else agent_id
            row = f"| {agent_display} |"
            
            # Get per-database performance from the iteration directory
            iter_dir = self.experiment_dir / f"iteration_{iteration:03d}" / f"agent_{agent_id}"
            
            for db in databases:
                db_eval_file = iter_dir / db / "results" / "evaluation.json"
                if db_eval_file.exists():
                    try:
                        import json
                        with open(db_eval_file, 'r') as f:
                            eval_data = json.load(f)
                        accuracy = eval_data.get('accuracy', 0.0)  # Already in percentage
                        row += f" {accuracy:.1f}% |"
                    except:
                        row += " - |"
                else:
                    row += " - |"
            
            # Overall accuracy
            overall = agent_data.get('accuracy', 0.0)
            row += f" {overall:.1f}% |"
            lines.append(row)
        
        lines.append("")
        
        # Add insights section
        lines.append("### Key Insights for Analysis")
        lines.append("- Compare agent performance across databases to identify strengths/weaknesses")
        lines.append("- Focus on databases where agents show significant performance differences")
        lines.append("- Review evaluation.json files for detailed error patterns in challenging databases")
        
        return "\n".join(lines)
    
    def _format_agent_pool_summary(self, agent_pool: Dict, performance_records: Dict) -> str:
        """Format agent pool summary."""
        lines = []
        for agent_id in sorted(agent_pool.keys()):
            perf = performance_records.get(agent_id, {})
            elo_score = perf.get('elo', 1500)
            lines.append(f"- {agent_id}: {perf.get('mean_accuracy', 0):.1f}% (ELO: {elo_score:.0f})")
        return "\n".join(lines)
    
    def _call_claude_cli(self, prompt: str, working_dir: Path, iteration: int, force_fresh: bool = False, strategy_name: str = None) -> Tuple[str, bool]:
        """Call Claude CLI to generate agent."""
        cli_model = CLAUDE_CLI_MODEL_MAP.get(self.evolution_model, self.evolution_model)

        cmd = [
            self.claude_path,
            "--model", cli_model
        ]

        # Force context reset for research_driven strategies (they read papers which use lots of context)
        force_reset_for_strategy = (
            strategy_name and "research_driven" in strategy_name
        )

        # Determine if we should use context continuation
        should_reset_context = (
            force_fresh or
            self.is_first_evolution_call or
            self.force_fresh_next_evolution or
            force_reset_for_strategy or
            (self.context_reset_interval > 0 and
             self.evolution_count > 0 and
             self.evolution_count % self.context_reset_interval == 1)
        )

        used_continue = False
        if self.evolution_context and not should_reset_context:
            cmd.extend(["--print", prompt, "--continue"])
            used_continue = True
        else:
            cmd.extend(["--print", prompt])
            # Track context resets (but not the very first call)
            if not self.is_first_evolution_call and self.evolution_context:
                if force_reset_for_strategy:
                    reason = 'research_driven_strategy'
                elif force_fresh:
                    reason = 'force_fresh'
                elif self.force_fresh_next_evolution:
                    reason = '5hr_restart'
                else:
                    reason = 'interval'
                self.context_resets.append({
                    'iteration': iteration,
                    'evolution_count': self.evolution_count,
                    'reason': reason
                })
            # Reset the flag after using it
            if self.force_fresh_next_evolution:
                self.force_fresh_next_evolution = False
            
        cmd.extend(["--permission-mode", "bypassPermissions"])
        
        # Set up environment
        env = os.environ.copy()
        if self.agents_directory:
            env['AGENTS_DIRECTORY'] = self.agents_directory
        env['EVOLUTION_ITERATION'] = str(iteration)
        
        try:
            result = subprocess.run(
                cmd,
                cwd=str(working_dir),
                capture_output=True,
                text=True,
                timeout=self.evolution_timeout,
                env=env
            )
            
            if result.returncode == 0:
                if self.is_first_evolution_call:
                    self.is_first_evolution_call = False
                return result.stdout, used_continue
            else:
                # Check for 5-hour limit error in both stderr and stdout
                error_output = result.stderr + result.stdout
                if "5-hour limit reached" in error_output or "resets" in error_output:
                    # Extract reset time from error message - handle various formats
                    # Pattern: "5-hour limit reached ∙ resets 12pm" or similar
                    # Also handle "resets 12pm" without the bullet point
                    match = re.search(r'resets\s*[∙•·]?\s*(\d{1,2}(?:am|pm))', error_output, re.IGNORECASE)
                    if match:
                        reset_time = match.group(1)
                        print(f"🕐 Detected 5-hour limit - reset time: {reset_time}")
                        return self._handle_five_hour_limit(reset_time, iteration, prompt, working_dir)
                    else:
                        # Log if we see "resets" but can't parse the time
                        print(f"⚠️ Possible 5-hour limit detected but couldn't parse reset time")
                        print(f"   Output: {error_output[:200]}")
                
                elif used_continue or "Prompt is too long" in result.stdout:
                    # Retry without --continue if we used it OR if we got "Prompt is too long"
                    if "Prompt is too long" in result.stdout:
                        print(f"  ⚠️ Prompt is too long, retrying without --continue...")
                    else:
                        print(f"  ⚠️ Evolution failed with context continuation, retrying without --continue...")
                    response, _ = self._call_claude_cli(prompt, working_dir, iteration, force_fresh=True, strategy_name=strategy_name)
                    
                    if response and len(response.strip()) >= 100:
                        # Retry succeeded
                        self.evolution_retries.append({
                            'iteration': iteration,
                            'reason': 'context_overflow_likely',
                            'success': True
                        })
                        print(f"  ✅ Retry successful - evolution completed with fresh context")
                        return response, False
                    else:
                        # Retry also failed
                        self.evolution_retries.append({
                            'iteration': iteration,
                            'reason': 'context_overflow_likely',
                            'success': False
                        })
                        print(f"  ❌ Retry also failed - evolution cannot proceed")
                        return "", False

                print(f"⚠️ Claude returned error:")
                if result.stderr:
                    print(f"   stderr: {result.stderr[:500]}")
                if result.stdout:
                    print(f"   stdout: {result.stdout[:500]}")
                
                return "", used_continue
                
        except subprocess.TimeoutExpired:
            print(f"⏱️ Evolution timeout after {self.evolution_timeout}s")
            return "", used_continue
        except Exception as e:
            print(f"❌ Evolution error: {e}")
            return "", used_continue
    
    def _extract_agent_content(self, response: str) -> str:
        """Extract agent content from Claude's response."""
        # Try to find markdown agent definition
        if "---\nname:" in response:
            start = response.find("---\nname:")
            if start != -1:
                return response[start:]
        return response
    
    def _validate_evolution_artifacts(self, evolution_dir: Path, iteration: int) -> Tuple[bool, List[str]]:
        """
        Validate that all required evolution artifacts exist and are properly formatted.

        Args:
            evolution_dir: Directory where evolution artifacts should be
            iteration: Current iteration number

        Returns:
            Tuple of (validation_passed, list_of_errors)
        """
        errors = []

        # Check agent.md exists
        agent_file = evolution_dir / "agent.md"
        if not agent_file.exists():
            errors.append("agent.md not created")
        else:
            # Check agent.md has content and YAML header
            try:
                content = agent_file.read_text()
                if len(content.strip()) < 100:
                    errors.append("agent.md too short (<100 chars)")
                elif not content.strip().startswith('---'):
                    errors.append("agent.md missing YAML header")
                else:
                    # Check for closing YAML delimiter
                    yaml_end = content.find('---', 3)
                    if yaml_end < 0:
                        errors.append("agent.md has incomplete YAML header")
            except Exception as e:
                errors.append(f"agent.md unreadable: {e}")

        # Check eval_instructions.md exists
        eval_file = evolution_dir / "eval_instructions.md"
        if not eval_file.exists():
            errors.append("eval_instructions.md not created")
        else:
            # Check eval_instructions.md has content
            try:
                content = eval_file.read_text()
                if len(content.strip()) < 50:
                    errors.append("eval_instructions.md too short (<50 chars)")
            except Exception as e:
                errors.append(f"eval_instructions.md unreadable: {e}")

        return (len(errors) == 0, errors)

    def _generate_agent_id(self, agent_content: str, iteration: int) -> str:
        """Generate ID for agent based on content."""
        # Try to extract name from frontmatter
        if "name:" in agent_content:
            lines = agent_content.split('\n')
            for line in lines:
                if line.startswith("name:"):
                    name = line.replace("name:", "").strip()
                    # Clean name for filesystem
                    name = name.replace("-", "_").replace(" ", "_")
                    return f"iter{iteration}_{name}"
        
        # Fallback to generic name
        return f"iter{iteration}_evolved_{int(time.time() % 10000)}"

    def _handle_five_hour_limit(self, reset_time_str: str, iteration: int, prompt: str, working_dir: Path) -> Tuple[str, bool]:
        """
        Handle Claude's 5-hour limit by waiting and retrying.

        Args:
            reset_time_str: Reset time from Claude (e.g., "7am", "3pm")
            iteration: Current iteration number
            prompt: The prompt to retry
            working_dir: Working directory for the command

        Returns:
            Tuple of (response, used_continue) or ("", False) if retry fails
        """
        print(f"\n⚠️ Claude 5-hour limit reached at iteration {iteration}")
        print(f"🕐 Reset time: {reset_time_str}")

        # Parse reset time and calculate wait time
        reset_datetime = self._parse_reset_time(reset_time_str)
        wait_until = reset_datetime + timedelta(minutes=1)

        # Find earliest iteration with Phase 1 failures
        restart_iteration = self._find_earliest_iteration_with_phase1_failures(iteration)
        phase1_failure_iterations = []

        # Count failures in each affected iteration for logging
        for iter_num in range(restart_iteration, iteration):
            failures = self._check_phase1_failures_in_iteration(iter_num)
            if failures > 0:
                phase1_failure_iterations.append((iter_num, failures))

        # Log the incident
        incident = {
            'iteration': iteration,
            'timestamp': datetime.now().isoformat(),
            'reset_time': reset_time_str,
            'phase1_failures_detected': len(phase1_failure_iterations),
            'restart_from_iteration': restart_iteration if restart_iteration < iteration else None,
            'retry_successful': False
        }

        if restart_iteration < iteration:
            print(f"⚠️ Detected Phase 1 failures in {len(phase1_failure_iterations)} prior iteration(s)")
            for iter_num, count in phase1_failure_iterations:
                print(f"   Iteration {iter_num}: {count} Phase 1 failure(s)")
            print(f"   These were likely caused by the 5-hour limit")
            print(f"   Will restart from iteration {restart_iteration} after waiting")

        # Wait until 1 minute after reset
        self._wait_until(wait_until)

        # If Phase 1 failures detected, need to handle differently
        if restart_iteration < iteration:
            # Set the specific iteration to restart from
            self.restart_from_iteration = restart_iteration
            # Also set flag to force fresh context on next evolution
            self.force_fresh_next_evolution = True
            incident['retry_successful'] = True  # Will be handled by main loop
            self.five_hour_limit_incidents.append(incident)
            return "", False  # Return empty to trigger restart logic

        # Otherwise, retry the evolution directly
        print(f"🔄 Retrying evolution for iteration {iteration}...")
        result = self._retry_evolution(prompt, working_dir, iteration)

        # Update incident based on retry result
        incident['retry_successful'] = (result[0] != "")
        self.five_hour_limit_incidents.append(incident)

        if incident['retry_successful']:
            print(f"✅ Evolution retry successful for iteration {iteration}")
        else:
            print(f"❌ Evolution retry failed for iteration {iteration}")

        return result

    def _parse_reset_time(self, time_str: str) -> datetime:
        """
        Parse reset time string (e.g., "7am", "3pm") to datetime.

        Args:
            time_str: Time string like "7am" or "3pm"

        Returns:
            datetime object for the reset time (today or tomorrow)
        """
        now = datetime.now()

        # Parse hour and am/pm
        match = re.match(r'(\d+)(am|pm)', time_str.lower())
        if not match:
            # Default to 7:00 AM if parsing fails
            hour = 7
            is_pm = False
        else:
            hour = int(match.group(1))
            is_pm = (match.group(2) == 'pm')

            # Convert to 24-hour format
            if is_pm and hour != 12:
                hour += 12
            elif not is_pm and hour == 12:
                hour = 0

        # Create target time for today
        target_time = now.replace(hour=hour, minute=0, second=0, microsecond=0)

        # If target time has already passed today, use tomorrow
        if now >= target_time:
            target_time += timedelta(days=1)

        return target_time

    def _wait_until(self, target_time: datetime):
        """
        Wait until the target time with periodic status updates.

        Args:
            target_time: datetime to wait until
        """
        now = datetime.now()
        wait_seconds = (target_time - now).total_seconds()

        if wait_seconds <= 0:
            return

        wait_hours = wait_seconds / 3600

        print(f"⏰ Current time: {now.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"⏰ Will retry at: {target_time.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"⏰ Waiting {wait_hours:.1f} hours until Claude limit resets...")

        # Wait with periodic status updates
        while datetime.now() < target_time:
            remaining = (target_time - datetime.now()).total_seconds()

            if remaining <= 0:
                break
            elif remaining > 3600:
                remaining_hours = remaining / 3600
                print(f"   {remaining_hours:.1f} hours remaining...")
                time.sleep(min(3600, remaining))  # Wait up to 1 hour
            elif remaining > 60:
                remaining_mins = remaining / 60
                print(f"   {remaining_mins:.1f} minutes remaining...")
                time.sleep(min(60, remaining))  # Wait up to 1 minute
            else:
                print(f"   {remaining:.0f} seconds remaining...")
                time.sleep(min(10, remaining))  # Wait up to 10 seconds

        print("✅ Claude limit should now be reset!")

    def _check_phase1_failures_in_iteration(self, iteration: int) -> int:
        """
        Count Phase 1 failures for a specific iteration.

        Args:
            iteration: Iteration number to check

        Returns:
            Number of Phase 1 failures in that iteration
        """
        # Need access to researcher's phase1_failures list
        # This will be set from ParallelAgentResearcher
        if hasattr(self, 'researcher_phase1_failures'):
            return sum(1 for _, _, iter_num in self.researcher_phase1_failures if iter_num == iteration)
        return 0

    def _find_earliest_iteration_with_phase1_failures(self, current_iteration: int) -> int:
        """
        Find the earliest iteration in a continuous chain of Phase 1 failures.

        Args:
            current_iteration: The iteration where 5-hour limit was hit

        Returns:
            The earliest iteration with Phase 1 failures to restart from,
            or current_iteration if no failures found
        """
        # Start checking from previous iteration
        earliest_with_failures = current_iteration
        check_iteration = current_iteration - 1

        # Look backwards while we find Phase 1 failures
        while check_iteration > 0:
            failures_count = self._check_phase1_failures_in_iteration(check_iteration)
            if failures_count > 0:
                # Found failures, this might be our restart point
                earliest_with_failures = check_iteration
                check_iteration -= 1  # Keep looking further back
            else:
                # No failures in this iteration, stop looking
                break

        return earliest_with_failures

    def _retry_evolution(self, prompt: str, working_dir: Path, iteration: int) -> Tuple[str, bool]:
        """
        Retry the evolution command after waiting for limit reset.

        Args:
            prompt: The prompt to retry
            working_dir: Working directory for the command
            iteration: Current iteration number

        Returns:
            Tuple of (response, used_continue)
        """
        # Force a fresh call without context continuation
        return self._call_claude_cli(prompt, working_dir, iteration, force_fresh=True)


# Continue in next part due to size...

class ParallelAgentResearcher:
    """Research system for evolving database analysis agents."""
    
    def __init__(self,
                 dataset: str = 'train',
                 num_iterations: int = 10,
                 agents_per_iteration: int = 3,
                 databases_per_iteration: int = 8,
                 questions_per_database: int = 40,
                 eval_model: str = 'sonnet-4',
                 analysis_model: str = 'sonnet-4',
                 evolution_model: str = 'opus-4.1',
                 max_concurrent_dbs: int = 8,
                 random_seed: Optional[int] = None,
                 phase1_timeout: int = 1800,
                 sql_timeout: int = 3600,
                 evolution_timeout: int = 1800,
                 evolution_default: str = 'cross_pollination_judgment',
                 evolution_schedule: Optional[Dict[int, str]] = None,
                 evolution_context: bool = True,
                 resume_mode: bool = False,
                 resume_from_iteration: Optional[int] = None,
                 resume_checkpoint: Optional[Dict] = None,
                 resume_experiment_dir: Optional[Path] = None,
                 dev_eval_mode: bool = False,
                 test_eval_mode: bool = False,
                 custom_experiment_name: Optional[str] = None,
                 agents_directory: Optional[str] = None,
                 number_of_used_papers: int = 0,
                 context_reset_interval: int = 4,
                 evolution_random_pool: Optional[List[str]] = None,
                 evolution_weighted_random: Optional[Dict[str, int]] = None,
                 api_key: Optional[str] = None,
                 verification_retries: int = 2,
                 temperature_strategy: str = "progressive"):
        """Initialize the parallel agent researcher."""
        # Core parameters
        self.dataset = dataset
        self.num_iterations = num_iterations
        self.agents_per_iteration = agents_per_iteration
        self.databases_per_iteration = databases_per_iteration
        self.questions_per_database = questions_per_database
        self.agents_directory = agents_directory
        
        # Models
        self.eval_model = eval_model
        self.analysis_model = analysis_model
        self.evolution_model = evolution_model
        
        # Concurrency and timeouts
        self.max_concurrent_dbs = max_concurrent_dbs
        self.phase1_timeout = phase1_timeout
        self.sql_timeout = sql_timeout
        self.evolution_timeout = evolution_timeout

        # Verification settings
        self.verification_retries = verification_retries
        self.temperature_strategy = temperature_strategy
        
        # Handle resume mode
        self.resume_mode = resume_mode
        self.resume_from_iteration = resume_from_iteration

        if not api_key:
            raise Exception("must pass api key to use for sql generation")
        
        self.api_key = api_key
        
        if resume_mode:
            # Restore state from checkpoint
            self.experiment_dir = resume_experiment_dir
            self.agent_pool = self._restore_agent_pool(resume_checkpoint['agent_pool'])
            self.performance_records = resume_checkpoint['performance_records']
            self.test_history = resume_checkpoint['test_history']
            self.total_cost = resume_checkpoint.get('total_cost', 0.0)
            self.iteration_costs = resume_checkpoint.get('iteration_costs', [])
            self.iteration_times = resume_checkpoint.get('iteration_times', [])
            self.phase1_failures = resume_checkpoint.get('phase1_failures', [])
            self.zero_accuracy_cases = resume_checkpoint.get('zero_accuracy_cases', [])
            self.five_hour_limit_incidents = resume_checkpoint.get('five_hour_limit_incidents', [])
            self.number_of_used_papers = resume_checkpoint.get('number_of_used_papers', 0)
            
            if resume_from_iteration:
                self.archive_iterations(resume_from_iteration)
            
            self.original_seed = resume_checkpoint['random_seed']
            last_completed = resume_checkpoint.get('last_completed_iteration', len(resume_checkpoint.get('test_history', [])))
            current_iteration = resume_from_iteration if resume_from_iteration else last_completed + 1
            self.random_seed = (self.original_seed + current_iteration * 10000) % (2**32)
            random.seed(self.random_seed)
            print(f"🎲 Resume seed: {self.random_seed}")
        else:
            # Normal initialization
            if random_seed:
                random.seed(random_seed)
                self.random_seed = random_seed
            else:
                self.random_seed = random.randint(0, 10000)
                random.seed(self.random_seed)
            
            # Setup experiment directory
            if dev_eval_mode and custom_experiment_name:
                self.experiment_dir = Path("robophd_evaluation") / custom_experiment_name
            elif test_eval_mode and custom_experiment_name:
                self.experiment_dir = Path("robophd_evaluation") / custom_experiment_name
            else:
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                self.experiment_dir = Path("research") / f"robophd_{timestamp}"
            self.experiment_dir.mkdir(parents=True, exist_ok=True)

            # Store evaluation modes
            self.dev_eval_mode = dev_eval_mode
            self.test_eval_mode = test_eval_mode
            
            # Create symlink to papers directory
            papers_source = Path(__file__).parent.parent / "papers"
            papers_link = self.experiment_dir / "papers"
            if papers_source.exists() and not papers_link.exists():
                try:
                    os.symlink(papers_source.absolute(), papers_link.absolute())
                except:
                    pass  # Ignore symlink errors
            
            # Initialize state
            self.agent_pool = {}
            self.performance_records = {}
            self.test_history = []
            self.total_cost = 0.0
            self.iteration_costs = []
            self.iteration_times = []
            self.phase1_failures = []
            self.zero_accuracy_cases = []
            self.five_hour_limit_incidents = []
            self.number_of_used_papers = number_of_used_papers
        
        self.debug = False

        # Ensure test_eval_mode and dev_eval_mode are always set (for both resume and fresh starts)
        if not hasattr(self, 'test_eval_mode'):
            self.test_eval_mode = test_eval_mode
        if not hasattr(self, 'dev_eval_mode'):
            self.dev_eval_mode = dev_eval_mode

        # Initialize components
        self.orchestrator = AgentOrchestrator(
            base_experiment_dir=self.experiment_dir,
            analysis_model=analysis_model,
            timeout_phase1=phase1_timeout
        )
        
        # Setup evolver with proper settings
        if resume_mode and resume_checkpoint and 'experiment_config' in resume_checkpoint:
            config = resume_checkpoint['experiment_config']
            actual_evolution_default = config.get('evolution_default', evolution_default)
            
            if evolution_schedule is not None and isinstance(evolution_schedule, dict) and evolution_schedule:
                actual_evolution_schedule = evolution_schedule
            else:
                saved_schedule = config.get('evolution_schedule', {})
                if saved_schedule and isinstance(saved_schedule, dict):
                    actual_evolution_schedule = {
                        int(k) if isinstance(k, str) and k.isdigit() else k: v 
                        for k, v in saved_schedule.items()
                    }
                else:
                    actual_evolution_schedule = saved_schedule
            
            actual_evolution_context = config.get('evolution_context', evolution_context)
            actual_evolution_random_pool = config.get('evolution_random_pool', [])
            actual_evolution_weighted_random = config.get('evolution_weighted_random', {})
        else:
            actual_evolution_default = evolution_default
            actual_evolution_schedule = evolution_schedule
            actual_evolution_context = evolution_context
            actual_evolution_random_pool = evolution_random_pool
            actual_evolution_weighted_random = evolution_weighted_random

        self.evolver = ParallelAgentEvolver(
            experiment_dir=self.experiment_dir,
            evolution_model=evolution_model,
            evolution_timeout=evolution_timeout,
            evolution_default=actual_evolution_default,
            evolution_schedule=actual_evolution_schedule,
            evolution_context=actual_evolution_context,
            agents_directory=agents_directory,
            context_reset_interval=context_reset_interval,
            evolution_random_pool=actual_evolution_random_pool if resume_mode else evolution_random_pool,
            evolution_weighted_random=evolution_weighted_random  # Use the potentially modified version
        )

        # Pass phase1_failures reference to evolver for 5-hour limit handling
        self.evolver.researcher_phase1_failures = self.phase1_failures

        # Restore evolver state if resuming
        if resume_mode and resume_checkpoint and 'experiment_config' in resume_checkpoint:
            config = resume_checkpoint['experiment_config']
            self.evolver.evolution_count = config.get('evolution_count', 0)
            self.evolver.context_resets = config.get('context_resets', [])
            self.evolver.evolution_retries = config.get('evolution_retries', [])
            self.evolver.evolution_history = config.get('evolution_history', [])
            self.evolver.header_repairs = config.get('header_repairs', [])
            self.evolver.evolution_validation_failures = config.get('evolution_validation_failures', [])
            self.evolver.five_hour_limit_incidents = self.five_hour_limit_incidents
            if 'shuffled_random_pool' in config:
                self.evolver.shuffled_random_pool = config['shuffled_random_pool']
            if 'shuffled_papers_pool' in config:
                self.evolver.shuffled_papers_pool = config['shuffled_papers_pool']

        # Apply pending evolution reset if needed (from archive_iterations)
        if hasattr(self, '_pending_evolution_reset'):
            self._reset_evolution_tracking_for_iteration(self._pending_evolution_reset)
            delattr(self, '_pending_evolution_reset')

        self.memory_monitor = MemoryMonitor()
        
        # Load data
        self._load_data()
        
        # Initialize SQL generator and evaluator
        self.sql_generator = SQLGenerator(
            eval_model=eval_model,
            questions_file=self.questions_file,
            timeout=sql_timeout,
            use_evidence=True,
            api_key=self.api_key,
            verification_retries=self.verification_retries,
            temperature_strategy=self.temperature_strategy
        )
        
        self.evaluator = Evaluator(
            questions_file=self.questions_file,
            db_root=self.db_root
        )

        # Initialize test output generator for test-eval mode
        if self.test_eval_mode:
            self.test_output_generator = TestOutputGenerator()
        
        print(f"\n🔬 RoboPhD Parallel Agent Researcher initialized")
        print(f"📂 Experiment directory: {self.experiment_dir}")
        print(f"🎲 Random seed: {self.random_seed}")
        print(f"🔗 Evolution context: {'enabled' if evolution_context else 'disabled'}")
    
    def _load_data(self):
        """Load questions and databases."""
        # Determine paths based on dataset
        if self.dataset == 'train':
            self.questions_file = Path("benchmark_resources/datasets/train/train/train.json")
            self.db_root = Path("benchmark_resources/datasets/train/train/train_databases")
        elif self.dataset == 'test':
            self.questions_file = Path("benchmark_resources/datasets/test/test/test.json")
            self.db_root = Path("benchmark_resources/datasets/test/test/test_databases")
        else:  # dev
            self.questions_file = Path("benchmark_resources/datasets/dev/dev_20240627/dev.json")
            self.db_root = Path("benchmark_resources/datasets/dev/dev_20240627/dev_databases")
        
        # Load questions
        with open(self.questions_file, 'r') as f:
            self.all_questions = json.load(f)
        
        # Group questions by database
        # Add question_id if not present (train dataset doesn't have it)
        self.questions_by_db = {}
        for idx, q in enumerate(self.all_questions):
            # Add question_id if missing (using array index)
            if 'question_id' not in q:
                q['question_id'] = idx
            
            db_name = q['db_id']
            if db_name not in self.questions_by_db:
                self.questions_by_db[db_name] = []
            self.questions_by_db[db_name].append(q)
        
        # Get available databases (excluding problematic ones)
        excluded_dbs = DatabaseManager.BLACKLISTED_DATABASES
        self.databases = []
        
        if self.db_root.exists():
            for db_dir in self.db_root.iterdir():
                if db_dir.is_dir() and db_dir.name not in excluded_dbs:
                    db_file = db_dir / f"{db_dir.name}.sqlite"
                    if db_file.exists():
                        self.databases.append(db_dir.name)
        
        print(f"📊 Loaded {len(self.all_questions)} questions from {len(self.databases)} databases")
    
    @classmethod
    def load_checkpoint(cls, experiment_dir: Path) -> Dict:
        """Load checkpoint from an experiment directory."""
        checkpoint_path = experiment_dir / 'checkpoint.json'
        if not checkpoint_path.exists():
            raise FileNotFoundError(f"No checkpoint found at {checkpoint_path}")
        
        with open(checkpoint_path, 'r') as f:
            return json.load(f)
    
    def _restore_agent_pool(self, pool_data: Dict) -> Dict:
        """Restore agent pool from checkpoint data."""
        restored_pool = {}
        for agent_id, agent_info in pool_data.items():
            # Convert path string back to Path object
            path = Path(agent_info['path']) if isinstance(agent_info['path'], str) else agent_info['path']
            
            # Read current content from file
            content = path.read_text() if path.exists() else agent_info.get('content', '')
            
            # Determine package directory (handle both old and new checkpoint formats)
            if 'package_dir' in agent_info:
                # New format - has package_dir saved
                package_dir = Path(agent_info['package_dir']) if isinstance(agent_info['package_dir'], str) else agent_info['package_dir']
            else:
                # Old format - reconstruct from agent path
                # Path is like: research/robophd_20250830_223700/agents/iter3_defensive_schema_analyzer/agent.md
                package_dir = path.parent if path.name == 'agent.md' else path.parent
            
            # Check if this is a three-artifact package
            eval_instructions_file = package_dir / 'eval_instructions.md'
            tools_dir = package_dir / 'tools'
            
            # Build the restored agent info with three-artifact structure
            restored_agent = {
                'path': path,
                'content': content,
                'source': agent_info.get('source', 'restored'),
                'created_iteration': agent_info.get('created_iteration', 0),
                'evolution_strategy': agent_info.get('evolution_strategy', None),  # Restore evolution strategy
                'package_dir': package_dir,
                'package_type': 'three_artifact'  # We only support three-artifact now
            }
            
            # Add three-artifact specific fields if they exist
            if eval_instructions_file.exists():
                restored_agent['eval_instructions_file'] = eval_instructions_file
            
            if tools_dir.exists() and tools_dir.is_dir():
                restored_agent['tools_dir'] = tools_dir
            
            restored_pool[agent_id] = restored_agent
        
        return restored_pool
    
    def archive_iterations(self, from_iteration: int):
        """Archive existing iterations from a specific point onwards."""
        import shutil
        from datetime import datetime
        
        # Find iterations to archive
        iterations_to_archive = []
        for item in self.experiment_dir.iterdir():
            if item.is_dir() and item.name.startswith('iteration_'):
                try:
                    iter_num = int(item.name.split('_')[1])
                    if iter_num >= from_iteration:
                        iterations_to_archive.append(item)
                except (IndexError, ValueError):
                    continue
        
        # Find evolution_output directories to archive
        evolution_dirs_to_archive = []
        evolution_output_dir = self.experiment_dir / "evolution_output"
        if evolution_output_dir.exists():
            for item in evolution_output_dir.iterdir():
                if item.is_dir() and item.name.startswith('iteration_'):
                    try:
                        iter_num = int(item.name.split('_')[1])
                        if iter_num >= from_iteration:
                            evolution_dirs_to_archive.append(item)
                    except (IndexError, ValueError):
                        continue
        
        # Archive if there's anything to archive (either iterations or evolution_output)
        if iterations_to_archive or evolution_dirs_to_archive:
            # Create archive directory with timestamp
            archive_dir = self.experiment_dir / f"archived_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
            archive_dir.mkdir(exist_ok=True)
            
            # Archive iterations if any
            if iterations_to_archive:
                print(f"📦 Archiving {len(iterations_to_archive)} iterations to {archive_dir.name}/")
                for iteration_dir in iterations_to_archive:
                    dest = archive_dir / iteration_dir.name
                    print(f"  Moving {iteration_dir.name} to archive...")
                    # Use copytree + rmtree for consistency and reliability
                    shutil.copytree(str(iteration_dir), str(dest), dirs_exist_ok=True, symlinks=True)
                    shutil.rmtree(str(iteration_dir))
            
            # Archive evolution_output directories if any
            if evolution_dirs_to_archive:
                print(f"📦 Archiving {len(evolution_dirs_to_archive)} evolution_output directories to {archive_dir.name}/")
                archive_evolution = archive_dir / "evolution_output"
                archive_evolution.mkdir(exist_ok=True)
                for evo_dir in evolution_dirs_to_archive:
                    dest = archive_evolution / evo_dir.name
                    print(f"  Moving evolution_output/{evo_dir.name} to archive...")
                    # Use copytree + rmtree instead of move to ensure all contents are archived
                    # This is more reliable for complex directory structures with subdirectories
                    shutil.copytree(str(evo_dir), str(dest), dirs_exist_ok=True, symlinks=True)
                    shutil.rmtree(str(evo_dir))
        
        # Archive and remove agents created in archived iterations
        agents_to_archive = []
        agents_archive_dir = None
        
        if hasattr(self, 'agent_pool'):
            for agent_id, agent_info in self.agent_pool.items():
                if agent_info.get('created_iteration', 0) >= from_iteration:
                    agents_to_archive.append(agent_id)
            
            if agents_to_archive:
                print(f"📦 Archiving {len(agents_to_archive)} agents created in iterations {from_iteration}+")
                
                # Create agents archive directory
                agents_archive_dir = archive_dir / 'agents'
                agents_archive_dir.mkdir(exist_ok=True)
                
                # Move agent directories to archive (removes from original location)
                agents_dir = self.experiment_dir / "agents"
                for agent_id in agents_to_archive:
                    src = agents_dir / agent_id
                    if src.exists():
                        dest = agents_archive_dir / agent_id
                        print(f"  Moving agent {agent_id} to archive...")
                        shutil.move(str(src), str(dest))
                    
                    # Remove from agent pool
                    del self.agent_pool[agent_id]
                    
                    # Don't delete from performance_records, just clean iteration_results
                    # This preserves the agent's history up to the archive point
                
                print(f"  🧹 Removed {len(agents_to_archive)} agents from active pool and agents/ directory")
        
        # Also check for orphaned agent directories (created but not in pool)
        # These can occur when evolution fails after creating directories
        agents_dir = self.experiment_dir / "agents"
        if agents_dir.exists():
            orphaned_agents = []
            for agent_dir in agents_dir.iterdir():
                if agent_dir.is_dir() and agent_dir.name.startswith('iter'):
                    # Extract iteration number from agent name (e.g., iter25_resilient_fusion -> 25)
                    try:
                        iter_num = int(agent_dir.name.split('_')[0].replace('iter', ''))
                        if iter_num >= from_iteration and agent_dir.name not in agents_to_archive:
                            orphaned_agents.append(agent_dir.name)
                    except (ValueError, IndexError):
                        continue
            
            if orphaned_agents:
                print(f"📦 Archiving {len(orphaned_agents)} orphaned agents from iterations {from_iteration}+")
                # Ensure agents archive directory exists
                if agents_archive_dir is None:
                    agents_archive_dir = archive_dir / 'agents'
                    agents_archive_dir.mkdir(exist_ok=True)
                
                for agent_id in orphaned_agents:
                    src = agents_dir / agent_id
                    dest = agents_archive_dir / agent_id
                    print(f"  Moving orphaned agent {agent_id} to archive...")
                    shutil.move(str(src), str(dest))
                print(f"  🧹 Removed {len(orphaned_agents)} orphaned agents from agents/ directory")
        
        # Trim data arrays to remove archived iterations
        if from_iteration > 1:
            self.test_history = self.test_history[:from_iteration - 1]
            
            # Clean up iteration_results in performance_records for archived iterations
            # This prevents duplicate entries when resuming
            for agent_id in self.performance_records:
                if 'iteration_results' in self.performance_records[agent_id]:
                    # Remove any results from archived iterations
                    cleaned_results = [
                        result for result in self.performance_records[agent_id]['iteration_results']
                        if result.get('iteration', 0) < from_iteration
                    ]
                    self.performance_records[agent_id]['iteration_results'] = cleaned_results
                    
                    # Recalculate summary statistics based on cleaned results
                    if cleaned_results:
                        total_correct = sum(r.get('correct', 0) for r in cleaned_results if 'correct' in r)
                        total_questions = sum(r.get('total', 0) for r in cleaned_results if 'total' in r)
                        # If we don't have correct/total, calculate from accuracy
                        if total_questions == 0:
                            for r in cleaned_results:
                                if 'accuracy' in r and 'databases' in r:
                                    # Estimate based on questions per database
                                    questions = r['databases'] * self.questions_per_database
                                    total_questions += questions
                                    total_correct += int(questions * r['accuracy'] / 100)
                        
                        self.performance_records[agent_id]['test_count'] = len(cleaned_results)
                        self.performance_records[agent_id]['total_correct'] = total_correct
                        self.performance_records[agent_id]['total_questions'] = total_questions
                        if total_questions > 0:
                            self.performance_records[agent_id]['mean_accuracy'] = (total_correct / total_questions) * 100
                    else:
                        # No results left, reset statistics
                        self.performance_records[agent_id]['test_count'] = 0
                        self.performance_records[agent_id]['total_correct'] = 0
                        self.performance_records[agent_id]['total_questions'] = 0
                        self.performance_records[agent_id]['mean_accuracy'] = 0
            
            # Recalculate all ELO scores from the cleaned test_history
            # This ensures consistency after archiving
            print("  🎲 Recalculating ELO scores from cleaned test history...")
            self._recalculate_all_elo_scores()
            
            # Subtract cost/time of archived iterations from totals
            if len(self.iteration_costs) >= from_iteration:
                archived_cost = sum(self.iteration_costs[from_iteration - 1:])
                archived_time = sum(self.iteration_times[from_iteration - 1:])
                
                self.total_cost -= archived_cost
                self.iteration_costs = self.iteration_costs[:from_iteration - 1]
                self.iteration_times = self.iteration_times[:from_iteration - 1]
                
                print(f"  💰 Subtracted archived cost: ${archived_cost:.2f}")
                print(f"  ⏱️  Subtracted archived time: {archived_time/60:.1f} minutes")
            
            # Clear failure records for archived iterations
            original_failures = len(self.phase1_failures) if hasattr(self, 'phase1_failures') else 0
            if hasattr(self, 'phase1_failures'):
                self.phase1_failures = [
                    (agent_id, db_name, iter_num) 
                    for agent_id, db_name, iter_num in self.phase1_failures 
                    if iter_num < from_iteration
                ]
            
            original_zero_cases = len(self.zero_accuracy_cases) if hasattr(self, 'zero_accuracy_cases') else 0
            if hasattr(self, 'zero_accuracy_cases'):
                self.zero_accuracy_cases = [
                    (agent_id, db_name, iter_num, total_q) 
                    for agent_id, db_name, iter_num, total_q in self.zero_accuracy_cases 
                    if iter_num < from_iteration
                ]
            
            if original_failures > 0 and original_failures != len(self.phase1_failures):
                print(f"  🧹 Cleared {original_failures - len(self.phase1_failures)} Phase 1 failure records")
            if original_zero_cases > 0 and original_zero_cases != len(self.zero_accuracy_cases):
                print(f"  🧹 Cleared {original_zero_cases - len(self.zero_accuracy_cases)} zero accuracy records")
            
            # Note: Paper tracking is handled automatically via evolution_history cleanup
            # No need to adjust number_of_used_papers as it's the base count
            
            # Reset evolution tracking to match the new starting point
            self._reset_evolution_tracking_for_iteration(from_iteration)
    
    def _reset_evolution_tracking_for_iteration(self, from_iteration: int):
        """
        Reset evolution tracking when restarting from a specific iteration.
        
        This ensures that:
        1. Evolution count is properly adjusted
        2. Evolution history is trimmed to match archived iterations
        3. The evolver's first_evolution_call flag is reset appropriately
        
        Args:
            from_iteration: The iteration we're restarting from
        """
        # Only reset if evolver exists (it won't exist yet during __init__)
        if not hasattr(self, 'evolver'):
            # Store the reset request for later when evolver is initialized
            self._pending_evolution_reset = from_iteration
            return
            
        # Calculate how many evolutions occurred before from_iteration
        evolutions_before = 0
        for hist_entry in self.evolver.evolution_history:
            if hist_entry['iteration'] < from_iteration:
                if hist_entry['strategy'].lower() not in ['none', 'skip']:
                    evolutions_before += 1
        
        # Reset evolution count to match what it should be at from_iteration - 1
        self.evolver.evolution_count = evolutions_before
        
        # Trim evolution history to remove archived iterations
        # This ensures we don't have duplicate entries when re-running iterations
        self.evolver.evolution_history = [
            entry for entry in self.evolver.evolution_history
            if entry['iteration'] < from_iteration
        ]

        # Count how many random selections we're keeping
        # This preserves the random selection sequence
        remaining_random_count = sum(1 for entry in self.evolver.evolution_history
                                    if entry.get('was_random', False))
        print(f"     Keeping {remaining_random_count} random selections in history")
        
        # Trim context resets and retries similarly
        self.evolver.context_resets = [
            reset for reset in self.evolver.context_resets
            if reset.get('iteration', 999) < from_iteration
        ]
        
        self.evolver.evolution_retries = [
            retry for retry in self.evolver.evolution_retries
            if retry.get('iteration', 999) < from_iteration
        ]

        # Trim validation failures and header repairs
        if hasattr(self.evolver, 'evolution_validation_failures'):
            self.evolver.evolution_validation_failures = [
                failure for failure in self.evolver.evolution_validation_failures
                if failure.get('iteration', 999) < from_iteration
            ]

        if hasattr(self.evolver, 'header_repairs'):
            self.evolver.header_repairs = [
                repair for repair in self.evolver.header_repairs
                if repair.get('iteration', 999) < from_iteration
            ]
        
        # Reset the first evolution call flag based on whether we've done any evolutions
        # If we're at iteration 9 and have done evolutions, this should be False
        self.evolver.is_first_evolution_call = (evolutions_before == 0)
        
        print(f"  🔄 Reset evolution tracking:")
        print(f"     Evolution count: {self.evolver.evolution_count}")
        print(f"     Evolution history entries: {len(self.evolver.evolution_history)}")
        print(f"     First evolution call: {self.evolver.is_first_evolution_call}")
    
    def load_initial_agents(self, agent_list: Optional[List[str]] = None):
        """
        Load initial three-artifact agents from agents directory.
        
        Args:
            agent_list: Optional list of specific agent names to load
        """
        # Use custom agents directory if specified, otherwise default to RoboPhD/agents/
        if self.agents_directory:
            agents_dir = Path(self.agents_directory)
        else:
            agents_dir = Path(__file__).parent / 'agents'
        
        if not agent_list:
            # Auto-discover all three-artifact agent directories
            agent_dirs = [d for d in agents_dir.iterdir() if d.is_dir() and (d / 'agent.md').exists()]
        else:
            # Load specific agents
            agent_dirs = []
            for name in agent_list:
                agent_dir = agents_dir / name
                if agent_dir.exists() and agent_dir.is_dir():
                    if (agent_dir / 'agent.md').exists():
                        agent_dirs.append(agent_dir)
                    else:
                        print(f"  ⚠️ Agent directory missing agent.md: {name}")
                else:
                    print(f"  ⚠️ Agent not found: {name}")
        
        for agent_dir in agent_dirs:
            agent_id = agent_dir.name
            
            # Copy entire agent directory to local agents directory
            local_agents_dir = self.experiment_dir / "agents"
            local_agents_dir.mkdir(exist_ok=True)
            local_agent_dir = local_agents_dir / agent_id
            
            # Remove existing directory if it exists
            if local_agent_dir.exists():
                shutil.rmtree(local_agent_dir)
            
            # Copy the entire directory
            shutil.copytree(agent_dir, local_agent_dir, symlinks=True)
            
            # Load agent.md content
            agent_file = local_agent_dir / 'agent.md'
            agent_content = agent_file.read_text()
            
            # Check for three-artifact structure
            eval_instructions_file = local_agent_dir / 'eval_instructions.md'
            tools_dir = local_agent_dir / 'tools'
            
            agent_info = {
                'path': agent_file,
                'content': agent_content,
                'source': 'initial',
                'created_iteration': 0,
                'evolved_tools': None,
                'package_dir': local_agent_dir,
                'package_type': 'three_artifact'
            }
            
            # Add three-artifact specific paths if they exist
            if eval_instructions_file.exists():
                agent_info['eval_instructions_file'] = eval_instructions_file
            if tools_dir.exists() and tools_dir.is_dir():
                agent_info['tools_dir'] = tools_dir
            
            self.agent_pool[agent_id] = agent_info
            
            # Initialize performance record
            self.performance_records[agent_id] = {
                'test_count': 0,
                'total_correct': 0,
                'total_questions': 0,
                'mean_accuracy': 0.0,
                'elo': 1500,
                'iteration_results': []
            }
            
            print(f"  🤖 Loaded three-artifact agent: {agent_id}")
        
        print(f"\n✅ Loaded {len(self.agent_pool)} initial agents")
    
    def process_database(self,
                        iteration: int,
                        db_name: str,
                        agent_id: str) -> Dict:
        """
        Process a single database with a specific agent.
        
        Args:
            iteration: Current iteration number
            db_name: Database name
            agent_id: Agent ID to use
            
        Returns:
            Dictionary with results
        """
        from datetime import datetime
        timestamp = datetime.now().strftime("%H:%M:%S")
        print(f"    [{timestamp}] {agent_id} | {db_name}: Starting...")
        
        # Get agent info
        agent_info = self.agent_pool.get(agent_id)
        if not agent_info:
            return {'success': False, 'error': f'Agent not found: {agent_id}', 'database': db_name}
        
        agent_file = agent_info['path']
        
        # Get database path
        db_path = self.db_root / db_name / f"{db_name}.sqlite"
        if not db_path.exists():
            return {'success': False, 'error': 'Database not found', 'database': db_name}
        
        # Setup workspace with the agent
        # RoboPhD only needs package_dir and agent_id
        workspace = self.orchestrator.setup_workspace(
            iteration=iteration,
            database_name=db_name,
            database_path=db_path,
            package_dir=agent_info.get('package_dir'),
            agent_id=agent_id
        )
        
        try:
            # Phase 1: Agent analyzes database (no instructions needed!)
            success, prompt_content = self.orchestrator.run_phase1(workspace)
            
            if not success:
                # Create evaluation.json for failed Phase 1
                questions = self.questions_by_db.get(db_name, [])
                sampled = random.sample(questions, min(self.questions_per_database, len(questions))) if questions else []
                
                evaluation = {
                    'database': db_name,
                    'total_questions': len(sampled),
                    'correct': 0,
                    'accuracy': 0.0,
                    'error': 'Phase 1 failed',
                    'results': {}
                }
                
                results_dir = workspace / "results"
                results_dir.mkdir(exist_ok=True)
                with open(results_dir / "evaluation.json", 'w') as f:
                    json.dump(evaluation, f, indent=2)
                
                return {'success': False, 'error': 'Phase 1 failed', 'database': db_name}
            
            # Save results
            results_dir = workspace / "results"
            results_dir.mkdir(exist_ok=True)

            # Generate SQL for sampled questions
            questions = self.questions_by_db.get(db_name, [])
            if not questions:
                # Create evaluation.json for no questions case
                evaluation = {
                    'database': db_name,
                    'total_questions': 0,
                    'correct': 0,
                    'accuracy': 0.0,
                    'error': 'No questions found',
                    'results': {}
                }
                
                with open(results_dir / "evaluation.json", 'w') as f:
                    json.dump(evaluation, f, indent=2)
                
                return {'success': False, 'error': 'No questions found', 'database': db_name}
            
            # Sample questions
            sampled = random.sample(questions, min(self.questions_per_database, len(questions)))
            
            timestamp = datetime.now().strftime("%H:%M:%S")
            print(f"    [{timestamp}] {agent_id} | {db_name}: Generating SQL for {len(sampled)} questions...")
            
            # Generate predictions using SQLGenerator
            # First write prompt to temp file
            import tempfile
            with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
                f.write(prompt_content)
                prompt_file = Path(f.name)
            
            # Create temporary questions file with just sampled questions
            with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
                json.dump(sampled, f)
                temp_questions_file = Path(f.name)
            
            try:
                # Create a temporary SQL generator with the sampled questions
                # Note: SQLGenerator is already imported at the top of the file
                temp_sql_generator = SQLGenerator(
                    eval_model=self.eval_model,
                    questions_file=temp_questions_file,
                    timeout=self.sql_timeout,
                    use_evidence=True,
                    api_key=self.api_key,
                    verification_retries=self.verification_retries,
                    temperature_strategy=self.temperature_strategy
                )
                
                # Generate SQL for the database (returns tuple of (predictions_dict, cost))
                result, cost = temp_sql_generator.generate(
                    prompt_file=prompt_file,
                    db_name=db_name,
                    db_path=db_path
                )
                
                # Update total cost
                self.total_cost += cost
                
                if result and result.get('predictions'):
                    # predictions is a dictionary with question_id as keys and the bird sql format
                    # which is: {predicted_sql}\t----- bird -----\t{db_name}
                    #
                    predictions_dict = result['predictions']

                    # log out any sql validation+retries
                    validation_stats = result.get('metadata', {}).get('validation_stats', {})
                    timestamp = datetime.now().strftime("%H:%M:%S")
                    print(f"    [{timestamp}] {agent_id} | {db_name}: SQL Validation Stats = {json.dumps(validation_stats)}")
                    
                    # Convert to list format expected by evaluator
                    predictions = []
                    for q in sampled:
                        qid = str(q['question_id'])
                        if qid in predictions_dict:
                            predictions.append({
                                'question_id': q['question_id'],
                                'SQL': predictions_dict[qid],  # Keep full format with bird marker
                                'db_id': db_name  # Include db_id for the evaluator
                            })
                    
                    if not predictions:
                        print(f"    ⚠️ No matching predictions for sampled questions")
                else:
                    predictions = []
                    print(f"    ⚠️ No predictions generated for {db_name}")
            finally:
                # Clean up temp files
                if prompt_file.exists():
                    prompt_file.unlink()
                if temp_questions_file.exists():
                    temp_questions_file.unlink()
            
            # Evaluate/generate output if we have predictions
            if predictions:
                if self.test_eval_mode:
                    # Create a pseudo-evaluation structure for compatibility
                    test_output = self.test_output_generator.generate_output(
                        predictions_dict,
                        sampled
                    )
                    evaluation = {
                        'database': db_name,
                        'total_questions': len(test_output),
                        'test_output': test_output  
                    }

                    with open(results_dir / "bird_output.json", 'w') as f:
                        json.dump(predictions_dict, f, indent=2)

                else:
                    # Normal evaluation for dev/train modes
                    # Pass the full result which includes both predictions and detailed_results
                    evaluation = self.evaluator.evaluate(
                        result,  # Pass full result including detailed_results with verification info
                        db_name
                    )
            else:
                evaluation = {
                    'database': db_name,
                    'total_questions': len(sampled),
                    'correct': 0,
                    'accuracy': 0.0,
                    'results': []
                }
            
            # Get accuracy from evaluation
            correct = evaluation.get('correct', 0)
            total = evaluation.get('total_questions', len(sampled))
            accuracy = evaluation.get('accuracy', 0.0)  # Already in percentage format
            
            timestamp = datetime.now().strftime("%H:%M:%S")
            print(f"    [{timestamp}] {agent_id} | {db_name}: Accuracy = {accuracy:.1f}%")
            
            with open(results_dir / "evaluation.json", 'w') as f:
                json.dump(evaluation, f, indent=2)
            
            return {
                'success': True,
                'database': db_name,
                'agent_id': agent_id,
                'accuracy': accuracy,  # Already in percentage format (0-100)
                'correct': correct,
                'total': total,
                'evaluation': evaluation
            }
            
        except Exception as e:
            import traceback
            print(f"    ❌ {agent_id} | {db_name}: Error - {e}")
            traceback.print_exc()
            
            # Create evaluation.json for exception case
            questions = self.questions_by_db.get(db_name, [])
            sampled_count = min(self.questions_per_database, len(questions)) if questions else 0
            
            evaluation = {
                'database': db_name,
                'total_questions': sampled_count,
                'correct': 0,
                'accuracy': 0.0,
                'error': str(e),
                'results': {}
            }
            
            # Try to save evaluation.json if workspace exists
            try:
                results_dir = workspace / "results"
                results_dir.mkdir(exist_ok=True)
                with open(results_dir / "evaluation.json", 'w') as f:
                    json.dump(evaluation, f, indent=2)
            except:
                # If we can't even save the file, just continue
                pass
            
            return {'success': False, 'error': str(e), 'database': db_name}
    
    
    def run_iteration(self, iteration: int, selected_agents: List[str], databases: List[str]) -> Dict:
        """
        Run one iteration testing selected agents on databases.
        
        Args:
            iteration: Iteration number
            selected_agents: List of agent IDs to test
            databases: List of databases to test on
            
        Returns:
            Dictionary with iteration results
        """
        print(f"\n{'='*60}")
        print(f"ITERATION {iteration}")
        print(f"{'='*60}")
        print(f"Agents: {', '.join(selected_agents)}")
        print(f"Databases: {', '.join(databases)}")
        
        # Create tasks for parallel processing
        tasks = []
        for agent_id in selected_agents:
            for db_name in databases:
                tasks.append((agent_id, db_name))
        
        # Process in parallel
        results_by_agent = {agent_id: [] for agent_id in selected_agents}
        
        with ThreadPoolExecutor(max_workers=self.max_concurrent_dbs) as executor:
            futures = {}
            for agent_id, db_name in tasks:
                future = executor.submit(
                    self.process_database,
                    iteration,
                    db_name,
                    agent_id
                )
                futures[future] = (agent_id, db_name)
            
            # Collect results
            for future in as_completed(futures):
                agent_id, db_name = futures[future]
                try:
                    result = future.result()
                    results_by_agent[agent_id].append(result)
                    
                    # Track Phase 1 failures
                    if not result.get('success') and result.get('error') == 'Phase 1 failed':
                        self.phase1_failures.append((agent_id, db_name, iteration))
                    
                    # Track zero accuracy (Phase 1 succeeded but 0% accuracy)
                    elif result.get('success') and result.get('accuracy', -1) == 0:
                        total_q = result.get('total', 0)
                        if total_q > 0:  # Only track if questions were actually tested
                            self.zero_accuracy_cases.append((agent_id, db_name, iteration, total_q))
                            
                except Exception as e:
                    print(f"  ❌ Error processing {agent_id} on {db_name}: {e}")
                    results_by_agent[agent_id].append({
                        'success': False,
                        'error': str(e),
                        'database': db_name
                    })
        
        # Calculate metrics for each agent
        iteration_results = {}
        for agent_id, results in results_by_agent.items():
            successful = [r for r in results if r.get('success')]
            failed = [r for r in results if not r.get('success')]

            # For Phase 1 failures, count them as 0 correct answers
            # We need to estimate the number of questions that would have been tested
            phase1_failures = [r for r in failed if r.get('error') == 'Phase 1 failed']

            total_correct = sum(r.get('correct', 0) for r in successful)
            total_questions = sum(r.get('total', 0) for r in successful)

            # Add estimated questions for Phase 1 failures (use questions_per_database)
            # Phase 1 failures mean 0 correct answers but still count towards total
            if phase1_failures:
                # Each Phase 1 failure represents questions_per_database questions with 0 correct
                failed_db_questions = len(phase1_failures) * self.questions_per_database
                total_questions += failed_db_questions
                # total_correct stays the same (0 added for failures)

            accuracy = (total_correct / total_questions * 100) if total_questions > 0 else 0

            iteration_results[agent_id] = {
                'accuracy': accuracy,
                'correct': total_correct,
                'total': total_questions,
                'databases_tested': [r['database'] for r in successful],
                'failures': len(results) - len(successful)
            }
            
            # Update performance records
            perf = self.performance_records[agent_id]
            perf['test_count'] += 1
            perf['total_correct'] += total_correct
            perf['total_questions'] += total_questions
            perf['mean_accuracy'] = (perf['total_correct'] / perf['total_questions'] * 100) if perf['total_questions'] > 0 else 0
            perf['iteration_results'].append({
                'iteration': iteration,
                'accuracy': accuracy,
                'databases': len(successful)
            })
            
            # Accuracy is already a percentage
            print(f"\n{agent_id}: {accuracy:.1f}% ({total_correct}/{total_questions})")
        
        # Determine winner(s)
        # Find all agents with the highest accuracy
        max_accuracy = max(iteration_results[k]['accuracy'] for k in iteration_results.keys())
        winners = [k for k in iteration_results.keys() if iteration_results[k]['accuracy'] == max_accuracy]

        if len(winners) == 1:
            print(f"\n🏆 Iteration {iteration} winner: {winners[0]} ({max_accuracy:.1f}%)")
        else:
            print(f"\n🏆 Iteration {iteration} tied winners: {', '.join(winners)} ({max_accuracy:.1f}%)")
        
        # Store results in test_history BEFORE updating ELO scores
        # This ensures _recalculate_all_elo_scores() has complete data
        self.test_history.append(iteration_results)
        
        # Update ELO scores
        self._update_elo_scores(iteration_results)
        
        return iteration_results
    
    @staticmethod
    def _calculate_elo_updates(current_elos: Dict[str, float], iteration_results: Dict, k: int = 32) -> Dict[str, float]:
        """
        Calculate updated ELO scores based on head-to-head results, properly handling ties.
        
        Args:
            current_elos: Dictionary of agent_id -> current ELO score
            iteration_results: Dictionary of agent_id -> {'accuracy': float, ...}
            k: K-factor for ELO calculations (default 32)
            
        Returns:
            Dictionary of agent_id -> updated ELO score
        """
        # Create a copy to avoid modifying the input
        updated_elos = current_elos.copy()
        agents = list(iteration_results.keys())
        
        # Group agents by accuracy to identify ties
        accuracy_groups = {}
        for agent in agents:
            acc = iteration_results[agent]['accuracy']
            if acc not in accuracy_groups:
                accuracy_groups[acc] = []
            accuracy_groups[acc].append(agent)
        
        # Process ties within groups (each agent draws against others in same group)
        for acc, group in accuracy_groups.items():
            if len(group) > 1:
                # Process all pairs within the tied group
                for i, agent1 in enumerate(group):
                    for agent2 in group[i+1:]:
                        # Handle as a draw (0.5 points each)
                        elo1 = updated_elos[agent1]
                        elo2 = updated_elos[agent2]
                        
                        expected1 = 1 / (1 + 10**((elo2 - elo1) / 400))
                        expected2 = 1 / (1 + 10**((elo1 - elo2) / 400))
                        
                        updated_elos[agent1] += k * (0.5 - expected1)
                        updated_elos[agent2] += k * (0.5 - expected2)
        
        # Process wins/losses between different accuracy groups
        sorted_groups = sorted(accuracy_groups.keys(), reverse=True)
        for i, higher_acc in enumerate(sorted_groups[:-1]):
            for lower_acc in sorted_groups[i+1:]:
                for winner in accuracy_groups[higher_acc]:
                    for loser in accuracy_groups[lower_acc]:
                        # Winner beats loser
                        winner_elo = updated_elos[winner]
                        loser_elo = updated_elos[loser]
                        
                        # ELO calculation
                        expected_winner = 1 / (1 + 10**((loser_elo - winner_elo) / 400))
                        expected_loser = 1 / (1 + 10**((winner_elo - loser_elo) / 400))
                        
                        updated_elos[winner] += k * (1 - expected_winner)
                        updated_elos[loser] += k * (0 - expected_loser)
        
        return updated_elos
    
    def _recalculate_all_elo_scores(self):
        """
        Recalculate all ELO scores from scratch based on test_history.
        This ensures consistency and prevents accumulated errors.
        """
        # Reset all ELO scores to base
        cumulative_elo_scores = {}
        
        # Process all iterations in test_history
        for iteration_data in self.test_history:
            # Initialize new agents with base ELO
            for agent in iteration_data:
                if agent not in cumulative_elo_scores:
                    cumulative_elo_scores[agent] = 1500.0
            
            # Get accuracies for this iteration
            # Convert from percentage to decimal (accuracy is stored as percentage in test_history)
            iteration_results = {
                agent: {'accuracy': data['accuracy'] / 100.0} 
                for agent, data in iteration_data.items()
            }
            
            # Calculate updated ELO scores using the shared logic
            current_elos_for_iteration = {
                agent: cumulative_elo_scores[agent] 
                for agent in iteration_results
            }
            updated_elos = self._calculate_elo_updates(current_elos_for_iteration, iteration_results)
            
            # Update the cumulative scores
            for agent, new_elo in updated_elos.items():
                cumulative_elo_scores[agent] = new_elo
        
        # Update all performance_records with recalculated ELO scores
        for agent_id in self.performance_records:
            if agent_id in cumulative_elo_scores:
                self.performance_records[agent_id]['elo'] = cumulative_elo_scores[agent_id]
            else:
                # Agent hasn't been tested yet, keep base ELO
                self.performance_records[agent_id]['elo'] = 1500.0
    
    def _update_elo_scores(self, iteration_results: Dict):
        """
        Update ELO scores by recalculating from scratch based on all test history.
        This ensures consistency and prevents accumulated errors.
        """
        # Instead of incremental updates, recalculate everything from test_history
        # This prevents drift and ensures consistency
        self._recalculate_all_elo_scores()
    
    def _calculate_elo_progression(self) -> List[Dict]:
        """
        Calculate ELO progression to track the leader after each iteration.
        
        Returns:
            List of dictionaries containing iteration number, leader name, ELO score, and accuracy
        """
        # We need to maintain a cumulative ELO score dictionary
        cumulative_elo_scores = {}
        leaders = []
        
        for iter_num, iteration_data in enumerate(self.test_history, 1):
            # Initialize new agents with base ELO
            for agent in iteration_data:
                if agent not in cumulative_elo_scores:
                    cumulative_elo_scores[agent] = 1500.0
            
            # Get accuracies for this iteration
            # Convert from percentage to decimal (accuracy is stored as percentage in test_history)
            iteration_results = {
                agent: {'accuracy': data['accuracy'] / 100.0} 
                for agent, data in iteration_data.items()
            }
            
            # Calculate updated ELO scores using the shared logic
            # Important: We update the cumulative scores, not reset them
            current_elos_for_iteration = {
                agent: cumulative_elo_scores[agent] 
                for agent in iteration_results
            }
            updated_elos = self._calculate_elo_updates(current_elos_for_iteration, iteration_results)
            
            # Update the cumulative scores with the new values
            for agent, new_elo in updated_elos.items():
                cumulative_elo_scores[agent] = new_elo
            
            # Find the leader after this iteration (from ALL agents, not just tested ones)
            if cumulative_elo_scores:
                leader_agent = max(cumulative_elo_scores.items(), key=lambda x: x[1])
                leaders.append({
                    'iteration': iter_num,
                    'leader': leader_agent[0],
                    'elo': leader_agent[1],
                    'accuracy': iteration_data.get(leader_agent[0], {}).get('accuracy', None)
                })
        
        return leaders
    
    def _get_agent_evolution_strategy(self, agent_id: str) -> str:
        """
        Get the evolution strategy that created an agent.

        Args:
            agent_id: The agent identifier

        Returns:
            Strategy name or "Initial" for non-evolved agents
        """
        if agent_id not in self.agent_pool:
            return "Unknown"

        agent_info = self.agent_pool[agent_id]

        # Check if it's an evolved agent
        if agent_info.get('source') == 'evolution':
            # First check if we stored the strategy directly (new approach)
            if 'evolution_strategy' in agent_info:
                return agent_info['evolution_strategy']

            # Fallback: look it up from evolution history by iteration
            created_iter = agent_info.get('created_iteration')
            if created_iter:
                for entry in self.evolver.evolution_history:
                    if entry['iteration'] == created_iter:
                        return entry.get('strategy', 'Unknown')
            return "Evolution (unknown)"
        else:
            # Initial agent
            return "Initial"

    def _generate_elo_leadership_section(self) -> str:
        """
        Generate the ELO Leadership Progression section for the final report.
        
        Returns:
            Formatted markdown string with ELO leadership table and analysis
        """
        lines = []
        lines.append("## ELO Leadership Progression\n")
        lines.append("This table shows which agent was leading by ELO score after each iteration:\n")
        
        # Calculate progression
        leaders = self._calculate_elo_progression()
        
        if not leaders:
            lines.append("*No iteration data available*\n")
            return "\n".join(lines)
        
        # Generate table with evolution strategy
        lines.append("| Iteration | Leading Agent | ELO Score | Iteration Accuracy | Creation Strategy |")
        lines.append("|-----------|---------------|-----------|--------------------|-------------------|")

        for item in leaders:
            leader_name = item['leader']
            # Truncate long names for better table formatting
            if len(leader_name) > 30:
                leader_name = leader_name[:27] + "..."

            # Get the evolution strategy for this agent
            strategy = self._get_agent_evolution_strategy(item['leader'])
            # Truncate strategy name if too long
            if len(strategy) > 20:
                strategy = strategy[:17] + "..."

            if item['accuracy'] is not None:
                lines.append(f"| {item['iteration']:9d} | {leader_name:30s} | {item['elo']:9.0f} | {item['accuracy']:17.1f}% | {strategy:20s} |")
            else:
                lines.append(f"| {item['iteration']:9d} | {leader_name:30s} | {item['elo']:9.0f} | {'N/A':>18s} | {strategy:20s} |")
        
        # Add leadership changes analysis
        lines.append("\n### Leadership Changes\n")
        
        leader_changes = []
        prev_leader = None
        for item in leaders:
            if item['leader'] != prev_leader:
                leader_changes.append((item['iteration'], item['leader']))
                prev_leader = item['leader']
        
        lines.append(f"Total leadership changes: {len(leader_changes) - 1}\n")
        
        if len(leader_changes) > 1:
            lines.append("Leadership timeline:")
            for iter_num, leader in leader_changes:
                lines.append(f"- Iteration {iter_num}: **{leader}** takes the lead")

        # Add strategy effectiveness analysis
        lines.append("\n### Strategy Effectiveness for ELO Leaders\n")

        # Count unique leaders by strategy
        unique_leaders_by_strategy = {}
        seen_leaders = set()

        for item in leaders:
            agent_id = item['leader']
            if agent_id not in seen_leaders:
                seen_leaders.add(agent_id)
                strategy = self._get_agent_evolution_strategy(agent_id)
                unique_leaders_by_strategy[strategy] = unique_leaders_by_strategy.get(strategy, 0) + 1

        # Also count total iterations led by each strategy
        iterations_by_strategy = {}
        for item in leaders:
            strategy = self._get_agent_evolution_strategy(item['leader'])
            iterations_by_strategy[strategy] = iterations_by_strategy.get(strategy, 0) + 1

        lines.append("Unique agents that became ELO leaders by creation strategy:")
        for strategy in sorted(unique_leaders_by_strategy.keys()):
            count = unique_leaders_by_strategy[strategy]
            iterations = iterations_by_strategy.get(strategy, 0)
            lines.append(f"- **{strategy}**: {count} agent(s) led for {iterations} total iteration(s)")

        return "\n".join(lines)
    
    def _generate_agent_evaluation_report(self, agent_id: str, iteration: int, databases: List[str], 
                                         agent_time: float, agent_cost: float) -> None:
        """
        Generate an evaluation report for a specific agent in a specific iteration.
        
        Args:
            agent_id: ID of the agent
            iteration: Iteration number
            databases: List of databases tested
            agent_time: Estimated time spent on this agent (in seconds)
            agent_cost: Estimated cost for this agent
        """
        
        # Agent directory path
        agent_dir = self.experiment_dir / f"iteration_{iteration:03d}" / f"agent_{agent_id}"
        if not agent_dir.exists():
            print(f"  ⚠️  Agent directory not found: {agent_dir}")
            return
            
        report_path = agent_dir / "agent_evaluation_report.md"
        
        # Collect database results
        database_results = []
        total_questions = 0
        total_correct = 0
        total_sql_errors = 0
        warnings = []
        
        for db_name in databases:
            db_results_file = agent_dir / db_name / "results" / "evaluation.json"
            if db_results_file.exists():
                try:
                    with open(db_results_file, 'r') as f:
                        db_data = json.load(f)
                    
                    # Extract metrics
                    questions = db_data.get('total_questions', 0)
                    correct = db_data.get('correct', 0)
                    accuracy = (correct / questions * 100) if questions > 0 else 0
                    sql_errors = db_data.get('prediction_errors', 0)
                    
                    database_results.append({
                        'database': db_name,
                        'accuracy': accuracy,
                        'correct': correct,
                        'total': questions,
                        'sql_errors': sql_errors
                    })
                    
                    total_questions += questions
                    total_correct += correct
                    total_sql_errors += sql_errors
                    
                    # Check for unusual errors and generate warnings
                    pred_timeouts = db_data.get('prediction_timeouts', 0)
                    gt_errors = db_data.get('ground_truth_errors', 0)
                    gt_timeouts = db_data.get('ground_truth_timeouts', 0)
                    
                    if pred_timeouts > 0:
                        warnings.append(f"Prediction timeouts in {db_name}: {pred_timeouts} questions")
                    if gt_errors > 0:
                        warnings.append(f"Ground truth errors in {db_name}: {gt_errors} questions")
                    if gt_timeouts > 0:
                        warnings.append(f"Ground truth timeouts in {db_name}: {gt_timeouts} questions")
                        
                except Exception as e:
                    print(f"  ⚠️  Error reading {db_results_file}: {e}")
                    continue
            else:
                print(f"  ⚠️  Results file not found: {db_results_file}")
        
        # Calculate overall accuracy
        overall_accuracy = (total_correct / total_questions * 100) if total_questions > 0 else 0
        
        # Determine agent source
        agent_info = self.agent_pool.get(agent_id, {})
        agent_source = agent_info.get('source', 'unknown')
        created_iteration = agent_info.get('created_iteration', 'N/A')
        
        if agent_source == 'initial':
            source_description = "initial"
            agent_file = f"RoboPhD/agents/{agent_id}.md"
        elif agent_source == 'evolution':
            source_description = f"evolved in iteration {created_iteration}"
            agent_file = f"agents/{agent_id}.md"
        else:
            source_description = agent_source
            agent_file = "unknown"
        
        # Generate report content
        report_lines = ["# Agent Evaluation Report\n"]
        
        # Configuration section
        report_lines.append("## Configuration")
        report_lines.append(f"- **Agent**: {agent_id}")
        report_lines.append(f"- **Iteration**: {iteration}")
        report_lines.append(f"- **Dataset**: {self.dataset}")
        report_lines.append(f"- **Evaluation model**: {self.eval_model}")
        report_lines.append(f"- **Analysis model**: {self.analysis_model}")
        report_lines.append(f"- **Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report_lines.append(f"- **Databases tested**: {len(databases)}")
        report_lines.append(f"- **Questions per database**: {self.questions_per_database}")
        report_lines.append(f"- **Total time**: {agent_time/60:.1f} minutes")
        report_lines.append(f"- **Total cost**: ${agent_cost:.2f}")
        
        # Overall results
        report_lines.append("\n## Overall Results")
        report_lines.append(f"- **Total questions**: {total_questions}")
        report_lines.append(f"- **Correct answers**: {total_correct}")
        report_lines.append(f"- **Overall accuracy**: {overall_accuracy:.2f}%")
        
        # Warnings section (if any)
        if warnings:
            report_lines.append("\n⚠️  **WARNINGS**")
            for warning in warnings:
                report_lines.append(f"- {warning}")
        
        # Per-database results table
        report_lines.append("\n## Per-Database Results")
        report_lines.append("| Database | Accuracy | Correct/Total | SQL Errors |")
        report_lines.append("|----------|----------|---------------|------------|")
        
        for result in database_results:
            report_lines.append(
                f"| {result['database']} | {result['accuracy']:.2f}% | "
                f"{result['correct']}/{result['total']} | {result['sql_errors']} |"
            )
        
        # Agent details
        report_lines.append("\n## Agent Details")
        report_lines.append(f"- **Source**: {source_description}")
        report_lines.append(f"- **Agent file**: {agent_file}")
        
        # Write report
        try:
            with open(report_path, 'w') as f:
                f.write('\n'.join(report_lines))
            print(f"  📋 Generated evaluation report: {report_path.relative_to(self.experiment_dir)}")
        except Exception as e:
            print(f"  ❌ Error writing evaluation report: {e}")
    
    def select_agents_for_iteration(self, iteration: int, 
                                    evolved_agent_id: Optional[str] = None,
                                    skip_evolution: bool = False) -> List[str]:
        """
        Select agents to test in this iteration.
        
        Priority order:
        1. Most recent iteration winner (always included if available)
        2. Newly evolved agent (if provided)
        3. Untested agents (test_count == 0)
        4. ELO-based selection:
           - With evolution: Random from top 2*j agents
           - Without evolution: Deterministic top j agents
        
        Args:
            iteration: Current iteration
            evolved_agent_id: ID of newly evolved agent to include (if any)
            skip_evolution: If True, use deterministic top ELO selection
            
        Returns:
            List of agent IDs to test
        """
        selected = []
        available = list(self.agent_pool.keys())
        
        # Print selection header
        print(f"\n📋 AGENT SELECTION FOR ITERATION {iteration}")
        print("═" * 60)
        
        # Priority 1: Previous winner
        print("\nPriority 1 - Previous Winner:")
        if iteration > 1 and self.test_history:
            last_iteration = self.test_history[-1]
            # Find all agents with the highest accuracy from last iteration
            max_accuracy = max(last_iteration[k].get('accuracy', 0) for k in last_iteration.keys())
            winners = [k for k in last_iteration.keys() if last_iteration[k].get('accuracy', 0) == max_accuracy]

            if len(winners) == 1:
                recent_winner = winners[0]
            else:
                # Multiple agents tied for first place - randomly select one
                recent_winner = random.choice(winners)
            winner_accuracy = max_accuracy
            if recent_winner in available:
                selected.append(recent_winner)
                available.remove(recent_winner)
                print(f"  ✓ Selected: {recent_winner} (won iteration {iteration-1} with {winner_accuracy:.1f}%)")
            else:
                print(f"  ✗ Previous winner {recent_winner} not available")
        else:
            print("  ✗ No previous winner (first iteration or no history)")
        
        # Priority 2: Newly evolved agent
        print("\nPriority 2 - Newly Evolved Agent:")
        if evolved_agent_id and evolved_agent_id in available:
            selected.append(evolved_agent_id)
            available.remove(evolved_agent_id)
            print(f"  ✓ Selected: {evolved_agent_id} (just evolved)")
        elif evolved_agent_id and evolved_agent_id not in available:
            print(f"  ✗ Evolved agent {evolved_agent_id} not available")
        else:
            print("  ✗ No evolution occurred this iteration")
        
        # If we already have enough agents, return what we have
        if len(selected) >= self.agents_per_iteration:
            print(f"\n🎯 Final Selection: {selected[:self.agents_per_iteration]}")
            print("=" * 60)
            return selected[:self.agents_per_iteration]
        
        # Priority 3: Untested agents
        untested = [a for a in available if self.performance_records[a]['test_count'] == 0]
        tested = [a for a in available if self.performance_records[a]['test_count'] > 0]
        
        slots_remaining = self.agents_per_iteration - len(selected)
        
        print("\nPriority 3 - Untested Agents:")
        if untested and slots_remaining > 0:
            if len(untested) > slots_remaining:
                # Randomly select from untested agents
                print(f"  ✓ Selecting {slots_remaining} untested agent(s) from pool of {len(untested)}:")
                print(f"    Pool: {untested}")
                untested_selected = random.sample(untested, slots_remaining)
                selected.extend(untested_selected)
                print(f"    Selected: {untested_selected} (random selection)")
            else:
                # Take all untested agents if we have fewer than needed
                selected.extend(untested)
                print(f"  ✓ Selected all {len(untested)} untested agent(s): {untested}")
            slots_remaining = self.agents_per_iteration - len(selected)
        else:
            if not untested:
                print("  ✗ No untested agents available")
            else:
                print("  ✗ No slots remaining for untested agents")
        
        # Priority 4: ELO-based selection
        if slots_remaining > 0 and tested:
            print("\nPriority 4 - ELO-Based Selection:")
            # Sort tested agents by ELO
            sorted_tested = sorted(tested, 
                                 key=lambda a: self.performance_records[a]['elo'],
                                 reverse=True)
            
            # Always use random selection from top 2*k agents
            pool_size = min(slots_remaining * 2, len(sorted_tested))
            candidate_pool = sorted_tested[:pool_size]
            num_to_select = min(slots_remaining, len(candidate_pool))
            
            print(f"  Mode: Random selection from top {pool_size} agents")
            print(f"  Need to fill: {slots_remaining} slot(s)")
            print(f"  Candidate pool (top {pool_size} by ELO):")
            for i, agent in enumerate(candidate_pool, 1):
                elo = self.performance_records[agent]['elo']
                test_count = self.performance_records[agent]['test_count']
                print(f"    {i}. {agent} (ELO: {elo:.0f}, tested: {test_count} times)")
            
            elo_selected = random.sample(candidate_pool, num_to_select)
            selected.extend(elo_selected)
            print(f"  Selected: {elo_selected} (random from pool)")
        elif slots_remaining > 0:
            print("\nPriority 4 - ELO-Based Selection:")
            print("  ✗ No tested agents available for ELO-based selection")
        
        print(f"\n🎯 Final Selection: {selected[:self.agents_per_iteration]}")
        print("=" * 60)
        
        return selected[:self.agents_per_iteration]
    
    def run(self, initial_agents: Optional[List[str]] = None):
        """
        Run the complete parallel agent research experiment.
        
        Args:
            initial_agents: Optional list of specific agents to start with
        """
        start_time = time.time()
        
        print("\n" + "="*60)
        print("PARALLEL AGENT RESEARCH EXPERIMENT" + (" (RESUMED)" if self.resume_mode else ""))
        print("="*60)
        
        # Load initial agents only if not resuming
        if not self.resume_mode:
            self.load_initial_agents(initial_agents)
        else:
            print(f"📂 Resumed from: {self.experiment_dir}")
            print(f"📊 Agents in pool: {len(self.agent_pool)}")
            if self.resume_from_iteration:
                print(f"🔄 Restarting from iteration: {self.resume_from_iteration}")
            else:
                last_completed = self.test_history[-1] if self.test_history else 0
                print(f"🔄 Continuing from iteration: {len(self.test_history) + 1}")
        
        # Determine starting iteration
        if self.resume_mode:
            start_iteration = self.resume_from_iteration if self.resume_from_iteration else len(self.test_history) + 1
        else:
            start_iteration = 1
        
        # Main research loop (using while to allow restart)
        iteration = start_iteration
        while iteration <= self.num_iterations:
            # Check memory
            self.memory_monitor.check_memory()
            
            # Select databases for this iteration and sort alphabetically
            databases = sorted(random.sample(self.databases, 
                                           min(self.databases_per_iteration, len(self.databases))))
            
            # Select agents to test
            if iteration == 1:
                # Randomly select initial agents
                available_agents = list(self.agent_pool.keys())
                if len(available_agents) > self.agents_per_iteration:
                    selected_agents = random.sample(available_agents, self.agents_per_iteration)
                else:
                    selected_agents = available_agents
            else:
                # Check if evolution should be skipped for this iteration
                evolution_strategy, was_random = self.evolver.get_strategy_for_iteration(iteration)
                skip_evolution = (evolution_strategy is None)

                evolved_agent_id = None

                if not skip_evolution and self.test_history:
                    # Update evolver's Phase 1 failures list with current state
                    # This ensures 5-hour limit restart logic has up-to-date failures
                    self.evolver.researcher_phase1_failures = self.phase1_failures

                    # Create new agent based on previous results
                    recent_results = self.test_history[-1]

                    result = self.evolver.create_new_agent(
                        self.agent_pool,
                        self.performance_records,
                        recent_results,
                        iteration,
                        self.test_history,
                        strategy_name=evolution_strategy,
                        was_random=was_random
                    )
                    
                    # Check if evolution failed
                    if result is None or result[0] is None:
                        # Check if we need to restart from an earlier iteration due to Phase 1 failures
                        if hasattr(self.evolver, 'restart_from_iteration') and self.evolver.restart_from_iteration is not None:
                            restart_iter = self.evolver.restart_from_iteration
                            iterations_to_redo = iteration - restart_iter + 1
                            print(f"\n🔄 Restarting from iteration {restart_iter} due to Phase 1 failures caused by 5-hour limit")
                            print(f"   Will redo {iterations_to_redo} iteration(s): {restart_iter} through {iteration}")

                            # Archive the failed iterations
                            self.archive_iterations(restart_iter)

                            # Reset the restart flag but preserve force_fresh flag
                            self.evolver.restart_from_iteration = None
                            # The force_fresh_next_evolution flag will persist to ensure clean context

                            # Update five_hour_limit_incidents to be persistent
                            self.five_hour_limit_incidents = self.evolver.five_hour_limit_incidents.copy()

                            # Save checkpoint before restart
                            self._save_checkpoint(restart_iter - 1)

                            # Restart loop from the specified iteration
                            iteration = restart_iter
                            continue  # Skip to next iteration of while loop
                        else:
                            # Normal failure - end experiment
                            print(f"\n🏁 Ending experiment early after {iteration-1} successful iterations")
                            print(f"   Evolution failed for iteration {iteration} - cannot continue")
                            break
                    
                    # Unpack the result
                    new_agent_content, new_agent_id, reasoning, package_info = result
                    evolved_agent_id = new_agent_id
                    
                    # Install package based on type
                    if package_info['type'] == 'three_artifact':
                        # Three-artifact structure - install complete package
                        package_dir = self._install_three_artifact_package(
                            new_agent_id, package_info, iteration
                        )
                        new_agent_path = package_dir / "agent.md"
                    elif package_info['type'] in ['parsed', 'single_artifact']:
                        # Parsed from response or single artifact - create simple agent file
                        if package_info['type'] == 'parsed':
                            print(f"  📝 Creating agent from parsed response")
                        else:
                            print(f"  📝 Creating agent from single artifact")
                        package_dir = self.experiment_dir / "agents" / new_agent_id
                        package_dir.mkdir(parents=True, exist_ok=True)
                        new_agent_path = package_dir / "agent.md"
                        with open(new_agent_path, 'w') as f:
                            f.write(new_agent_content)
                        # Create placeholder eval_instructions.md
                        eval_instructions_path = package_dir / "eval_instructions.md"
                        with open(eval_instructions_path, 'w') as f:
                            f.write("# SQL Generation Instructions\n\nGenerate accurate SQL queries based on the database analysis provided.\n")
                        # Copy tools if they exist (for single_artifact that might have tools)
                        if package_info.get('tools_dir') and package_info['tools_dir'].exists():
                            tools_dst = package_dir / "tools"
                            if tools_dst.exists():
                                shutil.rmtree(tools_dst)
                            shutil.copytree(package_info['tools_dir'], tools_dst, symlinks=True)
                    else:
                        print(f"  ⚠️ Unexpected package type: {package_info['type']}")
                        package_dir = self.experiment_dir / "agents" / new_agent_id
                        package_dir.mkdir(parents=True, exist_ok=True)
                        new_agent_path = package_dir / "agent.md"
                        with open(new_agent_path, 'w') as f:
                            f.write(new_agent_content)
                    
                    # Add to pool with package info
                    self.agent_pool[new_agent_id] = {
                        'path': new_agent_path,
                        'content': new_agent_content,
                        'source': 'evolution',
                        'created_iteration': iteration,
                        'evolution_strategy': evolution_strategy,  # Track which strategy created this agent
                        'package_dir': package_dir,
                        'package_type': 'three_artifact',
                        'eval_instructions_file': package_dir / "eval_instructions.md",
                        'tools_dir': package_dir / "tools" if package_info.get('tools_dir') else None
                    }
                    
                    # Initialize performance record
                    self.performance_records[new_agent_id] = {
                        'test_count': 0,
                        'total_correct': 0,
                        'total_questions': 0,
                        'mean_accuracy': 0.0,
                        'elo': 1500,
                        'iteration_results': []
                    }
                    
                    print(f"\n✨ Created new agent: {new_agent_id}")
                elif skip_evolution:
                    print(f"\n⏭️ Skipping evolution for iteration {iteration} (configured in evolution schedule)")
                    # Track that evolution was skipped
                    self.evolver.evolution_history.append({
                        'iteration': iteration,
                        'strategy': 'none (skipped)',
                        'was_random': False
                    })
                
                # Select agents for this iteration
                selected_agents = self.select_agents_for_iteration(
                    iteration, 
                    evolved_agent_id=evolved_agent_id,
                    skip_evolution=skip_evolution
                )
            
            # Track iteration timing and cost
            iteration_start_time = time.time()
            iteration_start_cost = self.total_cost
            
            # Run iteration
            iteration_results = self.run_iteration(iteration, selected_agents, databases)
            
            # Calculate iteration metrics
            iteration_time = time.time() - iteration_start_time
            iteration_cost = self.total_cost - iteration_start_cost
            
            # Store per-iteration metrics
            self.iteration_times.append(iteration_time)
            self.iteration_costs.append(iteration_cost)
            
            # Store results (already done in run_iteration before ELO calculation)
            
            # Generate agent evaluation reports for this iteration
            for agent_id in selected_agents:
                self._generate_agent_evaluation_report(
                    agent_id, 
                    iteration, 
                    databases, 
                    iteration_time / len(selected_agents),  # Estimate per-agent time
                    iteration_cost / len(selected_agents)   # Estimate per-agent cost
                )
            
            # Generate interim report after this iteration
            self._generate_interim_report(start_time, iteration)

            # Save checkpoint
            self._save_checkpoint(iteration)

            # Increment iteration for next loop
            iteration += 1

        # Generate final report
        self._generate_report(start_time, report_type='final')

        # Generate test_predictions.json for test-eval mode
        if self.test_eval_mode:
            test_predictions_path = self.experiment_dir / 'test_predictions.json'
            self._generate_test_predictions_file(test_predictions_path)

        total_time = time.time() - start_time
        print(f"\n✅ Research complete!")
        print(f"Total time: {total_time/60:.1f} minutes")
        print(f"Results saved to: {self.experiment_dir}")

        if self.test_eval_mode:
            print(f"📁 Test predictions saved to: {test_predictions_path}")

    def _generate_test_predictions_file(self, output_file: Path):
        """Generate consolidated bird prediction file for test-eval mode."""
        if not self.test_eval_mode:
            return

        print("\n📝 Generating test_predictions.json file...")

        # Collect all test output from the latest iteration; note that bird organizers
        # want a single json dict of question_id -> predicted "bird style" sql
        all_predictions = {}

        # Find the latest (and only) iteration directory
        latest_iteration = max([int(d.name.split('_')[-1])
                               for d in self.experiment_dir.iterdir()
                               if d.is_dir() and d.name.startswith('iteration_')])

        iteration_dir = self.experiment_dir / f"iteration_{latest_iteration:03d}"

        # Collect predictions from all agents in the iteration
        for agent_dir in iteration_dir.iterdir():
            if not agent_dir.is_dir() or not agent_dir.name.startswith('agent_'):
                continue

            # Check each database subdirectory for test output
            for db_dir in agent_dir.iterdir():
                if not db_dir.is_dir():
                    continue

                # Look for evaluation.json files and extract test_output
                eval_file = db_dir / "results" / "bird_output.json"
                if eval_file.exists():
                    try:
                        with open(eval_file, 'r') as f:
                            data = json.load(f)

                        # Extract test output from evaluation structure
                        all_predictions.update(data)
                    except:
                        continue

        # Save consolidated test predictions
        with open(output_file, 'w') as f:
            json.dump(all_predictions, f, indent=2)

        print(f"✅ Generated {output_file} with {len(all_predictions)} predictions")

    def _install_three_artifact_package(self, agent_id: str, package_info: Dict, iteration: int) -> Path:
        """
        Install a three-artifact package to the experiment agents directory.
        
        Args:
            agent_id: ID for the agent
            package_info: Package information from evolution
            iteration: Current iteration number
            
        Returns:
            Path to the installed package directory
        """
        # Create package directory in agents folder
        package_dir = self.experiment_dir / "agents" / agent_id
        package_dir.mkdir(parents=True, exist_ok=True)
        
        # Copy artifacts
        evolution_dir = package_info['evolution_dir']
        
        # Copy agent.md
        agent_src = package_info['agent_file']
        if not agent_src.exists():
            raise FileNotFoundError(f"Source agent.md not found: {agent_src}")
        shutil.copy(agent_src, package_dir / "agent.md")
        
        # Copy eval_instructions.md
        eval_src = package_info['eval_instructions_file']
        if not eval_src.exists():
            raise FileNotFoundError(f"Source eval_instructions.md not found: {eval_src}")
        shutil.copy(eval_src, package_dir / "eval_instructions.md")
        
        # Copy tools if present
        if package_info.get('tools_dir'):
            tools_src = package_info['tools_dir']
            tools_dst = package_dir / "tools"
            if tools_dst.exists():
                shutil.rmtree(tools_dst)
            shutil.copytree(tools_src, tools_dst, symlinks=True)
        
        # Create tool_output directory for runtime
        (package_dir / "tool_output").mkdir(exist_ok=True)
        
        print(f"  📦 Installed three-artifact package to {package_dir.name}")
        
        return package_dir
    
    def _save_checkpoint(self, iteration: int):
        """Save checkpoint after each iteration."""
        # Convert agent_pool to serializable format
        serializable_pool = {}
        for agent_id, agent_info in self.agent_pool.items():
            serializable_agent = {
                'path': str(agent_info['path']),
                'source': agent_info.get('source', 'unknown'),
                'created_iteration': agent_info.get('created_iteration', 0),
                'package_type': agent_info.get('package_type', 'three_artifact'),
                'evolution_strategy': agent_info.get('evolution_strategy', None)  # Save evolution strategy
            }
            
            # Add three-artifact specific fields
            if 'package_dir' in agent_info:
                serializable_agent['package_dir'] = str(agent_info['package_dir'])
            
            if 'eval_instructions_file' in agent_info:
                serializable_agent['eval_instructions_file'] = str(agent_info['eval_instructions_file'])
            
            if 'tools_dir' in agent_info and agent_info['tools_dir']:
                serializable_agent['tools_dir'] = str(agent_info['tools_dir'])
            
            serializable_pool[agent_id] = serializable_agent
        
        checkpoint = {
            'last_completed_iteration': iteration,
            'agent_pool': serializable_pool,
            'performance_records': self.performance_records,
            'test_history': self.test_history,
            'random_seed': self.random_seed,
            'total_cost': self.total_cost,
            'iteration_costs': self.iteration_costs,
            'iteration_times': self.iteration_times,
            'phase1_failures': self.phase1_failures,
            'zero_accuracy_cases': self.zero_accuracy_cases,
            'five_hour_limit_incidents': self.five_hour_limit_incidents,
            'number_of_used_papers': self.number_of_used_papers,
            'experiment_config': {
                'dataset': self.dataset,
                'num_iterations': self.num_iterations,
                'agents_per_iteration': self.agents_per_iteration,
                'databases_per_iteration': self.databases_per_iteration,
                'questions_per_database': self.questions_per_database,
                'eval_model': self.eval_model,
                'analysis_model': self.analysis_model,
                'evolution_model': self.evolution_model,
                'max_concurrent_dbs': self.max_concurrent_dbs,
                'phase1_timeout': self.phase1_timeout,
                'sql_timeout': self.sql_timeout,
                'evolution_timeout': self.evolution_timeout,
                'evolution_default': self.evolver.evolution_default,
                'evolution_schedule': self.evolver.evolution_schedule,
                'evolution_context': self.evolver.evolution_context,
                'evolution_count': self.evolver.evolution_count,
                'context_resets': self.evolver.context_resets,
                'evolution_retries': self.evolver.evolution_retries,
                'evolution_history': self.evolver.evolution_history,
                'header_repairs': getattr(self.evolver, 'header_repairs', []),
                'evolution_validation_failures': getattr(self.evolver, 'evolution_validation_failures', []),
                'agents_directory': self.agents_directory,
                'context_reset_interval': self.evolver.context_reset_interval,
                'evolution_random_pool': self.evolver.evolution_random_pool,
                'evolution_weighted_random': self.evolver.evolution_weighted_random,
                'shuffled_random_pool': self.evolver.shuffled_random_pool,
                'shuffled_papers_pool': self.evolver.shuffled_papers_pool,
                'verification_retries': self.verification_retries,
                'temperature_strategy': self.temperature_strategy
            }
        }
        
        checkpoint_file = self.experiment_dir / 'checkpoint.json'
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint, f, indent=2, default=str)
    
    
    def _generate_interim_report(self, start_time: float, iteration: int):
        """Generate interim report after each iteration."""
        try:
            self._generate_report(start_time, report_type='interim', iteration=iteration)

            # Verify the file was actually written
            iteration_dir = self.experiment_dir / f"iteration_{iteration:03d}"
            report_file = iteration_dir / 'interim_report.md'

            if report_file.exists() and report_file.stat().st_size > 0:
                print(f"  📊 Generated interim report: iteration_{iteration:03d}/interim_report.md")
            else:
                print(f"  ⚠️ Warning: Interim report file not created or is empty: {report_file}")
        except Exception as e:
            print(f"  ❌ Error generating interim report for iteration {iteration}: {str(e)}")
            # Log the full traceback for debugging
            import traceback
            traceback.print_exc()

    def _generate_report(self, start_time: float, report_type: str = 'final', iteration: int = None):
        """Generate comprehensive report (final or interim).

        Args:
            start_time: Experiment start time
            report_type: 'final' or 'interim'
            iteration: Current iteration (for interim reports)
        """
        if report_type == 'final':
            report_lines = ["# Parallel Agent Research - Final Report\n"]
        else:
            report_lines = [f"# Parallel Agent Research - Interim Report (Iteration {iteration})\n"]
        
        # Configuration
        report_lines.append("## Experiment Configuration")
        report_lines.append(f"- **Run ID**: {self.experiment_dir.name}")
        report_lines.append(f"- **Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report_lines.append(f"- **Dataset**: {self.dataset}")
        if report_type == 'final':
            report_lines.append(f"- **Iterations**: {self.num_iterations}")
        else:
            report_lines.append(f"- **Total iterations planned**: {self.num_iterations}")
            report_lines.append(f"- **Current iteration**: {iteration}/{self.num_iterations}")
        report_lines.append(f"- **Agents per iteration**: {self.agents_per_iteration}")
        report_lines.append(f"- **Databases per iteration**: {self.databases_per_iteration}")
        report_lines.append(f"- **Questions per database**: {self.questions_per_database}")
        
        # Models
        report_lines.append("")
        report_lines.append("### Models")
        report_lines.append(f"- **Evaluation model**: {self.eval_model}")
        report_lines.append(f"- **Analysis model**: {self.analysis_model}")
        report_lines.append(f"- **Evolution model**: {self.evolution_model}")
        
        # Evolution Strategy
        report_lines.append("")
        report_lines.append("### Evolution Strategy")
        report_lines.append(f"- **Default strategy**: {self.evolver.evolution_default}")
        if self.evolver.evolution_schedule:
            report_lines.append(f"- **Scheduled strategies**: {self.evolver.evolution_schedule}")
        
        # Evolution History by Iteration
        if self.evolver.evolution_history:
            report_lines.append("")
            report_lines.append("### Evolution Strategies by Iteration")
            # Deduplicate by iteration number (keep the first occurrence)
            seen_iterations = set()
            for entry in self.evolver.evolution_history:
                if entry['iteration'] not in seen_iterations:
                    report_lines.append(f"- **Iteration {entry['iteration']}**: {entry['strategy']}")
                    seen_iterations.add(entry['iteration'])
        
        # Context Management
        report_lines.append("")
        report_lines.append("### Context Management")

        # Count different types of resets
        # Note: context resets use 'reason' field with values 'interval', 'force_fresh', '5hr_restart', or 'research_driven_strategy'
        scheduled_resets = [r for r in self.evolver.context_resets if r.get('reason') == 'interval']
        force_fresh_resets = [r for r in self.evolver.context_resets if r.get('reason') == 'force_fresh']
        restart_resets = [r for r in self.evolver.context_resets if r.get('reason') == '5hr_restart']
        research_driven_resets = [r for r in self.evolver.context_resets if r.get('reason') == 'research_driven_strategy']

        if scheduled_resets:
            reset_info = []
            for r in scheduled_resets:
                if 'evolution_count' in r:
                    reset_info.append(f"Iteration {r['iteration']} (evolution #{r['evolution_count']})")
                else:
                    reset_info.append(f"Iteration {r['iteration']}")
            interval_msg = f" (every {self.evolver.context_reset_interval} evolutions)" if self.evolver.context_reset_interval > 0 else " (scheduled resets disabled)"
            report_lines.append(f"- **Scheduled resets**: {', '.join(reset_info)}{interval_msg}")
        else:
            report_lines.append(f"- **Scheduled resets**: None")

        if force_fresh_resets:
            force_fresh_iterations = [r['iteration'] for r in force_fresh_resets]
            report_lines.append(f"- **Force fresh resets**: Iterations {', '.join(map(str, force_fresh_iterations))}")
        else:
            report_lines.append(f"- **Force fresh resets**: None")

        # Add 5hr restart resets (indicates run was restarted from this iteration)
        if restart_resets:
            restart_info = []
            for r in restart_resets:
                info = f"Iteration {r['iteration']}"
                if 'evolution_count' in r:
                    info += f" (evolution #{r['evolution_count']})"
                restart_info.append(info)
            report_lines.append(f"- **🔄 Restarts after 5-hour limit**: {', '.join(restart_info)}")
            report_lines.append(f"  *(Run was restarted from these iterations due to Claude conversation limits)*")

        # Add research_driven resets
        if research_driven_resets:
            research_reset_info = []
            for r in research_driven_resets:
                info = f"Iteration {r['iteration']}"
                if 'evolution_count' in r:
                    info += f" (evolution #{r['evolution_count']})"
                research_reset_info.append(info)
            report_lines.append(f"- **📚 Research-driven strategy resets**: {', '.join(research_reset_info)}")
            report_lines.append(f"  *(Context reset required due to large paper content in research_driven strategies)*")

        # Add retry resets (when files weren't created and retry was needed)
        retry_file_resets = [r for r in self.evolver.context_resets if r.get('reason') == 'retry_missing_files']
        if retry_file_resets:
            retry_info = []
            for r in retry_file_resets:
                info = f"Iteration {r['iteration']}"
                if 'evolution_count' in r:
                    info += f" (evolution #{r['evolution_count']})"
                retry_info.append(info)
            report_lines.append(f"- **🔁 Retries due to missing files**: {', '.join(retry_info)}")
            report_lines.append(f"  *(Context was reset to retry evolution when agent.md wasn't created)*")

        report_lines.append(f"- **Total context resets**: {len(self.evolver.context_resets)}")
        
        # Evolution Attempts
        if self.evolver.evolution_retries:
            successful_retries = [r for r in self.evolver.evolution_retries if r['success']]
            failed_retries = [r for r in self.evolver.evolution_retries if not r['success']]
            
            report_lines.append("")
            report_lines.append("### Evolution Attempts")
            report_lines.append(f"- **Total retries needed**: {len(self.evolver.evolution_retries)}")
            report_lines.append(f"- **Successful after retry**: {len(successful_retries)}")
            report_lines.append(f"- **Failed after retry**: {len(failed_retries)}")
            
            if successful_retries:
                success_iters = [r['iteration'] for r in successful_retries]
                report_lines.append(f"  - Succeeded on retry: Iterations {', '.join(map(str, success_iters))}")
            if failed_retries:
                fail_iters = [r['iteration'] for r in failed_retries]
                report_lines.append(f"  - Failed even after retry: Iterations {', '.join(map(str, fail_iters))}")
        
        # Experiment Performance
        report_lines.append("")
        report_lines.append("### Experiment Performance")
        
        # Calculate total time (only from active iterations, not archived ones)
        total_time = sum(self.iteration_times) if self.iteration_times else (time.time() - start_time)
        total_hours = total_time / 3600
        
        report_lines.append(f"- **Total time**: {total_time/60:.1f} minutes ({total_hours:.1f} hours)")
        report_lines.append(f"- **Total cost**: ${self.total_cost:.2f}")
        
        # Calculate averages if we have completed iterations
        completed_iterations = len(self.test_history)
        if completed_iterations > 0:
            avg_time_per_iteration = total_time / completed_iterations
            avg_cost_per_iteration = self.total_cost / completed_iterations
            if report_type == 'final':
                report_lines.append(f"- **Iterations completed**: {completed_iterations} (excludes any archived iterations)")
            else:
                report_lines.append(f"- **Iterations completed**: {iteration}/{self.num_iterations}")
            report_lines.append(f"- **Average time per iteration**: {avg_time_per_iteration/60:.1f} minutes")
            report_lines.append(f"- **Average cost per iteration**: ${avg_cost_per_iteration:.2f}")
        else:
            report_lines.append(f"- **Iterations completed**: 0")
        
        # Performance summary
        report_lines.append("\n## Performance Summary\n")

        # Winning Agents by Evolution Strategy
        try:
            if self.test_history and len(self.test_history) > 0:
                report_lines.append("### Winning Agents by Evolution Strategy\n")
                report_lines.append("This section shows which evolution strategies produced winning agents.\n")

                # Collect all winners and their strategies
                from collections import defaultdict
                strategy_winners = defaultdict(list)  # strategy -> list of agent_ids
                agent_wins = defaultdict(int)  # agent_id -> count of wins

                for iter_idx, iteration_results in enumerate(self.test_history, 1):
                    # Find max accuracy for this iteration
                    accuracies = [results.get('accuracy', 0) for results in iteration_results.values()]
                    if not accuracies:
                        continue
                    max_accuracy = max(accuracies)

                    # Find all agents that achieved max accuracy
                    winners = [agent_id for agent_id, results in iteration_results.items()
                              if results.get('accuracy', 0) == max_accuracy]

                    for winner in winners:
                        agent_wins[winner] += 1

                # Group agents by strategy
                for agent_id, win_count in agent_wins.items():
                    if win_count > 0:
                        agent_info = self.agent_pool.get(agent_id, {})
                        strategy = agent_info.get('evolution_strategy')
                        if strategy is None:
                            strategy = 'initial' if agent_info.get('source') == 'initial' else 'unknown'

                        if agent_id not in [a for a, _ in strategy_winners[strategy]]:
                            strategy_winners[strategy].append((agent_id, win_count))

                # Calculate statistics
                total_wins = sum(agent_wins.values())

                # Sort strategies by total wins
                strategy_totals = {}
                for strategy, agents in strategy_winners.items():
                    total = sum(wins for _, wins in agents)
                    strategy_totals[strategy] = total

                sorted_strategies = sorted(strategy_totals.items(), key=lambda x: x[1], reverse=True)

                # Display results
                for strategy, win_count in sorted_strategies:
                    if win_count > 0:
                        percentage = (win_count / total_wins * 100) if total_wins > 0 else 0
                        report_lines.append(f"#### **{strategy.replace('_', ' ').title()}** ({win_count} wins - {percentage:.1f}% of all wins)")

                        # Sort agents by number of wins
                        agents = sorted(strategy_winners[strategy], key=lambda x: x[1], reverse=True)

                        for agent, wins in agents:
                            agent_info = self.agent_pool.get(agent, {})
                            created_iter = agent_info.get('created_iteration', 'unknown')
                            report_lines.append(f"- **{agent}** ({wins} {'win' if wins == 1 else 'wins'}, created iteration {created_iter})")
                        report_lines.append("")

                # Summary statistics
                report_lines.append("#### Summary Statistics")
                report_lines.append(f"- **Total iterations with winners**: {len(self.test_history)}")
                report_lines.append(f"- **Total wins counted**: {total_wins} (includes ties)")
                report_lines.append(f"- **Unique winning agents**: {len(agent_wins)}")
                report_lines.append(f"- **Evolution strategies that produced winners**: {len([s for s, t in strategy_totals.items() if t > 0])}")
                report_lines.append("")
        except Exception as e:
            report_lines.append("### Winning Agents by Evolution Strategy\n")
            report_lines.append(f"⚠️ Error generating strategy analysis: {str(e)}\n")
        
        # Add comprehensive ranking table
        if self.test_history and len(self.test_history) > 0:
            ranking_table = self.evolver._generate_ranking_table(
                self.test_history, 
                self.performance_records, 
                for_evolution=False
            )
            report_lines.append(ranking_table)
            report_lines.append("\n")
        
        # Sort agents by ELO
        sorted_agents = sorted(self.performance_records.keys(),
                             key=lambda a: self.performance_records[a]['elo'],
                             reverse=True)
        
        # Simple summary table
        report_lines.append("### Quick Summary\n")
        report_lines.append("| Agent | ELO | Mean Accuracy | Tests |")
        report_lines.append("|-------|-----|---------------|-------|")
        
        for agent_id in sorted_agents:
            perf = self.performance_records[agent_id]
            report_lines.append(f"| {agent_id} | {perf['elo']:.0f} | "
                              f"{perf['mean_accuracy']:.1f}% | {perf['test_count']} |")
        
        # Best agent
        if sorted_agents:
            best_agent = sorted_agents[0]
            report_lines.append(f"\n## Best Agent: {best_agent}")
            report_lines.append(f"- ELO Score: {self.performance_records[best_agent]['elo']:.0f}")
            report_lines.append(f"- Mean Accuracy: {self.performance_records[best_agent]['mean_accuracy']:.1f}%")
        
        # ELO Leadership Progression
        report_lines.append("\n")
        elo_leadership_section = self._generate_elo_leadership_section()
        report_lines.append(elo_leadership_section)
        
        # Phase 1 Failures section
        report_lines.append("\n## Phase 1 Failures")
        if self.phase1_failures:
            report_lines.append(f"Total Phase 1 failures: {len(self.phase1_failures)}")
            report_lines.append("\n| Agent | Database | Iteration |")
            report_lines.append("|-------|----------|-----------|")
            for agent_id, db_name, iteration in sorted(self.phase1_failures):
                report_lines.append(f"| {agent_id} | {db_name} | {iteration} |")
        else:
            report_lines.append("No Phase 1 failures encountered ✅")
        
        # Zero Accuracy Cases section
        report_lines.append("\n## Zero Accuracy Cases")
        if self.zero_accuracy_cases:
            report_lines.append(f"Total zero accuracy cases: {len(self.zero_accuracy_cases)}")
            report_lines.append("\n| Agent | Database | Iteration | Questions Tested |")
            report_lines.append("|-------|----------|-----------|------------------|")
            for agent_id, db_name, iteration, total_q in sorted(self.zero_accuracy_cases):
                report_lines.append(f"| {agent_id} | {db_name} | {iteration} | {total_q} |")
        else:
            report_lines.append("No zero accuracy cases encountered ✅")

        # Evolution Health section
        report_lines.append("\n## Evolution Health")

        # Validation failures
        if hasattr(self.evolver, 'evolution_validation_failures') and self.evolver.evolution_validation_failures:
            validation_failures = self.evolver.evolution_validation_failures
            report_lines.append(f"\n### Artifact Validation Failures")
            report_lines.append(f"Total validation failures: {len(validation_failures)}")
            report_lines.append("\n| Iteration | Errors | Used Continue |")
            report_lines.append("|-----------|--------|---------------|")
            for failure in validation_failures:
                errors_str = ', '.join(failure['errors'][:2])  # Show first 2 errors
                if len(failure['errors']) > 2:
                    errors_str += f" (+{len(failure['errors'])-2} more)"
                report_lines.append(f"| {failure['iteration']} | {errors_str} | {failure['used_continue']} |")
        else:
            report_lines.append("\n### Artifact Validation")
            report_lines.append("All evolution artifacts validated successfully ✅")

        # YAML header repairs
        if hasattr(self.evolver, 'header_repairs') and self.evolver.header_repairs:
            header_repairs = self.evolver.header_repairs
            report_lines.append(f"\n### YAML Header Repairs")
            report_lines.append(f"Total repairs: {len(header_repairs)}")
            report_lines.append("\n| Iteration | Agent ID |")
            report_lines.append("|-----------|----------|")
            for repair in header_repairs:
                report_lines.append(f"| {repair['iteration']} | {repair['agent_id']} |")

        # Evolution retries
        if hasattr(self.evolver, 'evolution_retries') and self.evolver.evolution_retries:
            retries = self.evolver.evolution_retries
            successful = sum(1 for r in retries if r.get('success', False))
            failed = len(retries) - successful
            report_lines.append(f"\n### Evolution Retries")
            report_lines.append(f"Total retries: {len(retries)} (✅ {successful} successful, ❌ {failed} failed)")
            report_lines.append("\n| Iteration | Reason | Success |")
            report_lines.append("|-----------|--------|---------|")
            for retry in retries:
                success_str = "✅ Yes" if retry.get('success', False) else "❌ No"
                reason = retry.get('reason', 'unknown')
                report_lines.append(f"| {retry['iteration']} | {reason} | {success_str} |")

        # Claude 5-Hour Limit Incidents section
        report_lines.append("\n## Claude 5-Hour Limit Incidents")
        if self.five_hour_limit_incidents:
            report_lines.append(f"Total incidents: {len(self.five_hour_limit_incidents)}")
            report_lines.append("All incidents were automatically handled by waiting and retrying.\n")
            report_lines.append("| Iteration | Timestamp | Reset Time | Phase 1 Failures | Retry Successful |")
            report_lines.append("|-----------|-----------|------------|------------------|------------------|")
            for incident in self.five_hour_limit_incidents:
                retry_status = "✅ Yes" if incident['retry_successful'] else "❌ No"
                phase1_info = f"{incident['phase1_failures_detected']} (iter {incident['iteration']-1})" if incident['phase1_failures_detected'] > 0 else "0"
                report_lines.append(f"| {incident['iteration']} | {incident['timestamp'][:19]} | {incident['reset_time']} | {phase1_info} | {retry_status} |")
        else:
            report_lines.append("No 5-hour limit incidents encountered ✅")

        # Save report
        if report_type == 'final':
            report_file = self.experiment_dir / 'final_report.md'
        else:
            iteration_dir = self.experiment_dir / f"iteration_{iteration:03d}"
            iteration_dir.mkdir(exist_ok=True)
            report_file = iteration_dir / 'interim_report.md'

        with open(report_file, 'w') as f:
            f.write('\n'.join(report_lines))

        if report_type == 'final':
            print(f"\n📊 Final report saved to: {report_file}")

def main():
    """Main entry point for the parallel agent researcher."""
    parser = argparse.ArgumentParser(description='RoboPhD Parallel Agent Research System')
    
    # Core parameters
    parser.add_argument('--dataset', choices=['train', 'dev'], default='train',
                       help='Dataset to use (default: train)')
    parser.add_argument('--num-iterations', type=int, default=5,
                       help='Number of research iterations (default: 5)')
    parser.add_argument('--agents-per-iteration', type=int, default=None,
                       help='Number of agents to test per iteration (default: 3 for research, 1 for dev-eval)')
    parser.add_argument('--databases-per-iteration', type=int, default=8,
                       help='Number of databases per iteration (default: 8)')
    parser.add_argument('--questions-per-database', type=int, default=30,
                       help='Questions to test per database (default: 30)')
    
    # Models
    parser.add_argument('--eval-model', default='haiku-3.5',
                       help='Model for SQL generation (default: haiku-3.5)')
    parser.add_argument('--analysis-model', default='sonnet-4',
                       help='Model for Phase 1 analysis (default: sonnet-4)')
    parser.add_argument('--evolution-model', default='opus-4.1',
                       help='Model for agent evolution (default: opus-4.1)')
    
    # Evolution strategy parameters
    parser.add_argument('--evolution-default', default='cross_pollination_judgment',
                       help='Default evolution strategy to use (default: cross_pollination_judgment)')
    parser.add_argument('--evolution-schedule', type=str,
                       help='Evolution schedule as JSON dict, e.g. \'{"2": "none", "5": "agent_cross_pollination"}\' to skip evolution at iteration 2 and use cross_pollination at iteration 5 (Note: iteration 1 never evolves)')
    parser.add_argument('--evolution-random', type=str,
                       help='Pool of evolution strategies for "random" keyword as JSON list, e.g. \'["simplified_refinement", "cross_pollination", "research_driven"]\'')
    parser.add_argument('--evolution-weighted-random', type=str,
                       help='Weighted random strategy pool as JSON dict with percentages that sum to 100, e.g. \'{"research_driven": 33, "refinement": 33, "use_judgment_focus_on_errors": 17, "use_judgment_focus_on_prompt": 17}\'')

    # Evolution context is always enabled (removed --no-evolution-context flag)
    parser.add_argument('--context-reset-interval', type=int, default=4,
                       help='Reset evolution context every N evolutions to avoid token limits (default: 4, use 0 to disable)')
    
    # Other parameters
    parser.add_argument('--agents-directory', type=str, default=None,
                       help='Custom directory for agents (default: RoboPhD/agents/)')
    parser.add_argument('--max-concurrent-dbs', type=int, default=7,
                       help='Maximum concurrent database processing (default: 7)')
    parser.add_argument('--random-seed', type=int,
                       help='Random seed for reproducibility (default: random)')
    parser.add_argument('--initial-agents', nargs='+',
                       help='Specific agents to start with (default: auto-discover all)')
    parser.add_argument('--number-of-used-papers', type=int, default=0,
                       help='Number of papers already used from the deterministic shuffle (for fresh starts)')
    
    # Resume/extend parameters
    parser.add_argument('--resume', type=str,
                       help='Resume from experiment directory')
    parser.add_argument('--from-iteration', type=int,
                       help='Restart from specific iteration N (archives iterations N and later, then regenerates from N)')
    parser.add_argument('--extend', type=int,
                       help='Extend a completed run with additional iterations')
    parser.add_argument('--modify-evolution-schedule', type=str,
                       help='Modify evolution schedule as JSON dict. With --from-iteration: for iterations >= from_iteration. With --extend: for iterations > last_completed. Example: \'{"26": "simplified_refinement", "27": "none"}\')')
    parser.add_argument('--modify-evolution-weighted-random', type=str,
                       help='Modify weighted random pool when resuming. Percentages must sum to 100%%. Example: \'{"research_driven": 40, "refinement": 40, "use_your_judgment": 20}\'')

    # SQL verification parameters
    parser.add_argument('--verification-retries', type=int, default=2,
                       help='Number of verification attempts (default: 2, 0 = current behavior)')
    parser.add_argument('--temperature-strategy', choices=['progressive', 'fixed', 'adaptive'],
                       default='progressive',
                       help='Temperature strategy for verification retries (default: progressive)')

    # Dev evaluation mode
    parser.add_argument('--dev-eval', action='store_true',
                       help='Dev set evaluation mode: one iteration and agent, all questions and databases, saves to robophd_evaluation/')
    parser.add_argument('--test-eval', action='store_true',
                       help='Test set evaluation mode: one iteration and agent, all questions and databases, saves to robophd_evaluation/ with test_predictions.json')
    
    args = parser.parse_args()

    # Resolve and set the API key in environment
    resolved_api_key = resolve_api_key()
    if not resolved_api_key:
        print("Error: API key required. Either:")
        print("  1. Create .anthropic_key file in project root with your key")
        print(f"  2. Set {API_KEY_ENV_VAR} environment variable")
        return

    # Check if resuming from checkpoint
    if args.resume:
        # When resuming, ALL configuration comes from checkpoint
        # Command-line options are IGNORED except for --from-iteration, --extend, --modify-evolution-schedule, and --modify-evolution-weighted-random
        print("📂 Resuming from checkpoint - using saved configuration")
        print("   Note: Command-line options ignored except --from-iteration, --extend, --modify-evolution-schedule, and --modify-evolution-weighted-random")
        
        # Load checkpoint
        experiment_dir = Path(args.resume)
        if not experiment_dir.exists():
            print(f"Error: Experiment directory not found: {experiment_dir}")
            return
        
        try:
            checkpoint = ParallelAgentResearcher.load_checkpoint(experiment_dir)
        except FileNotFoundError:
            print(f"Error: No checkpoint found in {experiment_dir}")
            print("This experiment may not have checkpoint support.")
            return
        
        # Determine resume point
        last_completed = checkpoint.get('last_completed_iteration', len(checkpoint.get('test_history', [])))
        
        if args.from_iteration:
            resume_from = args.from_iteration
            print(f"Resuming from iteration {resume_from} (user specified)")
        elif args.extend:
            resume_from = last_completed + 1
            print(f"Extending from iteration {resume_from} (adding {args.extend} iterations)")
        else:
            # Auto-detect: resume from next iteration after last completed
            resume_from = last_completed + 1
            print(f"Auto-resuming from iteration {resume_from} (last completed: {last_completed})")
        
        # Create researcher from checkpoint
        if 'experiment_config' in checkpoint:
            config = checkpoint['experiment_config']
        else:
            # Handle old checkpoint format - use defaults
            print("⚠️  Old checkpoint format detected, using default configuration")
            config = {
                'dataset': args.dataset if args.dataset else 'train',
                'num_iterations': args.num_iterations if args.num_iterations else 10,
                'agents_per_iteration': args.agents_per_iteration if args.agents_per_iteration else 3,
                'databases_per_iteration': args.databases_per_iteration if args.databases_per_iteration else 4,
                'questions_per_database': args.questions_per_database if args.questions_per_database else 20,
                'eval_model': args.eval_model if args.eval_model else 'sonnet-4',
                'analysis_model': args.analysis_model if args.analysis_model else 'sonnet-4',
                'evolution_model': args.evolution_model if args.evolution_model else 'opus-4.1',
                'max_concurrent_dbs': args.max_concurrent_dbs if args.max_concurrent_dbs else 4,
                'phase1_timeout': 1800,
                'sql_timeout': 3600,
                'evolution_timeout': 1800
            }
        
        # Update iterations if extending
        if args.extend:
            config['num_iterations'] = last_completed + args.extend
        
        # ALWAYS use settings from checkpoint when resuming - no overrides allowed
        # Only --from-iteration, --extend, and --extend-evolution-schedule modify behavior
        evolution_default = config.get('evolution_default', 'agent_incremental_refinement')
        evolution_schedule = config.get('evolution_schedule', {})
        evolution_context = config.get('evolution_context', True)
        
        # Convert evolution schedule keys from strings to integers (JSON serialization issue)
        if isinstance(evolution_schedule, dict) and evolution_schedule:
            evolution_schedule = {
                int(k) if isinstance(k, str) and k.isdigit() else k: v 
                for k, v in evolution_schedule.items()
            }
        
        # Handle modified evolution schedule if provided
        if (args.extend or args.from_iteration) and args.modify_evolution_schedule:
            try:
                modify_schedule = json.loads(args.modify_evolution_schedule)
                # Convert string keys to integers
                modify_schedule = {int(k): v for k, v in modify_schedule.items()}
                
                # Validate iterations based on mode
                if args.extend:
                    # For extend mode: only allow iterations > last_completed
                    invalid_iterations = [k for k in modify_schedule.keys() if k <= last_completed]
                    if invalid_iterations:
                        print(f"❌ Error: --modify-evolution-schedule contains iterations that have already been completed: {invalid_iterations}")
                        print(f"   Last completed iteration was {last_completed}, with --extend schedule should only contain iterations > {last_completed}")
                        return
                elif args.from_iteration:
                    # For from-iteration mode: only allow iterations >= from_iteration
                    invalid_iterations = [k for k in modify_schedule.keys() if k < args.from_iteration]
                    if invalid_iterations:
                        print(f"❌ Error: --modify-evolution-schedule contains iterations before from_iteration: {invalid_iterations}")
                        print(f"   Restarting from iteration {args.from_iteration}, schedule should only contain iterations >= {args.from_iteration}")
                        return
                
                # Ensure all iterations are within the range (only for extend mode)
                if args.extend:
                    max_iteration = last_completed + args.extend
                    out_of_range = [k for k in modify_schedule.keys() if k > max_iteration]
                    if out_of_range:
                        print(f"⚠️ Warning: --modify-evolution-schedule contains iterations beyond the extended range: {out_of_range}")
                        print(f"   Extending by {args.extend} iterations means the last iteration will be {max_iteration}")
                
                # Merge with existing evolution schedule
                if isinstance(evolution_schedule, dict):
                    # Merge the schedules (keys already converted to integers above)
                    evolution_schedule.update(modify_schedule)
                else:
                    evolution_schedule = modify_schedule

                mode_text = "Extended" if args.extend else "Modified"
                print(f"📅 {mode_text} evolution schedule applied:")
                for iter_num in sorted([k for k in modify_schedule.keys()]):
                    print(f"   Iteration {iter_num}: {modify_schedule[iter_num]}")
                    
            except (json.JSONDecodeError, ValueError) as e:
                print(f"❌ Invalid --modify-evolution-schedule format: {e}")
                print("   Expected JSON dict like: '{\"26\": \"simplified_refinement\", \"27\": \"none\"}'")
                return
        elif args.modify_evolution_schedule and not (args.extend or args.from_iteration):
            print("⚠️ Warning: --modify-evolution-schedule provided without --extend or --from-iteration, ignoring")

        # Handle modified evolution weighted random if provided
        evolution_weighted_random = config.get('evolution_weighted_random', {})
        if (args.extend or args.from_iteration) and args.modify_evolution_weighted_random:
            try:
                modify_weighted = json.loads(args.modify_evolution_weighted_random)

                # Validate percentages sum to 100
                total = sum(modify_weighted.values())
                if total != 100:
                    print(f"❌ Error: --modify-evolution-weighted-random percentages must sum to 100%, got {total}%")
                    return

                # Replace existing weighted random config
                evolution_weighted_random = modify_weighted

                mode_text = "Extended" if args.extend else "Modified"
                print(f"⚖️ {mode_text} weighted random pool applied:")
                for strategy, weight in modify_weighted.items():
                    print(f"   {strategy}: {weight}%")

            except (json.JSONDecodeError, ValueError) as e:
                print(f"❌ Invalid --modify-evolution-weighted-random format: {e}")
                print("   Expected JSON dict like: '{\"research_driven\": 40, \"refinement\": 40, \"use_your_judgment\": 20}'")
                return
        elif args.modify_evolution_weighted_random and not (args.extend or args.from_iteration):
            print("⚠️ Warning: --modify-evolution-weighted-random provided without --extend or --from-iteration, ignoring")

        # Validate evolution strategies before creating researcher
        # This ensures we fail fast if any strategies are invalid
        try:
            temp_evolver = ParallelAgentEvolver(
                experiment_dir=Path("."),  # Temp path just for validation
                evolution_default=evolution_default,
                evolution_schedule=evolution_schedule,
                evolution_timeout=900,  # Dummy value for validation
                evolution_random_pool=config.get('evolution_random_pool'),
                evolution_weighted_random=evolution_weighted_random  # Use potentially modified version
            )
            # Validation happens in __init__, if we get here strategies are valid
        except ValueError as e:
            print(f"❌ Error validating evolution strategies from checkpoint: {e}")
            return
        
        researcher = ParallelAgentResearcher(
            dataset=config['dataset'],
            num_iterations=config['num_iterations'],
            agents_per_iteration=config['agents_per_iteration'],
            databases_per_iteration=config['databases_per_iteration'],
            questions_per_database=config['questions_per_database'],
            eval_model=config['eval_model'],
            analysis_model=config['analysis_model'],
            evolution_model=config['evolution_model'],
            max_concurrent_dbs=config['max_concurrent_dbs'],
            random_seed=checkpoint['random_seed'],
            phase1_timeout=config.get('phase1_timeout', 1800),
            sql_timeout=config.get('sql_timeout', 3600),
            evolution_timeout=config.get('evolution_timeout', 1800),
            evolution_default=evolution_default,
            evolution_schedule=evolution_schedule,
            evolution_context=evolution_context,
            evolution_random_pool=config.get('evolution_random_pool'),
            evolution_weighted_random=evolution_weighted_random,  # Use potentially modified version
            resume_mode=True,
            resume_from_iteration=resume_from,
            resume_checkpoint=checkpoint,
            resume_experiment_dir=experiment_dir,
            agents_directory=config.get('agents_directory'),
            context_reset_interval=config.get('context_reset_interval', 4),  # Load from checkpoint or use default
            verification_retries=getattr(args, 'verification_retries', config.get('verification_retries', 2)),
            temperature_strategy=getattr(args, 'temperature_strategy', config.get('temperature_strategy', 'progressive')),
            api_key=resolved_api_key
        )
        
        researcher.run()
    else:
        # Handle dev evaluation mode
        if args.dev_eval:
            # Override parameters for dev evaluation
            if args.dataset != 'dev':
                print("⚠️  Dev evaluation mode automatically sets --dataset=dev")

            # Set dev evaluation defaults
            dataset = 'dev'
            num_iterations = 1  # Single iteration for evaluation
            # Set agents_per_iteration based on whether it was explicitly provided
            if args.agents_per_iteration is not None:
                agents_per_iteration = args.agents_per_iteration  # User explicitly set it
            else:
                agents_per_iteration = 1  # Default to 1 for dev-eval mode
            databases_per_iteration = 999  # All databases
            questions_per_database = 99999  # All questions

            # Use custom experiment directory pattern
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            experiment_name = f"dev_{timestamp}"

            print(f"🔍 Dev Evaluation Mode Enabled")
            print(f"   Dataset: {dataset}")
            print(f"   Testing: {len(args.initial_agents) if args.initial_agents else 'all'} agent(s)")
            print(f"   Directory: robophd_evaluation/{experiment_name}")

        # Handle test evaluation mode
        elif args.test_eval:
            # Override parameters for test evaluation
            if args.dataset != 'test':
                print("⚠️  Test evaluation mode automatically sets --dataset=test")

            # Set test evaluation defaults
            dataset = 'test'
            num_iterations = 1  # Single iteration for evaluation
            # Set agents_per_iteration based on whether it was explicitly provided
            if args.agents_per_iteration is not None:
                agents_per_iteration = args.agents_per_iteration  # User explicitly set it
            else:
                agents_per_iteration = 1  # Default to 1 for test-eval mode
            databases_per_iteration = 999  # All databases
            questions_per_database = 99999  # All questions

            # Use custom experiment directory pattern
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            experiment_name = f"test_{timestamp}"

            print(f"🔍 Test Evaluation Mode Enabled")
            print(f"   Dataset: {dataset}")
            print(f"   Testing: {len(args.initial_agents) if args.initial_agents else 'all'} agent(s)")
            print(f"   Directory: robophd_evaluation/{experiment_name}")
        else:
            # Normal research mode
            dataset = args.dataset
            num_iterations = args.num_iterations
            agents_per_iteration = args.agents_per_iteration if args.agents_per_iteration is not None else 3
            databases_per_iteration = args.databases_per_iteration
            questions_per_database = args.questions_per_database
            experiment_name = None  # Use default timestamp pattern
        
        # Parse evolution schedule if provided
        evolution_schedule = None
        if args.evolution_schedule:
            try:
                evolution_schedule = json.loads(args.evolution_schedule)
                # Convert string keys to integers
                evolution_schedule = {int(k): v for k, v in evolution_schedule.items()}
                print(f"📅 Using evolution schedule: {evolution_schedule}")
            except (json.JSONDecodeError, ValueError) as e:
                print(f"⚠️ Invalid evolution schedule format: {e}")
                print("   Expected JSON dict like: '{\"1\": \"none\", \"5\": \"agent_cross_pollination\"}'")
                return
        
        # Parse evolution random pool if provided
        evolution_random_pool = None
        if args.evolution_random:
            try:
                evolution_random_pool = json.loads(args.evolution_random)
                print(f"🎲 Using random evolution pool: {evolution_random_pool}")
            except (json.JSONDecodeError, ValueError) as e:
                print(f"⚠️ Invalid evolution random pool format: {e}")
                print("   Expected JSON list like: '[\"simplified_refinement\", \"cross_pollination\", \"research_driven\"]'")
                return

        # Parse evolution weighted random if provided
        evolution_weighted_random = None
        if args.evolution_weighted_random:
            try:
                evolution_weighted_random = json.loads(args.evolution_weighted_random)
                # Validate percentages sum to 100
                total = sum(evolution_weighted_random.values())
                if total != 100:
                    print(f"❌ Weighted random percentages must sum to 100%, got {total}%")
                    return
                print(f"⚖️ Using weighted random evolution pool:")
                for strategy, weight in evolution_weighted_random.items():
                    print(f"   - {strategy}: {weight}%")
            except (json.JSONDecodeError, ValueError) as e:
                print(f"⚠️ Invalid evolution weighted random format: {e}")
                print("   Expected JSON dict like: '{\"research_driven\": 33, \"refinement\": 33, \"use_judgment_focus_on_errors\": 17, \"use_judgment_focus_on_prompt\": 17}'")
                return
        
        # Normal run (not resuming)
        researcher = ParallelAgentResearcher(
            dataset=dataset,
            num_iterations=num_iterations,
            agents_per_iteration=agents_per_iteration,
            databases_per_iteration=databases_per_iteration,
            questions_per_database=questions_per_database,
            eval_model=args.eval_model,
            analysis_model=args.analysis_model,
            evolution_model=args.evolution_model,
            max_concurrent_dbs=args.max_concurrent_dbs,
            random_seed=args.random_seed,
            evolution_default=args.evolution_default,
            evolution_schedule=evolution_schedule,
            evolution_context=True,  # Always enabled
            dev_eval_mode=args.dev_eval,
            test_eval_mode=args.test_eval,
            custom_experiment_name=experiment_name,
            agents_directory=args.agents_directory,
            number_of_used_papers=args.number_of_used_papers,
            context_reset_interval=args.context_reset_interval,
            evolution_random_pool=evolution_random_pool,
            evolution_weighted_random=evolution_weighted_random,
            evolution_timeout=1800,  # 30 minutes
            phase1_timeout=1800,  # 30 minutes for DB analysis
            sql_timeout=3600,  # 60 minutes for SQL execution (with verification)
            verification_retries=args.verification_retries,
            temperature_strategy=args.temperature_strategy,
            api_key=resolved_api_key
        )
        
        # Run experiment
        researcher.run(initial_agents=args.initial_agents)

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    main()
