import random
import importlib.util
from utils.llm_service import client
import os
import shutil
from typing import List, Dict, Any, Callable, Tuple
import json
import traceback
import numpy as np
from Agent_block import AGENT_BLOCKS
from FlowGen import FlowGen
from evo_optimizer import FlowGenome, EvoOptimizer
import re
from tqdm import tqdm


class DifficultyClassifier:
    """LLM-as-a-judge difficulty classifier for datasets without predefined difficulty levels"""
    
    def __init__(self, llm="gpt-4o-mini"):
        """Initialize difficulty classifier
        
        Args:
            llm: Language model to use for difficulty assessment
        """
        self.llm = llm
        self.difficulty_prompt_templates = {
            "math": """
You are an expert mathematics educator. Please analyze the following mathematical problem and classify its difficulty level.

Problem: {problem}

Please consider these factors when assessing difficulty:
1. Mathematical concepts required (basic arithmetic, algebra, geometry, calculus, etc.)
2. Number of steps needed to solve
3. Abstract thinking requirements
4. Prerequisites knowledge level
5. Problem complexity and reasoning depth

Classify the difficulty as one of:
- Level 1: Basic arithmetic, simple word problems (elementary level)
- Level 2: Basic algebra, simple geometry (middle school level)  
- Level 3: Intermediate algebra, geometry, basic trigonometry (high school level)
- Level 4: Advanced algebra, advanced geometry, pre-calculus (advanced high school)
- Level 5: Calculus, advanced mathematics (college level)

Respond with ONLY the level number (1, 2, 3, 4, or 5) and a brief explanation.
Format: "Level X: [brief explanation]"
""",
            "gsm8k": """
You are an expert in mathematical reasoning. Please analyze the following word problem and classify its difficulty level.

Problem: {problem}

Consider these aspects:
1. Number of mathematical operations required
2. Complexity of reasoning steps
3. Multiple concepts integration
4. Abstract vs concrete thinking needed
5. Problem length and information processing

Classify as:
- Easy: Simple arithmetic, 1-2 steps, concrete scenarios
- Medium: Multiple steps, basic reasoning, some abstraction
- Hard: Complex multi-step reasoning, multiple concepts, high abstraction

Respond with ONLY the difficulty level (Easy, Medium, or Hard) and a brief explanation.
Format: "[Difficulty]: [brief explanation]"
""",
            "general": """
You are an expert educator. Please analyze the following problem and classify its difficulty level.

Problem: {problem}

Consider:
1. Cognitive load required
2. Prior knowledge needed
3. Problem-solving steps complexity
4. Abstract reasoning requirements
5. Time typically needed to solve

Classify as Easy, Medium, or Hard.
Respond with ONLY the difficulty level and a brief explanation.
Format: "[Difficulty]: [brief explanation]"
"""
        }
    
    def classify_single_problem(self, problem_text: str, domain: str = "general") -> Dict[str, Any]:
        """Classify difficulty of a single problem using LLM
        
        Args:
            problem_text: The problem text to classify
            domain: Domain type ("math", "gsm8k", or "general")
            
        Returns:
            Dictionary with difficulty level and explanation
        """
        try:
            # Get appropriate prompt template
            template = self.difficulty_prompt_templates.get(domain, self.difficulty_prompt_templates["general"])
            prompt = template.format(problem=problem_text)
            
            # Query LLM
            response = client().chat.completions.create(
                model=self.llm,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,  # Low temperature for consistent classification
                max_tokens=150
            )
            
            result_text = response.choices[0].message.content.strip()
            
            # Parse response
            if domain == "math":
                # Extract level number for math problems
                level_match = re.search(r'Level (\d+)', result_text)
                if level_match:
                    level = int(level_match.group(1))
                    difficulty = f"Level {level}"
                else:
                    difficulty = "Level 3"  # Default to medium
            else:
                # Extract difficulty word for other domains
                if "Easy" in result_text or "easy" in result_text:
                    difficulty = "Easy"
                elif "Hard" in result_text or "hard" in result_text:
                    difficulty = "Hard"
                else:
                    difficulty = "Medium"  # Default to medium
            
            return {
                "difficulty": difficulty,
                "explanation": result_text,
                "confidence": "high" if any(word in result_text.lower() for word in ["clearly", "obviously", "definitely"]) else "medium"
            }
            
        except Exception as e:
            print(f"Error classifying problem: {e}")
            # Return default classification on error
            default_difficulty = "Level 3" if domain == "math" else "Medium"
            return {
                "difficulty": default_difficulty,
                "explanation": f"Default classification due to error: {str(e)}",
                "confidence": "low"
            }
    
    def classify_dataset(self, dataset_path: str, domain: str = "general", 
                        sample_size: int = None, output_path: str = None) -> Dict[str, Any]:
        """Classify difficulty for an entire dataset
        
        Args:
            dataset_path: Path to the dataset file
            domain: Domain type for classification
            sample_size: Number of problems to classify (None for all)
            output_path: Path to save classification results
            
        Returns:
            Dictionary with classification results and statistics
        """
        print(f"📚 Starting difficulty classification for {dataset_path}")
        
        # Load dataset
        problems = self._load_dataset(dataset_path)
        
        if sample_size and sample_size < len(problems):
            import random
            random.seed(42)
            problems = random.sample(problems, sample_size)
            print(f"🎯 Sampling {sample_size} problems from {len(problems)} total")
        
        # Classify each problem
        classified_problems = []
        difficulty_counts = {}
        
        print("🤖 Classifying problems using LLM...")
        for i, problem in enumerate(tqdm(problems, desc="Classifying")):
            # Extract problem text based on dataset format
            problem_text = self._extract_problem_text(problem, domain)
            
            # Classify difficulty
            classification = self.classify_single_problem(problem_text, domain)
            
            # Add classification to problem
            problem_with_difficulty = problem.copy()
            problem_with_difficulty.update({
                "difficulty_classification": classification,
                "original_index": i
            })
            
            classified_problems.append(problem_with_difficulty)
            
            # Count difficulties
            difficulty = classification["difficulty"]
            difficulty_counts[difficulty] = difficulty_counts.get(difficulty, 0) + 1
        
        # Create classification summary
        classification_results = {
            "dataset_path": dataset_path,
            "domain": domain,
            "total_problems": len(problems),
            "classified_problems": classified_problems,
            "difficulty_distribution": difficulty_counts,
            "classification_stats": {
                "high_confidence": len([p for p in classified_problems 
                                      if p["difficulty_classification"]["confidence"] == "high"]),
                "medium_confidence": len([p for p in classified_problems 
                                        if p["difficulty_classification"]["confidence"] == "medium"]),
                "low_confidence": len([p for p in classified_problems 
                                     if p["difficulty_classification"]["confidence"] == "low"])
            }
        }
        
        # Save results if output path provided
        if output_path:
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(classification_results, f, ensure_ascii=False, indent=2)
            print(f"💾 Classification results saved to: {output_path}")
        
        # Print summary
        self._print_classification_summary(classification_results)
        
        return classification_results
    
    def _load_dataset(self, dataset_path: str) -> List[Dict[str, Any]]:
        """Load dataset from file"""
        problems = []
        
        try:
            if dataset_path.endswith('.jsonl'):
                # Load JSONL file
                with open(dataset_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        if line.strip():
                            problems.append(json.loads(line.strip()))
            elif dataset_path.endswith('.json'):
                # Load JSON file
                with open(dataset_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    if isinstance(data, list):
                        problems = data
                    elif isinstance(data, dict) and 'problems' in data:
                        problems = data['problems']
                    else:
                        problems = [data]
            else:
                raise ValueError(f"Unsupported file format: {dataset_path}")
                
        except Exception as e:
            raise Exception(f"Failed to load dataset from {dataset_path}: {e}")
        
        return problems
    
    def _extract_problem_text(self, problem: Dict[str, Any], domain: str) -> str:
        """Extract problem text based on dataset format and domain"""
        # Common field names for problem text
        text_fields = ['question', 'problem', 'prompt', 'text', 'input', 'query']
        
        for field in text_fields:
            if field in problem:
                return problem[field]
        
        # If no standard field found, try to extract from the problem dict
        problem_str = str(problem)
        if len(problem_str) > 50:  # Reasonable length for a problem
            return problem_str
        
        # Last resort: return string representation
        return str(problem)
    
    def _print_classification_summary(self, results: Dict[str, Any]):
        """Print classification summary"""
        print("\n📊 Difficulty Classification Summary:")
        print("=" * 50)
        print(f"Total problems classified: {results['total_problems']}")
        print(f"Domain: {results['domain']}")
        
        print("\n🎯 Difficulty Distribution:")
        for difficulty, count in results['difficulty_distribution'].items():
            percentage = (count / results['total_problems']) * 100
            print(f"  {difficulty}: {count} ({percentage:.1f}%)")
        
        print("\n🎯 Classification Confidence:")
        stats = results['classification_stats']
        total = results['total_problems']
        print(f"  High confidence: {stats['high_confidence']} ({stats['high_confidence']/total*100:.1f}%)")
        print(f"  Medium confidence: {stats['medium_confidence']} ({stats['medium_confidence']/total*100:.1f}%)")
        print(f"  Low confidence: {stats['low_confidence']} ({stats['low_confidence']/total*100:.1f}%)")
    
    def create_curriculum_config_from_classification(self, classification_results: Dict[str, Any], 
                                                   dataset_name: str) -> Dict[str, Any]:
        """Create curriculum configuration based on classification results
        
        Args:
            classification_results: Results from classify_dataset
            dataset_name: Name for the dataset in curriculum config
            
        Returns:
            Curriculum configuration dictionary
        """
        difficulties = list(classification_results['difficulty_distribution'].keys())
        difficulties.sort()  # Sort difficulties
        
        # Create thresholds and parameters based on number of difficulty levels
        num_levels = len(difficulties)
        
        if num_levels == 3:  # Easy, Medium, Hard
            thresholds = [0.7, 0.8, 0.9]
            sample_sizes = [15, 25, 40]
            min_generations = [2, 3, 4]
        elif num_levels == 5:  # Level 1-5
            thresholds = [0.6, 0.65, 0.7, 0.75, 0.8]
            sample_sizes = [10, 15, 20, 25, 30]
            min_generations = [2, 2, 3, 3, 4]
        else:
            # Generic configuration for any number of levels
            base_threshold = 0.6
            threshold_increment = 0.1
            thresholds = [base_threshold + i * threshold_increment for i in range(num_levels)]
            
            base_sample_size = 10
            sample_sizes = [base_sample_size + i * 5 for i in range(num_levels)]
            
            min_generations = [2 + i // 2 for i in range(num_levels)]
        
        curriculum_config = {
            dataset_name: {
                "levels": difficulties,
                "thresholds": thresholds[:num_levels],
                "sample_sizes": sample_sizes[:num_levels],
                "min_generations": min_generations[:num_levels],
                "classification_source": "llm_judge",
                "classification_confidence": classification_results['classification_stats']
            }
        }
        
        print(f"\n📚 Generated curriculum configuration for {dataset_name}:")
        print(json.dumps(curriculum_config, indent=2))
        
        return curriculum_config


class CurriculumManager:
    """Curriculum manager responsible for managing training difficulty levels and curriculum progress"""
    
    def __init__(self, curriculum_config=None):
        """Initialize curriculum manager
        
        Args:
            curriculum_config: Curriculum configuration containing difficulty levels and thresholds
        """
        if curriculum_config is None:
            self.curriculum_config = {
                "gsm8k": {
                    "levels": ["easy", "medium", "hard"],
                    "thresholds": [0.7, 0.8, 0.9],  # Accuracy thresholds to advance to next level
                    "sample_sizes": [20, 50, 100],   # Sample sizes for each level
                    "min_generations": [3, 5, 8]     # Minimum training generations per level
                },
                "math": {
                    "levels": ["Level 1", "Level 2", "Level 3", "Level 4", "Level 5"],
                    "thresholds": [0.6, 0.65, 0.7, 0.75, 0.8],
                    "sample_sizes": [10, 20, 30, 40, 50],
                    "min_generations": [2, 3, 4, 5, 6],
                    "subjects": ["prealgebra", "algebra", "geometry", "counting_and_probability"]
                }
            }
        else:
            self.curriculum_config = curriculum_config
        
        # Current curriculum state
        self.current_level = {}
        self.performance_history = {}
        self.generation_count = {}
        
        # Initialize starting levels for each dataset
        for dataset in self.curriculum_config:
            self.current_level[dataset] = 0
            self.performance_history[dataset] = []
            self.generation_count[dataset] = 0
    
    def get_current_difficulty(self, dataset: str) -> Dict[str, Any]:
        """Get current difficulty configuration
        
        Args:
            dataset: Dataset name ("gsm8k" or "math")
            
        Returns:
            Configuration information for current difficulty level
        """
        if dataset not in self.curriculum_config:
            raise ValueError(f"Unsupported dataset: {dataset}")
        
        config = self.curriculum_config[dataset]
        level_idx = self.current_level[dataset]
        
        return {
            "level": config["levels"][level_idx],
            "level_index": level_idx,
            "sample_size": config["sample_sizes"][level_idx],
            "threshold": config["thresholds"][level_idx],
            "min_generations": config["min_generations"][level_idx]
        }
    
    def update_performance(self, dataset: str, accuracy: float) -> bool:
        """Update performance and determine if difficulty should be upgraded
        
        Args:
            dataset: Dataset name
            accuracy: Current accuracy
            
        Returns:
            Whether to upgrade to next difficulty level
        """
        self.performance_history[dataset].append(accuracy)
        self.generation_count[dataset] += 1
        
        config = self.curriculum_config[dataset]
        level_idx = self.current_level[dataset]
        
        # Check if upgrade conditions are met
        min_gens = config["min_generations"][level_idx]
        threshold = config["thresholds"][level_idx]
        
        # Need to meet minimum training generations and performance threshold
        if (self.generation_count[dataset] >= min_gens and 
            accuracy >= threshold and 
            level_idx < len(config["levels"]) - 1):
            
            # Upgrade to next level
            self.current_level[dataset] += 1
            self.generation_count[dataset] = 0
            print(f"📈 {dataset} upgraded to: {config['levels'][self.current_level[dataset]]}")
            return True
        
        return False
    
    def should_downgrade(self, dataset: str, accuracy: float, window_size: int = 3) -> bool:
        """Determine if difficulty should be downgraded
        
        Args:
            dataset: Dataset name
            accuracy: Current accuracy
            window_size: Performance window size
            
        Returns:
            Whether difficulty should be downgraded
        """
        if len(self.performance_history[dataset]) < window_size:
            return False
        
        # Get average performance of recent generations
        recent_performance = self.performance_history[dataset][-window_size:]
        avg_performance = np.mean(recent_performance)
        
        config = self.curriculum_config[dataset]
        level_idx = self.current_level[dataset]
        
        # Cannot downgrade if already at lowest level
        if level_idx == 0:
            return False
        
        # Consider downgrading if recent performance significantly drops
        threshold = config["thresholds"][level_idx - 1]  # Previous level threshold
        
        if avg_performance < threshold * 0.8:  # Downgrade threshold set to 80% of previous level threshold
            self.current_level[dataset] = max(0, level_idx - 1)
            self.generation_count[dataset] = 0
            print(f"📉 {dataset} downgraded to: {config['levels'][self.current_level[dataset]]}")
            return True
        
        return False
    
    def get_adaptive_sample_size(self, dataset: str, base_performance: float) -> int:
        """Adaptively adjust sample size based on performance
        
        Args:
            dataset: Dataset name
            base_performance: Baseline performance
            
        Returns:
            Adjusted sample size
        """
        config = self.curriculum_config[dataset]
        level_idx = self.current_level[dataset]
        base_size = config["sample_sizes"][level_idx]
        
        # Adjust sample size based on performance
        if base_performance < 0.3:
            # Poor performance, reduce samples to speed up iteration
            return max(5, int(base_size * 0.5))
        elif base_performance > 0.8:
            # Good performance, increase samples to ensure stability
            return min(100, int(base_size * 1.5))
        else:
            return base_size
    
    def add_llm_classified_dataset(self, dataset_path: str, dataset_name: str, 
                                  domain: str = "general", sample_size: int = None,
                                  classifier: DifficultyClassifier = None) -> Dict[str, Any]:
        """Add a new dataset with LLM-based difficulty classification
        
        Args:
            dataset_path: Path to the dataset file
            dataset_name: Name for the dataset in curriculum config
            domain: Domain type for classification ("math", "gsm8k", "general")
            sample_size: Number of problems to classify (None for all)
            classifier: DifficultyClassifier instance (creates new one if None)
            
        Returns:
            Classification results and updated curriculum config
        """
        if classifier is None:
            classifier = DifficultyClassifier()
        
        # Classify the dataset
        classification_results = classifier.classify_dataset(
            dataset_path=dataset_path,
            domain=domain,
            sample_size=sample_size,
            output_path=f"results/difficulty_classification_{dataset_name}.json"
        )
        
        # Create curriculum configuration
        new_config = classifier.create_curriculum_config_from_classification(
            classification_results, dataset_name
        )
        
        # Add to existing curriculum config
        self.curriculum_config.update(new_config)
        
        # Initialize state for new dataset
        self.current_level[dataset_name] = 0
        self.performance_history[dataset_name] = []
        self.generation_count[dataset_name] = 0
        
        print(f"✅ Added {dataset_name} to curriculum with LLM-based difficulty classification")
        
        return {
            "classification_results": classification_results,
            "curriculum_config": new_config,
            "dataset_name": dataset_name
        }
    
    def load_classified_dataset(self, classification_file: str, dataset_name: str):
        """Load a previously classified dataset
        
        Args:
            classification_file: Path to saved classification results
            dataset_name: Name for the dataset in curriculum config
        """
        try:
            with open(classification_file, 'r', encoding='utf-8') as f:
                classification_results = json.load(f)
            
            # Create curriculum configuration
            classifier = DifficultyClassifier()
            new_config = classifier.create_curriculum_config_from_classification(
                classification_results, dataset_name
            )
            
            # Add to existing curriculum config
            self.curriculum_config.update(new_config)
            
            # Initialize state for new dataset
            self.current_level[dataset_name] = 0
            self.performance_history[dataset_name] = []
            self.generation_count[dataset_name] = 0
            
            print(f"✅ Loaded {dataset_name} from classification file: {classification_file}")
            
        except Exception as e:
            raise Exception(f"Failed to load classified dataset: {e}")


class CurriculumGuidedEvoOptimizer(EvoOptimizer):
    """Curriculum-guided evolutionary optimizer"""
    
    def __init__(self, task_info, tools=None, population_size=5, initial_population=None, 
                 llm="gpt-4o-mini", curriculum_config=None, enable_curriculum=True):
        """Initialize curriculum-guided evolutionary optimizer
        
        Args:
            task_info: Task description information
            tools: Tool collection
            population_size: Population size
            initial_population: Initial population
            llm: Large language model to use
            curriculum_config: Curriculum configuration
            enable_curriculum: Whether to enable curriculum learning
        """
        super().__init__(task_info, tools, population_size, initial_population, llm)
        
        self.enable_curriculum = enable_curriculum
        self.curriculum_manager = CurriculumManager(curriculum_config) if enable_curriculum else None
        
        # Curriculum-related statistics
        self.curriculum_history = []
        self.difficulty_transitions = []
        
    def evaluate_curriculum(self, individual, benchmark_type="gsm-8k"):
        """Evaluate individual based on current curriculum difficulty
        
        Args:
            individual: Individual to be evaluated
            benchmark_type: Benchmark test type
            
        Returns:
            Evaluation results and detailed information
        """
        if not self.enable_curriculum:
            return self.evaluate(individual, benchmark_type)
        
        try:
            # Get current difficulty configuration
            dataset_name = benchmark_type.replace("-", "")  # "gsm-8k" -> "gsm8k"
            difficulty_config = self.curriculum_manager.get_current_difficulty(dataset_name)
            
            # Write code to temporary file
            with open("flow_genome_tmp.py", "w", encoding="utf-8") as f:
                f.write(individual["code"])
            
            # Dynamic import
            spec = importlib.util.spec_from_file_location("flow_genome_tmp", "flow_genome_tmp.py")
            flow_mod = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(flow_mod)
            
            # Create system and evaluate
            system = flow_mod.MultiAgentSystem("EvoTest", self.tools)
            
            # Evaluate based on curriculum difficulty
            if benchmark_type == "gsm-8k":
                from Evaluation.Eval_GSM8k import Eval
                # Use curriculum-specified sample size for evaluation
                accuracy = Eval(
                    agentflow=system, 
                    mode="train",
                    sample_size=difficulty_config["sample_size"]
                )
            elif benchmark_type == "math":
                from Evaluation.Eval_MATH import Eval
                # Use current difficulty level for evaluation
                accuracy = Eval(
                    agentflow=system,
                    level=difficulty_config["level"],
                    sample_size=difficulty_config["sample_size"],
                    mode="train"
                )
            else:
                accuracy = 0.0
            
            return accuracy, difficulty_config
            
        except Exception as e:
            print(f"Curriculum evaluation failed: {e}")
            print(traceback.format_exc())
            return 0.0, None
    
    def evolve_with_curriculum(self, generations=10, benchmark_type="gsm-8k"):
        """Execute curriculum-guided evolution process
        
        Args:
            generations: Number of generations
            benchmark_type: Benchmark test type
            
        Returns:
            Best individual and curriculum history
        """
        if not self.enable_curriculum:
            return self.evolve(generations, benchmark_type)
        
        best_individual = None
        best_fitness = -1
        dataset_name = benchmark_type.replace("-", "")
        
        # Create results directory
        os.makedirs("results/curriculum_history", exist_ok=True)
        
        for generation in range(generations):
            print(f"\n===== Generation {generation+1}/{generations} =====")
            
            # Get current curriculum difficulty
            difficulty_config = self.curriculum_manager.get_current_difficulty(dataset_name)
            print(f"🎯 Current difficulty: {difficulty_config['level']} (samples: {difficulty_config['sample_size']})")
            
            # Evaluate current population
            fitnesses = []
            generation_performance = []
            
            for i, individual in enumerate(self.population):
                if individual["fitness"] == -1:
                    # Use curriculum evaluation
                    accuracy, _ = self.evaluate_curriculum(individual, benchmark_type)
                    self.population[i]["fitness"] = accuracy
                    fitness = accuracy
                else:
                    fitness = individual["fitness"]
                
                fitnesses.append(fitness)
                generation_performance.append(fitness)
                
                # Update best individual
                if fitness > best_fitness:
                    best_fitness = fitness
                    best_individual = individual.copy()
            
            # Calculate performance statistics for current generation
            avg_fitness = np.mean(generation_performance)
            max_fitness = np.max(generation_performance)
            
            print(f"📊 Average accuracy: {avg_fitness:.4f}, Best accuracy: {max_fitness:.4f}")
            
            # Update curriculum progress
            upgraded = self.curriculum_manager.update_performance(dataset_name, max_fitness)
            downgraded = self.curriculum_manager.should_downgrade(dataset_name, avg_fitness)
            
            if upgraded or downgraded:
                # Record difficulty transition
                self.difficulty_transitions.append({
                    "generation": generation + 1,
                    "action": "upgrade" if upgraded else "downgrade",
                    "new_level": self.curriculum_manager.get_current_difficulty(dataset_name)["level"],
                    "performance": max_fitness
                })
                
                # Re-evaluate population (because difficulty changed)
                for i, individual in enumerate(self.population):
                    accuracy, _ = self.evaluate_curriculum(individual, benchmark_type)
                    self.population[i]["fitness"] = accuracy
                    fitnesses[i] = accuracy
            
            # Record curriculum history
            curriculum_record = {
                "generation": generation + 1,
                "difficulty_config": difficulty_config,
                "avg_performance": avg_fitness,
                "max_performance": max_fitness,
                "population_fitness": fitnesses.copy()
            }
            self.curriculum_history.append(curriculum_record)
            
            # If this is the last generation, no need to create new population
            if generation == generations - 1:
                break
            
            # Curriculum-adaptive evolution strategy selection
            new_population = self._curriculum_guided_evolution(fitnesses, difficulty_config, generation)
            
            # Update population
            self.population = new_population
            
            # Save curriculum history
            with open(f"results/curriculum_history/gen_{generation+1}_curriculum.json", "w", encoding="utf-8") as f:
                json.dump(curriculum_record, f, ensure_ascii=False, indent=2)
        
        # Save complete curriculum history
        final_history = {
            "curriculum_history": self.curriculum_history,
            "difficulty_transitions": self.difficulty_transitions,
            "final_best": best_individual,
            "final_performance": best_fitness
        }
        
        with open("results/curriculum_complete_history.json", "w", encoding="utf-8") as f:
            json.dump(final_history, f, ensure_ascii=False, indent=2)
        
        return best_individual, final_history
    
    def _curriculum_guided_evolution(self, fitnesses, difficulty_config, generation):
        """Curriculum-guided evolution strategy
        
        Args:
            fitnesses: Current fitness list
            difficulty_config: Current difficulty configuration
            generation: Current generation number
            
        Returns:
            New population
        """
        new_population = []
        avg_fitness = np.mean(fitnesses)
        
        # Elite preservation
        best_idx = fitnesses.index(max(fitnesses))
        new_population.append(self.population[best_idx].copy())
        
        # Adjust evolution strategy based on current performance and difficulty level
        if avg_fitness < 0.3:
            # Poor performance, use more exploration strategies
            strategy_weights = [0.8, 0.7, 0.4, 0.3, 0.2]  # More exploration
        elif avg_fitness > 0.7:
            # Good performance, use more exploitation strategies
            strategy_weights = [0.3, 0.4, 0.8, 0.9, 0.8]  # More exploitation
        else:
            # Medium performance, balanced strategy
            strategy_weights = [0.5, 0.5, 0.6, 0.6, 0.5]
        
        # Apply evolution for each strategy
        for i, strategy in enumerate(self.strategies):
            strategy_weight = strategy_weights[i]
            
            if random.random() < strategy_weight:
                try:
                    if strategy in ["1", "3"]:  # Requires multiple parents
                        parents = self.select_parents(fitnesses, 2)
                        result = self.flow_evo.evolve(strategy, [self.population[j] for j in parents])
                    else:  # Use best individual
                        result = self.flow_evo.evolve(strategy, self.population[best_idx])
                    
                    if "code" in result and "plan" in result:
                        # Immediately evaluate new individual
                        accuracy, _ = self.evaluate_curriculum(result, "gsm-8k")  # Default to gsm-8k
                        
                        offspring = {
                            "code": result["code"],
                            "plan": result["plan"],
                            "num_agent": result["num_agent"],
                            "fitness": accuracy
                        }
                        
                        new_population.append(offspring)
                        
                        # 保存进化历史
                        history_data = {
                            "generation": generation + 1,
                            "strategy": strategy,
                            "difficulty": difficulty_config,
                            "offspring_fitness": accuracy,
                            "parents": parents if strategy in ["1", "3"] else [best_idx]
                        }
                        
                        os.makedirs("results/curriculum_evolution", exist_ok=True)
                        with open(f"results/curriculum_evolution/gen_{generation+1}_strategy_{strategy}.json", "w", encoding="utf-8") as f:
                            json.dump(history_data, f, ensure_ascii=False, indent=2)
                
                except Exception as e:
                    print(f"Strategy {strategy} execution failed: {e}")
                    continue
        
        # Ensure population size
        while len(new_population) < self.population_size:
            idx = self.select_parents(fitnesses)[0]
            new_population.append(self.population[idx].copy())
        
        while len(new_population) > self.population_size:
            # Remove individual with lowest fitness
            min_fitness_idx = min(range(len(new_population)), 
                                 key=lambda i: new_population[i].get("fitness", 0))
            new_population.pop(min_fitness_idx)
        
        return new_population
    
    def analyze_curriculum_progress(self):
        """Analyze curriculum learning progress"""
        if not self.enable_curriculum or not self.curriculum_history:
            print("No curriculum history data available for analysis")
            return
        
        print("\n📈 Curriculum learning progress analysis:")
        print("=" * 50)
        
        for dataset in self.curriculum_manager.current_level:
            current_level = self.curriculum_manager.current_level[dataset]
            total_levels = len(self.curriculum_manager.curriculum_config[dataset]["levels"])
            progress = (current_level + 1) / total_levels * 100
            
            print(f"{dataset.upper()}:")
            print(f"  Current level: {self.curriculum_manager.curriculum_config[dataset]['levels'][current_level]}")
            print(f"  Progress: {progress:.1f}% ({current_level + 1}/{total_levels})")
            
            if self.curriculum_manager.performance_history[dataset]:
                recent_perf = self.curriculum_manager.performance_history[dataset][-3:]
                print(f"  Recent performance: {np.mean(recent_perf):.3f}")
        
        print(f"\nDifficulty transition count: {len(self.difficulty_transitions)}")
        for transition in self.difficulty_transitions:
            print(f"  Generation {transition['generation']}: {transition['action']} -> {transition['new_level']}")