#!/usr/bin/env python3
"""
Single Dataset Algorithm Selector Trainer

This script trains algorithm selectors on individual dataset directories for constraint 
optimization problems. It serves as the core training engine for LLM vs mzn2feature 
comparison research, providing comprehensive evaluation including both accuracy and 
ranking-based metrics.

KEY FEATURES:
- Supports 3 selector types: Random Forest, AutoSklearn, AutoSklearn Conservative
- Handles different problem types: VRP (objective minimization), car_sequencing/FLECC (time minimization)
- Provides comprehensive metrics: accuracy, cross-validation, ranking analysis
- Includes single best solver baseline comparison for research evaluation
- Optimized hyperparameters based on constraint programming domain knowledge

ALGORITHM SELECTION EVALUATION METRICS:
- Training/Test Accuracy: Classification accuracy for predicting best solver
- Cross-Validation: 5-fold CV for robust performance estimation
- Average Ranking: Mean rank of predicted solvers (1=best, lower is better)
- Top-K Performance: Percentage of instances where prediction is in top-1 or top-3
- Single Best Solver Baseline: Performance of always choosing most frequent winner

EXPECTED DATASET STRUCTURE:
dataset_dir/
├── features_train.csv      # Training features: [filename, feature1, feature2, ...]
├── features_test.csv       # Test features: [filename, feature1, feature2, ...]  
├── performance_train.csv   # Training performance: [instance, solver1_perf, solver2_perf, ...]
└── performance_test.csv    # Test performance: [instance, solver1_perf, solver2_perf, ...]

USAGE:
    # Train Random Forest on mzn2feat features for VRP
    python single_dataset_trainer.py --dataset-dir ./src/datasets/vrp/mzn2feat --problem vrp
    
    # Train LLM-based selector for car_sequencing with AutoSklearn
    python single_dataset_trainer.py --dataset-dir ./src/datasets/car_sequencing/lmtuner20250908123608 --problem car_sequencing --selector-type autosklearn
    
    # Train conservative AutoSklearn for large FLECC dataset
    python single_dataset_trainer.py --dataset-dir ./src/datasets/FLECC/mzn2feat --problem FLECC --selector-type autosklearn_conservative

RESEARCH APPLICATIONS:
This script enables systematic comparison of feature extraction approaches:
- mzn2feat: Traditional constraint programming features (95 features)
- LLM features: OpenAI O4-mini generated features (50 features)
- Performance comparison across multiple algorithm selector types
- Ranking-based evaluation to assess solution quality beyond binary accuracy

OUTPUT:
Results are saved to single_results/ directory with comprehensive metrics:
- JSON results file with all performance metrics
- Pickle model file for future predictions
- Detailed logging with ranking metrics and baseline comparisons
"""

import argparse
import json
import pickle
import sys
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
import logging
import pandas as pd
import numpy as np
from datetime import datetime

# ML imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, make_scorer
from sklearn.preprocessing import StandardScaler

# Try to import AutoSklearn (may not be available)
try:
    import autosklearn.classification
    AUTOSKLEARN_AVAILABLE = True
except ImportError:
    AUTOSKLEARN_AVAILABLE = False


# Problem configuration for performance type handling
PROBLEM_CONFIGS = {
    "vrp": {"performance_type": "objective"},        # VRP uses objective values (minimization)
    "car_sequencing": {"performance_type": "time"},  # Car sequencing uses solving time
    "FLECC": {"performance_type": "time"}           # FLECC uses solving time
}


def create_ranking_scorer(performance_data: pd.DataFrame, problem_type: str):
    """
    Create a custom scorer for ranking-based evaluation during grid search.
    
    Args:
        performance_data: Performance matrix with solvers as columns
        problem_type: "time" or "objective" to determine ranking order
    
    Returns:
        Sklearn scorer function that computes negative average ranking
    """
    def ranking_score(y_true, y_pred):
        """
        Calculate negative average ranking (higher is better for sklearn).
        
        Returns the negative of average rank so that sklearn's maximization
        behavior works correctly (lower rank = better = higher score).
        """
        total_rank = 0
        
        for i, (true_solver, pred_solver) in enumerate(zip(y_true, y_pred)):
            # Get performance values for this instance
            instance_perf = performance_data.iloc[i].drop('instance')
            
            # Create rankings (1 = best performance)
            if problem_type in ["time", "objective"]:  # Lower is better
                rankings = instance_perf.rank(method='min')
            else:  # Higher is better
                rankings = instance_perf.rank(method='min', ascending=False)
            
            # Get rank of predicted solver
            pred_rank = rankings[pred_solver]
            total_rank += pred_rank
        
        # Return negative average rank (so higher score = better ranking)
        avg_rank = total_rank / len(y_true)
        return -avg_rank
    
    return make_scorer(ranking_score, greater_is_better=True)


class SingleDatasetTrainer:
    """Algorithm selector trainer for a single dataset."""
    
    def __init__(self, dataset_dir: str, problem: str, results_dir: str = "single_results"):
        """
        Initialize the trainer.
        
        Args:
            dataset_dir: Directory containing the 4-file dataset structure
            problem: Problem name (vrp, car_sequencing, FLECC)
            results_dir: Directory to save results
        """
        self.dataset_dir = Path(dataset_dir)
        self.problem = problem
        # Auto-extract extractor name from directory path
        self.extractor = self.dataset_dir.name
        self.results_dir = Path(results_dir)
        self.results_dir.mkdir(parents=True, exist_ok=True)
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)
        
        # Validate dataset directory
        self._validate_dataset()
    
    def _validate_dataset(self):
        """Validate that all required files exist."""
        required_files = [
            'features_train.csv',
            'features_test.csv',
            'performance_train.csv', 
            'performance_test.csv'
        ]
        
        for file in required_files:
            file_path = self.dataset_dir / file
            if not file_path.exists():
                raise FileNotFoundError(f"Required file not found: {file_path}")
        
        self.logger.info(f"✅ Dataset validation passed: {self.dataset_dir}")
    
    def load_dataset(self) -> Tuple[Optional[Tuple], str]:
        """
        Load the complete dataset.
        
        Returns:
            Tuple of ((X_train, y_train, X_test, y_test), error_msg)
        """
        try:
            self.logger.info(f"📖 Loading dataset from {self.dataset_dir}")
            
            # Load all 4 files
            features_train = pd.read_csv(self.dataset_dir / 'features_train.csv')
            features_test = pd.read_csv(self.dataset_dir / 'features_test.csv')
            performance_train = pd.read_csv(self.dataset_dir / 'performance_train.csv')
            performance_test = pd.read_csv(self.dataset_dir / 'performance_test.csv')
            
            self.logger.info(f"   Features train: {features_train.shape}")
            self.logger.info(f"   Features test: {features_test.shape}")
            self.logger.info(f"   Performance train: {performance_train.shape}")
            self.logger.info(f"   Performance test: {performance_test.shape}")
            
            # Extract features (skip filename column)
            X_train = features_train.iloc[:, 1:].select_dtypes(include=[np.number])
            X_test = features_test.iloc[:, 1:].select_dtypes(include=[np.number])
            
            # Robust data cleaning: handle NaN, infinity, and extreme values
            def clean_data(df):
                # Convert to float64 to handle large values
                df = df.astype(np.float64)
                # Replace infinite values with NaN first
                df = df.replace([np.inf, -np.inf], np.nan)
                # Fill NaN with 0
                df = df.fillna(0)
                # Clip extreme values to prevent overflow (more conservative limits)
                df = df.clip(lower=-1e6, upper=1e6)
                # Convert back to float32 for sklearn compatibility
                df = df.astype(np.float32)
                return df
            
            X_train = clean_data(X_train)
            X_test = clean_data(X_test)
            
            # Get performance type for this problem
            performance_type = PROBLEM_CONFIGS.get(self.problem, {}).get("performance_type", "time")
            
            # Extract labels (best algorithm for each instance)
            y_train = self._extract_best_algorithms(performance_train, performance_type)
            y_test = self._extract_best_algorithms(performance_test, performance_type)
            
            # Verify alignment
            if len(X_train) != len(y_train) or len(X_test) != len(y_test):
                return None, f"Feature-performance alignment mismatch"
            
            self.logger.info(f"✅ Dataset loaded successfully:")
            self.logger.info(f"   Training: {len(X_train)} instances, {X_train.shape[1]} features")
            self.logger.info(f"   Testing: {len(X_test)} instances, {X_test.shape[1]} features")
            self.logger.info(f"   Performance type: {performance_type}")
            self.logger.info(f"   Available algorithms: {list(set(y_train))}")
            
            return (X_train, y_train, X_test, y_test), ""
            
        except Exception as e:
            return None, f"Error loading dataset: {e}"
    
    def _extract_best_algorithms(self, performance_df: pd.DataFrame, performance_type: str = "time") -> np.ndarray:
        """
        Extract best algorithm labels from performance data.
        
        Args:
            performance_df: DataFrame with columns [instance, alg1_perf, alg2_perf, ...]
            performance_type: Either "time" (minimize) or "objective" (minimize)
            
        Returns:
            Array of best algorithm names for each instance
        """
        # Get algorithm performance columns (skip instance name column)
        algo_columns = performance_df.columns[1:].tolist()
        
        self.logger.info(f"🎯 Extracting best algorithms ({performance_type}, minimize=better)")
        self.logger.info(f"   Available algorithms: {algo_columns}")
        
        # Find best algorithm (minimum value for both time and objective)
        best_algorithms = []
        for _, row in performance_df.iterrows():
            algo_values = [row[col] for col in algo_columns]
            
            # Handle missing/invalid values
            valid_values = [(i, val) for i, val in enumerate(algo_values) 
                           if pd.notna(val) and val != float('inf')]
            
            if not valid_values:
                # If no valid values, choose first algorithm as fallback
                best_algorithms.append(algo_columns[0])
            else:
                # Find minimum value (best for both time and objective)
                best_idx = min(valid_values, key=lambda x: x[1])[0]
                best_algorithms.append(algo_columns[best_idx])
        
        # Show algorithm selection distribution
        unique, counts = np.unique(best_algorithms, return_counts=True)
        for alg, count in zip(unique, counts):
            percentage = count / len(best_algorithms) * 100
            self.logger.info(f"   {alg}: {count} instances ({percentage:.1f}%)")
        
        return np.array(best_algorithms)
    
    def _calculate_single_best_solver_accuracy(self, performance_df: pd.DataFrame, 
                                             performance_type: str = "time") -> Tuple[float, str]:
        """
        Calculate accuracy of single best solver (oracle baseline).
        
        This computes what accuracy you'd get by always choosing the algorithm
        that wins the most instances (i.e., the most frequent winner).
        
        Args:
            performance_df: DataFrame with columns [instance, alg1_perf, alg2_perf, ...]
            performance_type: Either "time" (minimize) or "objective" (minimize)
            
        Returns:
            Tuple of (accuracy, best_solver_name)
        """
        # Get the actual best algorithm for each instance
        true_best = self._extract_best_algorithms(performance_df, performance_type)
        
        # Find the algorithm that wins the most instances (most frequent winner)
        winner_counts = pd.Series(true_best).value_counts()
        best_single_solver = winner_counts.index[0]  # Most frequent winner
        single_solver_accuracy = winner_counts.iloc[0] / len(true_best)  # Its win rate
        
        return float(single_solver_accuracy), best_single_solver
    
    def _calculate_ranking_metrics(self, performance_df: pd.DataFrame, predictions: np.ndarray, 
                                 performance_type: str = "time") -> Dict[str, float]:
        """
        Calculate ranking-based metrics for algorithm selector predictions.
        
        Args:
            performance_df: DataFrame with columns [instance, alg1_perf, alg2_perf, ...]
            predictions: Array of predicted best algorithms for each instance
            performance_type: Either "time" (minimize) or "objective" (minimize)
            
        Returns:
            Dictionary with ranking metrics
        """
        algo_columns = performance_df.columns[1:].tolist()
        rankings = []
        
        for i, (_, row) in enumerate(performance_df.iterrows()):
            # Get performance values for this instance
            perf_values = [(j, row[alg]) for j, alg in enumerate(algo_columns) 
                          if pd.notna(row[alg]) and row[alg] != float('inf')]
            
            if not perf_values:
                rankings.append(len(algo_columns))  # Worst possible rank
                continue
                
            # Sort by performance (ascending for time/objective - lower is better)
            sorted_perf = sorted(perf_values, key=lambda x: x[1])
            
            # Create rank mapping (1 = best, 2 = second best, etc.)
            rank_map = {algo_idx: rank + 1 for rank, (algo_idx, _) in enumerate(sorted_perf)}
            
            # Find rank of predicted algorithm
            predicted_algo = predictions[i]
            if predicted_algo in algo_columns:
                predicted_idx = algo_columns.index(predicted_algo)
                rank = rank_map.get(predicted_idx, len(algo_columns))
            else:
                rank = len(algo_columns)  # Worst possible rank for unknown algorithm
                
            rankings.append(rank)
        
        rankings = np.array(rankings)
        
        return {
            'average_rank': float(rankings.mean()),
            'median_rank': float(np.median(rankings)),
            'rank_std': float(rankings.std()),
            'rank1_percentage': float((rankings == 1).mean() * 100),  # % times predicted best
            'top3_percentage': float((rankings <= 3).mean() * 100)   # % times in top 3
        }
    
    def _calculate_single_best_ranking_metrics(self, performance_df: pd.DataFrame, 
                                             single_best_solver: str,
                                             performance_type: str = "time") -> Dict[str, float]:
        """Calculate ranking metrics for always choosing the single best solver."""
        algo_columns = performance_df.columns[1:].tolist()
        if single_best_solver not in algo_columns:
            return {'average_rank': len(algo_columns), 'median_rank': len(algo_columns), 
                   'rank_std': 0.0, 'rank1_percentage': 0.0, 'top3_percentage': 0.0}
        
        # Create predictions array (all instances get single best solver)
        predictions = np.full(len(performance_df), single_best_solver)
        
        return self._calculate_ranking_metrics(performance_df, predictions, performance_type)
    
    def _create_ranking_based_sample_weights(self, performance_df: pd.DataFrame, y_true: np.ndarray,
                                           performance_type: str = "time") -> np.ndarray:
        """
        Create sample weights based on ranking penalties.
        Instances where choosing wrong algorithm has higher ranking penalty get higher weight.
        
        Args:
            performance_df: DataFrame with algorithm performance data
            y_true: True best algorithms for each instance
            performance_type: Either "time" (minimize) or "objective" (minimize)
            
        Returns:
            Array of sample weights (higher weight = more important instance)
        """
        algo_columns = performance_df.columns[1:].tolist()
        sample_weights = []
        
        for i, (_, row) in enumerate(performance_df.iterrows()):
            # Get performance values for this instance
            perf_values = [(j, row[alg]) for j, alg in enumerate(algo_columns) 
                          if pd.notna(row[alg]) and row[alg] != float('inf')]
            
            if not perf_values:
                sample_weights.append(1.0)  # Default weight
                continue
                
            # Sort by performance (ascending - lower is better)
            sorted_perf = sorted(perf_values, key=lambda x: x[1])
            
            # Calculate ranking spread (how much rankings matter for this instance)
            # If performance differences are large, ranking matters more
            perf_values_only = [p[1] for p in sorted_perf]
            perf_range = max(perf_values_only) - min(perf_values_only)
            
            # Weight based on performance spread
            # Higher spread = higher weight (ranking mistakes are more costly)
            if perf_range > 0:
                # Normalize by median performance to get relative importance
                median_perf = np.median(perf_values_only)
                relative_spread = perf_range / median_perf if median_perf > 0 else 1.0
                weight = 1.0 + relative_spread  # Base weight + spread bonus
            else:
                weight = 1.0  # All algorithms perform similarly
                
            sample_weights.append(max(0.1, min(10.0, weight)))  # Clip to reasonable range
        
        return np.array(sample_weights)
    
    def _create_ranking_based_class_weights(self, performance_df: pd.DataFrame, 
                                          performance_type: str = "time", 
                                          present_labels: Optional[set] = None) -> Dict[str, float]:
        """
        Create class weights that favor algorithms with better average rankings.
        Algorithms that typically rank higher get higher weights when training.
        
        Args:
            performance_df: DataFrame with algorithm performance data
            performance_type: Either "time" (minimize) or "objective" (minimize)
            present_labels: Set of algorithm labels actually present in training data
            
        Returns:
            Dictionary mapping algorithm names to weights (only for present algorithms)
        """
        algo_columns = performance_df.columns[1:].tolist()
        
        # Calculate average rank for each algorithm across all instances
        algo_ranks = {alg: [] for alg in algo_columns}
        
        for _, row in performance_df.iterrows():
            # Get performance values for this instance
            perf_values = [(j, row[alg]) for j, alg in enumerate(algo_columns) 
                          if pd.notna(row[alg]) and row[alg] != float('inf')]
            
            if not perf_values:
                continue
                
            # Sort by performance (ascending - lower is better)
            sorted_perf = sorted(perf_values, key=lambda x: x[1])
            
            # Create rank mapping (1 = best, 2 = second best, etc.)
            for rank, (algo_idx, _) in enumerate(sorted_perf):
                alg = algo_columns[algo_idx]
                algo_ranks[alg].append(rank + 1)
        
        # Calculate average rank for each algorithm
        avg_ranks = {}
        for alg in algo_columns:
            if algo_ranks[alg]:
                avg_ranks[alg] = np.mean(algo_ranks[alg])
            else:
                avg_ranks[alg] = len(algo_columns)  # Worst possible rank
        
        # Convert to weights: lower average rank = higher weight
        # Use inverse ranking with smoothing
        max_rank = len(algo_columns)
        class_weights = {}
        
        # Only create weights for algorithms that are actually present in the dataset
        algorithms_to_weight = present_labels if present_labels else algo_columns
        
        for alg in algorithms_to_weight:
            if alg in avg_ranks:  # Make sure algorithm exists in performance data
                # Weight inversely proportional to average rank with smoothing
                # Best algorithm (avg_rank=1) gets highest weight
                # Worst algorithm gets lowest but non-zero weight
                weight = (max_rank + 1 - avg_ranks[alg]) / max_rank
                class_weights[alg] = max(0.1, weight)  # Ensure minimum weight
            
        return class_weights
    
    def _create_ranking_scorer(self, performance_df: pd.DataFrame, 
                             performance_type: str = "time"):
        """
        Create a custom scorer for AutoSklearn that optimizes average ranking
        instead of classification accuracy.
        """
        from sklearn.metrics import make_scorer
        
        def ranking_score(y_true, y_pred):
            """Calculate negative average ranking (so higher is better for sklearn)."""
            ranking_metrics = self._calculate_ranking_metrics(
                performance_df, y_pred, performance_type
            )
            # Return negative average rank (lower rank = better = higher score)
            return -ranking_metrics['average_rank']
        
        return make_scorer(ranking_score, greater_is_better=True)
    
    def train_random_forest(self, X_train: pd.DataFrame, y_train: np.ndarray,
                          X_test: pd.DataFrame, y_test: np.ndarray,
                          performance_train: pd.DataFrame, performance_test: pd.DataFrame,
                          loss_function: str = "accuracy", 
                          use_grid_search: bool = False,
                          grid_search_cv: int = 3,
                          n_jobs_grid: int = -1) -> Dict[str, Any]:
        """
        Train a Random Forest algorithm selector with optional hyperparameter tuning.
        
        Args:
            loss_function: "accuracy" (standard classification) or "ranking" (optimize average rank)
            use_grid_search: Whether to use grid search for hyperparameter optimization
            grid_search_cv: Number of CV folds for grid search (default: 3)
            n_jobs_grid: Number of parallel jobs for grid search (-1 uses all cores)
        """
        self.logger.info(f"🌲 Training Random Forest selector (loss: {loss_function}, grid_search: {use_grid_search})...")
        
        # Configure class weights based on loss function
        if loss_function == "ranking":
            # Use ranking-aware class weights, only for algorithms present in training data
            present_labels = set(y_train)
            class_weights = self._create_ranking_based_class_weights(
                performance_train, 
                PROBLEM_CONFIGS.get(self.problem, {}).get("performance_type", "time"),
                present_labels
            )
            self.logger.info(f"   Using ranking-based class weights: {class_weights}")
        else:
            # Use balanced class weights for accuracy
            class_weights = 'balanced'
        
        # Apply scaling
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Configure sample weights based on loss function
        sample_weights = None
        if loss_function == "ranking":
            # Use ranking-aware sample weights
            performance_type = PROBLEM_CONFIGS.get(self.problem, {}).get("performance_type", "time")
            sample_weights = self._create_ranking_based_sample_weights(
                performance_train, y_train, performance_type
            )
            self.logger.info(f"   Using ranking-based sample weights (mean: {sample_weights.mean():.2f})")
        
        if use_grid_search:
            self.logger.info(f"🔍 Performing grid search with {grid_search_cv}-fold CV...")
            
            # Define hyperparameter grid
            param_grid = {
                'n_estimators': [100, 200, 300, 500],
                'max_depth': [10, 15, 20, 25, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', 0.8]
            }
            
            # Calculate and log total combinations
            total_combinations = 1
            for key, values in param_grid.items():
                total_combinations *= len(values)
            total_fits = total_combinations * grid_search_cv
            self.logger.info(f"   Parameter grid: {total_combinations} combinations × {grid_search_cv} CV folds = {total_fits} total fits")
            self.logger.info(f"   Estimated time: ~{total_fits * 2 // 60} minutes (assuming 2s per fit)")
            
            # Create base model
            base_model = RandomForestClassifier(
                random_state=42,
                n_jobs=1,  # Use 1 job per model to allow grid search parallelization
                class_weight=class_weights
            )
            
            # Choose scorer based on loss function
            if loss_function == "ranking":
                # Create custom ranking scorer
                performance_type = PROBLEM_CONFIGS.get(self.problem, {}).get("performance_type", "time")
                scorer = create_ranking_scorer(performance_train, performance_type)
                self.logger.info(f"   Using ranking-based scoring for grid search")
            else:
                scorer = 'accuracy'
                self.logger.info(f"   Using accuracy scoring for grid search")
            
            # Perform grid search with parallel computation
            grid_search = GridSearchCV(
                estimator=base_model,
                param_grid=param_grid,
                cv=grid_search_cv,
                scoring=scorer,
                n_jobs=n_jobs_grid,  # Parallel computation across parameter combinations
                verbose=2,  # More verbose output to show progress
                refit=True
            )
            
            # Fit with sample weights if using ranking loss
            if sample_weights is not None:
                # GridSearchCV doesn't directly support fit_params in recent versions
                # So we'll create a custom approach for sample weights
                self.logger.info("   Note: Sample weights used during final model training")
                self.logger.info("   🚀 Starting grid search...")
                import time
                start_time = time.time()
                grid_search.fit(X_train_scaled, y_train)
                elapsed_time = time.time() - start_time
                self.logger.info(f"   ⏱️  Grid search completed in {elapsed_time/60:.1f} minutes")
                
                # Refit best model with sample weights
                best_model = RandomForestClassifier(**grid_search.best_params_, 
                                                  random_state=42, n_jobs=-1, 
                                                  class_weight=class_weights)
                best_model.fit(X_train_scaled, y_train, sample_weight=sample_weights)
                model = best_model
            else:
                self.logger.info("   🚀 Starting grid search...")
                import time
                start_time = time.time()
                grid_search.fit(X_train_scaled, y_train)
                elapsed_time = time.time() - start_time
                self.logger.info(f"   ⏱️  Grid search completed in {elapsed_time/60:.1f} minutes")
                model = grid_search.best_estimator_
            
            # Log best parameters
            self.logger.info(f"   Best parameters: {grid_search.best_params_}")
            self.logger.info(f"   Best CV score: {grid_search.best_score_:.4f}")
            
            # Store grid search results
            grid_search_results = {
                'best_params': grid_search.best_params_,
                'best_score': float(grid_search.best_score_),
                'cv_results': {
                    'mean_test_score': grid_search.cv_results_['mean_test_score'].tolist(),
                    'params': grid_search.cv_results_['params']
                }
            }
        else:
            # Use default optimized parameters without grid search
            model = RandomForestClassifier(
                n_estimators=300,  # More trees for better performance
                max_depth=20,      # Deeper trees for complex constraint patterns
                min_samples_split=5,
                min_samples_leaf=2,
                max_features='sqrt',
                random_state=42,
                n_jobs=-1,
                class_weight=class_weights
            )
            
            # Train with or without sample weights
            if sample_weights is not None:
                model.fit(X_train_scaled, y_train, sample_weight=sample_weights)
            else:
                model.fit(X_train_scaled, y_train)
            
            grid_search_results = None
        
        # Evaluate
        train_pred = model.predict(X_train_scaled)
        test_pred = model.predict(X_test_scaled)
        
        train_accuracy = accuracy_score(y_train, train_pred)
        test_accuracy = accuracy_score(y_test, test_pred)
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
        
        # Calculate single best solver baseline
        performance_type = PROBLEM_CONFIGS.get(self.problem, {}).get("performance_type", "time")
        train_baseline_acc, train_best_solver = self._calculate_single_best_solver_accuracy(performance_train, performance_type)
        test_baseline_acc, test_best_solver = self._calculate_single_best_solver_accuracy(performance_test, performance_type)
        
        # Feature importance (top 10)
        feature_importance = dict(zip(X_train.columns, model.feature_importances_))
        top_features = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10])
        
        # Algorithm selection distribution
        train_dist = dict(pd.Series(train_pred).value_counts())
        test_dist = dict(pd.Series(test_pred).value_counts())
        
        # Calculate improvement over single best solver
        train_improvement = train_accuracy - train_baseline_acc
        test_improvement = test_accuracy - test_baseline_acc
        
        self.logger.info(f"   Train accuracy: {train_accuracy:.3f}")
        self.logger.info(f"   Test accuracy: {test_accuracy:.3f}")
        self.logger.info(f"   CV mean±std: {cv_scores.mean():.3f}±{cv_scores.std():.3f}")
        self.logger.info(f"📊 Single Best Solver Baseline:")
        self.logger.info(f"   Train baseline ({train_best_solver}): {train_baseline_acc:.3f}")
        self.logger.info(f"   Test baseline ({test_best_solver}): {test_baseline_acc:.3f}")
        self.logger.info(f"   Train improvement: +{train_improvement:.3f} ({train_improvement/train_baseline_acc*100:+.1f}%)")
        self.logger.info(f"   Test improvement: +{test_improvement:.3f} ({test_improvement/test_baseline_acc*100:+.1f}%)")
        
        # Calculate ranking metrics
        train_ranking_metrics = self._calculate_ranking_metrics(performance_train, train_pred, performance_type)
        test_ranking_metrics = self._calculate_ranking_metrics(performance_test, test_pred, performance_type)
        
        # Calculate single best solver ranking metrics for comparison
        train_baseline_ranking = self._calculate_single_best_ranking_metrics(performance_train, train_best_solver, performance_type)
        test_baseline_ranking = self._calculate_single_best_ranking_metrics(performance_test, test_best_solver, performance_type)
        
        # Log ranking metrics
        self.logger.info(f"📊 Ranking Metrics:")
        self.logger.info(f"   ML Selector - Train avg rank: {train_ranking_metrics['average_rank']:.2f}, Test avg rank: {test_ranking_metrics['average_rank']:.2f}")
        self.logger.info(f"   Single Best - Train avg rank: {train_baseline_ranking['average_rank']:.2f}, Test avg rank: {test_baseline_ranking['average_rank']:.2f}")
        self.logger.info(f"   ML Selector - Test top-1: {test_ranking_metrics['rank1_percentage']:.1f}%, top-3: {test_ranking_metrics['top3_percentage']:.1f}%")
        
        return {
            'model': (model, scaler),  # Save both model and scaler
            'train_accuracy': float(train_accuracy),
            'test_accuracy': float(test_accuracy),
            'cv_mean': float(cv_scores.mean()),
            'cv_std': float(cv_scores.std()),
            'single_best_solver_train_accuracy': float(train_baseline_acc),
            'single_best_solver_test_accuracy': float(test_baseline_acc),
            'single_best_solver_train_name': train_best_solver,
            'single_best_solver_test_name': test_best_solver,
            'train_improvement_over_baseline': float(train_improvement),
            'test_improvement_over_baseline': float(test_improvement),
            'top_features': top_features,
            'train_algorithm_distribution': train_dist,
            'test_algorithm_distribution': test_dist,
            'classification_report': classification_report(y_test, test_pred, output_dict=True),
            # Ranking metrics for ML selector
            'train_ranking_metrics': train_ranking_metrics,
            'test_ranking_metrics': test_ranking_metrics,
            # Ranking metrics for single best solver baseline
            'single_best_solver_train_ranking': train_baseline_ranking,
            'single_best_solver_test_ranking': test_baseline_ranking,
            # Grid search results (if used)
            'grid_search_results': grid_search_results
        }
    
    def train_autosklearn(self, X_train: pd.DataFrame, y_train: np.ndarray,
                         X_test: pd.DataFrame, y_test: np.ndarray,
                         performance_train: pd.DataFrame, performance_test: pd.DataFrame,
                         conservative: bool = False, loss_function: str = "accuracy") -> Dict[str, Any]:
        """
        Train an AutoSklearn algorithm selector.
        
        Args:
            loss_function: "accuracy" (standard classification) or "ranking" (optimize average rank)
        """
        if not AUTOSKLEARN_AVAILABLE:
            return {'error': 'AutoSklearn not available'}
        
        selector_type = "AutoSklearn Conservative" if conservative else "AutoSklearn"
        self.logger.info(f"🤖 Training {selector_type} selector (loss: {loss_function})...")
        
        # Configure AutoSklearn
        time_budget = 300 if conservative else 600  # seconds
        
        # Configure scoring metric based on loss function
        if loss_function == "ranking":
            # Create custom ranking scorer
            performance_type = PROBLEM_CONFIGS.get(self.problem, {}).get("performance_type", "time")
            scoring_metric = self._create_ranking_scorer(performance_train, performance_type)
            self.logger.info("   Using ranking-based scoring function")
        else:
            # Use standard accuracy scoring
            scoring_metric = None  # AutoSklearn default
        
        if conservative:
            # Conservative settings for large datasets (matching reference implementation)
            model = autosklearn.classification.AutoSklearnClassifier(
                time_left_for_this_task=time_budget,
                per_run_time_limit=30,
                memory_limit=3072,
                ensemble_size=20,
                ensemble_nbest=50,
                initial_configurations_via_metalearning=5,
                resampling_strategy='holdout',
                resampling_strategy_arguments={'train_size': 0.8},
                delete_tmp_folder_after_terminate=True,
                n_jobs=1,
                seed=42,
                smac_scenario_args={'runcount_limit': 50}
            )
        else:
            # Standard settings for moderate datasets
            model = autosklearn.classification.AutoSklearnClassifier(
                time_left_for_this_task=time_budget,
                per_run_time_limit=30,
                memory_limit=3072,
                ensemble_size=50,
                ensemble_nbest=200,
                initial_configurations_via_metalearning=25,
                resampling_strategy='cv',
                resampling_strategy_arguments={'folds': 3},
                delete_tmp_folder_after_terminate=True,
                n_jobs=1,
                seed=42
            )
        
        # Scale features for AutoSklearn
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train model
        model.fit(X_train_scaled, y_train)
        
        # Evaluate
        train_pred = model.predict(X_train_scaled)
        test_pred = model.predict(X_test_scaled)
        
        train_accuracy = accuracy_score(y_train, train_pred)
        test_accuracy = accuracy_score(y_test, test_pred)
        
        # Calculate single best solver baseline
        performance_type = PROBLEM_CONFIGS.get(self.problem, {}).get("performance_type", "time")
        train_baseline_acc, train_best_solver = self._calculate_single_best_solver_accuracy(performance_train, performance_type)
        test_baseline_acc, test_best_solver = self._calculate_single_best_solver_accuracy(performance_test, performance_type)
        
        # Calculate improvement over single best solver
        train_improvement = train_accuracy - train_baseline_acc
        test_improvement = test_accuracy - test_baseline_acc
        
        self.logger.info(f"   Train accuracy: {train_accuracy:.3f}")
        self.logger.info(f"   Test accuracy: {test_accuracy:.3f}")
        self.logger.info(f"   Ensemble size: {len(model.get_models_with_weights())}")
        self.logger.info(f"📊 Single Best Solver Baseline:")
        self.logger.info(f"   Train baseline ({train_best_solver}): {train_baseline_acc:.3f}")
        self.logger.info(f"   Test baseline ({test_best_solver}): {test_baseline_acc:.3f}")
        self.logger.info(f"   Train improvement: +{train_improvement:.3f} ({train_improvement/train_baseline_acc*100:+.1f}%)")
        self.logger.info(f"   Test improvement: +{test_improvement:.3f} ({test_improvement/test_baseline_acc*100:+.1f}%)")
        
        # Calculate ranking metrics
        train_ranking_metrics = self._calculate_ranking_metrics(performance_train, train_pred, performance_type)
        test_ranking_metrics = self._calculate_ranking_metrics(performance_test, test_pred, performance_type)
        
        # Calculate single best solver ranking metrics for comparison
        train_baseline_ranking = self._calculate_single_best_ranking_metrics(performance_train, train_best_solver, performance_type)
        test_baseline_ranking = self._calculate_single_best_ranking_metrics(performance_test, test_best_solver, performance_type)
        
        # Log ranking metrics
        self.logger.info(f"📊 Ranking Metrics:")
        self.logger.info(f"   ML Selector - Train avg rank: {train_ranking_metrics['average_rank']:.2f}, Test avg rank: {test_ranking_metrics['average_rank']:.2f}")
        self.logger.info(f"   Single Best - Train avg rank: {train_baseline_ranking['average_rank']:.2f}, Test avg rank: {test_baseline_ranking['average_rank']:.2f}")
        self.logger.info(f"   ML Selector - Test top-1: {test_ranking_metrics['rank1_percentage']:.1f}%, top-3: {test_ranking_metrics['top3_percentage']:.1f}%")
        
        return {
            'model': (model, scaler),
            'train_accuracy': float(train_accuracy), 
            'test_accuracy': float(test_accuracy),
            'single_best_solver_train_accuracy': float(train_baseline_acc),
            'single_best_solver_test_accuracy': float(test_baseline_acc),
            'single_best_solver_train_name': train_best_solver,
            'single_best_solver_test_name': test_best_solver,
            'train_improvement_over_baseline': float(train_improvement),
            'test_improvement_over_baseline': float(test_improvement),
            'train_algorithm_distribution': dict(pd.Series(train_pred).value_counts()),
            'test_algorithm_distribution': dict(pd.Series(test_pred).value_counts()),
            'classification_report': classification_report(y_test, test_pred, output_dict=True),
            'autosklearn_stats': {
                'ensemble_size': len(model.get_models_with_weights()),
                'time_budget': time_budget
            },
            # Ranking metrics for ML selector
            'train_ranking_metrics': train_ranking_metrics,
            'test_ranking_metrics': test_ranking_metrics,
            # Ranking metrics for single best solver baseline
            'single_best_solver_train_ranking': train_baseline_ranking,
            'single_best_solver_test_ranking': test_baseline_ranking
        }
    
    def train_selector(self, selector_type: str = "random_forest", 
                      loss_function: str = "accuracy",
                      use_grid_search: bool = False,
                      grid_search_cv: int = 3,
                      n_jobs_grid: int = -1) -> Optional[Dict[str, Any]]:
        """
        Train a single algorithm selector.
        
        Args:
            selector_type: Type of selector ("random_forest", "autosklearn", "autosklearn_conservative")
            loss_function: "accuracy" (standard classification) or "ranking" (optimize average rank)
            use_grid_search: Whether to use grid search for hyperparameter optimization (Random Forest only)
            grid_search_cv: Number of CV folds for grid search
            n_jobs_grid: Number of parallel jobs for grid search
        """
        self.logger.info(f"\n{'='*80}")
        self.logger.info(f"🎯 Training {selector_type} selector")
        self.logger.info(f"   Problem: {self.problem}")
        self.logger.info(f"   Extractor: {self.extractor}")
        self.logger.info(f"   Dataset: {self.dataset_dir}")
        self.logger.info(f"{'='*80}")
        
        # Load dataset
        dataset, error = self.load_dataset()
        if dataset is None:
            self.logger.error(f"❌ Failed to load dataset: {error}")
            return None
        
        X_train, y_train, X_test, y_test = dataset
        
        # Also load raw performance data for baseline calculation
        performance_train = pd.read_csv(self.dataset_dir / 'performance_train.csv')
        performance_test = pd.read_csv(self.dataset_dir / 'performance_test.csv')
        
        # Train appropriate model type
        start_time = datetime.now()
        
        try:
            if selector_type == 'random_forest':
                results = self.train_random_forest(X_train, y_train, X_test, y_test, performance_train, performance_test, 
                                                 loss_function, use_grid_search, grid_search_cv, n_jobs_grid)
            elif selector_type == 'autosklearn':
                results = self.train_autosklearn(X_train, y_train, X_test, y_test, performance_train, performance_test, conservative=False, loss_function=loss_function)
            elif selector_type == 'autosklearn_conservative':
                results = self.train_autosklearn(X_train, y_train, X_test, y_test, performance_train, performance_test, conservative=True, loss_function=loss_function)
            else:
                self.logger.error(f"❌ Unknown selector type: {selector_type}")
                return None
            
            # Check for training errors
            if 'error' in results:
                self.logger.error(f"❌ Training failed: {results['error']}")
                return None
            
            training_time = (datetime.now() - start_time).total_seconds()
            
            # Add metadata to results
            results.update({
                'problem': self.problem,
                'extractor': self.extractor,
                'selector_type': selector_type,
                'dataset_dir': str(self.dataset_dir),
                'training_time_seconds': training_time,
                'train_instances': len(X_train),
                'test_instances': len(X_test),
                'feature_count': X_train.shape[1],
                'unique_algorithms': list(set(y_train) | set(y_test)),
                'timestamp': datetime.now().isoformat()
            })
            
            # Save model
            model_filename = f"{self.problem}_{self.extractor}_{selector_type}.pkl"
            model_path = self.results_dir / model_filename
            with open(model_path, 'wb') as f:
                pickle.dump(results['model'], f)
            results['model_path'] = str(model_path)
            
            # Save results
            results_filename = f"{self.problem}_{self.extractor}_{selector_type}_results.json"
            results_path = self.results_dir / results_filename
            
            # Remove model from results dict for JSON serialization
            model = results.pop('model')
            
            # Convert numpy types to Python native types for JSON serialization
            def convert_numpy_types(obj):
                if isinstance(obj, np.integer):
                    return int(obj)
                elif isinstance(obj, np.floating):
                    return float(obj)
                elif isinstance(obj, np.ndarray):
                    return obj.tolist()
                elif isinstance(obj, dict):
                    return {k: convert_numpy_types(v) for k, v in obj.items()}
                elif isinstance(obj, list):
                    return [convert_numpy_types(v) for v in obj]
                else:
                    return obj
            
            results_serializable = convert_numpy_types(results)
            
            with open(results_path, 'w') as f:
                json.dump(results_serializable, f, indent=2)
            
            self.logger.info(f"✅ Training completed successfully!")
            self.logger.info(f"   Training time: {training_time:.1f}s")
            self.logger.info(f"   Test accuracy: {results['test_accuracy']:.3f}")
            self.logger.info(f"   Model saved: {model_path}")
            self.logger.info(f"   Results saved: {results_path}")
            
            return results
            
        except Exception as e:
            self.logger.error(f"❌ Training failed with exception: {e}")
            return None


def main():
    """Main entry point with comprehensive usage examples."""
    
    usage_examples = """
EXAMPLE COMMANDS:

Basic Usage:
  python single_dataset_trainer.py --dataset-dir ./src/datasets/vrp/mzn2feat --problem vrp
  python single_dataset_trainer.py --dataset-dir ./src/datasets/car_sequencing/lmtuner20250908123608 --problem car_sequencing

Different Selectors:
  python single_dataset_trainer.py --dataset-dir ./src/datasets/FLECC/mzn2feat --problem FLECC --selector-type random_forest
  python single_dataset_trainer.py --dataset-dir ./src/datasets/vrp/lmtuner20250908115627 --problem vrp --selector-type autosklearn
  python single_dataset_trainer.py --dataset-dir ./src/datasets/car_sequencing/mzn2feat --problem car_sequencing --selector-type autosklearn_conservative

Research Comparison:
  # Compare mzn2feat vs LLM features on same problem
  python single_dataset_trainer.py --dataset-dir ./src/datasets/vrp/mzn2feat --problem vrp --selector-type random_forest
  python single_dataset_trainer.py --dataset-dir ./src/datasets/vrp/lmtuner20250908115627 --problem vrp --selector-type random_forest

Loss Function Comparison:
  # Train with standard accuracy optimization
  python single_dataset_trainer.py --dataset-dir ./src/datasets/car_sequencing/mzn2feat --problem car_sequencing --loss-function accuracy
  
  # Train with ranking-based optimization (minimize average rank)
  python single_dataset_trainer.py --dataset-dir ./src/datasets/car_sequencing/mzn2feat --problem car_sequencing --loss-function ranking
  
  # Compare ranking-optimized LLM features vs accuracy-optimized mzn2feat
  python single_dataset_trainer.py --dataset-dir ./src/datasets/vrp/mzn2feat --problem vrp --loss-function accuracy
  python single_dataset_trainer.py --dataset-dir ./src/datasets/vrp/lmtuner20250908115627 --problem vrp --loss-function ranking

OUTPUT METRICS:
- Test Accuracy: Classification accuracy for solver selection
- Average Ranking: Mean rank of predicted solvers (1=best, lower=better)
- Top-1 Performance: % instances where best solver is predicted
- Top-3 Performance: % instances where prediction is in top 3
- Single Best Baseline: Performance of always choosing most frequent winner
- Training Time: Time to train the selector model
- Loss Function: Shows which objective was optimized during training
    """
    
    parser = argparse.ArgumentParser(
        description="Train algorithm selector on single dataset with comprehensive ranking evaluation",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=usage_examples
    )
    
    parser.add_argument(
        "--dataset-dir",
        required=True,
        help="Directory containing 4-file dataset structure"
    )
    parser.add_argument(
        "--problem",
        required=True,
        choices=["vrp", "car_sequencing", "FLECC"],
        help="Problem name"
    )
    # Extractor name is auto-extracted from dataset directory name
    parser.add_argument(
        "--selector-type",
        default="random_forest",
        choices=["random_forest", "autosklearn", "autosklearn_conservative"],
        help="Type of selector to train (default: random_forest)"
    )
    parser.add_argument(
        "--results-dir",
        default="single_results",
        help="Directory to save results (default: single_results)"
    )
    parser.add_argument(
        "--loss-function",
        default="accuracy",
        choices=["accuracy", "ranking"],
        help="Loss function to optimize: 'accuracy' (classification accuracy) or 'ranking' (average rank) (default: accuracy)"
    )
    parser.add_argument(
        "--use-grid-search",
        action="store_true",
        help="Use grid search for hyperparameter optimization (only for Random Forest)"
    )
    parser.add_argument(
        "--grid-search-cv",
        type=int,
        default=3,
        help="Number of cross-validation folds for grid search (default: 3)"
    )
    parser.add_argument(
        "--n-jobs-grid",
        type=int,
        default=-1,
        help="Number of parallel jobs for grid search (-1 uses all cores, default: -1)"
    )
    
    args = parser.parse_args()
    
    # Validate dataset directory
    dataset_dir = Path(args.dataset_dir)
    if not dataset_dir.exists():
        print(f"❌ Error: Dataset directory not found: {dataset_dir}")
        sys.exit(1)
    
    # Check AutoSklearn availability
    if 'autosklearn' in args.selector_type and not AUTOSKLEARN_AVAILABLE:
        print("❌ Error: AutoSklearn not available")
        sys.exit(1)
    
    # Create trainer and run
    try:
        trainer = SingleDatasetTrainer(
            dataset_dir=args.dataset_dir,
            problem=args.problem,
            results_dir=args.results_dir
        )
        
        results = trainer.train_selector(args.selector_type, args.loss_function,
                                        args.use_grid_search, args.grid_search_cv, args.n_jobs_grid)
        
        if results is None:
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()