#!/usr/bin/env python3
"""
Universal Algorithm Selector Trainer for LLM vs mzn2feature Research

This script orchestrates comprehensive algorithm selector training experiments across
multiple problems, feature extractors, and ML algorithms. It serves as the primary
tool for systematic comparison of LLM-generated features vs traditional mzn2feature
approaches in constraint optimization algorithm selection.

RESEARCH ARCHITECTURE:
This refactored version uses SingleDatasetTrainer as the core engine, providing:
- Consistent training methodology across all experiments
- Comprehensive evaluation including accuracy AND ranking metrics  
- Single point of maintenance for algorithmic improvements
- Parallel processing for efficient large-scale experimentation

KEY CAPABILITIES:
- Multi-Problem Support: VRP, car_sequencing, FLECC with different performance metrics
- Multi-Extractor Comparison: mzn2feat (95 features) vs LLM extractors (50 features)  
- Multi-Algorithm Testing: Random Forest, AutoSklearn, AutoSklearn Conservative
- Comprehensive Metrics: Accuracy, ranking analysis, cross-validation, baseline comparison
- Automated Report Generation: CSV summaries with statistical comparisons

EXPECTED DIRECTORY STRUCTURE:
datasets/
├── vrp/
│   ├── mzn2feat/
│   │   ├── features_train.csv
│   │   ├── features_test.csv
│   │   ├── performance_train.csv
│   │   └── performance_test.csv
│   ├── lmtuner20250908115627/    # LLM extractor 1
│   │   └── ... (same 4-file structure)
│   └── lmtuner20250908121942/    # LLM extractor 2
│       └── ... (same 4-file structure)
├── car_sequencing/
│   └── ... (similar structure)
└── FLECC/
    └── ... (similar structure)

USAGE EXAMPLES:
    # Complete experiment: all problems, extractors, selectors
    python universal_selector_trainer.py --datasets-dir ./src/datasets/
    
    # Focus on specific problems for faster testing
    python universal_selector_trainer.py --datasets-dir ./src/datasets/ --problems vrp,car_sequencing
    
    # Compare only Random Forest performance across all extractors
    python universal_selector_trainer.py --datasets-dir ./src/datasets/ --selector-types random_forest
    
    # Generate comparison report from existing results (no training)
    python universal_selector_trainer.py --datasets-dir ./src/datasets/ --comparison-only
    
    # High-throughput parallel processing (adjust based on resources)
    python universal_selector_trainer.py --datasets-dir ./src/datasets/ --max-parallel 8

RESEARCH OUTPUT:
results/
├── comparison_report.csv           # Master comparison across all experiments
├── [problem]_[extractor]_[selector]_results.json  # Detailed individual results
└── models/
    └── [problem]_[extractor]_[selector].pkl       # Trained models

EVALUATION METRICS:
Each experiment provides comprehensive evaluation:
- Test Accuracy: Classification accuracy for solver selection
- Average Ranking: Mean rank of predicted solvers (1=best, lower=better)  
- Top-1/Top-3 Performance: % instances with optimal/near-optimal predictions
- Cross-Validation: 5-fold CV for robust performance estimation
- Single Best Baseline: Performance comparison against always choosing most frequent winner
- Feature Efficiency: Number of features used (mzn2feat: 95, LLM: 50)
- Training Time: Computational cost analysis

This enables answering key research questions:
1. Do LLM-generated features outperform traditional mzn2feature approaches?
2. How consistent are improvements across different problems and selectors?
3. What is the ranking quality of LLM-based predictions beyond binary accuracy?
4. Can fewer, semantically meaningful features achieve better performance?
"""

import argparse
import json
import sys
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional, Any
import logging
from datetime import datetime
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor, as_completed
import traceback

# Import the single dataset trainer
from single_dataset_trainer import SingleDatasetTrainer


def train_single_selector_worker(datasets_dir: str, problem: str, extractor: str, 
                               selector_type: str, results_dir: str, models_dir: str, 
                               cv_folds: int = 5, loss_function: str = "accuracy",
                               use_grid_search: bool = False, grid_search_cv: int = 3,
                               n_jobs_grid: int = -1) -> Optional[Dict[str, Any]]:
    """
    Worker function to train a single algorithm selector combination.
    Uses SingleDatasetTrainer for all the actual work.
    """
    try:
        from pathlib import Path
        import logging
        
        # Setup basic logging for this worker
        logging.basicConfig(level=logging.ERROR)  # Minimize worker logging
        
        # Convert paths back to Path objects
        dataset_dir = Path(datasets_dir) / problem / extractor
        
        # Create trainer instance
        trainer = SingleDatasetTrainer(
            dataset_dir=str(dataset_dir),
            problem=problem,
            results_dir=results_dir
        )
        
        # Train the selector
        results = trainer.train_selector(selector_type=selector_type, loss_function=loss_function,
                                        use_grid_search=use_grid_search, grid_search_cv=grid_search_cv,
                                        n_jobs_grid=n_jobs_grid)
        
        if results is None:
            return {
                'error': f'Training returned None for {problem}/{extractor}/{selector_type}',
                'problem': problem,
                'extractor': extractor,
                'selector_type': selector_type
            }
        
        # Add metadata that universal trainer expects
        results.update({
            'problem': problem,
            'extractor': extractor,
            'selector_type': selector_type
        })
        
        return results
        
    except Exception as e:
        import traceback
        return {
            'error': f'Training failed: {str(e)}',
            'problem': problem,
            'extractor': extractor,
            'selector_type': selector_type,
            'traceback': traceback.format_exc()
        }


class UniversalSelectorTrainer:
    """
    Universal trainer that orchestrates multiple SingleDatasetTrainer instances
    for comprehensive algorithm selector comparison experiments.
    """
    
    def __init__(self, datasets_dir: str, results_dir: str = None, cv_folds: int = 5):
        self.datasets_dir = Path(datasets_dir)
        self.results_dir = Path(results_dir) if results_dir else (self.datasets_dir.parent / "results")
        self.models_dir = self.results_dir / "models"
        self.cv_folds = cv_folds
        self.all_results = []
        
        # Create directories
        self.results_dir.mkdir(exist_ok=True)
        self.models_dir.mkdir(exist_ok=True)
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[logging.StreamHandler()]
        )
        self.logger = logging.getLogger(__name__)
    
    def find_dataset_combinations(self, problems: List[str]) -> List[tuple]:
        """Find all valid dataset combinations (problem, extractor) with 4-file structure."""
        combinations = []
        
        for problem_dir in self.datasets_dir.iterdir():
            if not problem_dir.is_dir():
                continue
                
            problem_name = problem_dir.name
            if problem_name not in problems:
                continue
            
            for extractor_dir in problem_dir.iterdir():
                if not extractor_dir.is_dir():
                    continue
                    
                # Check for required files
                required_files = ['features_train.csv', 'features_test.csv', 
                                'performance_train.csv', 'performance_test.csv']
                
                if all((extractor_dir / f).exists() for f in required_files):
                    combinations.append((problem_name, extractor_dir.name))
        
        return combinations
    
    def train_all_selectors(self, problems: List[str], selector_types: List[str], 
                          max_parallel: int = 3, loss_function: str = "accuracy",
                          use_grid_search: bool = False, grid_search_cv: int = 3,
                          n_jobs_grid: int = 1) -> None:
        """Train all selector combinations using parallel processing with optional grid search."""
        
        # Find all valid combinations
        dataset_combinations = self.find_dataset_combinations(problems)
        
        if not dataset_combinations:
            self.logger.error("No valid dataset combinations found!")
            return
        
        # Generate all training tasks
        tasks = []
        for problem, extractor in dataset_combinations:
            for selector_type in selector_types:
                tasks.append((problem, extractor, selector_type))
        
        self.logger.info(f"Found {len(dataset_combinations)} complete dataset combinations")
        self.logger.info(f"Starting {len(tasks)} training experiments with max {max_parallel} parallel jobs")
        
        # Train in parallel
        failed_tasks = []
        
        with ProcessPoolExecutor(max_workers=max_parallel) as executor:
            # Submit all tasks
            future_to_task = {}
            for problem, extractor, selector_type in tasks:
                future = executor.submit(
                    train_single_selector_worker,
                    str(self.datasets_dir), problem, extractor, selector_type,
                    str(self.results_dir), str(self.models_dir), self.cv_folds, loss_function,
                    use_grid_search, grid_search_cv, n_jobs_grid
                )
                future_to_task[future] = (problem, extractor, selector_type)
            
            # Collect results as they complete
            for i, future in enumerate(as_completed(future_to_task)):
                problem, extractor, selector_type = future_to_task[future]
                
                try:
                    result = future.result()
                    if result and 'error' not in result:
                        self.all_results.append(result)
                        self.logger.info(f"✅ [{i+1}/{len(tasks)}] Completed: {problem}/{extractor}/{selector_type}")
                    else:
                        error_msg = result.get('error', 'Unknown error') if result else 'No result returned'
                        failed_tasks.append((problem, extractor, selector_type, error_msg))
                        self.logger.error(f"❌ [{i+1}/{len(tasks)}] Failed: {problem}/{extractor}/{selector_type} - {error_msg}")
                        
                except Exception as e:
                    failed_tasks.append((problem, extractor, selector_type, str(e)))
                    self.logger.error(f"❌ [{i+1}/{len(tasks)}] Exception: {problem}/{extractor}/{selector_type} - {e}")
        
        # Summary
        self.logger.info(f"\nTraining completed: {len(self.all_results)}/{len(tasks)} successful, {len(failed_tasks)} failed")
        
        if failed_tasks:
            self.logger.warning("Failed tasks:")
            for problem, extractor, selector_type, error in failed_tasks:
                self.logger.warning(f"  {problem}/{extractor}/{selector_type}: {error}")
        
        # Generate comparison report
        self.generate_comparison_report()
    
    def generate_comparison_report(self) -> None:
        """Generate comprehensive comparison report from all results."""
        if not self.all_results:
            self.logger.warning("No results to generate report from")
            return
            
        self.logger.info(f"\n{'='*80}")
        self.logger.info("GENERATING COMPARISON REPORT")
        self.logger.info(f"{'='*80}")
        
        # Convert to DataFrame for easier analysis
        df = pd.DataFrame(self.all_results)
        
        # Save raw comparison data
        comparison_file = self.results_dir / "comparison_report.csv"
        df.to_csv(comparison_file, index=False)
        self.logger.info(f"\nComparison report saved to: {comparison_file}")
        
        # Summary statistics
        self.logger.info(f"Total experiments: {len(self.all_results)}")
        self.logger.info(f"Problems: {sorted(df['problem'].unique())}")
        self.logger.info(f"Extractors: {sorted(df['extractor'].unique())}")  
        self.logger.info(f"Selector types: {sorted(df['selector_type'].unique())}")
        
        # Best performers by problem
        self.logger.info(f"\nBest test accuracy by problem:")
        for problem in df['problem'].unique():
            problem_df = df[df['problem'] == problem]
            if not problem_df.empty:
                best_row = problem_df.loc[problem_df['test_accuracy'].idxmax()]
                self.logger.info(f"  {problem}: {best_row['test_accuracy']:.3f} ({best_row['extractor']}, {best_row['selector_type']})")
        
        # Feature extraction comparison
        self.logger.info(f"\nFeature extraction comparison:")
        mzn2_df = df[df['extractor'] == 'mzn2feat']
        llm_df = df[df['extractor'].str.startswith('lmtuner')]
        
        if not mzn2_df.empty and not llm_df.empty:
            mzn2_avg = mzn2_df['test_accuracy'].mean()
            llm_avg = llm_df['test_accuracy'].mean()
            improvement = ((llm_avg - mzn2_avg) / mzn2_avg) * 100
            
            self.logger.info(f"  mzn2feat average test accuracy: {mzn2_avg:.3f}")
            self.logger.info(f"  LLM features average test accuracy: {llm_avg:.3f}")
            self.logger.info(f"  Improvement: {improvement:+.1f}%")


def main():
    """Main entry point with comprehensive research usage examples."""
    
    usage_examples = """
RESEARCH EXPERIMENT EXAMPLES:

Complete Comparison Study:
  # Train all selectors on all problems and extractors (full experiment)
  python universal_selector_trainer.py --datasets-dir ./src/datasets/
  
  # Results: 27 experiments (3 problems × 3 extractors × 3 selectors)
  # Output: Comprehensive comparison report with ranking metrics

Targeted Research Questions:
  # Q1: Which problems benefit most from LLM features?
  python universal_selector_trainer.py --datasets-dir ./src/datasets/ --problems vrp,car_sequencing
  
  # Q2: How do different selectors perform with LLM features?
  python universal_selector_trainer.py --datasets-dir ./src/datasets/ --selector-types random_forest,autosklearn
  
  # Q3: Fast Random Forest comparison across all extractors
  python universal_selector_trainer.py --datasets-dir ./src/datasets/ --selector-types random_forest

Performance and Resource Management:
  # High-throughput parallel processing (adjust based on CPU cores)
  python universal_selector_trainer.py --datasets-dir ./src/datasets/ --max-parallel 8
  
  # Quick test run on single problem
  python universal_selector_trainer.py --datasets-dir ./src/datasets/ --problems car_sequencing --max-parallel 1

Analysis and Reporting:
  # Generate report from existing results (no training)
  python universal_selector_trainer.py --datasets-dir ./src/datasets/ --comparison-only
  
  # Custom results location
  python universal_selector_trainer.py --datasets-dir ./src/datasets/ --results-dir my_experiment_results

RESEARCH WORKFLOW:
1. Run complete experiment: python universal_selector_trainer.py --datasets-dir ./src/datasets/
2. Analyze results: Check results/comparison_report.csv
3. Generate paper figures from JSON results in results/ directory
4. Reproduce specific configurations as needed

EXPECTED RUNTIME:
- Random Forest: ~30s per experiment
- AutoSklearn: ~10-15 minutes per experiment  
- Full experiment: ~6-8 hours (with parallel processing)
    """
    
    parser = argparse.ArgumentParser(
        description="Universal Algorithm Selector Trainer for LLM vs mzn2feature Research",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=usage_examples
    )
    
    parser.add_argument('--datasets-dir', type=str, required=True,
                       help='Path to datasets directory containing problem subdirectories')
    parser.add_argument('--problems', type=str, default='vrp,car_sequencing,FLECC',
                       help='Comma-separated list of problems to train (default: all three)')
    parser.add_argument('--selector-types', type=str, 
                       default='random_forest,autosklearn,autosklearn_conservative',
                       help='Comma-separated list of selector types (default: all three)')
    parser.add_argument('--results-dir', type=str, default='results',
                       help='Path to results directory (default: results)')
    parser.add_argument('--cv-folds', type=int, default=5,
                       help='Number of cross-validation folds (default: 5)')
    parser.add_argument('--max-parallel', type=int, default=3,
                       help='Maximum parallel training jobs (default: 3, adjust based on resources)')
    parser.add_argument('--loss-function', type=str, default='accuracy', 
                       choices=['accuracy', 'ranking'],
                       help='Loss function for training: accuracy (default) or ranking')
    parser.add_argument('--use-grid-search', action='store_true',
                       help='Use grid search for hyperparameter optimization (Random Forest only)')
    parser.add_argument('--grid-search-cv', type=int, default=3,
                       help='Number of CV folds for grid search (default: 3)')
    parser.add_argument('--n-jobs-grid', type=int, default=1,
                       help='Number of parallel jobs per grid search (default: 1, higher values may conflict with max-parallel)')
    parser.add_argument('--comparison-only', action='store_true',
                       help='Skip training, only generate comparison report from existing results')
    
    args = parser.parse_args()
    
    # Validate datasets directory
    datasets_dir = Path(args.datasets_dir)
    if not datasets_dir.exists():
        print(f"Error: Datasets directory not found: {datasets_dir}")
        sys.exit(1)
    
    # Parse arguments
    problems = [p.strip() for p in args.problems.split(',')]
    selector_types = [s.strip() for s in args.selector_types.split(',')]
    
    # Validate selector types
    valid_types = ['random_forest', 'autosklearn', 'autosklearn_conservative']
    invalid_types = [t for t in selector_types if t not in valid_types]
    if invalid_types:
        print(f"Error: Invalid selector types: {invalid_types}")
        print(f"Valid types: {valid_types}")
        sys.exit(1)
    
    # Create trainer
    trainer = UniversalSelectorTrainer(
        datasets_dir=str(datasets_dir),
        results_dir=args.results_dir,
        cv_folds=args.cv_folds
    )
    
    if args.comparison_only:
        # Load existing results and generate comparison
        results_dir = Path(args.results_dir)
        if results_dir.exists():
            for results_file in results_dir.glob("*_results.json"):
                try:
                    with open(results_file) as f:
                        results = json.load(f)
                        trainer.all_results.append(results)
                except Exception as e:
                    trainer.logger.warning(f"Could not load {results_file}: {e}")
        trainer.generate_comparison_report()
    else:
        # Train all selectors
        trainer.train_all_selectors(problems, selector_types, args.max_parallel, args.loss_function,
                                   args.use_grid_search, args.grid_search_cv, args.n_jobs_grid)


if __name__ == "__main__":
    main()