#!/usr/bin/env python3
"""
Comprehensive Evaluation Pipeline for Schema Induction

This pipeline runs all evaluation scripts in the correct order:
1. Build corpus from test data
2. Reusability evaluation
3. Descriptive fitness evaluation
4. Descriptive coverage evaluation
5. Parsimony evaluation
6. Consistency/stability evaluation

Usage:
    python comprehensive_evaluation_pipeline.py --test_data <path> --question <question> --train_corpus <path> [--hierarchical_tree <path>] --output_dir <dir>
"""

import os
import sys
import json
import subprocess
import argparse
import shutil
from datetime import datetime
from pathlib import Path

class ComprehensiveEvaluationPipeline:
    """Runs all evaluation scripts in the correct order"""
    
    def __init__(self, output_dir: str):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.log_file = self.output_dir / "pipeline_log.txt"
        
        # Set environment variable to use VLLM_QWEN_32B_URL_2
        if os.getenv('VLLM_QWEN_32B_URL_2'):
            os.environ['VLLM_QWEN_32B_URL'] = os.getenv('VLLM_QWEN_32B_URL_2')
            self.log(f"🔗 Using VLLM_QWEN_32B_URL_2: {os.getenv('VLLM_QWEN_32B_URL_2')}")
        else:
            self.log("⚠️ VLLM_QWEN_32B_URL_2 not set, using default VLLM_QWEN_32B_URL")
        
        # Initialize log
        self.log("🚀 Starting Comprehensive Evaluation Pipeline")
        self.log(f"📁 Output directory: {self.output_dir}")
        self.log(f"⏰ Timestamp: {self.timestamp}")
        self.log("=" * 60)
    
    def log(self, message: str):
        """Log message to both console and file"""
        timestamp = datetime.now().strftime("%H:%M:%S")
        log_message = f"[{timestamp}] {message}"
        print(log_message)
        
        with open(self.log_file, 'a') as f:
            f.write(log_message + '\n')
    
    def run_command(self, command: list, step_name: str) -> bool:
        """Run a command and log the results"""
        self.log(f"🔄 {step_name}")
        self.log(f"📝 Command: {' '.join(command)}")
        
        try:
            result = subprocess.run(
                command,
                capture_output=True,
                text=True,
                cwd=os.getcwd(),
                env=os.environ
            )
            
            if result.returncode == 0:
                self.log(f"✅ {step_name} completed successfully")
                if result.stdout:
                    self.log(f"📤 Output: {result.stdout.strip()}")
                return True
            else:
                self.log(f"❌ {step_name} failed with return code {result.returncode}")
                if result.stderr:
                    self.log(f"📤 Error: {result.stderr.strip()}")
                return False
                
        except Exception as e:
            self.log(f"❌ {step_name} failed with exception: {str(e)}")
            return False
    
    def clear_results_folder(self):
        """Clear the results folder before running the pipeline"""
        self.log("\n" + "="*60)
        self.log("🧹 CLEARING RESULTS FOLDER")
        self.log("="*60)
        
        try:
            files_removed = 0
            if self.output_dir.exists():
                for file_path in self.output_dir.iterdir():
                    if file_path.is_file() and file_path.name != "pipeline_log.txt":
                        try:
                            file_path.unlink()
                            files_removed += 1
                            self.log(f"🗑️  Removed: {file_path.name}")
                        except Exception as e:
                            self.log(f"⚠️  Could not remove {file_path.name}: {str(e)}")
                
                # Also remove subdirectories if any
                for dir_path in self.output_dir.iterdir():
                    if dir_path.is_dir():
                        try:
                            shutil.rmtree(dir_path)
                            self.log(f"🗑️  Removed directory: {dir_path.name}")
                        except Exception as e:
                            self.log(f"⚠️  Could not remove directory {dir_path.name}: {str(e)}")
            
            self.log(f"✅ Results folder cleared! Removed {files_removed} files")
            
        except Exception as e:
            self.log(f"⚠️  Error clearing results folder: {str(e)}")
            self.log("⚠️  Continuing with pipeline...")
    
    def step1_build_corpus(self, test_data: str, question: str, train_corpus: str, hierarchical_tree: str = None) -> str:
        """Step 1: Build corpus from test data"""
        self.log("\n" + "="*60)
        self.log("📊 STEP 1: BUILD CORPUS")
        self.log("="*60)
        
        output_file = self.output_dir / "test_corpus.parquet"
        summary_file = self.output_dir / "test_corpus_summary.json"
        
        command = [
            "python", "build_corpus.py",
            "--question", question,
            "--train_corpus", train_corpus,
            "--test_data", test_data,
            "--output", str(output_file)
        ]
        
        # Add hierarchical_tree if provided
        if hierarchical_tree:
            command.extend(["--hierarchical_tree", hierarchical_tree, "--use_hierarchical_linkage"])
            self.log(f"🌳 Using hierarchical tree: {hierarchical_tree}")
        else:
            self.log("🌳 No hierarchical tree provided - using default behavior (no parents/grandparents)")
        
        success = self.run_command(command, "Building corpus from test data")
        
        if success and output_file.exists():
            self.log(f"📁 Corpus saved to: {output_file}")
            self.log(f"📁 Summary saved to: {summary_file}")
            return str(output_file)
        else:
            self.log("❌ Failed to build corpus")
            return None
    
    def step2_reusability_eval(self, test_corpus: str, question: str, train_corpus: str, hierarchical_tree: str = None) -> str:
        """Step 2: Reusability evaluation"""
        self.log("\n" + "="*60)
        self.log("📊 STEP 2: REUSABILITY EVALUATION")
        self.log("="*60)
        
        output_file = self.output_dir / "reusability_results.json"
        
        command = [
            "python", "reusability_eval.py",
            "--test_corpus", test_corpus,
            "--question", question,
            "--train_corpus", train_corpus,
            "--output", str(output_file),
        ]
        
        # Add hierarchical_tree if provided (optional for reusability eval)
        if hierarchical_tree:
            command.extend(["--hierarchical_tree", hierarchical_tree])
            self.log(f"🌳 Using hierarchical tree: {hierarchical_tree}")
        else:
            self.log("🌳 No hierarchical tree provided - reusability eval will use default behavior")
        
        success = self.run_command(command, "Running reusability evaluation")
        
        if success and output_file.exists():
            self.log(f"📁 Reusability results saved to: {output_file}")
            return str(output_file)
        else:
            self.log("❌ Failed to run reusability evaluation")
            return None
    
    def step3_descriptive_fitness_eval(self, reusability_results: str, test_data: str) -> str:
        """Step 3: Descriptive fitness evaluation"""
        self.log("\n" + "="*60)
        self.log("📊 STEP 3: DESCRIPTIVE FITNESS EVALUATION")
        self.log("="*60)
        
        output_file = self.output_dir / "descriptive_fitness_results.json"
        
        command = [
            "python", "descriptive_fitness_eval.py",
            "--sample_ratio", "1.0",
            "--results_path", reusability_results,
            "--test_data", test_data,
            "--output", str(output_file)
        ]
        
        success = self.run_command(command, "Running descriptive fitness evaluation")
        
        if success and output_file.exists():
            self.log(f"📁 Descriptive fitness results saved to: {output_file}")
            return str(output_file)
        else:
            self.log("❌ Failed to run descriptive fitness evaluation")
            return None
    
    def step4_descriptive_coverage_eval(self, reusability_results: str, test_data: str) -> str:
        """Step 4: Descriptive coverage evaluation"""
        self.log("\n" + "="*60)
        self.log("📊 STEP 4: DESCRIPTIVE COVERAGE EVALUATION")
        self.log("="*60)
        
        output_file = self.output_dir / "descriptive_coverage_results.json"
        
        command = [
            "python", "descriptive_coverage_eval.py",
            "--sample_ratio", "1.0",
            "--results_path", reusability_results,
            "--test_data", test_data,
            "--output", str(output_file)
        ]
        
        success = self.run_command(command, "Running descriptive coverage evaluation")
        
        if success and output_file.exists():
            self.log(f"📁 Descriptive coverage results saved to: {output_file}")
            return str(output_file)
        else:
            self.log("❌ Failed to run descriptive coverage evaluation")
            return None
    
    def step4_5_generate_embeddings(self, reusability_results: str) -> str:
        """Step 4.5: Generate embeddings for codes"""
        self.log("\n" + "="*60)
        self.log("📊 STEP 4.5: GENERATE EMBEDDINGS FOR CODES")
        self.log("="*60)
        
        output_file = self.output_dir / "code_embeddings.json"
        
        command = [
            "python", "generate_code_embeddings.py",
            "--reusability_results", reusability_results,
            "--output", str(output_file)
        ]
        
        success = self.run_command(command, "Generating embeddings for codes")
        
        if success and output_file.exists():
            self.log(f"📁 Code embeddings saved to: {output_file}")
            return str(output_file)
        else:
            self.log("❌ Failed to generate code embeddings")
            return None

    def step5_parsimony_eval(self, embeddings_results: str) -> str:
        """Step 5: Parsimony evaluation"""
        self.log("\n" + "="*60)
        self.log("📊 STEP 5: PARSIMONY EVALUATION")
        self.log("="*60)
        
        output_file = self.output_dir / "parsimony_results.json"
        
        command = [
            "python", "parsimony_eval.py",
            "--results_file", embeddings_results
        ]
        
        success = self.run_command(command, "Running parsimony evaluation")
        
        if success:
            # Check if parsimony_results.json was created (default output)
            default_output = "parsimony_results.json"
            if os.path.exists(default_output):
                # Move to our output directory
                shutil.move(default_output, output_file)
                self.log(f"📁 Parsimony results saved to: {output_file}")
                return str(output_file)
            else:
                self.log("❌ Parsimony results file not found")
                return None
        else:
            self.log("❌ Failed to run parsimony evaluation")
            return None
    
    def step6_consistency_stability_eval(self, test_corpus: str, train_corpus: str) -> str:
        """Step 6: Consistency/stability evaluation"""
        self.log("\n" + "="*60)
        self.log("📊 STEP 6: CONSISTENCY/STABILITY EVALUATION")
        self.log("="*60)
        
        output_file = self.output_dir / "consistency_stability_results.json"
        
        command = [
            "python", "consistency_stability_eval.py",
            "--test_data", test_corpus,
            "--train_corpus", train_corpus,
            "--output", str(output_file),
        ]
        
        success = self.run_command(command, "Running consistency/stability evaluation")
        
        if success and output_file.exists():
            self.log(f"📁 Consistency/stability results saved to: {output_file}")
            return str(output_file)
        else:
            self.log("❌ Failed to run consistency/stability evaluation")
            return None
    
    def generate_summary_report(self, results: dict):
        """Generate a comprehensive summary report"""
        self.log("\n" + "="*60)
        self.log("📊 GENERATING SUMMARY REPORT")
        self.log("="*60)
        
        summary_file = self.output_dir / "comprehensive_evaluation_summary.json"
        
        # Load all results and create summary
        summary = {
            "pipeline_info": {
                "timestamp": self.timestamp,
                "output_directory": str(self.output_dir),
                "log_file": str(self.log_file)
            },
            "results_files": results,
            "evaluation_summary": {}
        }
        
        # Try to load and summarize each result
        for eval_name, file_path in results.items():
            if file_path and os.path.exists(file_path):
                # Skip non-JSON files (like parquet files)
                if not file_path.endswith(".json"):
                    continue
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                    
                    # Extract key metrics based on evaluation type
                    if "reusability" in eval_name:
                        if "reusability_metric" in data:
                            summary["evaluation_summary"]["reusability"] = {
                                "metric": data["reusability_metric"],
                                "total_unique_codes": data.get("total_unique_codes", 0),
                                "codes_in_train": data.get("codes_in_train", 0),
                                "unique_train_codes": data.get("unique_train_codes", 0)
                            }
                    
                    elif "fitness" in eval_name:
                        if "average_fitness_score" in data:
                            summary["evaluation_summary"]["descriptive_fitness"] = {
                                "average_score": data.get("average_fitness_score", 0),
                                "score_range": f"{data.get('min_score', 0)}-{data.get('max_score', 0)}",
                                "total_chunks": data.get("total_chunks", 0)
                            }
                    
                    elif "coverage" in eval_name:
                        if "average_coverage_score" in data:
                            summary["evaluation_summary"]["descriptive_coverage"] = {
                                "average_score": data.get("average_coverage_score", 0),
                                "score_range": f"{data.get('min_score', 0)}-{data.get('max_score', 0)}",
                                "total_chunks": data.get("total_chunks", 0)
                            }
                    
                    elif "parsimony" in eval_name:
                        if "parsimony_score" in data:
                            summary["evaluation_summary"]["parsimony"] = {
                                "parsimony_score": data.get("parsimony_score", 0),
                                "total_codes": data.get("total_codes", 0),
                                "unique_codes": data.get("unique_codes", 0)
                            }
                    
                    elif "consistency" in eval_name:
                        if "consistency_stability_metrics" in data:
                            summary["evaluation_summary"]["consistency_stability"] = {
                                "jsd": data["consistency_stability_metrics"].get("jsd", 0),
                                "consistency": data["consistency_stability_metrics"].get("consistency", 0),
                                "stability": data["consistency_stability_metrics"].get("stability", 0),
                                "test_coverage": data["consistency_stability_metrics"].get("test_coverage", 0)
                            }
                
                except Exception as e:
                    self.log(f"⚠️ Could not load {eval_name} results: {str(e)}")
        
        # Save summary
        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2)
        
        self.log(f"📁 Summary report saved to: {summary_file}")
        
        # Print summary to console
        self.log("\n" + "="*60)
        self.log("📊 EVALUATION SUMMARY")
        self.log("="*60)
        
        for eval_name, metrics in summary["evaluation_summary"].items():
            self.log(f"\n📈 {eval_name.upper()}:")
            for metric, value in metrics.items():
                self.log(f"   {metric}: {value}")
    
    def run_pipeline(self, test_data: str, question: str, train_corpus: str, hierarchical_tree: str = None):
        """Run the complete evaluation pipeline"""
        results = {}
        
        # Step 0: Clear results folder
        self.clear_results_folder()
        
        # Step 1: Build corpus
        test_corpus = self.step1_build_corpus(test_data, question, train_corpus, hierarchical_tree)
        if not test_corpus:
            self.log("❌ Pipeline failed at Step 1: Build corpus")
            return False
        results["test_corpus"] = test_corpus
        
        # Step 2: Reusability evaluation
        reusability_results = self.step2_reusability_eval(test_corpus, question, train_corpus, hierarchical_tree)
        if not reusability_results:
            self.log("❌ Pipeline failed at Step 2: Reusability evaluation")
            return False
        results["reusability"] = reusability_results
        
        # Step 3: Descriptive fitness evaluation
        fitness_results = self.step3_descriptive_fitness_eval(reusability_results, test_data)
        if not fitness_results:
            self.log("⚠️ Pipeline continued despite Step 3 failure: Descriptive fitness evaluation")
        results["descriptive_fitness"] = fitness_results
        
        # Step 4: Descriptive coverage evaluation
        coverage_results = self.step4_descriptive_coverage_eval(reusability_results, test_data)
        if not coverage_results:
            self.log("⚠️ Pipeline continued despite Step 4 failure: Descriptive coverage evaluation")
        results["descriptive_coverage"] = coverage_results
        
        # Step 4.5: Generate embeddings for codes
        embeddings_results = self.step4_5_generate_embeddings(reusability_results)
        if not embeddings_results:
            self.log("⚠️ Pipeline continued despite Step 4.5 failure: Embedding generation")
        results["code_embeddings"] = embeddings_results
        
        # Step 5: Parsimony evaluation
        parsimony_results = self.step5_parsimony_eval(embeddings_results) if embeddings_results else None
        if not parsimony_results:
            self.log("⚠️ Pipeline continued despite Step 5 failure: Parsimony evaluation")
        results["parsimony"] = parsimony_results
        
        # Step 6: Consistency/stability evaluation
        consistency_results = self.step6_consistency_stability_eval(test_corpus, train_corpus)
        if not consistency_results:
            self.log("⚠️ Pipeline continued despite Step 6 failure: Consistency/stability evaluation")
        results["consistency_stability"] = consistency_results
        
        # Generate summary report
        self.generate_summary_report(results)
        
        self.log("\n" + "="*60)
        self.log("🎉 PIPELINE COMPLETED SUCCESSFULLY!")
        self.log("="*60)
        self.log(f"📁 All results saved in: {self.output_dir}")
        self.log(f"📝 Log file: {self.log_file}")
        
        return True

def main():
    parser = argparse.ArgumentParser(description='Comprehensive Evaluation Pipeline for Schema Induction')
    parser.add_argument('--test_data', type=str, required=True,
                       help='Path to test data CSV file')
    parser.add_argument('--question', type=str, required=True,
                       help='Question for the evaluation')
    parser.add_argument('--train_corpus', type=str, required=True,
                       help='Path to training corpus parquet file')
    parser.add_argument('--hierarchical_tree', type=str, default=None,
                       help='Path to hierarchical tree JSON file (optional - defaults to no parents/grandparents)')
    parser.add_argument('--output_dir', type=str, default='evaluation_results',
                       help='Output directory for all results')
    
    args = parser.parse_args()
    
    # Create and run pipeline
    pipeline = ComprehensiveEvaluationPipeline(args.output_dir)
    success = pipeline.run_pipeline(
        args.test_data,
        args.question,
        args.train_corpus,
        args.hierarchical_tree
    )
    
    if success:
        print(f"\n🎉 Pipeline completed successfully!")
        print(f"📁 Results saved in: {args.output_dir}")
        sys.exit(0)
    else:
        print(f"\n❌ Pipeline failed!")
        sys.exit(1)

if __name__ == "__main__":
    main()
