#!/usr/bin/env python3
"""
Unified Schema Induction Pipeline with Multi-Iteration Requirement

This unified pipeline requires multi-iteration schema induction for all modes.
Users can specify how many iterations to run for each mode.

Modes:
1. single-qa: Single question answering with multi-iteration pipeline
2. batch-qa: Batch QA with clustering and multi-iteration processing
3. codebook-gen: Generate codebooks using multi-iteration pipeline
4. qa-with-codebook: Use existing codebook for QA (still requires multi-iteration for context)

Usage:
    python unified_pipeline.py --mode single-qa --question "Your question here" --iterations 2
    python unified_pipeline.py --mode batch-qa --questions-file questions.csv --iterations 3
    python unified_pipeline.py --mode codebook-gen --questions-file questions.csv --iterations 2
    python unified_pipeline.py --mode qa-with-codebook --question "Your question" --codebook-path path/to/codebook --iterations 2
"""

# Direct server call function (based on build_corpus.py)
async def generate_answer_direct(question: str, context: str, model_url: str = None) -> str:
    """
    Generate answer using direct server call (no client wrapper)
    Based on AsyncVLLMClient from build_corpus.py
    """
    import aiohttp
    import os
    
    # Use provided model_url or get from environment
    chat_url = model_url or os.environ.get('VLLM_QWEN_32B_URL')
    model_name = os.environ.get('VLLM_QWEN_32B_MODEL', 'qwen2.5-32b-instruct')
    
    if not chat_url:
        raise ValueError("VLLM_QWEN_32B_URL environment variable not set")
    
    # Prepare the prompt
    prompt = f"""Based on the following context, answer the question:

Context:
{context}

Question: {question}

Answer:

IMPORTANT: Do not use thinking mode, step-by-step reasoning, or meta-commentary. Provide a direct answer immediately."""
    
    # Prepare the request payload
    payload = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.7,
        "max_tokens": 1000
    }
    
    try:
        async with aiohttp.ClientSession() as session:
            async with session.post(f"{chat_url}/v1/chat/completions", json=payload) as response:
                if response.status == 200:
                    result = await response.json()
                    return result["choices"][0]["message"]["content"]
                else:
                    raise Exception(f"API request failed with status {response.status}")
    except Exception as e:
        raise Exception(f"Failed to generate answer: {e}")
# Direct server call function (based on build_corpus.py)
async def generate_answer_direct(question: str, context: str, model_url: str = None) -> str:
    """
    Generate answer using direct server call (no client wrapper)
    Based on AsyncVLLMClient from build_corpus.py
    """
    import aiohttp
    import os
    
    # Use provided model_url or get from environment
    chat_url = model_url or os.environ.get('VLLM_QWEN_32B_URL')
    model_name = os.environ.get('VLLM_QWEN_32B_MODEL', 'qwen2.5-32b-instruct')
    
    if not chat_url:
        raise ValueError("VLLM_QWEN_32B_URL environment variable not set")
    
    # Prepare the prompt
    prompt = f"""Based on the following context, answer the question:

Context:
{context}

Question: {question}

Answer:

IMPORTANT: Do not use thinking mode, step-by-step reasoning, or meta-commentary. Provide a direct answer immediately."""
    
    # Prepare the request payload
    payload = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.7,
        "max_tokens": 1000
    }
    
    try:
        async with aiohttp.ClientSession() as session:
            async with session.post(f"{chat_url}/v1/chat/completions", json=payload) as response:
                if response.status == 200:
                    result = await response.json()
                    return result["choices"][0]["message"]["content"]
                else:
                    raise Exception(f"API request failed with status {response.status}")
    except Exception as e:
        raise Exception(f"Failed to generate answer: {e}")
import os
import sys
import argparse
import asyncio
import json
import pandas as pd
from typing import Dict, List, Any, Optional
from pathlib import Path
from datetime import datetime

# Add utils to path
sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))

class UnifiedPipeline:
    """
    Unified pipeline that requires multi-iteration schema induction for all modes
    """
    
    def __init__(self, base_dir: str = None, model_url: str = None):
        """
        Initialize the unified pipeline
        
        Args:
            base_dir: Base directory for temporary files
            model_url: URL for the VLLM model
        """
        # Set default base_dir to main pipeline temp_files if not specified
        if base_dir is None:
            # Get the directory where this script is located (main_pipeline)
            script_dir = os.path.dirname(os.path.abspath(__file__))
            base_dir = os.path.join(script_dir, "temp_files")
        
        self.base_dir = base_dir
        self.model_url = model_url or os.environ.get("VLLM_QWEN_32B_URL")
        
        # Ensure base directory exists
        os.makedirs(base_dir, exist_ok=True)
    async def run_single_qa(self, question: str, iterations: int = 2, **kwargs) -> Dict[str, Any]:
        """
        Mode 1: Single question answering with multi-iteration pipeline
        
        Args:
            question: The question to answer
            iterations: Number of iterations to run
            **kwargs: Additional parameters
            
        Returns:
            QA results
        """
        print(f"🎯 Mode: Single QA with Multi-Iteration")
        print(f"   Question: {question}")
        print(f"   Iterations: {iterations}")
        print("=" * 60)
        
        try:
            # Run multi-iteration pipeline first
            from utils.multi_iteration_schema_induction import MultiIterationSchemaInduction
            
            pipeline = MultiIterationSchemaInduction(
                base_temp_dir=self.base_dir,
                max_iterations=iterations,
                model_url=self.model_url,
                **kwargs
            )
            
            multi_iteration_result = await pipeline.run_full_pipeline(question)
            
            if not multi_iteration_result.get("success"):
                raise Exception(f"Multi-iteration failed: {multi_iteration_result.get('error')}")
            
            # Now run QA using the latest iteration's data
            from utils.data_retrieval import GraphBasedDataRetriever
            from utils.context_retrievers import DataRetrievalContextRetriever
            # Direct server calls using AsyncVLLMClient from build_corpus
            
            # Initialize data retriever with latest iteration
            data_retriever = GraphBasedDataRetriever(base_temp_dir=self.base_dir)
            # Context retriever will be initialized with proper paths from data_retriever
            # Get paths from data_retriever and initialize context_retriever
            embeddings_path = data_retriever.embeddings_path
            topological_graph_dir = data_retriever.topological_graph_dir
            context_retriever = DataRetrievalContextRetriever(embeddings_path, topological_graph_dir)
            # Retrieve context using the context retriever
            hrp_result = await context_retriever.retrieve_context(question=question)
            enhanced_context = hrp_result.get("chunks", [])
            # Generate answer using enhanced context
            # Using direct server call instead of client wrapper
            # Convert list of chunks to string format
            if isinstance(enhanced_context, list):
                context_text = "\n\n".join([str(chunk) for chunk in enhanced_context])
            else:
                context_text = str(enhanced_context)
            
            # Limit context length to prevent API errors (max ~8000 characters)
            max_context_length = 8000
            if len(context_text) > max_context_length:
                print(f"   ⚠️ Context too long ({len(context_text)} chars), truncating to {max_context_length}")
                context_text = context_text[:max_context_length] + "..."
            
            # Generate answer using enhanced context
            answer = await generate_answer_direct(question, context_text, self.model_url)
            return {
                "mode": "single-qa",
                "question": question,
                "iterations": iterations,
                "multi_iteration_result": multi_iteration_result,
                "answer": answer,
                "success": True
            }
        except Exception as e:
            print(f"❌ Single QA with multi-iteration failed: {e}")
            return {
                "mode": "single-qa",
                "question": question,
                "iterations": iterations,
                "error": str(e),
                "success": False
            }
            
    async def run_batch_qa(self, questions_file: str, iterations: int = 2, **kwargs) -> Dict[str, Any]:
        """Mode 2: Batch QA with clustering and multi-iteration processing
        """
        print(f"🎯 Mode: Batch QA with Multi-Iteration")
        print(f"   Questions file: {questions_file}")
        print(f"   Iterations: {iterations}")
        print("=" * 60)
        
        try:
            # Load questions
            questions_df = pd.read_csv(questions_file)
            questions = questions_df['question'].tolist() if 'question' in questions_df.columns else questions_df.iloc[:, 0].tolist()
            
            print(f"   📝 Loaded {len(questions)} questions")
            
            # Use clustering approach from batch_qa_pipeline
            from batch_qa_pipeline import BatchQAPipeline
            
            batch_pipeline = BatchQAPipeline(
                model_url=self.model_url,
                temp_dir=self.base_dir
            )
            
            # Cluster questions and get representatives
            cluster_results = await batch_pipeline.cluster_questions(questions)
            representative_questions = cluster_results['representative_questions']
            
            print(f"   🎯 Selected {len(representative_questions)} representative questions")
            
            # Run multi-iteration pipeline for each representative
            multi_iteration_results = {}
            for i, rep_question in enumerate(representative_questions):
                print(f"   🔄 Processing representative {i+1}/{len(representative_questions)}: {rep_question[:50]}...")
                
                from utils.multi_iteration_schema_induction import MultiIterationSchemaInduction
                
                pipeline = MultiIterationSchemaInduction(
                    base_temp_dir=os.path.join(self.base_dir, f"batch_cluster_{i}"),
                    max_iterations=iterations,
                    model_url=self.model_url,
                    **kwargs
                )
                
                result = await pipeline.run_full_pipeline(rep_question)
                multi_iteration_results[i] = result
            
            # Process remaining questions using cluster context
            batch_results = await batch_pipeline.process_remaining_questions(
                questions, cluster_results, multi_iteration_results
            )
            
            print(f"✅ Batch QA with multi-iteration completed successfully!")
            return {
                "mode": "batch-qa",
                "questions_file": questions_file,
                "iterations": iterations,
                "total_questions": len(questions),
                "representative_questions": len(representative_questions),
                "multi_iteration_results": multi_iteration_results,
                "batch_results": batch_results,
                "success": True
            }
            
        except Exception as e:
            print(f"❌ Batch QA with multi-iteration failed: {e}")
            return {
                "mode": "batch-qa",
                "questions_file": questions_file,
                "iterations": iterations,
                "error": str(e),
                "success": False
            }
    
    async def run_codebook_generation(self, questions_file: str, iterations: int = 2, **kwargs) -> Dict[str, Any]:
        """
        Mode 3: Generate codebooks using multi-iteration pipeline
        
        Args:
            questions_file: Path to CSV file with questions
            iterations: Number of iterations to run
            **kwargs: Additional parameters
            
        Returns:
            Codebook generation results
        """
        print(f"🎯 Mode: Codebook Generation with Multi-Iteration")
        print(f"   Questions file: {questions_file}")
        print(f"   Iterations: {iterations}")
        print("=" * 60)
        
        try:
            # Load questions
            questions_df = pd.read_csv(questions_file)
            questions = questions_df['question'].tolist() if 'question' in questions_df.columns else questions_df.iloc[:, 0].tolist()
            
            print(f"   📝 Loaded {len(questions)} questions")
            
            # Use clustering approach from codebook_generation_pipeline
            from codebook_generation_pipeline import CodebookGenerationPipeline
            
            codebook_pipeline = CodebookGenerationPipeline(
                model_url=self.model_url,
                temp_dir=self.base_dir
            )
            
            # Cluster questions and get representatives
            cluster_results = await codebook_pipeline.cluster_questions(questions)
            representative_questions = cluster_results['representative_questions']
            
            print(f"   🎯 Selected {len(representative_questions)} representative questions")
            
            # Run multi-iteration pipeline for each representative
            codebooks = {}
            for i, rep_question in enumerate(representative_questions):
                print(f"   🔄 Processing representative {i+1}/{len(representative_questions)}: {rep_question[:50]}...")
                
                from utils.multi_iteration_schema_induction import MultiIterationSchemaInduction
                
                pipeline = MultiIterationSchemaInduction(
                    base_temp_dir=os.path.join(self.base_dir, f"codebook_cluster_{i}"),
                    max_iterations=iterations,
                    model_url=self.model_url,
                    **kwargs
                )
                
                result = await pipeline.run_full_pipeline(rep_question)
                
                if result.get("success"):
                    # Extract codebook from the latest iteration
                    latest_iteration = result['results'][iterations]
                    codebooks[i] = {
                        'question': rep_question,
                        'cluster_id': i,
                        'codebook_data': latest_iteration,
                        'multi_iteration_result': result
                    }
            
            print(f"✅ Codebook generation with multi-iteration completed successfully!")
            return {
                "mode": "codebook-gen",
                "questions_file": questions_file,
                "iterations": iterations,
                "total_questions": len(questions),
                "representative_questions": len(representative_questions),
                "codebooks": codebooks,
                "success": True
            }
            
        except Exception as e:
            print(f"❌ Codebook generation with multi-iteration failed: {e}")
            return {
                "mode": "codebook-gen",
                "questions_file": questions_file,
                "iterations": iterations,
                "error": str(e),
                "success": False
            }
    
    async def run_qa_with_codebook(self, question: str, codebook_path: str, iterations: int = 2, **kwargs) -> Dict[str, Any]:
        """
        Mode 4: Use existing codebook for QA (still requires multi-iteration for context)
        
        Args:
            question: The question to answer
            codebook_path: Path to existing codebook
            iterations: Number of iterations to run for context
            **kwargs: Additional parameters
            
        Returns:
            QA with codebook results
        """
        print(f"🎯 Mode: QA with Existing Codebook + Multi-Iteration Context")
        print(f"   Question: {question}")
        print(f"   Codebook: {codebook_path}")
        print(f"   Iterations: {iterations}")
        print("=" * 60)
        
        try:
            # Load existing codebook
            if not os.path.exists(codebook_path):
                raise FileNotFoundError(f"Codebook not found: {codebook_path}")
            
            # Run multi-iteration pipeline for context (even with existing codebook)
            from utils.multi_iteration_schema_induction import MultiIterationSchemaInduction
            
            pipeline = MultiIterationSchemaInduction(
                base_temp_dir=self.base_dir,
                max_iterations=iterations,
                model_url=self.model_url,
                **kwargs
            )
            
            multi_iteration_result = await pipeline.run_full_pipeline(question)
            
            if not multi_iteration_result.get("success"):
                raise Exception(f"Multi-iteration failed: {multi_iteration_result.get('error')}")
            
            # Use both existing codebook and multi-iteration context
            from utils.data_retrieval import GraphBasedDataRetriever
            from utils.context_retrievers import DataRetrievalContextRetriever
            # Direct server calls using AsyncVLLMClient from build_corpus
            
            # Initialize data retriever with latest iteration
            data_retriever = GraphBasedDataRetriever(base_temp_dir=self.base_dir)
            # Context retriever will be initialized with proper paths from data_retriever
            # Get paths from data_retriever and initialize context_retriever
            embeddings_path = data_retriever.embeddings_path
            topological_graph_dir = data_retriever.topological_graph_dir
            context_retriever = DataRetrievalContextRetriever(embeddings_path, topological_graph_dir)
                        # Load existing codebook data
            with open(codebook_path, 'r') as f:
                existing_codebook = json.load(f)
            
            # Generate answer using enhanced context
            # Using direct server call instead of client wrapper
            # Convert list of chunks to string format
            if isinstance(enhanced_context, list):
                context_text = "\n\n".join([str(chunk) for chunk in enhanced_context])
            else:
                context_text = str(enhanced_context)
            
            # Limit context length to prevent API errors (max ~8000 characters)
            max_context_length = 8000
            if len(context_text) > max_context_length:
                print(f"   ⚠️ Context too long ({len(context_text)} chars), truncating to {max_context_length}")
                context_text = context_text[:max_context_length] + "..."
            
            # Generate answer using enhanced context
            answer = await generate_answer_direct(question, context_text, self.model_url)
            
            print(f"✅ QA with codebook + multi-iteration completed successfully!")
            return {
                "mode": "qa-with-codebook",
                "question": question,
                "codebook_path": codebook_path,
                "iterations": iterations,
                "multi_iteration_result": multi_iteration_result,
                "existing_codebook": existing_codebook,
                "answer": answer,
                "success": True
            }
            
        except Exception as e:
            print(f"❌ QA with codebook + multi-iteration failed: {e}")
            return {
                "mode": "qa-with-codebook",
                "question": question,
                "codebook_path": codebook_path,
                "iterations": iterations,
                "error": str(e),
                "success": False
            }
    
    async def run_pipeline(self, mode: str, iterations: int = 2, **kwargs) -> Dict[str, Any]:
        """
        Run the pipeline in the specified mode with multi-iteration requirement
        
        Args:
            mode: Pipeline mode to run
            iterations: Number of iterations to run (required for all modes)
            **kwargs: Mode-specific parameters
            
        Returns:
            Pipeline results
        """
        print(f"🚀 Starting Unified Schema Induction Pipeline (Multi-Iteration Required)")
        print(f"   Mode: {mode}")
        print(f"   Iterations: {iterations}")
        print(f"   Base directory: {self.base_dir}")
        print("=" * 80)
        
        start_time = datetime.now()
        
        try:
            if mode == "single-qa":
                if not kwargs.get('question'):
                    raise ValueError("Question is required for single-qa mode")
                result = await self.run_single_qa(iterations=iterations, **kwargs)
                
            elif mode == "batch-qa":
                if not kwargs.get('questions_file'):
                    raise ValueError("Questions file is required for batch-qa mode")
                result = await self.run_batch_qa(iterations=iterations, **kwargs)
                
            elif mode == "codebook-gen":
                if not kwargs.get('questions_file'):
                    raise ValueError("Questions file is required for codebook-gen mode")
                result = await self.run_codebook_generation(iterations=iterations, **kwargs)
                
            elif mode == "qa-with-codebook":
                if not kwargs.get('question') or not kwargs.get('codebook_path'):
                    raise ValueError("Question and codebook_path are required for qa-with-codebook mode")
                result = await self.run_qa_with_codebook(iterations=iterations, **kwargs)
                
            else:
                raise ValueError(f"Unknown mode: {mode}")
            
            end_time = datetime.now()
            duration = (end_time - start_time).total_seconds()
            
            result["duration_seconds"] = duration
            result["start_time"] = start_time.isoformat()
            result["end_time"] = end_time.isoformat()
            
            print(f"\n🎉 Pipeline completed in {duration:.2f} seconds!")
            return result
            
        except Exception as e:
            end_time = datetime.now()
            duration = (end_time - start_time).total_seconds()
            
            print(f"\n❌ Pipeline failed after {duration:.2f} seconds: {e}")
            return {
                "mode": mode,
                "iterations": iterations,
                "error": str(e),
                "duration_seconds": duration,
                "success": False
            }


def main():
    """Main entry point for the unified pipeline"""
    parser = argparse.ArgumentParser(description="Unified Schema Induction Pipeline (Multi-Iteration Required)")
    
    # Mode selection
    parser.add_argument("--mode", required=True, 
                       choices=["single-qa", "batch-qa", "codebook-gen", "qa-with-codebook"],
                       help="Pipeline mode to run")
    
    # Iterations (now required for all modes)
    parser.add_argument("--iterations", type=int, default=2, required=True,
                       help="Number of iterations to run (required for all modes)")
    
    # Common parameters
    parser.add_argument("--question", help="Question to answer")
    parser.add_argument("--questions-file", help="Path to CSV file with questions")
    parser.add_argument("--custom-data-path", help="Path to data file")
    parser.add_argument("--codebook-path", help="Path to existing codebook")
    parser.add_argument("--base-dir", default=None, help="Base directory for temporary files (default: main_pipeline/temp_files)")
    parser.add_argument("--model-url", help="URL for the VLLM model")
    parser.add_argument("--chunk-size", type=int, default=256, help="Chunk size for processing")
    parser.add_argument("--overlap", type=int, default=50, help="Overlap between chunks")
    parser.add_argument("--strategy", default="strategy_1", help="Strategy for code generation")
    parser.add_argument("--model", default="32B", help="Model to use")
    parser.add_argument("--output", help="Output file path")
    
    args = parser.parse_args()
    
    # Create pipeline
    pipeline = UnifiedPipeline(
        base_dir=args.base_dir,
        model_url=args.model_url
    )
    
    # Prepare kwargs
    kwargs = {
        "question": args.question,
        "questions_file": args.questions_file,
        "custom_data_path": args.custom_data_path,
        "codebook_path": args.codebook_path,
        "chunk_size": args.chunk_size,
        "overlap": args.overlap,
        "strategy": args.strategy,
        "model": args.model
    }
    
    # Remove None values
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
    
    # Run pipeline
    result = asyncio.run(pipeline.run_pipeline(args.mode, iterations=args.iterations, **kwargs))
    
    # Save results if output specified
    if args.output:
        with open(args.output, 'w') as f:
            json.dump(result, f, indent=2, default=str)
        print(f"📁 Results saved to: {args.output}")
    
    # Print summary
    if result.get("success"):
        print(f"\n✅ Pipeline completed successfully!")
        print(f"   Mode: {result['mode']}")
        print(f"   Iterations: {result.get('iterations', args.iterations)}")
        print(f"   Duration: {result.get('duration_seconds', 0):.2f} seconds")
    else:
        print(f"\n❌ Pipeline failed!")
        print(f"   Mode: {result['mode']}")
        print(f"   Error: {result.get('error', 'Unknown error')}")
        sys.exit(1)


def main():
    """Main entry point for the unified pipeline"""
    import asyncio
    import argparse
    import sys
    
    parser = argparse.ArgumentParser(description="Unified Schema Induction Pipeline")
    parser.add_argument("--mode", required=True, choices=["single-qa", "batch-qa", "codebook-gen", "qa-with-codebook"],
                       help="Pipeline mode")
    parser.add_argument("--question", help="Question for single QA mode")
    parser.add_argument("--questions-file", help="Path to questions CSV file")
    parser.add_argument("--codebook-path", help="Path to existing codebook")
    parser.add_argument("--iterations", type=int, required=True, help="Number of iterations to run")
    parser.add_argument("--custom-data-path", help="Custom data path")
    
    args = parser.parse_args()
    
    # Create pipeline instance
    pipeline = UnifiedPipeline()
    
    # Run the appropriate mode
    try:
        if args.mode == "single-qa":
            if not args.question:
                print("❌ --question is required for single-qa mode")
                sys.exit(1)
            result = asyncio.run(pipeline.run_single_qa(args.question, args.iterations, custom_data_path=args.custom_data_path))
        elif args.mode == "batch-qa":
            if not args.questions_file:
                print("❌ --questions-file is required for batch-qa mode")
                sys.exit(1)
            result = asyncio.run(pipeline.run_batch_qa(args.questions_file, args.iterations, custom_data_path=args.custom_data_path))
        elif args.mode == "codebook-gen":
            if not args.questions_file:
                print("❌ --questions-file is required for codebook-gen mode")
                sys.exit(1)
            result = asyncio.run(pipeline.run_codebook_generation(args.questions_file, args.iterations, custom_data_path=args.custom_data_path))
        elif args.mode == "qa-with-codebook":
            if not args.question or not args.codebook_path:
                print("❌ --question and --codebook-path are required for qa-with-codebook mode")
                sys.exit(1)
            result = asyncio.run(pipeline.run_qa_with_codebook(args.question, args.codebook_path, args.iterations, custom_data_path=args.custom_data_path))
        
        print("🎉 Pipeline completed successfully!")
        print(f"   Mode: {result.get('mode', 'unknown')}")
        if 'answer' in result:
            print(f"   Answer: {result['answer']}")
        if 'error' in result:
            print(f"   Error: {result['error']}")
            
    except Exception as e:
        print(f"❌ Pipeline failed!")
        print(f"   Mode: {args.mode}")
        print(f"   Error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()
