#!/usr/bin/env python3
"""
Simple script to run the codebook generation pipeline

This script runs the codebook generation pipeline with default settings
following the batch_qa pattern: clusters questions into groups, picks representatives,
and runs code generation only on representatives.
"""

import os
import sys
import asyncio
from datetime import datetime
from dotenv import load_dotenv

# Add current directory to path for imports
sys.path.append(os.path.dirname(__file__))

from codebook_generation_pipeline import CodebookGenerationPipeline

async def main():
    """Run the codebook generation pipeline with default settings"""
    print("🚀 Starting Codebook Generation Pipeline (Batch QA Pattern)")
    print("=" * 60)

    # Load environment variables from .env if present
    load_dotenv()

    embedding_model = os.getenv("DEFAULT_EMBEDDING_MODEL")
    if not embedding_model:
        print("❌ DEFAULT_EMBEDDING_MODEL is not set. Please export it or add it to .env before running.")
        sys.exit(1)
    print(f"🤖 DEFAULT_EMBEDDING_MODEL: {embedding_model}")
    print(f"🔗 VLLM_EMBEDDING_URL: {os.getenv('VLLM_EMBEDDING_URL', '(unset)')}")
    print(f"🔗 VLLM_QWEN_32B_URL: {os.getenv('VLLM_QWEN_32B_URL', '(unset)')}")

    # Initialize pipeline with large cluster size to include ALL questions in single cluster
    pipeline = CodebookGenerationPipeline(
        chunk_size=256,
        overlap=50,
        cluster_size=10,  # Large cluster size to ensure all questions end up in one cluster
        strategy="llm_enhanced",
        model=embedding_model,  # strictly from env
        temp_dir="temp_files",
        output_file="temp_files/generated_codebooks.json"
    )
    
    print(f"⚙️  Configuration: chunk_size=256, overlap=50, cluster_size=1000 (all questions in one cluster), retries=3, batch_size=100, concurrency=64")
    
    # Run the pipeline
    result = await pipeline.run_codebook_pipeline()
    
    # Handle the result
    if "error" in result:
        print(f"\n❌ Pipeline failed: {result['error']}")
        sys.exit(1)
    elif result.get("success"):
        print(f"\n✅ Pipeline completed successfully!")
        print(f"   📊 Total clusters: {result['total_clusters']}")
        print(f"   ✅ Successful clusters: {result['successful_clusters']}")
        print(f"   ❌ Failed clusters: {result['failed_clusters']}")
        print(f"   📝 Total codebooks: {result['total_codebooks']}")
        print(f"   ❓ Total questions: {result['total_questions']}")
        print(f"   📁 Output file: {result['output_file']}")
        
        if result['failed_clusters'] > 0:
            print(f"   ⚠️  Note: {result['failed_clusters']} clusters have empty codebooks due to processing errors")
    else:
        print(f"\n⚠️  Pipeline completed with unknown status")
        print(f"   Result: {result}")

if __name__ == "__main__":
    asyncio.run(main()) 