#!/usr/bin/env python3
"""
Generate embeddings for codes from reusability results

This script extracts unique codes from reusability results and generates embeddings for them.
"""

import asyncio
import json
import os
import sys
import aiohttp
from typing import Dict, List, Any
import argparse
from dotenv import load_dotenv

# Load environment variables
load_dotenv("../../main_pipeline/.env")

class EmbeddingGenerator:
    """Generate embeddings for codes"""
    
    def __init__(self):
        self.embed_url = os.getenv("VLLM_EMBEDDING_URL", "http://localhost:8001")
        self.embed_model = os.getenv("DEFAULT_EMBEDDING_MODEL", "qwen3-embed-0.6b")
        self.session = None
    
    async def __aenter__(self):
        self.session = aiohttp.ClientSession()
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.session:
            await self.session.close()
    
    async def generate_embeddings(self, texts: List[str]) -> Dict[str, Any]:
        """Generate embeddings for a list of texts"""
        if not texts:
            return {"data": [], "usage": {"total_tokens": 0}}
        
        payload = {
            'model': self.embed_model,
            'input': texts
        }
        
        async with self.session.post(f'{self.embed_url}/v1/embeddings', json=payload) as response:
            return await response.json()

async def generate_embeddings_for_codes(reusability_results_file: str, output_file: str):
    """Generate embeddings for codes from reusability results"""
    print("🔄 Generating embeddings for codes...")
    
    # Load reusability results
    with open(reusability_results_file, 'r') as f:
        results = json.load(f)
    
    # Extract unique codes from the correct location
    unique_codes = results.get('unique_codes', [])
    
    if not unique_codes:
        print("❌ No unique codes found in reusability results")
        return
    
    print(f"📊 Found {len(unique_codes)} unique codes to embed")
    
    # Generate embeddings
    async with EmbeddingGenerator() as generator:
        # Process in batches to avoid overwhelming the API
        batch_size = 100
        all_embeddings = {}
        
        for i in range(0, len(unique_codes), batch_size):
            batch = unique_codes[i:i + batch_size]
            print(f"🔄 Processing batch {i//batch_size + 1}/{(len(unique_codes) + batch_size - 1)//batch_size}")
            
            try:
                response = await generator.generate_embeddings(batch)
                
                if 'data' in response:
                    for j, embedding_data in enumerate(response['data']):
                        code = batch[j]
                        embedding = embedding_data.get('embedding', [])
                        all_embeddings[code] = embedding
                        print(f"  ✅ Generated embedding for: {code[:50]}...")
                else:
                    print(f"  ❌ Error in batch response: {response}")
                    
            except Exception as e:
                print(f"  ❌ Error processing batch: {e}")
                continue
    
    # Save embeddings
    embeddings_data = {
        "code_embeddings": all_embeddings,
        "total_codes": len(unique_codes),
        "total_embeddings": len(all_embeddings),
        "embedding_model": generator.embed_model
    }
    
    with open(output_file, 'w') as f:
        json.dump(embeddings_data, f, indent=2)
    
    print(f"✅ Generated embeddings for {len(all_embeddings)} codes")
    print(f"📁 Saved to: {output_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Generate embeddings for codes')
    parser.add_argument('--reusability_results', type=str, required=True, 
                       help='Path to reusability results JSON file')
    parser.add_argument('--output', type=str, required=True,
                       help='Path to save embeddings JSON file')
    
    args = parser.parse_args()
    
    asyncio.run(generate_embeddings_for_codes(args.reusability_results, args.output))
