import asyncio
import numpy as np
import pandas as pd
import json
from typing import Dict, List, Tuple
import sys
import os

class ParsimonyEvaluator:
    """Evaluates parsimony of codes using embeddings from reusability results"""
    
    def __init__(self):
        pass
    
    def cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
        """Calculate cosine similarity between two vectors"""
        if not vec1 or not vec2:
            return 0.0
        
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)
        
        dot_product = np.dot(vec1, vec2)
        norm1 = np.linalg.norm(vec1)
        norm2 = np.linalg.norm(vec2)
        
        if norm1 == 0 or norm2 == 0:
            return 0.0
        
        return dot_product / (norm1 * norm2)
    
    def calculate_parsimony_from_embeddings(self, embeddings: List[List[float]], codes: List[str]) -> Dict:
        """Calculate parsimony metric for a list of code embeddings
        
        Formula: parsimony = 1 - cos
        where cos = 2 / (n(n-1)) * sum_i sum_j (cos(node_i, node_j)) for i != j
        """
        if len(embeddings) < 2:
            return {
                'parsimony_score': 1.0,
                'average_cosine_similarity': 0.0,
                'total_pairs': 0,
                'valid_embeddings': len(embeddings)
            }
        
        print(f"🔄 Calculating parsimony for {len(embeddings)} code embeddings...")
        
        # Calculate cosine similarities between all pairs
        n = len(embeddings)
        total_pairs = n * (n - 1) // 2
        cosine_sum = 0.0
        valid_pairs = 0
        
        print(f"🔄 Calculating cosine similarities for {total_pairs} pairs...")
        
        for i in range(n):
            for j in range(i + 1, n):
                if embeddings[i] and embeddings[j]:
                    similarity = self.cosine_similarity(embeddings[i], embeddings[j])
                    cosine_sum += similarity
                    valid_pairs += 1
                    print(f"  📊 Pair {valid_pairs}/{total_pairs}: {codes[i][:30]}... vs {codes[j][:30]}... = {similarity:.4f}")
                else:
                    print(f"  ⚠️ Skipping pair due to missing embeddings")
        
        if valid_pairs == 0:
            return {
                'parsimony_score': 0.0,
                'average_cosine_similarity': 0.0,
                'total_pairs': total_pairs,
                'valid_embeddings': len([e for e in embeddings if e is not None])
            }
        
        # Calculate average cosine similarity
        average_cosine = cosine_sum / valid_pairs
        
        # Calculate parsimony using the formula: parsimony = 1 - cos
        # where cos = 2 / (n(n-1)) * sum of all cosine similarities
        cos = (2 * cosine_sum) / (n * (n - 1))
        parsimony_score = 1 - cos
        
        return {
            'parsimony_score': parsimony_score,
            'average_cosine_similarity': average_cosine,
            'total_pairs': total_pairs,
            'valid_pairs': valid_pairs,
            'valid_embeddings': len([e for e in embeddings if e is not None]),
            'cos_value': cos
        }

def extract_embeddings_from_results(results_file: str) -> Tuple[List[str], List[List[float]]]:
    """Extract codes and their embeddings from embeddings results file"""
    try:
        with open(results_file, 'r') as f:
            results = json.load(f)
        
        # Extract codes and embeddings from the embeddings file
        code_embeddings = results.get('code_embeddings', {})
        
        if not code_embeddings:
            print("❌ No code embeddings found in results")
            return [], []
        
        print(f"📊 Found {len(code_embeddings)} code embeddings")
        
        # Extract codes and embeddings
        codes = list(code_embeddings.keys())
        embeddings = list(code_embeddings.values())
        
        return codes, embeddings
        
    except Exception as e:
        print(f"❌ Error extracting embeddings: {e}")
        return [], []

async def evaluate_parsimony_from_results(results_file: str) -> Dict:
    """Evaluate parsimony from comprehensive evaluation results"""
    codes, embeddings = extract_embeddings_from_results(results_file)
    
    if not codes:
        return {}
    
    if not embeddings:
        print("❌ No embeddings found in results")
        return {}
    
    evaluator = ParsimonyEvaluator()
    parsimony_results = evaluator.calculate_parsimony_from_embeddings(embeddings, codes)
    
    return parsimony_results

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description='Evaluate parsimony of codes')
    parser.add_argument('--results_file', type=str, required=True, help='Path to comprehensive evaluation results JSON file')
    
    args = parser.parse_args()
    
    async def main():
        results = await evaluate_parsimony_from_results(args.results_file)
        if results:
            print("PARSIMONY EVALUATION RESULTS")
            print("=" * 50)
            print(f"Parsimony Score: {results.get('parsimony_score', 'N/A'):.4f}")
            print(f"Average Cosine Similarity: {results.get('average_cosine_similarity', 'N/A'):.4f}")
            print(f"Total Pairs: {results.get('total_pairs', 'N/A')}")
            print(f"Valid Pairs: {results.get('valid_pairs', 'N/A')}")
            print(f"Valid Embeddings: {results.get('valid_embeddings', 'N/A')}")
            print(f"Cos Value: {results.get('cos_value', 'N/A'):.4f}")
            
            # Save results to JSON file
            output_file = "parsimony_results.json"
            with open(output_file, 'w') as f:
                json.dump(results, f, indent=2)
            print(f"\n✅ Results saved to: {output_file}")
        else:
            print("❌ No parsimony results available")
    
    asyncio.run(main())
