import json
import random
import numpy as np
from datetime import datetime
from typing import List, Dict, Any, Optional
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI


class RoleMatcher:
                

    def __init__(self, role_data_file: str,
                 openai_client: Optional[OpenAI] = None,
                 use_semantic_matching: bool = True,
                 role_index_dir: Optional[str] = None):
           
        self.role_data_file = Path(role_data_file)
        self.roles_data = []
        self.openai_client = openai_client
        self.use_semantic_matching = use_semantic_matching

                    
        if role_index_dir:
            self.role_index_dir = Path(role_index_dir)
        else:
                                             
            self.role_index_dir = self.role_data_file.parent / "role_index"

                  
        self.role_index_dir.mkdir(parents=True, exist_ok=True)

                  
        self.role_embeddings = []
        self.template_cache = {}             
        self.role_embeddings_file = self.role_index_dir / "role_embeddings.npy"
        self.role_metadata_file = self.role_index_dir / "role_metadata.json"

        self._load_roles_data()
        self._build_topic_index()

                          
        if self.use_semantic_matching and self.openai_client:
            self._precompute_role_embeddings()

    def _load_roles_data(self):
                    
        if not self.role_data_file.exists():
            raise FileNotFoundError(f"Role data file does not exist: {self.role_data_file}")

        try:
            with open(self.role_data_file, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        role_data = json.loads(line)
                        self.roles_data.append(role_data)

        except Exception as e:
            raise

    def _build_topic_index(self):
                           
        self.topic_to_roles = {}

        for role in self.roles_data:
            source_topic = role.get('source_topic', '')
            if source_topic:
                              
                topics = [topic.strip() for topic in source_topic.split('|')]
                for topic in topics:
                    if topic not in self.topic_to_roles:
                        self.topic_to_roles[topic] = []
                    self.topic_to_roles[topic].append(role)

    def find_matching_role(self, template_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
           
        if not self.roles_data:
            return None

                        
        if self.use_semantic_matching and len(self.role_embeddings) > 0:
            return self._semantic_matching(template_data)
        else:
                      
            return self._keyword_matching(template_data)

    def _semantic_matching(self, template_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
           
        try:
                       
            template_text = self._build_template_text_representation(template_data)
            template_id = template_data.get('id', str(hash(template_text)))

                  
            if template_id in self.template_cache:
                template_embedding = self.template_cache[template_id]
            else:
                           
                template_embedding = self._get_embedding(template_text)
                self.template_cache[template_id] = template_embedding

                         
            similarities = cosine_similarity(
                template_embedding.reshape(1, -1),
                self.role_embeddings
            ).flatten()

                      
            best_role_idx = np.argmax(similarities)
            best_similarity = similarities[best_role_idx]

                                
            if best_similarity < 0.3:          
                return self._keyword_matching(template_data)

            best_role = self.roles_data[best_role_idx]

                         
            best_role_copy = best_role.copy()
            best_role_copy['semantic_similarity'] = float(best_similarity)
            best_role_copy['matching_method'] = 'semantic'

            return best_role_copy

        except Exception as e:
            return self._keyword_matching(template_data)

    def _keyword_matching(self, template_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
           
        question_type = template_data.get('question_type', '')
        template_topic = template_data.get('topic', '')

                
        candidate_roles = self._get_candidate_roles(question_type, template_topic)

        if not candidate_roles:
                                
            selected_role = random.choice(self.roles_data) if self.roles_data else None
        else:
                           
            selected_role = max(candidate_roles, key=lambda x: x.get('confidence_score', 0.0))

        if selected_role:
                    
            selected_role_copy = selected_role.copy()
            selected_role_copy['matching_method'] = 'keyword'
            return selected_role_copy

        return None

    def _get_candidate_roles(self, question_type: str, template_topic: str) -> List[Dict[str, Any]]:
           
        candidate_roles = []

                           
        if template_topic:
            topics = [topic.strip() for topic in template_topic.split('|')]
            for topic in topics:
                if topic in self.topic_to_roles:
                    candidate_roles.extend(self.topic_to_roles[topic])

                                      
        if len(candidate_roles) < 3:
            additional_roles = self._fuzzy_match_by_question_type(question_type)
            candidate_roles.extend(additional_roles)

            
        seen_ids = set()
        unique_candidates = []
        for role in candidate_roles:
            role_id = role.get('source_question_id', '')
            if role_id not in seen_ids:
                seen_ids.add(role_id)
                unique_candidates.append(role)

        return unique_candidates

    def _precompute_role_embeddings(self, batch_size: int = 50):
                            

        try:
                        
            if self._load_role_embeddings():
                return


                       
            role_texts = []
            for role in self.roles_data:
                role_text = self._build_role_text_representation(role)
                role_texts.append(role_text)

                      
            embeddings = []
            for i in range(0, len(role_texts), batch_size):
                batch_texts = role_texts[i:i + batch_size]
                batch_embeddings = self._get_embeddings_batch(batch_texts)
                embeddings.extend(batch_embeddings)

                      
                processed = min(i + batch_size, len(role_texts))

                        
            self.role_embeddings = np.array(embeddings)

                      
            self._save_role_embeddings()


        except Exception as e:
            self.use_semantic_matching = False
            self.role_embeddings = []

    def _get_embeddings_batch(self, texts: List[str]) -> List[np.ndarray]:
                      
        try:
            response = self.openai_client.embeddings.create(
                model="text-embedding-3-small",
                input=texts
            )
            return [np.array(item.embedding) for item in response.data]
        except Exception as e:
                            
            return [self._get_embedding(text) for text in texts]

    def _load_role_embeddings(self) -> bool:
           
        try:
                        
            if not self.role_embeddings_file.exists() or not self.role_metadata_file.exists():
                return False

                   
            with open(self.role_metadata_file, 'r', encoding='utf-8') as f:
                metadata = json.load(f)

                             
            if not self._validate_metadata(metadata):
                return False

                    
            self.role_embeddings = np.load(self.role_embeddings_file)

            return True

        except Exception as e:
            return False

    def _save_role_embeddings(self):
                            
        try:
                    
            np.save(self.role_embeddings_file, self.role_embeddings)

                      
            metadata = self._generate_metadata()
            with open(self.role_metadata_file, 'w', encoding='utf-8') as f:
                json.dump(metadata, f, ensure_ascii=False, indent=2)


        except Exception as e:
            print(f"save role embeddings failed: {e}")

    def _validate_metadata(self, metadata: Dict[str, Any]) -> bool:
           
        try:
                    
            if metadata.get('total_roles') != len(self.roles_data):
                return False

                           
            current_mtime = self.role_data_file.stat().st_mtime
            cached_mtime = metadata.get('role_file_mtime')
            if abs(current_mtime - cached_mtime) > 1:           
                return False

                                       
            if len(self.roles_data) >= 10:
                current_sample_ids = [role.get('source_question_id', '') for role in self.roles_data[:10]]
                cached_sample_ids = metadata.get('sample_role_ids', [])
                if current_sample_ids != cached_sample_ids:
                    return False

            return True

        except Exception:
            return False

    def _generate_metadata(self) -> Dict[str, Any]:
           
        metadata = {
            'total_roles': len(self.roles_data),
            'role_file_path': str(self.role_data_file),
            'role_file_mtime': self.role_data_file.stat().st_mtime,
            'embedding_model': 'text-embedding-3-small',
            'embedding_dimension': self.role_embeddings.shape[1] if len(self.role_embeddings) > 0 else 0,
            'created_time': datetime.now().isoformat(),
            'sample_role_ids': [role.get('source_question_id', '') for role in self.roles_data[:10]] if len(self.roles_data) >= 10 else []
        }
        return metadata

    def clear_cache(self):
                      
        try:
            if self.role_embeddings_file.exists():
                self.role_embeddings_file.unlink()

            if self.role_metadata_file.exists():
                self.role_metadata_file.unlink()

                      
            self.role_embeddings = []
            self.template_cache = {}


        except Exception as e:
            print(f"clear cache failed: {e}")

    def rebuild_cache(self):
                        

                
        self.clear_cache()

                         
        if self.use_semantic_matching and self.openai_client:
            self._precompute_role_embeddings()

    def _build_role_text_representation(self, role_data: Dict[str, Any]) -> str:
                                
        components = []

                
        player_desc = role_data.get('player_description', '')
        if player_desc:
            components.append(f"Player characteristics: {player_desc}")

                
        source_content = role_data.get('source_content', '')
        if source_content:
            components.append(f"Original question: {source_content}")

                
        source_topic = role_data.get('source_topic', '')
        if source_topic:
            components.append(f"Related topic: {source_topic}")

        return " ".join(components)

    def _build_template_text_representation(self, template_data: Dict[str, Any]) -> str:
                         
        components = []

                
        template = template_data.get('template', '')
        if template:
            components.append(f"Question template: {template}")

                
        question_type = template_data.get('question_type', '')
        if question_type:
            components.append(f"Question type: {question_type}")

              
        description = template_data.get('description', '')
        if description:
            components.append(f"Description: {description}")

                
        topic = template_data.get('topic', '')
        if topic:
            components.append(f"Topic: {topic}")

        return " ".join(components)

    def _get_embedding(self, text: str) -> np.ndarray:
                       
        try:
            response = self.openai_client.embeddings.create(
                model="text-embedding-3-small",
                input=text
            )
            return np.array(response.data[0].embedding)
        except Exception as e:
                              
            return np.random.rand(1536)                             

    def _fuzzy_match_by_question_type(self, question_type: str) -> List[Dict[str, Any]]:
           
                        
        type_to_topics = {
            'PREORDER_REWARDS': ['Monetization', 'SKU'],
            'REVIEW_QUESTIONS': ['Unlabeled reviews'],
            'TEAM_COOPERATION': ['Friends', 'Co-Op'],
            'HARDWARE_COMPATIBILITY': ['Frame/FPS', 'Graphics Quality', 'Performance Issue'],
            'GAME_CONTENT': ['Game Content', 'Gameplay', 'Quest'],
            'TECHNICAL_ISSUES': ['Bug', 'Crash', 'Server Disconnection'],
            'PURCHASE_ADVICE': ['Value For Money', 'SKU', 'Refund'],
            'GAMEPLAY_EXPERIENCE': ['Gameplay', 'Combat', 'Learning Curve'],
        }

        related_topics = type_to_topics.get(question_type, [])
        candidate_roles = []

        for topic in related_topics:
            if topic in self.topic_to_roles:
                candidate_roles.extend(self.topic_to_roles[topic])

        return candidate_roles

    def get_role_context(self, role_data: Dict[str, Any]) -> str:
           
        if not role_data:
            return ""

        player_description = role_data.get('player_description', '')

        context = f"""
            Player Role Background:
            {player_description}

            Please ask questions and think as this player with their identity and language style.

            **Important Requirements**: When asking questions as this character, you must provide specific details:
            - For hardware issues, specify the exact hardware model
            - For game content, use accurate game terms and names
            - For numerical values, provide specific numbers or ranges
            - **Version-related Note**: Do not directly mention version numbers in questions, use natural expressions (like "recently", "now", etc.), version information is reflected through the question timing
            - Avoid vague expressions and ensure questions have enough information for accurate answers
        """
        return context.strip()

    def get_statistics(self) -> Dict[str, Any]: 
           
        total_roles = len(self.roles_data)
        topic_distribution = {}

        for topic, roles in self.topic_to_roles.items():
            topic_distribution[topic] = len(roles)

               
        sorted_topics = sorted(topic_distribution.items(), key=lambda x: x[1], reverse=True)

        return {
            'total_roles': total_roles,
            'total_topics': len(self.topic_to_roles),
            'topic_distribution': dict(sorted_topics[:10]),           
            'avg_confidence': sum(role.get('confidence_score', 0.0) for role in self.roles_data) / total_roles if total_roles > 0 else 0.0,
            'semantic_matching_enabled': self.use_semantic_matching,
            'embeddings_computed': len(self.role_embeddings) > 0,
            'template_cache_size': len(self.template_cache),
            'role_index_dir': str(self.role_index_dir),
            'embeddings_file_exists': self.role_embeddings_file.exists(),
            'metadata_file_exists': self.role_metadata_file.exists()
        }

    def get_top_similar_roles(self, template_data: Dict[str, Any], top_k: int = 5) -> List[Dict[str, Any]]:
           
        if not self.use_semantic_matching or len(self.role_embeddings) == 0:
            return self._get_candidate_roles(
                template_data.get('question_type', ''),
                template_data.get('topic', '')
            )[:top_k]

        try:
                              
            template_text = self._build_template_text_representation(template_data)
            template_embedding = self._get_embedding(template_text)

                   
            similarities = cosine_similarity(
                template_embedding.reshape(1, -1),
                self.role_embeddings
            ).flatten()

                         
            top_indices = np.argsort(similarities)[::-1][:top_k]

            result = []
            for idx in top_indices:
                role = self.roles_data[idx].copy()
                role['semantic_similarity'] = float(similarities[idx])
                role['rank'] = len(result) + 1
                result.append(role)

            return result

        except Exception as e:
            return []
