import json
from pathlib import Path
from typing import List, Dict, Any
from llama_index.core import (
    VectorStoreIndex,
    Document,
    StorageContext,
    load_index_from_storage
)
from llama_index.core.retrievers import VectorIndexRetriever


class IndexBuilder:
                

    def __init__(self, corpus_path: Path, output_dir: Path):       
        self.corpus_path = corpus_path
        self.output_dir = output_dir
        self.index = None
        self.retriever = None

    def load_corpus_for_segment(self, segment_id: int) -> List[Document]:  
        documents = []

                      
        timeless_corpus = self.corpus_path / "segment_timeless" / "corpus.jsonl"
        if timeless_corpus.exists():
            documents.extend(self._load_corpus_from_file(timeless_corpus, "timeless"))

                  
        segment_corpus = self.corpus_path / f"segment_{segment_id}" / "corpus.jsonl"
        if segment_corpus.exists():
            segment_docs = self._load_corpus_from_file(segment_corpus, f"segment_{segment_id}")
            documents.extend(segment_docs)

        return documents

    def _load_corpus_from_file(self, file_path: Path, source: str) -> List[Document]:
                        
        documents = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                if line.strip():
                    try:
                        data = json.loads(line.strip())
                                                          
                        original_metadata = data.get('metadata', {})
                        processed_metadata = {
                            'id': data.get('id', f'{source}_doc_{line_num}'),
                            'source': source,
                        }

                                        
                        for key, value in original_metadata.items():
                            if key == 'entities' and isinstance(value, list):
                                                                   
                                entity_texts = []
                                for entity in value:
                                    if isinstance(entity, dict) and entity.get('text'):
                                        entity_texts.append(entity.get('text', ''))
                                processed_metadata['entity_texts'] = entity_texts
                            else:
                                processed_metadata[key] = value

                        doc = Document(
                            text=data.get('contents', data.get('text', '')),
                            metadata=processed_metadata
                        )
                        doc.id_ = data.get('id', f'{source}_doc_{line_num}')
                        documents.append(doc)
                    except json.JSONDecodeError as e:
                        continue
        return documents

    def build_segment_index(self, segment_id: int, force_rebuild: bool = False) -> bool:

                     
        segment_index_dir = self.output_dir / f"segment_{segment_id}_index"

                    
        if force_rebuild and segment_index_dir.exists():
            import shutil
            shutil.rmtree(segment_index_dir)

                  
        if not force_rebuild and segment_index_dir.exists():
            try:
                storage_context = StorageContext.from_defaults(
                    persist_dir=str(segment_index_dir))
                self.index = load_index_from_storage(storage_context)

                       
                self.retriever = VectorIndexRetriever(
                    index=self.index,
                    similarity_top_k=3
                )
                return True

            except Exception as e:
                print(f"add segment {segment_id} index failed: {e}, will rebuild index")
                 
        documents = self.load_corpus_for_segment(segment_id)

        if not documents:
            return False

        try:
                    
            self.index = VectorStoreIndex.from_documents(
                documents,
                show_progress=True
            )

                  
            self.index.storage_context.persist(persist_dir=str(segment_index_dir))

                   
            self.retriever = VectorIndexRetriever(
                index=self.index,
                similarity_top_k=3
            )

            return True

        except Exception as e:
            return False

    def retrieve_documents(self, query: str) -> List[Dict[str, Any]]:
                    
        if not self.retriever:
            raise ValueError("Retriever not initialized, please build index first")

        nodes = self.retriever.retrieve(query)
        documents = []
        for node in nodes:
            doc_data = {
                'id': node.id_,
                'content': node.get_content(),
                'metadata': node.metadata,
                'score': getattr(node, 'score', 0.0)
            }
            documents.append(doc_data)

        return documents

    def cleanup_index(self, segment_id: int):
                    
        segment_index_dir = self.output_dir / f"segment_{segment_id}_index"
        if segment_index_dir.exists():
            import shutil
            shutil.rmtree(segment_index_dir)
