import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings 
from langchain.docstore.document import Document
from PyPDF2 import PdfReader
from config import TOP_K_RAG2, GOOGLE_GEMINI_API_KEY, EMBEDDING_MODEL_NAME  # Add EMBEDDING_MODEL_NAME in config

# -----------------------------
# Ingest PDF into FAISS
# -----------------------------
def ingest_pdf_to_faiss(pdf_path, faiss_index_path="faiss_index"):
    """
    Ingests PDF, splits into chunks, generates embeddings, and stores in FAISS index
    """
    # Read PDF
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() + "\n"

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=50
    )
    chunks = text_splitter.split_text(full_text)

    # Create Documents
    documents = [Document(page_content=chunk) for chunk in chunks]

    # Embeddings
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(documents, embedding_model)

    # Save FAISS index
    vectorstore.save_local(faiss_index_path)
    return vectorstore

# -----------------------------
# Load FAISS index
# -----------------------------
def load_faiss_index(faiss_index_path="faiss_index"):
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.load_local(faiss_index_path, embedding_model, allow_dangerous_deserialization=True)

# -----------------------------
# Retrieve top-k facts for a question
# -----------------------------
def retrieve_rag2_facts(question_text, vectorstore, k=TOP_K_RAG2):
    """
    Returns top-k relevant text chunks (facts) from RAG2
    """
    results = vectorstore.similarity_search(question_text, k=k)
    facts = [doc.page_content for doc in results]
    return facts
