import pandas as pd
from rag2_retrieval import ingest_pdf_to_faiss
from cache import CacheManager
from utils import get_embeddings, normalize_text
from config import FAISS_INDEX_PATH, TOP_K_RAG2, TEXTBOOK_PDF, QUESTIONS_EXCEL

# -----------------------------
# Ingest textbook PDF to FAISS
# -----------------------------
def ingest_rag2_pdf(pdf_path, faiss_index_path=FAISS_INDEX_PATH):
    """
    Ingest PDF into FAISS vectorstore
    """
    print(f"Ingesting PDF '{pdf_path}' into FAISS...")
    vectorstore = ingest_pdf_to_faiss(pdf_path, faiss_index_path)
    print("FAISS index created and saved successfully.")
    return vectorstore

# -----------------------------
# Preload cold cache for questions
# -----------------------------
def preload_cold_cache(question_excel, cache_manager, vectorstore, top_k=TOP_K_RAG2):
    """
    For each question in the Excel, retrieve top-k facts from RAG2 and store in cold cache
    """
    df = pd.read_excel(question_excel)
    for idx, row in df.iterrows():
        question = normalize_text(str(row['Question']))
        # Retrieve top-k facts from RAG2
        facts = vectorstore.similarity_search(question, k=top_k)
        
        # Prepare facts with embeddings
        facts_with_embeddings = []
        for fact in facts:
            fact_text = fact.page_content
            # Get embedding for the fact text
            embedding = get_embeddings([fact_text])[0]  # get single embedding
            facts_with_embeddings.append({
                "fact": fact_text,  # Note: using "fact" key as expected by cache
                "embedding": embedding
            })
        
        # Add to cold cache
        facts_with_embeddings = []
        for fact in facts:
            fact_text = fact.page_content
            embedding = get_embeddings([fact_text])[0]  # get single embedding
            facts_with_embeddings.append({
                "fact": fact_text,  # key "fact" as required by cache
                "embedding": embedding
            })

        cache_manager.add_to_cold(question, facts_with_embeddings)
        print(f"Loaded cold cache for question {idx+1}: '{row['Question']}' with {len(facts_with_embeddings)} facts.")
    print(f"Cold cache preloaded successfully for all {len(df)} questions.")


if __name__ == "__main__":
    # Ingest the textbook PDF and create FAISS index folder/files
    vectorstore = ingest_rag2_pdf(TEXTBOOK_PDF, FAISS_INDEX_PATH)

    # Initialize cache manager
    cache_manager = CacheManager()

    # Preload cold cache from questions excel into cache manager
    preload_cold_cache(QUESTIONS_EXCEL, cache_manager, vectorstore)
