# Build a BM25 database and perform a search

import sqlite3
import json
import os
import time

# --- Configuration ---
DATA_PATH = 'xxx'  # path to the JSONL file, containing doc chunks
DB_PATH = 'xxx'
TABLE_NAME = 'wiki_fts'
BATCH_SIZE = 10000


def setup_database():
    """
    Prepare the database and FTS5 table.
    """
    print(f"Preparing database: {DB_PATH}")
    if os.path.exists(DB_PATH):
        print(f"Warning: Database file '{DB_PATH}' already exists.")

    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    create_table_sql = f"""
    CREATE VIRTUAL TABLE IF NOT EXISTS {TABLE_NAME} USING fts5(
        title,
        content,
        chunk_id UNINDEXED,
        doc_id UNINDEXED,
        tokenize = 'porter unicode61'
    );
    """
    cursor.execute(create_table_sql)

    conn.commit()
    conn.close()
    print("Optimized database and table are ready.")


def index_documents_if_needed():
    """
    Reads data from the JSONL file and builds the FTS5 index only if the table is empty.
    """
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    cursor.execute(f"SELECT COUNT(*) FROM {TABLE_NAME}")
    count = cursor.fetchone()[0]

    if count > 0:
        print(
            f"'{TABLE_NAME}' table already contains {count} records. Skipping index process.")
        conn.close()
        return

    print("Table is empty, starting to index documents...")
    if not os.path.exists(DATA_PATH):
        print(f"Error: Data file not found at '{DATA_PATH}'")
        conn.close()
        return

    start_time = time.time()
    documents_to_insert = []
    doc_count = 0

    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                doc = json.loads(line)
                documents_to_insert.append((
                    doc.get('content', ''),
                    doc.get('chunk_id', ''),
                    doc.get('doc_id', ''),
                    doc.get('title', '')
                ))
                doc_count += 1

                if len(documents_to_insert) >= BATCH_SIZE:
                    cursor.executemany(
                        f"INSERT INTO {TABLE_NAME} (content, chunk_id, doc_id, title) VALUES (?, ?, ?, ?)",
                        documents_to_insert
                    )
                    documents_to_insert = []
                    print(f"Indexed {doc_count} documents...")

            except json.JSONDecodeError:
                print(f"Warning: Skipping unparsable line: {line.strip()}")

    if documents_to_insert:
        cursor.executemany(
            f"INSERT INTO {TABLE_NAME} (content, chunk_id, doc_id, title) VALUES (?, ?, ?, ?)",
            documents_to_insert
        )

    conn.commit()
    end_time = time.time()
    print(
        f"\nIndexing complete! A total of {doc_count} new documents were indexed.")
    print(f"Total time taken: {end_time - start_time:.2f} seconds.")

    conn.close()


def search_improved(query: str, top_k: int = 5):
    """
    Performs an improved BM25 search.
    """
    print(f"\nExecuting query: '{query}'")

    if not os.path.exists(DB_PATH):
        print("Error: Database file not found. Please run the script to create the index first.")
        return []

    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    terms = query.split()
    or_query = " OR ".join(f'"{term}"' for term in terms)
    match_query = f"content : {or_query}"

    sql_query = f"""
    SELECT
        chunk_id,
        title,
        content,
        bm25({TABLE_NAME}) AS score
    FROM {TABLE_NAME}
    WHERE {TABLE_NAME} MATCH ?
    ORDER BY score
    LIMIT ?;
    """
    try:
        cursor.execute(sql_query, (match_query, top_k))
        results = cursor.fetchall()
        return results
    finally:
        conn.close()


if __name__ == '__main__':
    setup_database()

    index_documents_if_needed()

    test_query = "Ambroise Thomas both Opera composers"
    search_results = search_improved(test_query, top_k=5)
    print("\n--- Search Results ---")
    if search_results:
        for i, (chunk_id, title, content, score) in enumerate(search_results):
            print(f"{i+1}. Chunk ID: {chunk_id}")
            print(f"   Title: {title}")
            print(f"   Content: {content}")
            print(f"   BM25 Score: {score:.4f}")
            print("-" * 20)
    else:
        print("No relevant results found.")
