from rank_bm25 import BM25Okapi

class BM25Baseline:
    def __init__(self, documents):
        """
        Initializes the BM25 baseline model.
        
        Args:
            documents (dict): A dictionary where keys are submission numbers
                              and values are the concatenated text of the documents.
        """
        self.doc_map = {i: sub_num for i, sub_num in enumerate(documents.keys())}
        self.corpus = list(documents.values())
        self.tokenized_corpus = [doc.lower().split() for doc in self.corpus]
        self.bm25 = BM25Okapi(self.tokenized_corpus)

    def search(self, query, top_k=5):
        """
        Performs a search using the BM25 model.
        
        Args:
            query (str): The search query.
            top_k (int): The number of top results to return.
            
        Returns:
            list: A list of submission numbers for the top k results.
        """
        tokenized_query = query.lower().split()
        doc_scores = self.bm25.get_scores(tokenized_query)
        
        # Get the top k indices and scores
        top_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_k]
        
        # Map indices back to submission numbers and return with scores
        return [(self.doc_map[i], doc_scores[i]) for i in top_indices]
