"""Low-level text processing utilities for the Energy Model.

This module contains functions for sentence segmentation and normalization using
the spaCy library. These helpers are responsible for transforming raw text strings
into a fixed-size list of sentences, which is a required input format for the
attention mechanisms in the main EnergyModel.
"""

from typing import Optional
import spacy

# Use spaCy due to https://medium.com/@prabhuss73/spacy-vs-nltk-a-comprehensive-comparison-of-two-popular-nlp-libraries-in-python-b66dc477a689
nlp = spacy.load("en_core_web_sm")


def semantic_sentence_split(text: str) -> list[str]:
    """Split text into sentences using spaCy's sentence segmentation.

    Args:
        text (str): Input text to be split into sentences.

    Returns:
        list[str]: List of sentences with leading/trailing whitespace removed.

    """
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]


def batch_semantic_sentence_split(texts: list[str]) -> list[list[str]]:
    """Splits a batch of texts into sentences using spaCy's efficient nlp.pipe.

    Args:
        texts (list[str]): A list of texts to be split.

    Returns:
        list[list[str]]: A list of sentence lists, one for each input text.
    """
    docs = nlp.pipe(texts)
    return [[sent.text.strip() for sent in doc.sents] for doc in docs]


def normalize_sentences(
    sentences: list[str], n_sentences: int, placeholder_emb: Optional[list[float]] = None
) -> list[str | None]:
    """Normalize number of a list of sentences to a fixed length.

    Adjusts the input list of sentences to exactly match the target length by either
    combining sentences (if too many) or adding placeholders (if too few).

    Args:
    sentences (list[str]): List of input sentences to normalize.
    n_sentences (int): Target number of sentences.
    placeholder_emb (None, optional): If None, uses "..." as placeholder text.
        Otherwise, uses None placeholders for embedding replacement. Defaults to None.

    Returns:
    list[str | None]: Normalized list with exactly n_sentences elements.
        Elements are strings for actual sentences or None for embedding placeholders.

    Note:
    When reducing sentences, the last two sentences are repeatedly merged until
    the target length is reached. When extending, either "..." strings or None
    values are appended based on the placeholder_emb parameter.

    """
    if len(sentences) > n_sentences:
        while len(sentences) > n_sentences:
            sentences[-2] = sentences[-2] + " " + sentences[-1]
            sentences.pop()
        return sentences
    if len(sentences) < n_sentences:
        diff = n_sentences - len(sentences)
        if placeholder_emb is None:
            sentences.extend(["..."] * diff)
        else:
            sentences.extend([None] * diff)  # Placeholder: will be replaced with vector
        return sentences
    return sentences
