import re

def chunk_text_char(text: str, max_chars: int = 420, overlap: int = 80):
    """Chunk text by fixed character length with overlap (for char mode)."""
    text = text.strip()
    if len(text) <= max_chars:
        return [text]
    chunks, i = [], 0
    step = max_chars - overlap
    while i < len(text):
        chunk = text[i:i+max_chars].strip()
        if chunk:
            chunks.append(chunk)
        i += step
    return chunks


def chunk_text_word(text: str, max_words: int = 120, overlap: int = 24, lang: str = "zh"):
    """
    Chunk text by word tokens with overlap (for word mode).
    - Uses jieba for Chinese segmentation.
    - Uses regex tokenization for English (words and punctuation).
    """
    if lang == "zh":
        import jieba
        words = [tok for tok in jieba.cut(text)]
        joiner = ""  # No spaces between Chinese tokens
    else:
        words = re.findall(r"\w+|[^\w\s]", text, flags=re.UNICODE)  # Include punctuation
        joiner = " "
    chunks, i = [], 0
    step = max_words - overlap
    while i < len(words):
        w = words[i:i+max_words]
        if not w:
            break
        chunks.append(joiner.join(w).strip())
        i += step
    return chunks


def detect_lang(text: str) -> str:
    """Detect language: return 'zh' if contains Chinese characters, otherwise 'en'."""
    return "zh" if any('\u4e00' <= ch <= '\u9fff' for ch in text) else "en"


# -------------------- Sentence-based chunking (no overlap) --------------------

def _split_sentences_en(text: str):
    """
    Split English text into full sentences by punctuation.
    - Keeps the ending punctuation with the sentence.
    - Works when there is no space after punctuation and with trailing quotes/brackets.
    """
    text = text.strip()
    if not text:
        return []
    # Split right AFTER ., !, ? optionally followed by quotes/brackets; allow zero-or-more spaces
    parts = re.split(r'(?<=[.!?])(?:["\'”’)\]]+)?\s*', text)
    # Remove empty tails created by split
    sents = [s.strip() for s in parts if s and s.strip()]
    return sents


def _split_sentences_zh(text: str):
    """
    Split Chinese text into full sentences by punctuation.
    - Keeps the ending punctuation with the sentence.
    """
    text = text.strip()
    if not text:
        return []
    parts = re.split(r'(?<=[。！？；])\s*', text)
    sents = [s.strip() for s in parts if s and s.strip()]
    return sents


def _count_words_en(text: str) -> int:
    """Count English words (exclude punctuation)."""
    return len(re.findall(r"\w+", text, flags=re.UNICODE))


def _count_words_zh(text: str) -> int:
    """Approximate Chinese token count using jieba (exclude pure punctuation tokens)."""
    import jieba
    toks = [t.strip() for t in jieba.cut(text)]
    # Drop empty and punctuation-only tokens
    toks = [t for t in toks if t and not re.fullmatch(r"\p{P}+", t, flags=re.UNICODE)]
    return len(toks)


def chunk_text_sentence(
    text: str,
    lang: str = "en",
    min_words: int = 20,
):
    """
    Chunk by sentences with LOWER bound only (no overlap, no upper bound):
    1) Split into FULL sentences by punctuation.
    2) Start each chunk at the BEGINNING of a sentence (no word overlap).
    3) If the current sentence has fewer than `min_words` words, MERGE subsequent FULL sentences
       until the chunk reaches `min_words` or no more sentences remain.
    """
    if lang.lower().startswith("zh"):
        sents = _split_sentences_zh(text)
        count_words = _count_words_zh
    else:
        sents = _split_sentences_en(text)
        count_words = _count_words_en

    if not sents:
        return []

    chunks = []
    i, n = 0, len(sents)

    while i < n:
        chunk = sents[i]
        cnt = count_words(chunk)
        used = 1

        # ONLY merge forward if the sentence is shorter than the lower bound
        while cnt < min_words and (i + used) < n:
            chunk = (chunk + " " + sents[i + used]).strip()
            cnt = count_words(chunk)
            used += 1

        chunks.append(chunk)
        i += used  # advance to the next NEW sentence (no overlap)

    return chunks


# -----------------------------------------------------------------

def chunk_text(text: str,
               mode: str = "word",
               lang: str = "auto",
               max_chars: int = 420, overlap_chars: int = 80,
               max_words: int = 120, overlap_words: int = 24,
               min_words: int = 20):
    """
    Unified entry for text chunking.

    - word mode: uses word-level overlap (max_words/overlap_words apply).
    - char mode: uses character-level overlap (max_chars/overlap_chars apply).
    - sentence mode:
        * NO overlap.
        * Each chunk is a FULL sentence; only when its word count < min_words,
          it merges subsequent FULL sentences (never truncates).
        * Ignores max_words and overlap_words entirely.
    """
    if lang == "auto":
        lang = detect_lang(text)
    if mode == "word":
        return chunk_text_word(text, max_words, overlap_words, lang)
    elif mode == "sentence":
        return chunk_text_sentence(text, lang=lang, min_words=min_words)
    else:
        return chunk_text_char(text, max_chars, overlap_chars)
