# preprocess_text.py
# Text cleaning, tokenization, and lexicon-based feature extraction.
# Uses Hugging Face tokenizer when available, and provides a small lexicon-based feature extractor.

from typing import List, Dict, Any, Optional
import re
import numpy as np

try:
    from transformers import AutoTokenizer
except Exception:
    AutoTokenizer = None

# Example small lexicons (extend / replace with proper lexicons)
_POS_WORDS = {"good", "happy", "great", "love", "pleasant", "fortunate"}
_NEG_WORDS = {"sad", "angry", "bad", "hate", "unhappy", "pain"}

def clean_text(text: str) -> str:
    """
    Simple text cleaning: normalize whitespace, remove control chars.
    """
    text = re.sub(r"\s+", " ", text).strip()
    return text

def get_tokenizer(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
    """
    Return a HF tokenizer if available. If transformers not installed, returns None.
    """
    if AutoTokenizer is None:
        return None
    return AutoTokenizer.from_pretrained(model_name)

def tokenize_texts(texts: List[str], tokenizer=None, max_length: int = 256) -> Dict[str, Any]:
    """
    Tokenize a list of texts using tokenizer. If tokenizer is None, fallback to whitespace split.
    Returns a dict with 'input_ids' and 'attention_mask' or list of token lists.
    """
    cleaned = [clean_text(t) for t in texts]
    if tokenizer is None:
        # simple whitespace tokenization
        tokenized = [t.split()[:max_length] for t in cleaned]
        return {"tokens": tokenized}
    return tokenizer(cleaned, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

def extract_lexicon_features(text: str, pos_words: Optional[set] = None, neg_words: Optional[set] = None) -> np.ndarray:
    """
    Compute simple lexicon features: counts and normalized counts of positive / negative words.
    Returns a numpy array with shape (4,) -> [pos_count, neg_count, pos_ratio, neg_ratio]
    """
    pos_words = pos_words or _POS_WORDS
    neg_words = neg_words or _NEG_WORDS
    tokens = re.findall(r"\w+", text.lower())
    n_tokens = max(1, len(tokens))
    pos_count = sum(1 for t in tokens if t in pos_words)
    neg_count = sum(1 for t in tokens if t in neg_words)
    pos_ratio = pos_count / n_tokens
    neg_ratio = neg_count / n_tokens
    return np.array([pos_count, neg_count, pos_ratio, neg_ratio], dtype=np.float32)

def compute_hlex_hmeta(text: str) -> Dict[str, Any]:
    """
    Compute hlex and hmeta features used in Phase I.
    For prototype: hlex = lexicon features, hmeta = simple style features (length, punctuation rate)
    """
    lex = extract_lexicon_features(text)
    n_chars = len(text)
    n_tokens = max(1, len(re.findall(r"\w+", text)))
    punctuation_rate = sum(1 for c in text if c in "!?.,;:") / max(1, n_chars)
    hmeta = np.array([n_chars, n_tokens, punctuation_rate], dtype=np.float32)
    return {"hlex": lex, "hmeta": hmeta}
