import re
from collections import Counter
from dataclasses import dataclass, field
from typing import Dict, Iterable, List, Optional


@dataclass
class GenderIdentification:
    # You can extend these lists if needed
    female_pronouns: List[str] = field(
        default_factory=lambda: ["she", "her", "hers", "herself"]
    )
    masculine_pronouns: List[str] = field(
        default_factory=lambda: ["he", "him", "his", "himself"]
    )

    # Optional weights. By default mirrors your original idea
    weight_feminine: int = 2
    weight_masculine: int = 2

    def _tokenize(self, text: str) -> List[str]:
        tokens = re.findall(r"[A-Za-z']+", text.lower())
        return tokens

    def _counts_by_group(self, tokens: List[str]) -> Dict[str, int]:
        c = Counter(tokens)
        fem = sum(c[w] for w in self.female_pronouns)
        masc = sum(c[w] for w in self.masculine_pronouns)
        return {"female": fem, "male": masc}

    def classify_gender_text(self, text: Optional[str]) -> str:
        """
        Returns either "female" or "male".
        When uncertain (no pronouns or a tie), returns "female".
        """
        if not isinstance(text, str) or not text.strip():
            return "female"

        tokens = self._tokenize(text)
        counts = self._counts_by_group(tokens)

        scores = {
            "female": counts["female"] * self.weight_feminine,
            "male": counts["male"] * self.weight_masculine,
        }

        # Prefer "female" on ties or no signal
        if scores["female"] >= scores["male"]:
            return "female"
        return "male"

    def __call__(self, texts: Iterable[Optional[str]]) -> Dict[str, float]:
        """
        Aggregates over an iterable of texts and returns proportions for:
        female, male
        """
        n = {"female": 0, "male": 0}
        considered = 0

        for t in texts:
            g = self.classify_gender_text(t)
            if g in n:
                n[g] += 1
                considered += 1

        if considered == 0:
            return {"female": 0.0, "male": 0.0}

        return {k: v / considered for k, v in n.items()}
