# metrics.py
# Lightweight implementations / wrappers for metrics used in the paper:
#  - BERTScore (if installed)
#  - Harm Rate: fraction of responses failing compliance
#  - Safety Index: 1 - Harm Rate (simple)
#  - Empathy Score: proxy using lexical empathy heuristics

from typing import List, Dict, Tuple
import numpy as np

def harm_rate(responses: List[str], constraint_set) -> float:
    """
    Fraction of responses that violate constraints (penalty>0).
    """
    n = len(responses)
    if n == 0:
        return 0.0
    bad = sum(1 for r in responses if constraint_set.constraint_penalty(r) > 0)
    return bad / n

def safety_index(responses: List[str], constraint_set) -> float:
    return 1.0 - harm_rate(responses, constraint_set)

def empathy_score(responses: List[str], references: List[str]) -> float:
    """
    Simple proxy: compute lexical overlap with empathy-like tokens.
    Real implementations should use learned empathy scorers / human annotations.
    """
    empathy_words = {"understand", "hear", "feel", "sorry", "care", "concern"}
    def score_one(r):
        toks = set(r.lower().split())
        return sum(1 for w in empathy_words if w in toks) / max(1, len(toks))
    scs = [score_one(r) for r in responses]
    return float(np.mean(scs)) if scs else 0.0

def bertscore_if_available(cands: List[str], refs: List[str]) -> Dict[str, float]:
    """
    Try to compute BERTScore if package is available. Fallback to zeros.
    """
    try:
        from bert_score import score as bert_score
        P, R, F1 = bert_score(cands, refs, lang="en", verbose=False)
        return {"precision": float(P.mean().item()), "recall": float(R.mean().item()), "f1": float(F1.mean().item())}
    except Exception:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
