"""Utilities for normalizing dataset fields and safe string formatting.

Provides:
- SafeDict: dict subclass that returns empty string for missing keys
- normalize_prompt_fields: map common aliases to 'context' and 'question'

These helpers are shared by benchmarking and extraction scripts to ensure
prompt templates can reference {context} and {question} consistently across
datasets with heterogeneous field names.
"""

from __future__ import annotations

from typing import Any, Dict


class SafeDict(dict):
    def __missing__(self, key):
        return ""


def normalize_prompt_fields(obj: Dict[str, Any]) -> Dict[str, Any]:
    """Return a copy with common aliases mapped to 'context' and 'question'.

    - Joins list/tuple fields into newline-separated strings.
    - Adds 'context' from one of: context/input/document/documents/passage/text/content/article
    - Adds 'question' from one of: question/query/instruction/prompt/title/ask

    Time: O(k) over number of fields; Space: O(k) for the copy.
    """

    def _to_text(x: Any) -> str:
        if isinstance(x, (list, tuple)):
            return "\n".join(str(v) for v in x)
        return str(x)

    norm: Dict[str, Any] = dict(obj)
    # context aliases
    for k in ("context", "input", "document", "documents", "passage", "text", "content", "article"):
        if k in obj:
            norm.setdefault("context", _to_text(obj[k]))
            break
    # question aliases
    for k in ("question", "query", "instruction", "prompt", "title", "ask"):
        if k in obj:
            norm.setdefault("question", _to_text(obj[k]))
            break
    return norm


