from __future__ import annotations

import json
import logging
import os
import re
from typing import Any
from typing import Optional

from google.adk.evaluation.eval_metrics import EvalStatus
from google.adk.evaluation.eval_rubrics import RubricScore
from google.adk.evaluation.evaluator import EvaluationResult
from google.adk.evaluation.evaluator import PerInvocationResult


_DEFAULT_JUDGE_MODEL = "gemini-2.5-flash"
_JUDGE_MODEL_ENV = "EVAL_CUISINE_JUDGE_MODEL"
_JUDGE_CLIENT: Any = None
_DEFAULT_LITELLM_JUDGE_MODEL = "openrouter/google/gemini-2.5-flash"
_LITELLM_MODEL_ENV = "EVAL_CUISINE_JUDGE_LITELLM_MODEL"
_PALATABILITY_JUDGE_MODEL_ENV = "EVAL_PALATABILITY_JUDGE_MODEL"
_PALATABILITY_LITELLM_MODEL_ENV = "EVAL_PALATABILITY_JUDGE_LITELLM_MODEL"
_OPENROUTER_API_KEY_ENV = "OPENROUTER_API_KEY"
_OPENROUTER_API_BASE_ENV = "OPENROUTER_API_BASE"
_INFANT_MAX_AGE_YEARS = 2.0
_DAY_WORD_TO_COUNT = {
    "one": 1,
    "three": 3,
    "five": 5,
    "seven": 7,
    "week": 7,
    "weekly": 7,
}

log = logging.getLogger(__name__)


def _extract_text(invocation) -> str:
    if not invocation.final_response or not invocation.final_response.parts:
        return ""
    texts = [part.text for part in invocation.final_response.parts if getattr(part, "text", None)]
    return "\n".join(texts)


def _extract_user_text(invocation) -> str:
    if not invocation.user_content or not invocation.user_content.parts:
        return ""
    texts = [part.text for part in invocation.user_content.parts if getattr(part, "text", None)]
    return "\n".join(texts)


def _read_attr_or_key(obj: Any, name: str, default: Any = None) -> Any:
    if isinstance(obj, dict):
        return obj.get(name, default)
    return getattr(obj, name, default)


def _coerce_positive_int(value: Any) -> Optional[int]:
    if isinstance(value, bool):
        return None
    if isinstance(value, int):
        return value if value > 0 else None
    if isinstance(value, float) and value.is_integer():
        candidate = int(value)
        return candidate if candidate > 0 else None
    if isinstance(value, str):
        stripped = value.strip()
        if stripped.isdigit():
            candidate = int(stripped)
            return candidate if candidate > 0 else None
    return None


def _extract_optimizer_tool_day_count(invocation) -> Optional[int]:
    intermediate_data = _read_attr_or_key(invocation, "intermediate_data")
    if not intermediate_data:
        return None

    events = _read_attr_or_key(intermediate_data, "invocation_events") or []
    for event in events:
        content = _read_attr_or_key(event, "content")
        parts = _read_attr_or_key(content, "parts") or []
        for part in parts:
            function_response = _read_attr_or_key(part, "function_response")
            if not function_response:
                continue

            function_name = _read_attr_or_key(function_response, "name")
            if function_name != "calculate_average_macro_nutrient_per_day":
                continue

            response_payload = _read_attr_or_key(function_response, "response")
            if not response_payload:
                continue

            day_count = _coerce_positive_int(_read_attr_or_key(response_payload, "day_count"))
            if day_count is not None:
                return day_count

    return None


def _extract_requested_cuisine(user_text: str) -> Optional[str]:
    patterns = [
        r"\bi\s+want\s+to\s+eat\s+([a-z][a-z\s\-]{1,40}?)\s+(?:food|cuisine)\b",
        r"\bi\s+want\s+([a-z][a-z\s\-]{1,40}?)\s+(?:food|cuisine)\b",
        r"\bi\s+(?:would\s+)?like\s+to\s+eat\s+([a-z][a-z\s\-]{1,40}?)\s+(?:food|cuisine)\b",
        r"\bi usually eat\s+([a-z][a-z\s\-]{1,40}?)\s+food\b",
        r"\b(?:prefer|like)\s+(?:to\s+eat\s+)?([a-z][a-z\s\-]{1,40}?)\s+(?:food|cuisine)\b",
        r"\bcuisine\s*(?:is|:)?\s*([a-z][a-z\s\-]{1,40})\b",
    ]
    normalized = user_text.strip().lower()
    for pattern in patterns:
        match = re.search(pattern, normalized)
        if match:
            return re.sub(r"\s+", " ", match.group(1).strip())
    return None


def _extract_requested_day_count(user_text: str) -> Optional[int]:
    normalized = re.sub(r"\s+", " ", user_text.strip().lower())

    digit_patterns = [
        r"\btell\s+me\s+(\d+)\s*day\s+meal\s+plan\b",
        r"\b(\d+)\s*day\s+meal\s+plan\b",
        r"\bmeal\s+plan\s+for\s+(\d+)\s+days?\b",
    ]
    for pattern in digit_patterns:
        match = re.search(pattern, normalized)
        if match:
            return int(match.group(1))

    word_match = re.search(r"\b(one|three|five|seven)[-\s]*day\s+meal\s+plan\b", normalized)
    if word_match:
        return _DAY_WORD_TO_COUNT[word_match.group(1)]

    if re.search(r"\b(?:a|one)\s+week\s+meal\s+plan\b", normalized):
        return 7
    if re.search(r"\bweekly\s+meal\s+plan\b", normalized):
        return 7

    return None


def _extract_age_from_user_text(user_text: str) -> Optional[float]:
    normalized = re.sub(r"\s+", " ", user_text.strip().lower())
    year_patterns = [
        r"\b(\d{1,2})\s*(?:years?\s*old|yrs?\s*old|yo)\b",
        r"\bage\s*(?:is|:|=)?\s*(\d{1,2})\b",
    ]
    for pattern in year_patterns:
        match = re.search(pattern, normalized)
        if not match:
            continue
        try:
            age = float(int(match.group(1)))
        except (TypeError, ValueError):
            continue
        if 0 < age < 120:
            return age

    month_patterns = [
        r"\b(\d{1,2})\s*(?:months?\s*old|mos?\s*old|mo)\b",
        r"\bage\s*(?:is|:|=)?\s*(\d{1,2})\s*months?\b",
    ]
    for pattern in month_patterns:
        match = re.search(pattern, normalized)
        if not match:
            continue
        try:
            months = int(match.group(1))
        except (TypeError, ValueError):
            continue
        if 0 < months < 120:
            return float(months) / 12.0

    return None


def _is_infant_profile(user_text: str) -> bool:
    normalized = re.sub(r"\s+", " ", user_text.strip().lower())
    age = _extract_age_from_user_text(normalized)
    if age is not None:
        return age <= _INFANT_MAX_AGE_YEARS

    infant_keywords = (
        "infant",
        "baby",
        "newborn",
        "new-born",
    )
    return any(keyword in normalized for keyword in infant_keywords)


def _split_response_into_days(response_text: str, requested_days: Optional[int]) -> list[dict[str, Any]]:
    normalized = response_text.replace("\r\n", "\n").replace("\r", "\n")
    day_heading_pattern = re.compile(
        r"(?im)^\s*(?:[#>*-]+\s*)?(?:\*\*|__)?\s*day\s*(\d{1,2})(?:\s*(?:\*\*|__))?\s*(?::|-|–|\)|\.)?\s*(.*)$"
    )
    matches = list(day_heading_pattern.finditer(normalized))
    if not matches:
        if requested_days == 1 and normalized.strip():
            return [{"day_number": 1, "text": normalized.strip()}]
        return []

    day_sections: list[dict[str, Any]] = []
    for index, match in enumerate(matches):
        body_start = match.end()
        body_end = matches[index + 1].start() if index + 1 < len(matches) else len(normalized)
        inline_text = match.group(2).strip()
        body_text = normalized[body_start:body_end].strip()
        text = f"{inline_text}\n{body_text}".strip() if inline_text and body_text else inline_text or body_text
        if text:
            day_sections.append({"day_number": int(match.group(1)), "text": text})

    return day_sections


def _format_day_sections(day_sections: list[dict[str, Any]]) -> str:
    blocks: list[str] = []
    for index, day_section in enumerate(day_sections, start=1):
        day_number = day_section.get("day_number")
        if not isinstance(day_number, int):
            day_number = index
        text = str(day_section.get("text") or "").strip()
        if text:
            blocks.append(f"Day {day_number}:\n{text}")
    return "\n\n".join(blocks)


def _extract_day_scores(payload: dict[str, Any]) -> list[float]:
    day_scores: list[float] = []
    raw_day_scores = payload.get("day_scores")
    if not isinstance(raw_day_scores, list):
        return day_scores

    for item in raw_day_scores:
        candidate: Optional[float] = None
        if isinstance(item, (int, float)):
            candidate = float(item)
        elif isinstance(item, dict):
            raw_score = item.get("alignment_score", item.get("score"))
            if isinstance(raw_score, (int, float)):
                candidate = float(raw_score)
        if candidate is not None:
            day_scores.append(max(0.0, min(1.0, candidate)))

    return day_scores


def _compute_per_day_alignment_score(
    requested_days: Optional[int],
    day_scores: list[float],
    overall_score: Optional[float],
) -> Optional[float]:
    if requested_days is not None and requested_days > 0:
        normalized_scores = day_scores[:requested_days]
        if len(normalized_scores) < requested_days:
            normalized_scores.extend([0.0] * (requested_days - len(normalized_scores)))

        score = sum(normalized_scores) / requested_days
        return max(0.0, min(1.0, score))

    if day_scores:
        return max(0.0, min(1.0, sum(day_scores) / len(day_scores)))

    if isinstance(overall_score, (int, float)):
        return max(0.0, min(1.0, float(overall_score)))

    return None


def _strip_code_fences(text: str) -> str:
    stripped = text.strip()
    if stripped.startswith("```"):
        stripped = re.sub(r"^```[a-zA-Z0-9_-]*\n", "", stripped)
        stripped = re.sub(r"\n```$", "", stripped)
    return stripped.strip()


def _parse_json_safely(text: str) -> Optional[dict[str, Any]]:
    cleaned = _strip_code_fences(text)
    try:
        parsed = json.loads(cleaned)
        return parsed if isinstance(parsed, dict) else None
    except Exception:
        match = re.search(r"\{.*\}", cleaned, re.DOTALL)
        if not match:
            return None
        try:
            parsed = json.loads(match.group(0))
            return parsed if isinstance(parsed, dict) else None
        except Exception:
            return None


def _get_judge_client() -> Any:
    global _JUDGE_CLIENT
    if _JUDGE_CLIENT is not None:
        return _JUDGE_CLIENT

    api_key = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
    if not api_key:
        return None

    try:
        from google import genai
    except Exception:
        return None

    try:
        _JUDGE_CLIENT = genai.Client(api_key=api_key)
    except Exception:
        return None
    return _JUDGE_CLIENT


def _llm_judge_generate_text_via_litellm(
    prompt: str,
    *,
    litellm_model_env: str = _LITELLM_MODEL_ENV,
) -> Optional[str]:
    openrouter_api_key = os.getenv(_OPENROUTER_API_KEY_ENV)
    if not openrouter_api_key:
        return None

    try:
        from litellm import completion
    except Exception:
        return None

    model_name = os.getenv(litellm_model_env, _DEFAULT_LITELLM_JUDGE_MODEL)
    api_base = os.getenv(_OPENROUTER_API_BASE_ENV, "https://openrouter.ai/api/v1")
    try:
        response = completion(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            api_key=openrouter_api_key,
            api_base=api_base,
        )
    except Exception as exc:
        log.warning("LiteLLM judge call failed: %s", exc)
        return None

    choices = getattr(response, "choices", None)
    if not choices:
        return None

    message = getattr(choices[0], "message", None)
    if not message:
        return None

    content = getattr(message, "content", None)
    if isinstance(content, str) and content.strip():
        return content
    return None


def _llm_judge_generate_text(
    prompt: str,
    *,
    model_env: str = _JUDGE_MODEL_ENV,
    litellm_model_env: str = _LITELLM_MODEL_ENV,
) -> Optional[str]:
    text = _llm_judge_generate_text_via_litellm(
        prompt,
        litellm_model_env=litellm_model_env,
    )

    if not text:
        client = _get_judge_client()
        if client is None:
            return None

        model_name = os.getenv(model_env, _DEFAULT_JUDGE_MODEL)
        try:
            resp = client.models.generate_content(
                model=model_name,
                contents=[prompt],
                config={"temperature": 0},
            )
        except Exception as exc:
            log.warning("Google GenAI judge call failed: %s", exc)
            return None

        text = getattr(resp, "text", None) or getattr(resp, "output_text", None)
        if not text and getattr(resp, "candidates", None):
            candidate = resp.candidates[0]
            parts = getattr(getattr(candidate, "content", None), "parts", [])
            text = "".join(getattr(part, "text", "") for part in parts)

    return text


def _llm_judges_per_day_cuisine_alignment(
    requested_cuisine: str,
    requested_days: Optional[int],
    response_text: str,
    optimizer_day_count: Optional[int] = None,
    is_infant_profile: bool = False,
) -> tuple[Optional[float], list[RubricScore]]:
    day_sections = _split_response_into_days(response_text, requested_days)
    response_payload = _format_day_sections(day_sections) if day_sections else response_text
    response_label = "Parsed meal plan days:\n" if day_sections else "Meal plan response:\n"
    requested_days_text = str(requested_days) if requested_days is not None else "unknown"
    optimizer_day_count_text = str(optimizer_day_count) if optimizer_day_count is not None else "unknown"
    infant_relaxation_instruction = (
        "For infant profiles, apply a relaxed cuisine interpretation: consider infant-safe, simplified, "
        "or mildly adapted versions of the requested cuisine as aligned when the overall cuisine direction "
        "is still recognizable. Do not require strict authenticity, full spice intensity, or adult-style dish complexity. "
        if is_infant_profile
        else ""
    )
    prompt = (
        "You are grading whether a meal-plan response follows the requested cuisine on a per-day basis. "
        "Return ONLY strict JSON with this schema: "
        '{"detected_day_count": <integer>, "day_scores": [<number 0 to 1>], "reason": "short"}. '\
        "Use this rubric for each detected day: 1.0 = strongly aligned, 0.7 = mostly aligned, "
        "0.4 = mixed/partially aligned, 0.0 = not aligned. "
        f"{infant_relaxation_instruction}"
        "Score each distinct meal-plan day in order. Ignore summaries, shopping lists, and notes. "
        "If a day is present but too vague to judge, score it 0.0. "
        f"Requested cuisine: {requested_cuisine}\n"
        f"Expected plan days: {requested_days_text}\n\n"
        f"Optimizer tool day_count (authoritative when provided): {optimizer_day_count_text}\n\n"
        f"{response_label}"
        f"{response_payload}"
    )

    text = _llm_judge_generate_text(
        prompt,
        model_env=_JUDGE_MODEL_ENV,
        litellm_model_env=_LITELLM_MODEL_ENV,
    )

    if not text:
        return None, []

    payload = _parse_json_safely(text)
    if not payload:
        return None, []

    day_scores = _extract_day_scores(payload)
    rubric_scores: list[RubricScore] = []
    for index, day_score in enumerate(day_scores, start=1):
        rubric_scores.append(
            RubricScore(
                rubricId=f"day_{index}_cuisine_alignment_score",
                score=day_score,
                rationale="Per-day cuisine alignment score returned by the LLM judge.",
            )
        )

    if requested_days is not None and requested_days > 0:
        rubric_scores.append(
            RubricScore(
                rubricId="requested_day_count",
                score=float(requested_days),
                rationale="Day count requested by the user prompt.",
            )
        )
    if optimizer_day_count is not None and optimizer_day_count > 0:
        rubric_scores.append(
            RubricScore(
                rubricId="optimizer_day_count",
                score=float(optimizer_day_count),
                rationale="Day count from optimizer tool output (authoritative when available).",
            )
        )

    overall_score = payload.get("overall_score")
    if not isinstance(overall_score, (int, float)):
        overall_score = payload.get("alignment_score")
    computed_score = _compute_per_day_alignment_score(
        requested_days=requested_days,
        day_scores=day_scores,
        overall_score=float(overall_score) if isinstance(overall_score, (int, float)) else None,
    )
    if computed_score is not None:
        rationale = "Computed by _compute_per_day_alignment_score using per-day scores and requested day count."
        rubric_scores.append(
            RubricScore(
                rubricId="computed_per_day_alignment_score",
                score=computed_score,
                rationale=rationale,
            )
        )
        return computed_score, rubric_scores

    alignment_score = payload.get("alignment_score")
    if isinstance(alignment_score, (int, float)):
        clamped_alignment_score = max(0.0, min(1.0, float(alignment_score)))
        rubric_scores.append(
            RubricScore(
                rubricId="fallback_alignment_score",
                score=clamped_alignment_score,
                rationale="Fallback alignment_score from judge payload.",
            )
        )
        return clamped_alignment_score, rubric_scores

    # Backward compatibility for boolean-style responses.
    follows = payload.get("follows_cuisine")
    if isinstance(follows, bool):
        score = 1.0 if follows else 0.0
        rubric_scores.append(
            RubricScore(
                rubricId="fallback_follows_cuisine",
                score=score,
                rationale="Boolean follows_cuisine fallback from judge payload.",
            )
        )
        return score, rubric_scores
    if isinstance(follows, str):
        value = follows.strip().lower()
        if value in {"true", "yes", "1"}:
            rubric_scores.append(
                RubricScore(
                    rubricId="fallback_follows_cuisine",
                    score=1.0,
                    rationale="String follows_cuisine fallback from judge payload.",
                )
            )
            return 1.0, rubric_scores
        if value in {"false", "no", "0"}:
            rubric_scores.append(
                RubricScore(
                    rubricId="fallback_follows_cuisine",
                    score=0.0,
                    rationale="String follows_cuisine fallback from judge payload.",
                )
            )
            return 0.0, rubric_scores
    return None, rubric_scores


def _llm_judges_meal_plan_palatability(
    requested_cuisine: Optional[str],
    requested_days: Optional[int],
    response_text: str,
    is_infant_profile: bool = False,
) -> tuple[Optional[float], list[RubricScore]]:
    day_sections = _split_response_into_days(response_text, requested_days)
    response_payload = _format_day_sections(day_sections) if day_sections else response_text
    response_label = "Parsed meal plan days:\n" if day_sections else "Meal plan response:\n"
    requested_days_text = str(requested_days) if requested_days is not None else "unknown"
    requested_cuisine_text = requested_cuisine or "unspecified"
    infant_instruction = (
        "For infant profiles, judge palatability for the intended infant consumer: favor simple, gentle, age-appropriate, "
        "repeatable meals and do not penalize the plan for lacking adult-level complexity or strong seasoning. "
        if is_infant_profile
        else ""
    )
    prompt = (
        "You are grading the palatability of a meal-plan response on a per-day basis. "
        "Return ONLY strict JSON with this schema: "
        '{"detected_day_count": <integer>, "day_scores": [<number 0 to 1>], "reason": "short"}. '
        "Use this rubric for each detected day: 1.0 = highly appealing and coherent meals with good variety and realistic pairings, "
        "0.7 = generally appealing with minor issues, 0.4 = edible but weak, repetitive, awkward, or poorly paired, "
        "0.0 = clearly unappealing, incoherent, or unrealistic as a meal plan. "
        f"{infant_instruction}"
        "Judge likely eating appeal based on ingredient combinations, meal coherence, variety, and realism. "
        "Do not score nutrition accuracy here. Do not require gourmet quality. Ignore summaries, shopping lists, and notes. "
        "If a day is too vague to judge, score it 0.0. "
        f"Requested cuisine context: {requested_cuisine_text}\n"
        f"Expected plan days: {requested_days_text}\n\n"
        f"{response_label}"
        f"{response_payload}"
    )

    text = _llm_judge_generate_text(
        prompt,
        model_env=_PALATABILITY_JUDGE_MODEL_ENV,
        litellm_model_env=_PALATABILITY_LITELLM_MODEL_ENV,
    )
    if not text:
        return None, []

    payload = _parse_json_safely(text)
    if not payload:
        return None, []

    day_scores = _extract_day_scores(payload)
    rubric_scores: list[RubricScore] = []
    for index, day_score in enumerate(day_scores, start=1):
        rubric_scores.append(
            RubricScore(
                rubricId=f"day_{index}_palatability_score",
                score=day_score,
                rationale="Per-day palatability score returned by the LLM judge.",
            )
        )

    overall_score = payload.get("overall_score")
    if not isinstance(overall_score, (int, float)):
        overall_score = payload.get("palatability_score")

    computed_score = _compute_per_day_alignment_score(
        requested_days=requested_days,
        day_scores=day_scores,
        overall_score=float(overall_score) if isinstance(overall_score, (int, float)) else None,
    )
    if computed_score is not None:
        rubric_scores.append(
            RubricScore(
                rubricId="computed_per_day_palatability_score",
                score=computed_score,
                rationale="Computed from per-day palatability scores and requested day count.",
            )
        )
        return computed_score, rubric_scores

    palatability_score = payload.get("palatability_score")
    if isinstance(palatability_score, (int, float)):
        clamped_score = max(0.0, min(1.0, float(palatability_score)))
        rubric_scores.append(
            RubricScore(
                rubricId="fallback_palatability_score",
                score=clamped_score,
                rationale="Fallback palatability_score from judge payload.",
            )
        )
        return clamped_score, rubric_scores

    return None, rubric_scores


def _score_cuisine_metric_invocations(eval_metric, actual_invocations, threshold = 0.5):
    if eval_metric and eval_metric.criterion is not None:
        threshold = float(eval_metric.criterion.threshold)

    per_results: list[PerInvocationResult] = []
    for actual_invocation in actual_invocations:
        user_text = _extract_user_text(actual_invocation)
        response_text = _extract_text(actual_invocation)
        requested_cuisine = _extract_requested_cuisine(user_text)
        requested_days = _extract_requested_day_count(user_text)
        optimizer_day_count = _extract_optimizer_tool_day_count(actual_invocation)
        is_infant_profile = _is_infant_profile(user_text)

        if not response_text.strip():
            per_results.append(
                PerInvocationResult(
                    actual_invocation=actual_invocation,
                    score=0.0,
                    eval_status=EvalStatus.FAILED,
                )
            )
            continue

        if not requested_cuisine:
            per_results.append(
                PerInvocationResult(
                    actual_invocation=actual_invocation,
                    score=None,
                    eval_status=EvalStatus.NOT_EVALUATED,
                )
            )
            continue

        alignment_score, rubric_scores = _llm_judges_per_day_cuisine_alignment(
            requested_cuisine=requested_cuisine,
            requested_days=requested_days,
            response_text=response_text,
            optimizer_day_count=optimizer_day_count,
            is_infant_profile=is_infant_profile,
        )
        if alignment_score is None:
            per_results.append(
                PerInvocationResult(
                    actual_invocation=actual_invocation,
                    score=None,
                    eval_status=EvalStatus.NOT_EVALUATED,
                    rubric_scores=rubric_scores or None,
                )
            )
            continue

        score = float(alignment_score)
        status = EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
        per_results.append(
            PerInvocationResult(
                actual_invocation=actual_invocation,
                score=score,
                eval_status=status,
                rubric_scores=rubric_scores or None,
            )
        )

    if not per_results:
        return EvaluationResult(overall_score=0.0, overall_eval_status=EvalStatus.FAILED)

    scored_results = [result.score for result in per_results if result.score is not None]
    if not scored_results:
        return EvaluationResult(overall_score=None, overall_eval_status=EvalStatus.NOT_EVALUATED)

    overall_score = sum(scored_results) / len(scored_results)
    overall_status = EvalStatus.PASSED if overall_score >= threshold else EvalStatus.FAILED
    return EvaluationResult(
        overall_score=overall_score,
        overall_eval_status=overall_status,
        per_invocation_results=per_results,
    )


def _score_palatability_metric_invocations(eval_metric, actual_invocations, threshold = 0.5):
    if eval_metric and eval_metric.criterion is not None:
        threshold = float(eval_metric.criterion.threshold)

    per_results: list[PerInvocationResult] = []
    for actual_invocation in actual_invocations:
        user_text = _extract_user_text(actual_invocation)
        response_text = _extract_text(actual_invocation)
        requested_cuisine = _extract_requested_cuisine(user_text)
        requested_days = _extract_requested_day_count(user_text)
        is_infant_profile = _is_infant_profile(user_text)

        if not response_text.strip():
            per_results.append(
                PerInvocationResult(
                    actual_invocation=actual_invocation,
                    score=0.0,
                    eval_status=EvalStatus.FAILED,
                )
            )
            continue

        palatability_score, rubric_scores = _llm_judges_meal_plan_palatability(
            requested_cuisine=requested_cuisine,
            requested_days=requested_days,
            response_text=response_text,
            is_infant_profile=is_infant_profile,
        )
        if palatability_score is None:
            per_results.append(
                PerInvocationResult(
                    actual_invocation=actual_invocation,
                    score=None,
                    eval_status=EvalStatus.NOT_EVALUATED,
                    rubric_scores=rubric_scores or None,
                )
            )
            continue

        score = float(palatability_score)
        status = EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
        per_results.append(
            PerInvocationResult(
                actual_invocation=actual_invocation,
                score=score,
                eval_status=status,
                rubric_scores=rubric_scores or None,
            )
        )

    if not per_results:
        return EvaluationResult(overall_score=0.0, overall_eval_status=EvalStatus.FAILED)

    scored_results = [result.score for result in per_results if result.score is not None]
    if not scored_results:
        return EvaluationResult(overall_score=None, overall_eval_status=EvalStatus.NOT_EVALUATED)

    overall_score = sum(scored_results) / len(scored_results)
    overall_status = EvalStatus.PASSED if overall_score >= threshold else EvalStatus.FAILED
    return EvaluationResult(
        overall_score=overall_score,
        overall_eval_status=overall_status,
        per_invocation_results=per_results,
    )


def non_empty_response_metric(
    eval_metric,
    actual_invocations,
    expected_invocations=None,
    conversation_scenario=None,
):
    del expected_invocations, conversation_scenario

    threshold = 1.0
    if eval_metric and eval_metric.criterion is not None:
        threshold = float(eval_metric.criterion.threshold)

    per_results: list[PerInvocationResult] = []
    for actual_invocation in actual_invocations:
        response_text = _extract_text(actual_invocation)
        score = 1.0 if response_text.strip() else 0.0
        status = EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED

        per_results.append(
            PerInvocationResult(
                actual_invocation=actual_invocation,
                score=score,
                eval_status=status,
            )
        )

    if not per_results:
        return EvaluationResult(overall_score=0.0, overall_eval_status=EvalStatus.FAILED)

    overall_score = sum(result.score for result in per_results if result.score is not None) / len(per_results)
    overall_status = EvalStatus.PASSED if overall_score >= threshold else EvalStatus.FAILED

    return EvaluationResult(
        overall_score=overall_score,
        overall_eval_status=overall_status,
        per_invocation_results=per_results,
    )


def cuisine_alignment_llm_judge_metric(
    eval_metric,
    actual_invocations,
    expected_invocations=None,
    conversation_scenario=None,
):
    del expected_invocations, conversation_scenario

    return _score_cuisine_metric_invocations(eval_metric, actual_invocations)


def per_day_cuisine_alignment_llm_judge_metric(
    eval_metric,
    actual_invocations,
    expected_invocations=None,
    conversation_scenario=None,
):
    del expected_invocations, conversation_scenario

    return _score_cuisine_metric_invocations(eval_metric, actual_invocations)


def per_day_palatability_llm_judge_metric(
    eval_metric,
    actual_invocations,
    expected_invocations=None,
    conversation_scenario=None,
):
    del expected_invocations, conversation_scenario

    return _score_palatability_metric_invocations(eval_metric, actual_invocations)
