from __future__ import annotations

import argparse
from datetime import datetime
import json
import math
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any


ROOT = Path(__file__).resolve().parents[1]
DEFAULT_SAMPLES_PATH = ROOT / "eval" / "category_samples.json"
DEFAULT_SAMPLE_SOURCES = [
	ROOT / "eval" / "dri.jsonl",
	ROOT / "eval" / "mfp.jsonl",
]

AGENT_CONFIGS = {
	"NoLLMCanadaDRIPlanner": ROOT / "NoLLMCanadaDRIPlanner",
	"NoLLMMFPNutritionPlanner": ROOT / "NoLLMMFPNutritionPlanner",
	"BaselineCanadaDRIPlanner": ROOT / "BaselineCanadaDRIPlanner",
	"BaselineMFPNutritionPlanner": ROOT / "BaselineMFPNutritionPlanner",
	"OptimizerCanadaDRIPlanner": ROOT / "OptimizerCanadaDRIPlanner",
	"OptimizerMFPNutritionPlanner": ROOT / "OptimizerMFPNutritionPlanner",
}

AGENT_COMPARISON_GROUPS = [
	(
		"Canada Health DRI",
		"BaselineCanadaDRIPlanner",
		"OptimizerCanadaDRIPlanner",
	),
	(
		"MyFitnessPal",
		"BaselineMFPNutritionPlanner",
		"OptimizerMFPNutritionPlanner",
	),
]

DRI_AGENTS = {
	"NoLLMCanadaDRIPlanner",
	"BaselineCanadaDRIPlanner",
	"OptimizerCanadaDRIPlanner",
}

MFP_AGENTS = {
	"NoLLMMFPNutritionPlanner",
	"BaselineMFPNutritionPlanner",
	"OptimizerMFPNutritionPlanner",
}

TABLE_ROW_ORDER = [
	("overall", "baseline"),
	("age", "young"),
	("age", "old"),
	("activity level", "inactive"),
	("activity level", "low active"),
	("activity level", "active"),
	("activity level", "very active"),
	("sex", "male"),
	("sex", "female"),
	("pregnancy status", "not pregnant"),
	("pregnancy status", "pregnant"),
	("gestation stage", "first half (<20 weeks)"),
	("gestation stage", "second half (>=20 weeks)"),
	("lactation status", "none"),
	("lactation status", "0-6 months postpartum"),
	("lactation status", "7-12 months postpartum"),
	("prepregnancy bmi class", "uw"),
	("prepregnancy bmi class", "nw"),
	("prepregnancy bmi class", "ow"),
	("prepregnancy bmi class", "ob"),
]

DEFAULT_AGENT_ORDER = [
	"NoLLMCanadaDRIPlanner",
	"NoLLMMFPNutritionPlanner",
	"BaselineCanadaDRIPlanner",
	"OptimizerCanadaDRIPlanner",
	"BaselineMFPNutritionPlanner",
	"OptimizerMFPNutritionPlanner",
]

LATEX_TABLE_LAYOUT = [
	("data", "All", ("overall", "baseline")),
	("group", "Age", None),
	("data", "\\quad -Young", ("age", "young")),
	("data", "\\quad -Old", ("age", "old")),
	("group", "Activity level", None),
	("data", "\\quad -Inactive", ("activity level", "inactive")),
	("data", "\\quad -Low active", ("activity level", "low active")),
	("data", "\\quad -Active", ("activity level", "active")),
	("data", "\\quad -Very active", ("activity level", "very active")),
	("group", "Sex", None),
	("data", "\\quad -Male", ("sex", "male")),
	("data", "\\quad -Female", ("sex", "female")),
	("group", "\\quad\\quad Pregnancy status", None),
	("data", "\\quad\\quad\\quad -Not pregnant", ("pregnancy status", "not pregnant")),
	("data", "\\quad\\quad\\quad -Pregnant", ("pregnancy status", "pregnant")),
	("group", "\\quad\\quad Gestation stage", None),
	("data", "\\quad\\quad\\quad -First half ($<20$ weeks)", ("gestation stage", "first half (<20 weeks)")),
	("data", "\\quad\\quad\\quad -Second half ($\\geq 20$ weeks)", ("gestation stage", "second half (>=20 weeks)")),
	("group", "\\quad\\quad Lactation status", None),
	("data", "\\quad\\quad\\quad -None", ("lactation status", "none")),
	("data", "\\quad\\quad\\quad -0--6 months postpartum", ("lactation status", "0-6 months postpartum")),
	("data", "\\quad\\quad\\quad -7--12 months postpartum", ("lactation status", "7-12 months postpartum")),
	("group", "\\quad\\quad Prepregnancy BMI class", None),
	("data", "\\quad\\quad\\quad -UW", ("prepregnancy bmi class", "uw")),
	("data", "\\quad\\quad\\quad -NW", ("prepregnancy bmi class", "nw")),
	("data", "\\quad\\quad\\quad -OW", ("prepregnancy bmi class", "ow")),
	("data", "\\quad\\quad\\quad -OB", ("prepregnancy bmi class", "ob")),
]

DISPLAY_LABELS = {
	("overall", "baseline"): ("Overall", "Baseline"),
	("age", "young"): ("Age", "Young"),
	("age", "old"): ("Age", "Old"),
	("activity level", "inactive"): ("Activity Level", "Inactive"),
	("activity level", "low active"): ("Activity Level", "Low Active"),
	("activity level", "active"): ("Activity Level", "Active"),
	("activity level", "very active"): ("Activity Level", "Very Active"),
	("pregnancy status", "not pregnant"): ("Pregnancy Status", "Not Pregnant"),
	("pregnancy status", "pregnant"): ("Pregnancy Status", "Pregnant"),
	("gestation stage", "first half (<20 weeks)"): ("Gestation Stage", "First Half (<20 weeks)"),
	("gestation stage", "second half (>=20 weeks)"): ("Gestation Stage", "Second Half (>=20 weeks)"),
	("lactation status", "none"): ("Lactation Status", "None"),
	("lactation status", "0-6 months postpartum"): ("Lactation Status", "0--6 Months Postpartum"),
	("lactation status", "7-12 months postpartum"): ("Lactation Status", "7--12 Months Postpartum"),
	("prepregnancy bmi class", "uw"): ("Prepregnancy Bmi Class", "UW"),
	("prepregnancy bmi class", "nw"): ("Prepregnancy Bmi Class", "NW"),
	("prepregnancy bmi class", "ow"): ("Prepregnancy Bmi Class", "OW"),
	("prepregnancy bmi class", "ob"): ("Prepregnancy Bmi Class", "OB"),
	("sex", "male"): ("Sex", "Male"),
	("sex", "female"): ("Sex", "Female"),
}

CUISINE_ALIASES = {
	"american": ["american"],
	"chinese": ["chinese"],
	"indian": ["indian"],
	"italian": ["italian"],
	"japanese": ["japanese"],
	"korean": ["korean"],
	"mediterranean": ["mediterranean"],
	"mexican": ["mexican"],
	"middle eastern": ["middle eastern", "middle-eastern", "levant", "levantine", "arabic"],
	"thai": ["thai"],
}

NUTRIENT_ALIASES = {
	"protein": ["protein"],
	"carbohydrates": ["carbohydrates", "carbs"],
	"total_fat": ["total fat", "fat", "fats"],
	"total_fibre": ["total fibre", "fibre", "total fiber", "fiber"],
	"calories": ["calories", "energy"],
}


def _normalize_text(value: str) -> str:
	text = value.lower()
	text = text.replace("\u2013", "-").replace("\u2014", "-")
	text = re.sub(r"\s+", " ", text)
	return text.strip()


def _canonical_key(text: str) -> str:
	return _normalize_text(text).replace("_", " ")


def _normalize_activity_category(value: str) -> str:
	normalized = _canonical_key(value)
	activity_aliases = {
		"sedentary": "inactive",
		"inactive": "inactive",
		"lightly active": "low active",
		"low active": "low active",
		"moderately active": "active",
		"active": "active",
		"extra active": "very active",
		"very active": "very active",
	}
	return activity_aliases.get(normalized, normalized)


def _to_years(age: Any, age_unit: Any) -> float | None:
	age_value = _to_float(age)
	if age_value is None:
		return None
	unit = _canonical_key(str(age_unit or "years"))
	if unit in {"month", "months"}:
		return age_value / 12.0
	return age_value


def _to_weight_kg(weight: Any, unit: Any) -> float | None:
	weight_value = _to_float(weight)
	if weight_value is None:
		return None
	normalized_unit = _canonical_key(str(unit or "kg"))
	if normalized_unit in {"lb", "lbs", "pound", "pounds"}:
		return weight_value / 2.2046226218
	return weight_value


def _to_height_cm(sample: dict[str, Any]) -> float | None:
	height_value = _to_float(sample.get("height"))
	if height_value is None:
		return None
	height_unit = _canonical_key(str(sample.get("height_unit", "cm")))
	if height_unit in {"cm", "centimeter", "centimeters"}:
		return height_value
	if height_unit in {"m", "meter", "meters"}:
		return height_value * 100.0
	if height_unit in {"ft in", "ft_in", "feet inches", "feet inch", "ft"}:
		inches = _to_float(sample.get("height_inches")) or 0.0
		return (height_value * 30.48) + (inches * 2.54)
	return None


def _prepregnancy_bmi_class(sample: dict[str, Any]) -> str | None:
	existing = sample.get("prepregnancy_bmi_class")
	if isinstance(existing, str) and existing.strip():
		return _canonical_key(existing)

	prepreg_weight_kg = _to_weight_kg(sample.get("prepregnancy_weight"), sample.get("prepregnancy_weight_unit"))
	height_cm = _to_height_cm(sample)
	if prepreg_weight_kg is None or height_cm is None or height_cm <= 0:
		return None
	bmi = prepreg_weight_kg / ((height_cm / 100.0) ** 2)
	if bmi < 18.5:
		return "uw"
	if bmi < 25.0:
		return "nw"
	if bmi < 30.0:
		return "ow"
	return "ob"


def _derive_factor_categories_from_sample(sample: dict[str, Any]) -> list[tuple[str, str]]:
	derived: list[tuple[str, str]] = []

	def add(factor: str, category: str | None) -> None:
		if not category:
			return
		key = (_canonical_key(factor), _canonical_key(category))
		if key not in derived:
			derived.append(key)

	age_years = _to_years(sample.get("age"), sample.get("age_unit"))
	if age_years is not None:
		add("age", "old" if age_years > 50.0 else "young")

	activity = sample.get("activity_level")
	if isinstance(activity, str) and activity.strip():
		add("activity level", _normalize_activity_category(activity))

	sex = sample.get("sex")
	if isinstance(sex, str) and sex.strip():
		add("sex", sex)

	is_pregnant = bool(sample.get("is_pregnant", False))
	add("pregnancy status", "pregnant" if is_pregnant else "not pregnant")

	if is_pregnant:
		gestation_weeks = _to_float(sample.get("gestation_weeks"))
		if gestation_weeks is not None:
			add("gestation stage", "first half (<20 weeks)" if gestation_weeks < 20 else "second half (>=20 weeks)")

		bmi_class = _prepregnancy_bmi_class(sample)
		if bmi_class is not None:
			add("prepregnancy bmi class", bmi_class)

	lactation_status = sample.get("lactation_status")
	if isinstance(lactation_status, str) and lactation_status.strip():
		normalized = _canonical_key(lactation_status)
		if normalized in {"none", "no", "not lactating"}:
			add("lactation status", "none")
		elif normalized in {"0-6 months postpartum", "0 to 6 months postpartum", "0 6 months postpartum"}:
			add("lactation status", "0-6 months postpartum")
		elif normalized in {"7-12 months postpartum", "7 to 12 months postpartum", "7 12 months postpartum"}:
			add("lactation status", "7-12 months postpartum")
		else:
			add("lactation status", normalized)

	return derived


@dataclass
class MetricBounds:
	lower: float | None = None
	upper: float | None = None


@dataclass
class CaseMetrics:
	agent: str
	eval_id: str
	profile_sentence: str
	eval_timestamp: float
	factor_categories: list[tuple[str, str]]
	requested_cuisine: str | None
	calories_target: MetricBounds
	protein_target: MetricBounds
	carbohydrates_target: MetricBounds
	total_fat_target: MetricBounds
	total_fibre_target: MetricBounds
	achieved: dict[str, float]
	min_daily_calories: float | None
	max_daily_calories: float | None
	cuisine_alignment_score: float | None
	palatability_score: float | None
	latency_seconds: float | None
	tool_retries: int | None


def _parse_history_timestamp(path: Path) -> float:
	match = re.search(r"_(\d+(?:\.\d+)?)\.evalset_result\.json$", path.name)
	if not match:
		return path.stat().st_mtime
	try:
		return float(match.group(1))
	except ValueError:
		return path.stat().st_mtime


def _extract_parts_text(content: dict[str, Any] | None) -> str:
	if not isinstance(content, dict):
		return ""
	parts = content.get("parts")
	if not isinstance(parts, list):
		return ""
	texts = [part.get("text", "") for part in parts if isinstance(part, dict) and part.get("text")]
	return "\n".join(texts).strip()


def _extract_user_text(case_result: dict[str, Any]) -> str:
	invocations = case_result.get("eval_metric_result_per_invocation") or []
	if not invocations:
		return ""
	actual = (invocations[0] or {}).get("actual_invocation") or {}
	return _extract_parts_text(actual.get("user_content"))


def _extract_final_response_text(case_result: dict[str, Any]) -> str:
	invocations = case_result.get("eval_metric_result_per_invocation") or []
	if not invocations:
		return ""
	actual = (invocations[0] or {}).get("actual_invocation") or {}
	return _extract_parts_text(actual.get("final_response"))


def _iter_invocation_events(case_result: dict[str, Any]) -> list[dict[str, Any]]:
	invocations = case_result.get("eval_metric_result_per_invocation") or []
	if not invocations:
		return []
	actual = (invocations[0] or {}).get("actual_invocation") or {}
	intermediate = actual.get("intermediate_data") or {}
	events = intermediate.get("invocation_events")
	return events if isinstance(events, list) else []


def _extract_timestamp_from_part_metadata(part: dict[str, Any]) -> float | None:
	part_metadata = part.get("part_metadata")
	if not isinstance(part_metadata, dict):
		return None
	for key in (
		"creation_timestamp",
		"timestamp",
		"created_at",
		"generated_at",
		"response_timestamp",
		"event_timestamp",
	):
		value = _to_float(part_metadata.get(key))
		if value is not None:
			return value
	return None


def _extract_final_message_timestamp(case_result: dict[str, Any], eval_timestamp: float) -> float | None:
	# Prefer explicit timestamps on final response parts when present.
	invocations = case_result.get("eval_metric_result_per_invocation") or []
	if invocations:
		actual = (invocations[0] or {}).get("actual_invocation") or {}
		final_response = actual.get("final_response")
		parts = final_response.get("parts") if isinstance(final_response, dict) else None
		if isinstance(parts, list):
			for part in reversed(parts):
				if isinstance(part, dict):
					ts = _extract_timestamp_from_part_metadata(part)
					if ts is not None:
						return ts

	# If invocation events include event timestamps, use the latest event timestamp.
	event_timestamps: list[float] = []
	for event in _iter_invocation_events(case_result):
		if not isinstance(event, dict):
			continue
		event_ts = _to_float(event.get("creation_timestamp") or event.get("timestamp") or event.get("event_timestamp"))
		if event_ts is not None:
			event_timestamps.append(event_ts)
	if event_timestamps:
		return max(event_timestamps)

	# Fallback: eval-history file timestamp approximates completion time.
	return eval_timestamp


def _extract_latency_seconds(case_result: dict[str, Any], eval_timestamp: float) -> float | None:
	invocations = case_result.get("eval_metric_result_per_invocation") or []
	if not invocations:
		return None
	actual = (invocations[0] or {}).get("actual_invocation") or {}
	user_sent_ts = _to_float(actual.get("creation_timestamp"))
	if user_sent_ts is None:
		return None
	final_generated_ts = _extract_final_message_timestamp(case_result, eval_timestamp)
	if final_generated_ts is None:
		return None
	latency = final_generated_ts - user_sent_ts
	if latency < 0:
		return None
	return latency


def _extract_tool_retries(case_result: dict[str, Any]) -> int | None:
	events = _iter_invocation_events(case_result)
	if not events:
		return None

	tool_calls: list[str] = []
	for event in events:
		content = event.get("content") if isinstance(event, dict) else None
		parts = content.get("parts") if isinstance(content, dict) else None
		if not isinstance(parts, list):
			continue
		for part in parts:
			if not isinstance(part, dict):
				continue
			function_call = part.get("function_call")
			if not isinstance(function_call, dict):
				continue
			name = function_call.get("name")
			if isinstance(name, str) and name.strip():
				tool_calls.append(name)

	if not tool_calls:
		return 0
	return max(0, len(tool_calls) - len(set(tool_calls)))


def _extract_function_responses(case_result: dict[str, Any], function_name: str) -> list[dict[str, Any]]:
	responses: list[dict[str, Any]] = []
	for event in _iter_invocation_events(case_result):
		content = event.get("content") if isinstance(event, dict) else None
		parts = content.get("parts") if isinstance(content, dict) else None
		if not isinstance(parts, list):
			continue
		for part in parts:
			if not isinstance(part, dict):
				continue
			function_response = part.get("function_response")
			if not isinstance(function_response, dict):
				continue
			if function_response.get("name") == function_name:
				payload = function_response.get("response")
				if isinstance(payload, dict):
					responses.append(payload)
	return responses


def _calculator_response_order_key(response: dict[str, Any]) -> tuple[int, float]:
	is_feasible = bool(response.get("is_feasible", False))
	score_value = _to_float(response.get("feasibility_score"))
	if score_value is None:
		score_value = 0.0 if is_feasible else float("inf")
	return (0 if is_feasible else 1, score_value)


def _select_best_calculator_response(responses: list[dict[str, Any]]) -> dict[str, Any] | None:
	best_response: dict[str, Any] | None = None
	best_key: tuple[int, float] | None = None
	for response in responses:
		if not isinstance(response, dict):
			continue
		candidate_key = _calculator_response_order_key(response)
		if best_key is None or candidate_key <= best_key:
			best_key = candidate_key
			best_response = response
	return best_response


def _to_float(value: Any) -> float | None:
	if isinstance(value, (int, float)):
		return float(value)
	if isinstance(value, str):
		try:
			return float(value.replace(",", "").strip())
		except ValueError:
			return None
	return None


def _extract_requested_cuisine(user_text: str) -> str | None:
	patterns = [
		r"\bi want to eat\s+([a-z][a-z\s\-]{1,40}?)\s+food\b",
		r"\bi usually eat\s+([a-z][a-z\s\-]{1,40}?)\s+food\b",
		r"\b(?:prefer|like)\s+([a-z][a-z\s\-]{1,40}?)\s+(?:food|cuisine)\b",
	]
	text = _normalize_text(user_text)
	for pattern in patterns:
		match = re.search(pattern, text)
		if match:
			return _normalize_text(match.group(1))
	return None


def _extract_dri_bounds(case_result: dict[str, Any]) -> dict[str, MetricBounds]:
	bounds = {
		"calories": MetricBounds(),
		"protein": MetricBounds(),
		"carbohydrates": MetricBounds(),
		"total_fat": MetricBounds(),
		"total_fibre": MetricBounds(),
	}
	dri_responses = _extract_function_responses(case_result, "calculate_health_canada_dri")
	if not dri_responses:
		return bounds
	dri = dri_responses[-1]
	calories = dri.get("calories")
	if isinstance(calories, dict):
		eer = _to_float(calories.get("eer_kcal"))
		if eer is not None:
			bounds["calories"] = MetricBounds(lower=eer, upper=eer)
	else:
		calorie_value = _to_float(calories)
		if calorie_value is not None:
			bounds["calories"] = MetricBounds(lower=calorie_value, upper=calorie_value)

	recommended_per_day = dri.get("recommended_g_per_day")
	if isinstance(recommended_per_day, dict):
		eer = _to_float(
			recommended_per_day.get("calories eer_kcal")
			or recommended_per_day.get("calories_eer_kcal")
			or recommended_per_day.get("eer_kcal")
		)
		if eer is not None:
			bounds["calories"] = MetricBounds(lower=eer, upper=eer)

	ranges = dri.get("macronutrient_ranges_g_per_day")
	if not isinstance(ranges, dict):
		ranges = dri.get("recommended_macronutrient_ranges_g_per_day")
	if isinstance(ranges, dict):
		for source_name, target_key in (("Protein", "protein"), ("Carbohydrates", "carbohydrates"), ("Fats", "total_fat")):
			value = ranges.get(source_name)
			if isinstance(value, dict):
				lower = _to_float(value.get("lower"))
				upper = _to_float(value.get("upper"))
				bounds[target_key] = MetricBounds(
					lower=lower,
					upper=upper,
				)

	recommended = dri.get("macronutrient_recommended_g_per_day")
	if not isinstance(recommended, dict):
		recommended = dri.get("recommended_g_per_day")
	if isinstance(recommended, dict):
		for key in ("Total fibre", "Total fiber", "total fibre", "total fiber"):
			value = _to_float(recommended.get(key))
			if value is not None:
				bounds["total_fibre"] = MetricBounds(lower=value, upper=None)
				break
	return bounds


def _extract_mfp_bounds(case_result: dict[str, Any]) -> dict[str, MetricBounds]:
	bounds = {
		"calories": MetricBounds(),
		"protein": MetricBounds(),
		"carbohydrates": MetricBounds(),
		"total_fat": MetricBounds(),
		"total_fibre": MetricBounds(),
	}
	mfp_responses = _extract_function_responses(case_result, "calculate_mfp_macros")
	if not mfp_responses:
		return bounds

	mfp = mfp_responses[-1]
	calories = _to_float(mfp.get("target_calories_kcal") or mfp.get("calories") or mfp.get("calories_kcal"))
	if calories is not None:
		bounds["calories"] = MetricBounds(lower=calories, upper=calories)

	macros = mfp.get("macros_g_per_day")
	if isinstance(macros, dict):
		protein = _to_float(macros.get("protein"))
		carbohydrates = _to_float(macros.get("carbohydrates") or macros.get("carbs"))
		total_fat = _to_float(macros.get("total_fat") or macros.get("fat"))
		if protein is not None:
			bounds["protein"] = MetricBounds(lower=protein, upper=protein)
		if carbohydrates is not None:
			bounds["carbohydrates"] = MetricBounds(lower=carbohydrates, upper=carbohydrates)
		if total_fat is not None:
			bounds["total_fat"] = MetricBounds(lower=total_fat, upper=total_fat)

	fibre = _to_float(mfp.get("target_fibre_g") or mfp.get("fibre_g") or mfp.get("fiber_g"))
	if fibre is not None:
		bounds["total_fibre"] = MetricBounds(lower=fibre, upper=fibre)

	return bounds


def _extract_bounds_from_summary_table(case_result: dict[str, Any]) -> dict[str, MetricBounds]:
	bounds = {
		"calories": MetricBounds(),
		"protein": MetricBounds(),
		"carbohydrates": MetricBounds(),
		"total_fat": MetricBounds(),
		"total_fibre": MetricBounds(),
	}
	text = _extract_final_response_text(case_result)
	if not text:
		return bounds

	for raw_line in text.splitlines():
		line = raw_line.strip()
		if "|" not in line:
			continue
		cells = [cell.strip() for cell in line.split("|") if cell.strip()]
		if len(cells) < 3:
			continue
		if cells[0].strip("-").strip() in {"", "parameter", "target", "actual"}:
			continue

		label = _canonical_key(cells[0])
		target_text = cells[1]
		numbers = [float(match) for match in re.findall(r"[0-9]+(?:\.[0-9]+)?", target_text)]
		if not numbers:
			continue

		if label in {"protein"}:
			key = "protein"
		elif label in {"carbs", "carbohydrates"}:
			key = "carbohydrates"
		elif label in {"fat", "fats", "total fat"}:
			key = "total_fat"
		elif label in {"calories", "energy"}:
			key = "calories"
		elif label in {"fibre", "fiber", "total fibre", "total fiber"}:
			key = "total_fibre"
		else:
			continue

		if len(numbers) >= 2:
			bounds[key] = MetricBounds(lower=numbers[0], upper=numbers[1])
		else:
			if key == "total_fibre":
				bounds[key] = MetricBounds(lower=numbers[0], upper=None)
			else:
				bounds[key] = MetricBounds(lower=numbers[0], upper=numbers[0])

	return bounds


def _extract_target_bounds(case_result: dict[str, Any]) -> dict[str, MetricBounds]:
	dri_responses = _extract_function_responses(case_result, "calculate_health_canada_dri")
	if dri_responses:
		return _extract_dri_bounds(case_result)

	mfp_responses = _extract_function_responses(case_result, "calculate_mfp_macros")
	if mfp_responses:
		return _extract_mfp_bounds(case_result)

	return _extract_bounds_from_summary_table(case_result)


def _extract_nutrient_values(payload: dict[str, Any]) -> dict[str, float]:
	achieved: dict[str, float] = {}
	key_aliases = {
		"protein": ("protein",),
		"carbohydrates": ("carbohydrates", "carbs"),
		"total_fat": ("total_fat", "fat", "fats"),
		"total_fibre": ("total_fibre", "total_fiber", "fibre", "fiber"),
		"calories": ("calories", "energy", "target_calories_kcal"),
	}
	for canonical_key, aliases in key_aliases.items():
		for alias in aliases:
			numeric_value = _to_float(payload.get(alias))
			if numeric_value is not None:
				achieved[canonical_key] = numeric_value
				break
	return achieved


def _extract_achieved_from_optimize_quantity(case_result: dict[str, Any]) -> dict[str, float]:
	opt_responses = _extract_function_responses(case_result, "optimize_quantity")
	for response in reversed(opt_responses):
		for candidate_key in (
			"average_macro_nutrient_from_calculated_quantity_per_day",
			"achieved_targets_per_day",
			"achieved_macros_per_day",
			"achieved_daily_macros",
			"average_daily_macros",
		):
			payload = response.get(candidate_key)
			if not isinstance(payload, dict):
				continue
			achieved = _extract_nutrient_values(payload)
			if achieved:
				return achieved
		if isinstance(response, dict):
			achieved = _extract_nutrient_values(response)
			if achieved:
				return achieved
	return {}


def _extract_achieved_from_mfp_optimizer(case_result: dict[str, Any]) -> dict[str, float]:
	opt_responses = _extract_function_responses(case_result, "optimize_quantity_for_mfp_targets")
	for response in reversed(opt_responses):
		for candidate_key in (
			"achieved_targets_per_day",
			"average_macro_nutrient_from_calculated_quantity_per_day",
			"achieved_macros_per_day",
			"achieved_daily_macros",
			"average_daily_macros",
		):
			payload = response.get(candidate_key)
			if not isinstance(payload, dict):
				continue
			achieved = _extract_nutrient_values(payload)
			if achieved:
				return achieved
		if isinstance(response, dict):
			achieved = _extract_nutrient_values(response)
			if achieved:
				return achieved
	return {}


def _extract_achieved_from_macro_average(case_result: dict[str, Any]) -> dict[str, float]:
	responses = _extract_function_responses(case_result, "calculate_average_macro_nutrient_per_day")
	if not responses:
		return {}

	response = _select_best_calculator_response(responses) or responses[-1]
	for candidate_key in (
		"average_macro_nutrient_per_day",
		"average_macro_nutrient_from_calculated_quantity_per_day",
	):
		payload = response.get(candidate_key)
		if not isinstance(payload, dict):
			continue
		achieved = _extract_nutrient_values(payload)
		if achieved:
			return achieved

	return _extract_nutrient_values(response) if isinstance(response, dict) else {}


def _extract_achieved_from_structured(agent_name: str, case_result: dict[str, Any]) -> dict[str, float]:
	if agent_name == "NoLLMCanadaDRIPlanner":
		achieved = _extract_achieved_from_optimize_quantity(case_result)
		if achieved:
			return achieved
		return _extract_achieved_from_macro_average(case_result)

	if agent_name == "NoLLMMFPNutritionPlanner":
		achieved = _extract_achieved_from_mfp_optimizer(case_result)
		if achieved:
			return achieved
		return _extract_achieved_from_macro_average(case_result)

	if agent_name == "OptimizerCanadaDRIPlanner":
		return _extract_achieved_from_optimize_quantity(case_result)
	if agent_name == "OptimizerMFPNutritionPlanner":
		return _extract_achieved_from_mfp_optimizer(case_result)
	if agent_name in {"BaselineCanadaDRIPlanner", "BaselineMFPNutritionPlanner"}:
		return _extract_achieved_from_macro_average(case_result)

	achieved = _extract_achieved_from_macro_average(case_result)
	if achieved:
		return achieved
	achieved = _extract_achieved_from_mfp_optimizer(case_result)
	if achieved:
		return achieved
	return _extract_achieved_from_optimize_quantity(case_result)


def _to_dict(value: Any) -> dict[str, Any]:
	if isinstance(value, dict):
		return value
	if isinstance(value, str):
		try:
			parsed = json.loads(value)
		except json.JSONDecodeError:
			return {}
		return parsed if isinstance(parsed, dict) else {}
	return {}


def _is_product_record(value: Any) -> bool:
	if not isinstance(value, dict):
		return False
	if "index" not in value:
		return False
	return "name" in value or "nutrition" in value or "nutrition_100g" in value or "cost" in value


def _collect_products(value: Any, out: dict[int, dict[str, Any]]) -> None:
	if _is_product_record(value):
		index = _to_float(value.get("index"))
		if index is not None:
			out[int(index)] = value
		return

	if isinstance(value, list):
		for item in value:
			_collect_products(item, out)
		return

	if isinstance(value, dict):
		for nested in value.values():
			_collect_products(nested, out)


def _extract_products_by_index(case_result: dict[str, Any]) -> dict[int, dict[str, Any]]:
	products_by_index: dict[int, dict[str, Any]] = {}
	for response in _extract_function_responses(case_result, "find_ingredient"):
		if not isinstance(response, dict):
			continue
		result = response.get("result")
		_collect_products(result if result is not None else response, products_by_index)
	return products_by_index


def _extract_item_entries(value: Any, out: list[tuple[int, float]]) -> None:
	if isinstance(value, list):
		for item in value:
			_extract_item_entries(item, out)
		return

	if not isinstance(value, dict):
		return

	index = _to_float(value.get("index"))
	if index is not None:
		quantity = None
		for quantity_key in ("qty", "quantity", "quantity_g"):
			candidate = _to_float(value.get(quantity_key))
			if candidate is not None:
				quantity = candidate
				break
		if quantity is not None and quantity > 0:
			out.append((int(index), quantity))
			return

	for nested in value.values():
		_extract_item_entries(nested, out)


def _calories_per_100g(product: dict[str, Any]) -> float | None:
	nutrition = _to_dict(product.get("nutrition_100g"))
	if not nutrition:
		nutrition = _to_dict(product.get("nutrition"))
	if not nutrition:
		return None
	for key in ("calories", "kcal", "energy_kcal", "energy", "target_calories_kcal"):
		value = _to_float(nutrition.get(key))
		if value is not None:
			return value
	return None


def _daily_calories_from_quantities(
	quantity_payload: Any,
	products_by_index: dict[int, dict[str, Any]],
) -> list[float]:
	if not isinstance(quantity_payload, dict) or not quantity_payload:
		return []

	def _sum_for_structure(structure: Any) -> float | None:
		items: list[tuple[int, float]] = []
		_extract_item_entries(structure, items)
		if not items:
			return None
		total = 0.0
		found_any = False
		for index, qty in items:
			product = products_by_index.get(index)
			if not product:
				continue
			calories_100g = _calories_per_100g(product)
			if calories_100g is None:
				continue
			total += calories_100g * (qty / 100.0)
			found_any = True
		return total if found_any else None

	has_day_keys = any(str(key).strip().lower().startswith("day") for key in quantity_payload)
	if has_day_keys:
		values: list[float] = []
		for day_payload in quantity_payload.values():
			day_total = _sum_for_structure(day_payload)
			if day_total is not None:
				values.append(day_total)
		if values:
			return values

	flat_total = _sum_for_structure(quantity_payload)
	if flat_total is not None:
		return [flat_total]
	return []


def _extract_daily_calories_from_payload(payload: dict[str, Any], products_by_index: dict[int, dict[str, Any]]) -> list[float]:
	# Preferred: compute per-day calories from ingredient quantities and nutrition_100g.
	for quantity_key in ("calculated_quantity_per_day", "average_calculated_quantity_per_day"):
		quantity_payload = payload.get(quantity_key)
		quantity_based = _daily_calories_from_quantities(quantity_payload, products_by_index)
		if quantity_based:
			return quantity_based

	# Baseline-style payload: per day macro summary with direct calories per day.
	per_day = payload.get("per_day_macro_nutrient_from_calculated_quantity")
	if isinstance(per_day, dict):
		values: list[float] = []
		for day_payload in per_day.values():
			if not isinstance(day_payload, dict):
				continue
			cal = _to_float(
				day_payload.get("calories")
				or day_payload.get("energy")
				or day_payload.get("target_calories_kcal")
			)
			if cal is not None:
				values.append(cal)
		if values:
			return values

	# Optimizer-style payload: per day, per meal nutrition with calories per meal.
	nutrition_per_meal = payload.get("nutrition_per_meal")
	if isinstance(nutrition_per_meal, dict):
		values = []
		for day_payload in nutrition_per_meal.values():
			if not isinstance(day_payload, dict):
				continue
			day_total = 0.0
			found_any = False
			for meal_payload in day_payload.values():
				if not isinstance(meal_payload, dict):
					continue
				cal = _to_float(
					meal_payload.get("calories")
					or meal_payload.get("energy")
					or meal_payload.get("target_calories_kcal")
				)
				if cal is None:
					continue
				day_total += cal
				found_any = True
			if found_any:
				values.append(day_total)
		if values:
			return values

	return []


def _extract_daily_calorie_values(agent_name: str, case_result: dict[str, Any]) -> list[float]:
	responses: list[dict[str, Any]] = []
	products_by_index = _extract_products_by_index(case_result)
	best_macro_average_response: dict[str, Any] | None = None

	if agent_name in {"BaselineCanadaDRIPlanner", "BaselineMFPNutritionPlanner", "NoLLMCanadaDRIPlanner", "NoLLMMFPNutritionPlanner"}:
		macro_average_responses = _extract_function_responses(case_result, "calculate_average_macro_nutrient_per_day")
		best_macro_average_response = _select_best_calculator_response(macro_average_responses)
		if best_macro_average_response is not None:
			responses.append(best_macro_average_response)

	if agent_name in {"OptimizerCanadaDRIPlanner", "NoLLMCanadaDRIPlanner"}:
		responses.extend(_extract_function_responses(case_result, "optimize_quantity"))

	if agent_name in {"OptimizerMFPNutritionPlanner", "NoLLMMFPNutritionPlanner"}:
		responses.extend(_extract_function_responses(case_result, "optimize_quantity_for_mfp_targets"))

	for response in reversed(responses):
		if not isinstance(response, dict):
			continue
		values = _extract_daily_calories_from_payload(response, products_by_index)
		if values:
			return values

	return []


def _resolve_eval_history_dir(agent_name: str) -> Path:
	agent_root = AGENT_CONFIGS[agent_name]
	candidates = [
		agent_root / ".adk" / "eval_history",
		agent_root / "eval_history",
		agent_root / ".adk" / "eval",
		agent_root / "eval",
	]
	for candidate in candidates:
		if candidate.exists() and candidate.is_dir():
			return candidate
	raise FileNotFoundError(f"No eval directory found for {agent_name}. Checked: {', '.join(str(path) for path in candidates)}")


def _resolve_trace_dir(agent_name: str) -> Path | None:
	agent_root = AGENT_CONFIGS[agent_name]
	candidates = [
		agent_root / ".adk" / "traces",
		agent_root / "traces",
	]
	for candidate in candidates:
		if candidate.exists() and candidate.is_dir():
			return candidate
	return None


def _parse_iso_timestamp(value: Any) -> float | None:
	if not isinstance(value, str) or not value.strip():
		return None
	text = value.strip().replace("Z", "+00:00")
	try:
		return datetime.fromisoformat(text).timestamp()
	except ValueError:
		return None


def _trace_sort_timestamp(path: Path, payload: dict[str, Any]) -> float:
	created_at_utc = _parse_iso_timestamp(payload.get("created_at_utc"))
	if created_at_utc is not None:
		return created_at_utc
	return path.stat().st_mtime


def _load_latest_trace_timing_by_eval_id(agent_name: str) -> dict[str, dict[str, float | None]]:
	trace_dir = _resolve_trace_dir(agent_name)
	if trace_dir is None:
		return {}

	latest_eval: dict[str, tuple[float, dict[str, float | None]]] = {}
	latest_inference: dict[str, tuple[float, dict[str, float | None]]] = {}
	for path in sorted(trace_dir.rglob("*.trace.json")):
		try:
			payload = json.loads(path.read_text(encoding="utf-8"))
		except Exception:
			continue
		if not isinstance(payload, dict):
			continue

		phase = payload.get("phase", "evaluation")
		sort_ts = _trace_sort_timestamp(path, payload)

		if phase == "evaluation":
			eval_id = payload.get("eval_id")
			if not isinstance(eval_id, str) or not eval_id:
				continue
			timing = {
				"user_message_timestamp": _to_float(payload.get("user_message_timestamp")),
				"final_message_timestamp": _to_float(payload.get("final_message_timestamp")),
				"latency_seconds": _to_float(payload.get("latency_seconds")),
			}
			current = latest_eval.get(eval_id)
			if current is None or sort_ts >= current[0]:
				latest_eval[eval_id] = (sort_ts, timing)
			continue

		if phase == "inference":
			inference_result = payload.get("inference_result")
			if not isinstance(inference_result, dict):
				continue
			eval_id = inference_result.get("eval_case_id")
			if not isinstance(eval_id, str) or not eval_id:
				continue

			user_message_timestamp = None
			inferences = inference_result.get("inferences")
			if isinstance(inferences, list) and inferences:
				first_inference = inferences[0]
				if isinstance(first_inference, dict):
					user_message_timestamp = _to_float(first_inference.get("creation_timestamp"))

			final_message_timestamp = _parse_iso_timestamp(payload.get("created_at_utc"))
			latency_seconds = None
			if (
				user_message_timestamp is not None
				and final_message_timestamp is not None
			):
				latency = final_message_timestamp - user_message_timestamp
				if latency >= 0:
					latency_seconds = latency

			timing = {
				"user_message_timestamp": user_message_timestamp,
				"final_message_timestamp": final_message_timestamp,
				"latency_seconds": latency_seconds,
			}
			current = latest_inference.get(eval_id)
			if current is None or sort_ts >= current[0]:
				latest_inference[eval_id] = (sort_ts, timing)

	combined: dict[str, dict[str, float | None]] = {}
	all_eval_ids = set(latest_eval.keys()) | set(latest_inference.keys())
	for eval_id in all_eval_ids:
		inference_timing = latest_inference.get(eval_id, (0.0, {}))[1]
		eval_timing = latest_eval.get(eval_id, (0.0, {}))[1]
		if _to_float(inference_timing.get("latency_seconds")) is not None:
			combined[eval_id] = inference_timing
		elif _to_float(eval_timing.get("latency_seconds")) is not None:
			combined[eval_id] = eval_timing
		elif inference_timing:
			combined[eval_id] = inference_timing
		elif eval_timing:
			combined[eval_id] = eval_timing

	return combined


def _extract_float_near_label(text: str, label_patterns: list[str]) -> float | None:
	for pattern in label_patterns:
		match = re.search(pattern, text, re.IGNORECASE)
		if match:
			candidate = match.group(1).replace(",", "")
			try:
				return float(candidate)
			except ValueError:
				continue
	return None


def _extract_achieved_from_text(case_result: dict[str, Any]) -> dict[str, float]:
	text = _extract_final_response_text(case_result)
	if not text:
		return {}

	# Fallback for agents that report nutrients in a markdown summary table.
	for raw_line in text.splitlines():
		line = raw_line.strip()
		if "|" not in line:
			continue
		cells = [cell.strip() for cell in line.split("|") if cell.strip()]
		if len(cells) < 3:
			continue
		if cells[0].strip("-").strip() in {"", "parameter", "target", "actual"}:
			continue

		param = _canonical_key(cells[0])
		actual_value = _to_float(re.search(r"([0-9]+(?:\.[0-9]+)?)", cells[2]).group(1)) if re.search(r"([0-9]+(?:\.[0-9]+)?)", cells[2]) else None
		if actual_value is None:
			continue

		if param in {"protein"}:
			return_value_key = "protein"
		elif param in {"carbs", "carbohydrates"}:
			return_value_key = "carbohydrates"
		elif param in {"fat", "fats", "total fat"}:
			return_value_key = "total_fat"
		elif param in {"calories", "energy"}:
			return_value_key = "calories"
		elif param in {"fibre", "fiber", "total fibre", "total fiber"}:
			return_value_key = "total_fibre"
		else:
			continue

		# Defer returning so non-table regex extraction can fill any missing fields.
		# This dict is initialized lazily to avoid affecting existing logic when table rows are absent.
		if "table_achieved" not in locals():
			table_achieved: dict[str, float] = {}
		table_achieved[return_value_key] = actual_value

	patterns = {
		"calories": [
			r"(?:\*\*)?(?:calories|energy)(?:\*\*)?\s*[:|]\s*\*{0,2}~?([0-9]+(?:\.[0-9]+)?)",
		],
		"protein": [
			r"(?:\*\*)?protein(?:\*\*)?\s*[:|]\s*\*{0,2}~?([0-9]+(?:\.[0-9]+)?)",
		],
		"carbohydrates": [
			r"(?:\*\*)?carbohydrates(?:\*\*)?\s*[:|]\s*\*{0,2}~?([0-9]+(?:\.[0-9]+)?)",
			r"(?:\*\*)?carbs(?:\*\*)?\s*[:|]\s*\*{0,2}~?([0-9]+(?:\.[0-9]+)?)",
		],
		"total_fat": [
			r"(?:\*\*)?(?:total\s+fat|fat|fats)(?:\*\*)?\s*[:|]\s*\*{0,2}~?([0-9]+(?:\.[0-9]+)?)",
		],
		"total_fibre": [
			r"(?:\*\*)?(?:total\s+fibre|fibre|total\s+fiber|fiber)(?:\*\*)?\s*[:|]\s*\*{0,2}~?([0-9]+(?:\.[0-9]+)?)",
		],
	}
	achieved: dict[str, float] = {}
	for key, label_patterns in patterns.items():
		value = _extract_float_near_label(text, label_patterns)
		if value is not None:
			achieved[key] = value

	if "table_achieved" in locals():
		for key, value in table_achieved.items():
			achieved.setdefault(key, value)
	return achieved



def _extract_metric_rubric_scores(metric: dict[str, Any]) -> list[dict[str, Any]]:
	details = metric.get("details")
	if not isinstance(details, dict):
		return []
	rubric_scores = details.get("rubric_scores")
	if not isinstance(rubric_scores, list):
		return []
	return [item for item in rubric_scores if isinstance(item, dict)]


def _extract_cuisine_score_from_metric(metric: dict[str, Any]) -> float | None:
	metric_name = metric.get("metric_name")
	if metric_name == "per_day_cuisine_alignment_score":
		day_scores: list[float] = []
		for rubric_score in _extract_metric_rubric_scores(metric):
			rubric_id = rubric_score.get("rubric_id")
			if not isinstance(rubric_id, str) or not re.fullmatch(r"day_\d+_cuisine_alignment_score", rubric_id):
				continue
			score = _to_float(rubric_score.get("score"))
			if score is not None:
				day_scores.append(score)
		if day_scores:
			return sum(day_scores) / len(day_scores)

	score = _to_float(metric.get("score"))
	if score is None:
		return None
	return max(0.0, min(1.0, score))


def _extract_cuisine_alignment_score(case_result: dict[str, Any]) -> float | None:
	metric_results = case_result.get("overall_eval_metric_results") or []
	fallback_score: float | None = None
	for metric in metric_results:
		if not isinstance(metric, dict):
			continue
		metric_name = metric.get("metric_name")
		if metric_name not in {"cuisine_alignment_score", "per_day_cuisine_alignment_score"}:
			continue
		score = _extract_cuisine_score_from_metric(metric)
		if score is None:
			continue
		if metric_name == "per_day_cuisine_alignment_score":
			return score
		fallback_score = score
	return fallback_score


def _extract_palatability_score(case_result: dict[str, Any]) -> float | None:
	metric_results = case_result.get("overall_eval_metric_results") or []
	fallback_score: float | None = None
	for metric in metric_results:
		if not isinstance(metric, dict):
			continue
		metric_name = metric.get("metric_name")
		if metric_name not in {"palatability_score", "per_day_palatability_score"}:
			continue
		if metric_name == "per_day_palatability_score":
			day_scores: list[float] = []
			for rubric_score in _extract_metric_rubric_scores(metric):
				rubric_id = rubric_score.get("rubric_id")
				if not isinstance(rubric_id, str) or not re.fullmatch(r"day_\d+_palatability_score", rubric_id):
					continue
				score = _to_float(rubric_score.get("score"))
				if score is not None:
					day_scores.append(score)
			if day_scores:
				return sum(day_scores) / len(day_scores)

		score = _to_float(metric.get("score"))
		if score is None:
			continue
		fallback_score = max(0.0, min(1.0, score))

	return fallback_score



def _load_rows_from_samples_path(samples_path: Path) -> list[dict[str, Any]]:
	if not samples_path.exists():
		if samples_path == DEFAULT_SAMPLES_PATH:
			rows: list[dict[str, Any]] = []
			for fallback_path in DEFAULT_SAMPLE_SOURCES:
				rows.extend(_load_rows_from_samples_path(fallback_path))
			return rows
		raise FileNotFoundError(f"Sample source not found: {samples_path}")

	if samples_path.suffix == ".jsonl":
		rows: list[dict[str, Any]] = []
		for line in samples_path.read_text(encoding="utf-8").splitlines():
			line = line.strip()
			if not line:
				continue
			payload = json.loads(line)
			if isinstance(payload, dict):
				rows.append(payload)
		return rows

	data = json.loads(samples_path.read_text(encoding="utf-8"))
	if isinstance(data, list):
		return [row for row in data if isinstance(row, dict)]
	return []


def _load_sample_lookup(samples_path: Path) -> dict[str, dict[str, Any]]:
	data = _load_rows_from_samples_path(samples_path)
	lookup: dict[str, dict[str, Any]] = {}
	for row in data:
		profile = row.get("profile_sentence")
		if isinstance(profile, str):
			lookup[_normalize_text(profile)] = row
	return lookup


def _latest_case_results(history_dir: Path) -> dict[str, tuple[float, dict[str, Any]]]:
	latest: dict[str, tuple[float, dict[str, Any]]] = {}
	for path in sorted(history_dir.glob("*.evalset_result.json")):
		try:
			payload = json.loads(path.read_text(encoding="utf-8"))
		except Exception:
			continue
		case_results = payload.get("eval_case_results")
		if not isinstance(case_results, list):
			continue
		timestamp = _parse_history_timestamp(path)
		for case_result in case_results:
			if not isinstance(case_result, dict):
				continue
			eval_id = case_result.get("eval_id")
			if not isinstance(eval_id, str):
				continue
			current = latest.get(eval_id)
			if current is None or timestamp >= current[0]:
				latest[eval_id] = (timestamp, case_result)
	return latest


def _build_case_metrics(
	agent_name: str,
	case_result: dict[str, Any],
	sample_lookup: dict[str, dict[str, Any]],
	eval_timestamp: float,
	trace_timing_by_eval_id: dict[str, dict[str, float | None]] | None = None,
) -> CaseMetrics | None:
	profile_sentence = _extract_user_text(case_result)
	if not profile_sentence:
		return None
	sample = sample_lookup.get(_normalize_text(profile_sentence)) or {}
	bounds = _extract_target_bounds(case_result)
	achieved = _extract_achieved_from_structured(agent_name, case_result)
	if not achieved:
		achieved = _extract_achieved_from_text(case_result)
	requested_cuisine = _extract_requested_cuisine(profile_sentence) or _normalize_text(str(sample.get("food_type", ""))) or None
	factor_categories: list[tuple[str, str]] = []
	for entry in sample.get("categories", []):
		if not isinstance(entry, dict):
			continue
		factor = entry.get("factor")
		category = entry.get("category")
		if isinstance(factor, str) and isinstance(category, str):
			canonical_factor = _canonical_key(factor)
			canonical_category = _canonical_key(category)
			if canonical_factor == "activity level":
				canonical_category = _normalize_activity_category(canonical_category)
			factor_categories.append((canonical_factor, canonical_category))
	if not factor_categories:
		factor_categories = _derive_factor_categories_from_sample(sample)

	daily_calories = _extract_daily_calorie_values(agent_name, case_result)
	min_daily_calories = min(daily_calories) if daily_calories else None
	max_daily_calories = max(daily_calories) if daily_calories else None

	eval_id = str(case_result.get("eval_id", ""))
	trace_timing = (trace_timing_by_eval_id or {}).get(eval_id, {})
	trace_latency = _to_float(trace_timing.get("latency_seconds")) if isinstance(trace_timing, dict) else None
	if trace_latency is not None and trace_latency < 0:
		trace_latency = None

	latency_seconds = trace_latency
	if latency_seconds is None:
		latency_seconds = _extract_latency_seconds(case_result, eval_timestamp)

	return CaseMetrics(
		agent=agent_name,
		eval_id=eval_id,
		profile_sentence=profile_sentence,
		eval_timestamp=eval_timestamp,
		factor_categories=factor_categories,
		requested_cuisine=requested_cuisine,
		calories_target=bounds["calories"],
		protein_target=bounds["protein"],
		carbohydrates_target=bounds["carbohydrates"],
		total_fat_target=bounds["total_fat"],
		total_fibre_target=bounds["total_fibre"],
		achieved=achieved,
		min_daily_calories=min_daily_calories,
		max_daily_calories=max_daily_calories,
		cuisine_alignment_score=_extract_cuisine_alignment_score(case_result),
		palatability_score=_extract_palatability_score(case_result),
		latency_seconds=latency_seconds,
		tool_retries=_extract_tool_retries(case_result),
	)


def _is_not_met(value: float | None, bounds: MetricBounds) -> bool | None:
	if value is None:
		return None
	if bounds.lower is not None and value < bounds.lower:
		return True
	if bounds.upper is not None and value > bounds.upper:
		return True
	if bounds.lower is None and bounds.upper is None:
		return None
	return False


def _outside_range_abs_deviation(value: float | None, bounds: MetricBounds) -> float | None:
	if value is None:
		return None
	if bounds.lower is None and bounds.upper is None:
		return None
	if bounds.lower is not None and value < bounds.lower:
		return bounds.lower - value
	if bounds.upper is not None and value > bounds.upper:
		return value - bounds.upper
	return 0.0


def _abs_error_from_target(value: float | None, target: MetricBounds) -> float | None:
	if value is None:
		return None
	target_value = target.lower if target.lower is not None else target.upper
	if target_value is None:
		return None
	return abs(value - target_value)


def _calorie_error(value: float | None, target: MetricBounds) -> float | None:
	if value is None:
		return None
	if target.lower is None:
		return None
	return abs(value - target.lower)


def _case_feasible(case: CaseMetrics) -> bool | None:
	if case.agent in DRI_AGENTS:
		calorie_error = _calorie_error(case.achieved.get("calories"), case.calories_target)
		protein_violation = _outside_range_abs_deviation(case.achieved.get("protein"), case.protein_target)
		carb_violation = _outside_range_abs_deviation(case.achieved.get("carbohydrates"), case.carbohydrates_target)
		fat_violation = _outside_range_abs_deviation(case.achieved.get("total_fat"), case.total_fat_target)
		fibre_error = _abs_error_from_target(case.achieved.get("total_fibre"), case.total_fibre_target)
		if any(value is None for value in (calorie_error, protein_violation, carb_violation, fat_violation, fibre_error)):
			return None
		return (
			calorie_error <= 10.0
			and fibre_error <= 5.0
			and protein_violation == 0.0
			and carb_violation == 0.0
			and fat_violation == 0.0
		)

	if case.agent in MFP_AGENTS:
		calorie_error = _calorie_error(case.achieved.get("calories"), case.calories_target)
		protein_error = _abs_error_from_target(case.achieved.get("protein"), case.protein_target)
		carb_error = _abs_error_from_target(case.achieved.get("carbohydrates"), case.carbohydrates_target)
		fat_error = _abs_error_from_target(case.achieved.get("total_fat"), case.total_fat_target)
		if any(value is None for value in (calorie_error, protein_error, carb_error, fat_error)):
			return None
		return (
			calorie_error <= 10.0
			and protein_error <= 5.0
			and carb_error <= 2.0
			and fat_error <= 5.0
		)

	return None


def _pct(part: int, whole: int) -> float | None:
	if whole <= 0:
		return None
	return 100.0 * part / whole


def _format_metric(value: float | None, decimals: int = 1) -> str:
	if value is None or math.isnan(value):
		return "--"
	return f"{value:.{decimals}f}"


def _cuisine_score_bucket(score: float) -> str:
	if score < 0.25:
		return "cuisine_lt_025_pct"
	if score < 0.5:
		return "cuisine_025_05_pct"
	if score <= 0.75:
		return "cuisine_05_075_pct"
	return "cuisine_gt_075_pct"


def _summarize_group(cases: list[CaseMetrics]) -> dict[str, Any]:
	calorie_errors = [_calorie_error(case.achieved.get("calories"), case.calories_target) for case in cases]
	calorie_errors = [value for value in calorie_errors if value is not None]

	protein_devs = [_outside_range_abs_deviation(case.achieved.get("protein"), case.protein_target) for case in cases]
	protein_devs = [value for value in protein_devs if value is not None]

	carb_devs = [_outside_range_abs_deviation(case.achieved.get("carbohydrates"), case.carbohydrates_target) for case in cases]
	carb_devs = [value for value in carb_devs if value is not None]

	fat_devs = [_outside_range_abs_deviation(case.achieved.get("total_fat"), case.total_fat_target) for case in cases]
	fat_devs = [value for value in fat_devs if value is not None]

	fibre_errors = [_abs_error_from_target(case.achieved.get("total_fibre"), case.total_fibre_target) for case in cases]
	fibre_errors = [value for value in fibre_errors if value is not None]

	feasible_flags = [_case_feasible(case) for case in cases]
	cuisine_scores = [case.cuisine_alignment_score for case in cases if case.cuisine_alignment_score is not None]

	def _rate(flags: list[bool | None]) -> float | None:
		valid = [flag for flag in flags if flag is not None]
		return _pct(sum(1 for flag in valid if flag), len(valid)) if valid else None

	def _bucket_rate(bucket_key: str) -> float | None:
		if not cuisine_scores:
			return None
		count = sum(1 for score in cuisine_scores if _cuisine_score_bucket(score) == bucket_key)
		return _pct(count, len(cuisine_scores))

	return {
		"n": len(cases),
		"calorie_error_kcal": sum(calorie_errors) / len(calorie_errors) if calorie_errors else None,
		"protein_outside_range_mad_g": sum(protein_devs) / len(protein_devs) if protein_devs else None,
		"carb_outside_range_mad_g": sum(carb_devs) / len(carb_devs) if carb_devs else None,
		"fat_outside_range_mad_g": sum(fat_devs) / len(fat_devs) if fat_devs else None,
		"fibre_mae_g": sum(fibre_errors) / len(fibre_errors) if fibre_errors else None,
		"feasible_pct": _rate(feasible_flags),
		"cuisine_lt_025_pct": _bucket_rate("cuisine_lt_025_pct"),
		"cuisine_025_05_pct": _bucket_rate("cuisine_025_05_pct"),
		"cuisine_05_075_pct": _bucket_rate("cuisine_05_075_pct"),
		"cuisine_gt_075_pct": _bucket_rate("cuisine_gt_075_pct"),
	}


def _group_cases(cases: list[CaseMetrics]) -> dict[tuple[str, str], list[CaseMetrics]]:
	grouped: dict[tuple[str, str], list[CaseMetrics]] = {}
	for case in cases:
		for factor_category in case.factor_categories:
			grouped.setdefault(factor_category, []).append(case)
	return grouped


def _unique_cases_from_grouped(grouped: dict[tuple[str, str], list[CaseMetrics]]) -> list[CaseMetrics]:
	# Deduplicate by eval_id so aggregate rows count each evaluated profile once.
	unique: dict[str, CaseMetrics] = {}
	for bucket in grouped.values():
		for case in bucket:
			if case.eval_id and case.eval_id not in unique:
				unique[case.eval_id] = case
	return list(unique.values())


def _summarize_row(grouped: dict[tuple[str, str], list[CaseMetrics]], key: tuple[str, str]) -> dict[str, Any]:
	# The "All" row is the aggregate across unique eval cases (no subgroup double-counting).
	if key == ("overall", "baseline"):
		return _summarize_group(_unique_cases_from_grouped(grouped))
	return _summarize_group(grouped.get(key, []))


def _render_markdown(agent_name: str, grouped: dict[tuple[str, str], list[CaseMetrics]]) -> str:
	lines = [
		f"## {agent_name}",
		"",
		"| Input factor | Category | N | Cal. error (kcal) | Protein outside-range MAD (g) | Carb outside-range MAD (g) | Fat outside-range MAD (g) | Fibre MAE (g) | Feasible (%) | Cuisine <0.25 (%) | Cuisine 0.25-0.5 (%) | Cuisine 0.5-0.75 (%) | Cuisine >0.75 (%) |",
		"| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
	]
	for factor, category in TABLE_ROW_ORDER:
		key = (_canonical_key(factor), _canonical_key(category))
		summary = _summarize_row(grouped, key)
		display_factor, display_category = DISPLAY_LABELS.get(key, (factor.title(), category.title()))
		lines.append(
			"| {factor} | {category} | {n} | {cal} | {protein} | {carb} | {fat} | {fibre} | {feasible} | {lt_025} | {r_025_05} | {r_05_075} | {gt_075} |".format(
				factor=display_factor,
				category=display_category,
				n=summary["n"],
				cal=_format_metric(summary["calorie_error_kcal"]),
				protein=_format_metric(summary["protein_outside_range_mad_g"]),
				carb=_format_metric(summary["carb_outside_range_mad_g"]),
				fat=_format_metric(summary["fat_outside_range_mad_g"]),
				fibre=_format_metric(summary["fibre_mae_g"]),
				feasible=_format_metric(summary["feasible_pct"]),
				lt_025=_format_metric(summary["cuisine_lt_025_pct"]),
				r_025_05=_format_metric(summary["cuisine_025_05_pct"]),
				r_05_075=_format_metric(summary["cuisine_05_075_pct"]),
				gt_075=_format_metric(summary["cuisine_gt_075_pct"]),
			)
		)
	lines.append("")
	return "\n".join(lines)


def _empty_summary() -> dict[str, Any]:
	return {
		"n": 0,
		"calorie_error_kcal": None,
		"protein_outside_range_mad_g": None,
		"carb_outside_range_mad_g": None,
		"fat_outside_range_mad_g": None,
		"fibre_mae_g": None,
		"feasible_pct": None,
		"cuisine_lt_025_pct": None,
		"cuisine_025_05_pct": None,
		"cuisine_05_075_pct": None,
		"cuisine_gt_075_pct": None,
	}


def _format_pair(ano_value: float | None, llm_value: float | None, *, higher_is_better: bool = False) -> str:
	baseline_text = _format_metric(ano_value)
	optimizer_text = _format_metric(llm_value)
	if (
		ano_value is not None
		and llm_value is not None
		and not math.isnan(ano_value)
		and not math.isnan(llm_value)
		and ano_value != llm_value
	):
		if (ano_value > llm_value) if higher_is_better else (ano_value < llm_value):
			baseline_text = f"\\textbf{{{baseline_text}}}"
		else:
			optimizer_text = f"\\textbf{{{optimizer_text}}}"
	return f"{baseline_text} / {optimizer_text}"


def _format_n_cell(ano_n: int, llm_n: int) -> str:
	if ano_n == llm_n:
		return str(ano_n)
	return f"{ano_n}/{llm_n}"


def _get_agent_summary(
	summaries_by_agent: dict[str, dict[tuple[str, str], dict[str, Any]]],
	agent_name: str | None,
	key: tuple[str, str],
) -> dict[str, Any]:
	if not agent_name:
		return _empty_summary()
	return summaries_by_agent.get(agent_name, {}).get(key, _empty_summary())


def _resolve_pair_agents(available_agents: list[str]) -> tuple[str | None, str | None]:
	ano = "BaselineCanadaDRIPlanner" if "BaselineCanadaDRIPlanner" in available_agents else None
	llm_only = "OptimizerCanadaDRIPlanner" if "OptimizerCanadaDRIPlanner" in available_agents else None

	if ano is None and available_agents:
		ano = available_agents[0]
	if llm_only is None:
		for candidate in available_agents:
			if candidate != ano:
				llm_only = candidate
				break

	return ano, llm_only


def _render_latex_table_rows(summaries_by_agent: dict[str, dict[tuple[str, str], dict[str, Any]]]) -> str:
	lines: list[str] = []
	for heading, baseline_agent, optimizer_agent in AGENT_COMPARISON_GROUPS:
		if baseline_agent not in summaries_by_agent and optimizer_agent not in summaries_by_agent:
			continue
		lines.append("\\multicolumn{{12}}{{l}}{{\\textbf{{{heading}}}}} \\\\".format(heading=heading))
		lines.append("\\hline")
		for row_type, label, key_pair in LATEX_TABLE_LAYOUT:
			if row_type == "group":
				lines.append("{label} & & & & & & & & & & & \\\\".format(label=label))
				lines.append("\\hline")
				continue

			assert key_pair is not None
			key = (_canonical_key(key_pair[0]), _canonical_key(key_pair[1]))
			baseline_summary = _get_agent_summary(summaries_by_agent, baseline_agent, key)
			optimizer_summary = _get_agent_summary(summaries_by_agent, optimizer_agent, key)

			lines.append(
				"{label} & {n} & {cal} & {protein} & {carb} & {fat} & {fibre} & {feasible} & {lt_025} & {r_025_05} & {r_05_075} & {gt_075} \\\\".format(
					label=label,
					n=_format_n_cell(int(baseline_summary["n"]), int(optimizer_summary["n"])),
					cal=_format_pair(baseline_summary["calorie_error_kcal"], optimizer_summary["calorie_error_kcal"]),
					protein=_format_pair(baseline_summary["protein_outside_range_mad_g"], optimizer_summary["protein_outside_range_mad_g"]),
					carb=_format_pair(baseline_summary["carb_outside_range_mad_g"], optimizer_summary["carb_outside_range_mad_g"]),
					fat=_format_pair(baseline_summary["fat_outside_range_mad_g"], optimizer_summary["fat_outside_range_mad_g"]),
					fibre=_format_pair(baseline_summary["fibre_mae_g"], optimizer_summary["fibre_mae_g"]),
					feasible=_format_pair(baseline_summary["feasible_pct"], optimizer_summary["feasible_pct"], higher_is_better=True),
					lt_025=_format_pair(baseline_summary["cuisine_lt_025_pct"], optimizer_summary["cuisine_lt_025_pct"]),
					r_025_05=_format_pair(baseline_summary["cuisine_025_05_pct"], optimizer_summary["cuisine_025_05_pct"]),
					r_05_075=_format_pair(baseline_summary["cuisine_05_075_pct"], optimizer_summary["cuisine_05_075_pct"], higher_is_better=True),
					gt_075=_format_pair(baseline_summary["cuisine_gt_075_pct"], optimizer_summary["cuisine_gt_075_pct"], higher_is_better=True),
				)
			)
			lines.append("\\hline")

	return "\n".join(lines)


def _bucket_rate(values: list[float], *, lower: float | None, upper: float | None, include_lower: bool, include_upper: bool) -> float | None:
	if not values:
		return None

	def in_bucket(value: float) -> bool:
		if lower is not None:
			if include_lower:
				if value < lower:
					return False
			else:
				if value <= lower:
					return False
		if upper is not None:
			if include_upper:
				if value > upper:
					return False
			else:
				if value >= upper:
					return False
		return True

	count = sum(1 for value in values if in_bucket(value))
	return _pct(count, len(values))


def _html_cell_for_bucket(values: list[float], *, lower: float | None, upper: float | None, include_lower: bool, include_upper: bool) -> str:
	rate = _bucket_rate(
		values,
		lower=lower,
		upper=upper,
		include_lower=include_lower,
		include_upper=include_upper,
	)
	if rate is None:
		return "--"
	return f"{rate:.1f}%"


def _percentile(values: list[float], percentile: float) -> float | None:
	if not values:
		return None
	if len(values) == 1:
		return values[0]
	sorted_values = sorted(values)
	position = (percentile / 100.0) * (len(sorted_values) - 1)
	lower_index = int(math.floor(position))
	upper_index = int(math.ceil(position))
	if lower_index == upper_index:
		return sorted_values[lower_index]
	weight = position - lower_index
	return sorted_values[lower_index] + (sorted_values[upper_index] - sorted_values[lower_index]) * weight


def _html_cell_for_scalar(value: float | None, suffix: str = "") -> str:
	if value is None or math.isnan(value):
		return "--"
	return f"{value:.1f}{suffix}"


def _case_calorie_deviation(case: CaseMetrics) -> float | None:
	return _calorie_error(case.achieved.get("calories"), case.calories_target)


def _case_protein_deviation(case: CaseMetrics) -> float | None:
	return _outside_range_abs_deviation(case.achieved.get("protein"), case.protein_target)


def _case_carb_deviation(case: CaseMetrics) -> float | None:
	return _outside_range_abs_deviation(case.achieved.get("carbohydrates"), case.carbohydrates_target)


def _case_fat_deviation(case: CaseMetrics) -> float | None:
	return _outside_range_abs_deviation(case.achieved.get("total_fat"), case.total_fat_target)


def _case_fibre_deviation(case: CaseMetrics) -> float | None:
	return _abs_error_from_target(case.achieved.get("total_fibre"), case.total_fibre_target)


def _render_html_bucket_table(cases_by_agent: dict[str, list[CaseMetrics]], ordered_agents: list[str]) -> str:
	metric_values: dict[str, dict[str, list[float]]] = {}
	latency_by_retry_bucket: dict[str, dict[int, list[float]]] = {}
	for agent_name in ordered_agents:
		cases = cases_by_agent.get(agent_name, [])
		metric_values[agent_name] = {
			"calorie": [value for value in (_case_calorie_deviation(case) for case in cases) if value is not None],
			"protein": [value for value in (_case_protein_deviation(case) for case in cases) if value is not None],
			"carb": [value for value in (_case_carb_deviation(case) for case in cases) if value is not None],
			"fat": [value for value in (_case_fat_deviation(case) for case in cases) if value is not None],
			"fibre": [value for value in (_case_fibre_deviation(case) for case in cases) if value is not None],
			"min_daily_calorie": [case.min_daily_calories for case in cases if case.min_daily_calories is not None],
			"max_daily_calorie": [case.max_daily_calories for case in cases if case.max_daily_calories is not None],
			"cuisine": [case.cuisine_alignment_score for case in cases if case.cuisine_alignment_score is not None],
			"palatability": [case.palatability_score for case in cases if case.palatability_score is not None],
			"latency": [case.latency_seconds for case in cases if case.latency_seconds is not None],
			"tool_retries": [float(case.tool_retries) for case in cases if case.tool_retries is not None],
		}
		latency_buckets: dict[int, list[float]] = {0: [], 1: [], 2: [], 3: []}
		for case in cases:
			if case.tool_retries in latency_buckets and case.latency_seconds is not None:
				latency_buckets[case.tool_retries].append(case.latency_seconds)
		latency_by_retry_bucket[agent_name] = latency_buckets

	row_specs = [
		("Calorie deviation <=10 kcal", "calorie", 0.0, 10.0, True, True),
		("Calorie deviation 10-50 kcal", "calorie", 10.0, 50.0, False, True),
		("Calorie deviation 50-100 kcal", "calorie", 50.0, 100.0, False, True),
		("Calorie deviation >100 kcal", "calorie", 100.0, None, False, False),
		("Protein deviation <=5 g", "protein", 0.0, 5.0, True, True),
		("Protein deviation 5-25 g", "protein", 5.0, 25.0, False, True),
		("Protein deviation 25-50 g", "protein", 25.0, 50.0, False, True),
		("Protein deviation >50 g", "protein", 50.0, None, False, False),
		("Carbohydrate deviation <=2 g", "carb", 0.0, 2.0, True, True),
		("Carbohydrate deviation 2-5 g", "carb", 2.0, 5.0, False, True),
		("Carbohydrate deviation 5-50 g", "carb", 5.0, 50.0, False, True),
		("Carbohydrate deviation >50 g", "carb", 50.0, None, False, False),
		("Fat deviation <=5 g", "fat", 0.0, 5.0, True, True),
		("Fat deviation 5-10 g", "fat", 5.0, 10.0, False, True),
		("Fat deviation 10-25 g", "fat", 10.0, 25.0, False, True),
		("Fat deviation >25 g", "fat", 25.0, None, False, False),
		("Dietary fiber deviation <=5 g", "fibre", 0.0, 5.0, True, True),
		("Dietary fiber deviation 5-10 g", "fibre", 5.0, 10.0, False, True),
		("Dietary fiber deviation 10-15 g", "fibre", 10.0, 15.0, False, True),
		("Dietary fiber deviation >15 g", "fibre", 15.0, None, False, False),
		("Min daily calorie <1000 kcal", "min_daily_calorie", None, 1000.0, False, False),
		("Min daily calorie 1000-1400 kcal", "min_daily_calorie", 1000.0, 1400.0, True, True),
		("Min daily calorie >1400 kcal", "min_daily_calorie", 1400.0, None, False, False),
		("Max daily calorie >3800 kcal", "max_daily_calorie", 3800.0, None, False, False),
		("Max daily calorie 3400-3800 kcal", "max_daily_calorie", 3400.0, 3800.0, True, True),
		("Max daily calorie <3400 kcal", "max_daily_calorie", None, 3400.0, False, False),
		("Cuisine alignment score <0.25", "cuisine", None, 0.25, False, False),
		("Cuisine alignment score 0.25-0.50", "cuisine", 0.25, 0.5, True, False),
		("Cuisine alignment score 0.50-0.75", "cuisine", 0.5, 0.75, True, False),
		("Cuisine alignment score >=0.75", "cuisine", 0.75, None, True, False),
		("Palatability score <0.25", "palatability", None, 0.25, False, False),
		("Palatability score 0.25-0.50", "palatability", 0.25, 0.5, True, False),
		("Palatability score 0.50-0.75", "palatability", 0.5, 0.75, True, True),
		("Palatability score >0.75", "palatability", 0.75, None, False, False),
		("Number of tool retries =0", "tool_retries", 0.0, 0.0, True, True),
		("Number of tool retries =1", "tool_retries", 1.0, 1.0, True, True),
		("Number of tool retries =2", "tool_retries", 2.0, 2.0, True, True),
	]

	headers = ["Metric Bucket", *ordered_agents]
	lines = [
		"<!doctype html>",
		"<html lang=\"en\">",
		"<head>",
		"  <meta charset=\"utf-8\" />",
		"  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />",
		"  <title>Deviation and Cuisine Alignment Buckets</title>",
		"  <style>",
		"    body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 24px; color: #1f2937; }",
		"    h1 { font-size: 20px; margin-bottom: 12px; }",
		"    table { border-collapse: collapse; width: 100%; max-width: 1200px; }",
		"    th, td { border: 1px solid #d1d5db; padding: 8px 10px; text-align: right; }",
		"    th:first-child, td:first-child { text-align: left; }",
		"    thead th { background: #f3f4f6; }",
		"    tbody tr:nth-child(odd) { background: #fafafa; }",
		"  </style>",
		"</head>",
		"<body>",
		"  <h1>Deviation and Cuisine Alignment Buckets</h1>",
		"  <table>",
		"    <thead>",
		"      <tr>",
	]
	for header in headers:
		lines.append(f"        <th>{header}</th>")
	lines.extend([
		"      </tr>",
		"    </thead>",
		"    <tbody>",
	])

	for label, metric_key, lower, upper, include_lower, include_upper in row_specs:
		lines.append("      <tr>")
		lines.append(f"        <td>{label}</td>")
		for agent_name in ordered_agents:
			values = metric_values.get(agent_name, {}).get(metric_key, [])
			cell = _html_cell_for_bucket(
				values,
				lower=lower,
				upper=upper,
				include_lower=include_lower,
				include_upper=include_upper,
			)
			lines.append(f"        <td>{cell}</td>")
		lines.append("      </tr>")

	scalar_rows = [
		("Latency p50 (s)", "latency", "p50"),
		("Latency p95 (s)", "latency", "p95"),
	]
	for label, metric_key, reducer in scalar_rows:
		lines.append("      <tr>")
		lines.append(f"        <td>{label}</td>")
		for agent_name in ordered_agents:
			values = metric_values.get(agent_name, {}).get(metric_key, [])
			if reducer == "p50":
				value = _percentile(values, 50.0)
			elif reducer == "p95":
				value = _percentile(values, 95.0)
			else:
				value = (sum(values) / len(values)) if values else None
			cell = _html_cell_for_scalar(value)
			lines.append(f"        <td>{cell}</td>")
		lines.append("      </tr>")

	for retry_bucket in (0, 1, 2):
		lines.append("      <tr>")
		lines.append(f"        <td>Latency p50 (s) for tool retries ={retry_bucket}</td>")
		for agent_name in ordered_agents:
			values = latency_by_retry_bucket.get(agent_name, {}).get(retry_bucket, [])
			value = _percentile(values, 50.0)
			cell = _html_cell_for_scalar(value)
			lines.append(f"        <td>{cell}</td>")
		lines.append("      </tr>")

		lines.append("      <tr>")
		lines.append(f"        <td>Latency p95 (s) for tool retries ={retry_bucket}</td>")
		for agent_name in ordered_agents:
			values = latency_by_retry_bucket.get(agent_name, {}).get(retry_bucket, [])
			value = _percentile(values, 95.0)
			cell = _html_cell_for_scalar(value)
			lines.append(f"        <td>{cell}</td>")
		lines.append("      </tr>")

	lines.extend([
		"    </tbody>",
		"  </table>",
		"</body>",
		"</html>",
	])
	return "\n".join(lines)


def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Compute LaTeX results-table metrics from ADK eval_history outputs.")
	parser.add_argument(
		"--agents",
		nargs="+",
		choices=sorted(AGENT_CONFIGS.keys()),
		default=DEFAULT_AGENT_ORDER,
		help="Agents to evaluate.",
	)
	parser.add_argument(
		"--samples-path",
		type=Path,
		default=DEFAULT_SAMPLES_PATH,
		help="Path to category_samples.json.",
	)
	parser.add_argument(
		"--output-json",
		type=Path,
		help="Optional path to write structured JSON output.",
	)
	parser.add_argument(
		"--format",
		choices=["latex", "markdown", "html"],
		default="latex",
		help="Output format for stdout.",
	)
	parser.add_argument(
		"--output-html",
		type=Path,
		help="Optional path to write HTML output.",
	)
	return parser.parse_args()


def main() -> None:
	args = parse_args()
	sample_lookup = _load_sample_lookup(args.samples_path)
	json_output: dict[str, Any] = {}
	markdown_sections: list[str] = []
	summaries_by_agent: dict[str, dict[tuple[str, str], dict[str, Any]]] = {}
	cases_by_agent: dict[str, list[CaseMetrics]] = {}

	for agent_name in args.agents:
		history_dir = _resolve_eval_history_dir(agent_name)
		trace_timing_by_eval_id = _load_latest_trace_timing_by_eval_id(agent_name)
		latest = _latest_case_results(history_dir)
		cases: list[CaseMetrics] = []
		for history_timestamp, case_result in sorted(latest.values(), key=lambda item: str(item[1].get("eval_id", ""))):
			metrics = _build_case_metrics(
				agent_name,
				case_result,
				sample_lookup,
				history_timestamp,
				trace_timing_by_eval_id=trace_timing_by_eval_id,
			)
			if metrics is not None:
				cases.append(metrics)

		grouped = _group_cases(cases)
		cases_by_agent[agent_name] = cases
		agent_summary: dict[str, Any] = {}
		agent_summary_by_key: dict[tuple[str, str], dict[str, Any]] = {}
		for factor, category in TABLE_ROW_ORDER:
			key = (_canonical_key(factor), _canonical_key(category))
			summary = _summarize_row(grouped, key)
			agent_summary[f"{key[0]}::{key[1]}"] = summary
			agent_summary_by_key[key] = summary
		json_output[agent_name] = agent_summary
		summaries_by_agent[agent_name] = agent_summary_by_key
		markdown_sections.append(_render_markdown(agent_name, grouped))

	if args.output_json:
		args.output_json.write_text(json.dumps(json_output, indent=2), encoding="utf-8")

	html_text: str | None = None
	if args.format == "html" or args.output_html:
		html_text = _render_html_bucket_table(cases_by_agent, args.agents)
		if args.output_html:
			args.output_html.write_text(html_text, encoding="utf-8")

	if args.format == "markdown":
		print("\n\n".join(markdown_sections))
	elif args.format == "html":
		assert html_text is not None
		print(html_text)
	else:
		print(_render_latex_table_rows(summaries_by_agent))


if __name__ == "__main__":
	main()