from __future__ import annotations

from typing import Dict
import numpy as np


def evaluate_policy(agent, env, episodes: int = 5, deterministic: bool = True) -> Dict[str, float]:
    returns = []
    costs = []
    lengths = []
    for _ in range(episodes):
        obs, _ = env.reset()
        done = False
        ep_ret = 0.0
        ep_cost = 0.0
        ep_len = 0
        while not done:
            act = agent.select_action(obs, deterministic=deterministic)
            obs, rew, term, trunc, info = env.step(act)
            done = term or trunc
            ep_ret += float(rew)
            ep_cost += float(info.get("cost", 0.0))
            ep_len += 1
        returns.append(ep_ret)
        costs.append(ep_cost)
        lengths.append(ep_len)
    return {
        "eval/return_mean": float(np.mean(returns)),
        "eval/return_std": float(np.std(returns)),
        "eval/cost_mean": float(np.mean(costs)),
        "eval/len_mean": float(np.mean(lengths)),
    }

