from typing import Dict
from operator import itemgetter
import numpy as np

try:
    import gymnasium as gym
except Exception:
    import gym


def evaluate(
    agent, env: gym.Env, num_episodes: int, episode_length: int
) -> Dict[str, float]:

    n_seeds = env.num_envs
    aggregate_dict = {"return": []}
    for _ in range(num_episodes):
        observations, dones = env.reset(), np.array([False] * n_seeds)
        rets, length = np.zeros(n_seeds), 0
        while not dones.all():
            actions = agent.sample_actions(observations, temperature=0.0)
            prev_dones = dones
            observations, rewards, dones, infos = env.step(actions)
            rets += rewards * (1 - prev_dones)
            length += 1
            if length >= episode_length:
                break

        aggregate_dict["return"].append(rets)

        # for myosuite and gymnasium-robotics
        if "solved" in infos[0].keys() or "is_success" in infos[0].keys():
            if "success" not in aggregate_dict.keys():
                aggregate_dict["success"] = []

            if "solved" in infos[0].keys():
                get_value = itemgetter("solved")
            elif "is_success" in infos[0].keys():
                get_value = itemgetter("is_success")
            aggregate_dict["success"].append(list(map(get_value, infos)))

    # calculate mean
    to_ret_dict = {}
    for k, v in aggregate_dict.items():
        to_ret_dict[k] = np.array(v).mean(axis=0)

    return to_ret_dict
