"""
Based on PureJaxRL Implementation of PPO.
"""
import os
from pathlib import Path
import pickle
import jax
import jax.numpy as jnp
import flax.linen as nn
import numpy as np
import optax
import pyrallis
from typing import NamedTuple
from flax.training.train_state import TrainState
import distrax
from flax import struct

import jax.scipy.stats

from dataclasses import asdict, dataclass

from src.agents.actors import ActorWithConditionalCritic, ScannedRNN, ActorCriticRNN
from src.envs import make_env
from src.envs.log_wrapper import LogWrapper

import distrax


import wandb
import pickle


@dataclass
class TrainConfig:
    # Wandb and other logging
    project: str = "JaxZSC"
    mode: str = "disabled"  # Literal["online", "offline", "disabled"]
    entity: str = ""
    checkpoint_path: str = "checkpoints"
    checkpoint_freq: int = 25  # Checkpoint every N updates

    # LBF
    env_name: str = "lbf"

    # Actor-Critic
    activation: str = "tanh"
    fc_dim_size: int = 256
    gru_hidden_dim: int = 256

    embedding_layers: int = 2
    actor_layers: int = 4
    critic_layers: int = 4

    use_layernorm: bool = True

    other_agent_prediction: bool = True
    moa_coef: float = 1.0

    # Training
    seed: int = 42
    lr: float = 1e-3
    anneal_lr: bool = True
    num_envs: int = 512

    num_steps_per_env: int = 128
    num_steps_per_update: int = 128

    total_timesteps: float = 5e7
    update_epochs: int = 6
    num_minibatches: int = 8
    gamma: float = 0.99
    gae_lambda: float = 0.95
    clip_eps: float = 0.2
    ent_coef: float = 0.01
    vf_coef: float = 1.0
    max_grad_norm: float = 0.5

    # SFL
    # how many times more steps to rollout than the max_steps
    sfl_rollout_factor: int = 10
    sfl_buffer_size: int = 128
    sfl_batch_size: int = 128
    sfl_num_batches: int = 1
    sfl_buffer_refresh_freq: int = 4
    sfl_num_envs_to_sample: int = 512

    # gaussian-weighted-standarddev, variance-x-mean, variance, mean-return, success-rate-over-global-median
    learnability_function: str = "variance"
    sample_with_dense_rewards: bool = False

    num_devices: int = 1

    eval_against_pop: bool = True

    def __post_init__(self):
        self.num_agents = 2
        self.num_actors = self.num_agents * self.num_envs
        self.sfl_num_envs_to_generate = self.num_envs - self.sfl_num_envs_to_sample

        self.num_inner_updates = self.num_steps_per_env // self.num_steps_per_update
        self.num_envs_per_device = self.num_envs // self.num_devices

        self.minibatch_size = self.num_actors * \
            self.num_steps_per_update // self.num_minibatches
        self.total_timesteps_per_device = self.total_timesteps // self.num_devices
        self.num_meta_updates = round(
            self.total_timesteps_per_device /
            (self.num_envs_per_device * self.num_steps_per_env)
        )
        self.num_outer_steps = self.num_meta_updates // self.sfl_buffer_refresh_freq

        if self.learnability_function == "none":
            self.sfl_rollout_factor: int = 2

        print('num inner updates', self.num_inner_updates)


def mask_and_norm(probs, mask, eps=1e-8):
    p = probs * mask
    z = jnp.sum(p, axis=-1, keepdims=True)
    # if mask is all-zero, fall back to uniform over (masked) support
    uniform_valid = mask / jnp.maximum(mask.sum(-1, keepdims=True), 1.0)
    return jnp.where(z > 0, p / jnp.maximum(z, eps), uniform_valid)


class RolloutStats(struct.PyTreeNode):
    reward: jax.Array = jnp.asarray(0.0)
    length: jax.Array = jnp.asarray(0)


class PartnerParametersWithBias(struct.PyTreeNode):
    epsilon: jnp.float32
    epsilon_agent: jnp.int32
    bias_mask: jnp.ndarray  # shape (ACTION_SPACE_SIZE,)


def sample_partner_parameters(num_agents, rng: jax.random.PRNGKey) -> PartnerParametersWithBias:
    """
    Sampling partner parameters with 50% chance to inject Dirichlet-based action bias.
    """
    ACTION_SPACE_SIZE = 6
    DIRICHLET_ALPHA = 1.0

    rng, rng_eps, rng_eps_agent, rng_bias_decision, rng_dirichlet = jax.random.split(
        rng, 5)

    epsilon = jax.random.uniform(rng_eps, shape=(), minval=0.0, maxval=1.0)
    epsilon_agent = jax.random.bernoulli(rng_eps_agent)

    # 50% chance to apply Dirichlet bias
    use_bias = jax.random.bernoulli(rng_bias_decision, p=0.5)

    def biased_mask():
        return jax.random.dirichlet(rng_dirichlet, alpha=DIRICHLET_ALPHA * jnp.ones(ACTION_SPACE_SIZE))

    def uniform_mask():
        return jnp.ones((ACTION_SPACE_SIZE,)) / ACTION_SPACE_SIZE

    bias_mask = jax.lax.cond(use_bias, biased_mask, uniform_mask)

    return PartnerParametersWithBias(
        epsilon=epsilon,
        epsilon_agent=jnp.asarray(epsilon_agent, dtype=bool),
        bias_mask=bias_mask
    )


def rollout_nsteps(
    rng: jax.Array,
    env,
    partner_params: PartnerParametersWithBias,
    train_state: TrainState,
    init_hstate: jax.Array,
    num_steps: int = 400,
    sfl_rollout_factor: int = 2,
    use_dense_rewards: bool = False,
    dense_rewards_coeff: jax.Array = jnp.asarray(0.0),
):
    """ Rollout for `num_steps` environment steps """

    class RolloutEpisodeStats(struct.PyTreeNode):
        reward: jax.Array = jnp.asarray(0.0)
        episode_return: jax.Array = jnp.zeros((sfl_rollout_factor,))
        length: jax.Array = jnp.asarray(0)
        episode_counter: jax.Array = jnp.asarray(0)
        done: jax.Array = jnp.asarray(False)

    def _env_step(carry, unused):
        rng, env_state, stats, last_obs, last_done, hstate, past_5_sa_pairs = carry
        rng, rng_action, rng_step = jax.random.split(rng, 3)

        obs_batch = batchify(last_obs, env.agents, 2)

        avail_actions = env.get_avail_actions(env_state.env_state)
        avail_actions = jax.lax.stop_gradient(avail_actions)
        avail_actions_0 = avail_actions["agent_0"].astype(
            jnp.float32)
        avail_actions_1 = avail_actions["agent_1"].astype(
            jnp.float32)

        avail_actions_batch = batchify(
            {"agent_0": avail_actions_0, "agent_1": avail_actions_1},
            env.agents, 2)

        def get_e3t_action(args):
            """ε-mix between masked policy and masked Dirichlet-biased random,
            then mask again after mixing."""
            pi_ego, key, epsilon, bias_mask, avail_mask = args
            # both inputs are masked & normalized on the valid support
            probs_ego = mask_and_norm(pi_ego.probs, avail_mask)
            probs_rand = mask_and_norm(bias_mask,   avail_mask)

            probs_mix = (1.0 - epsilon) * \
                probs_ego + epsilon * probs_rand
            probs_mix = mask_and_norm(
                probs_mix, avail_mask)  # final safety pass

            pi_e3t = distrax.Categorical(probs=probs_mix)
            a = pi_e3t.sample(seed=key)
            lp = pi_e3t.log_prob(a)
            ent = pi_e3t.entropy()
            return a, lp, ent

        def get_base_action(args):
            """Sample from the *masked* policy head."""
            pi_ego, key, avail_mask = args
            # re-mask to be safe even if the network already masked logits
            probs_masked = mask_and_norm(pi_ego.probs, avail_mask)
            pi_masked = distrax.Categorical(probs=probs_masked)

            a = pi_masked.sample(seed=key)
            lp = pi_masked.log_prob(a)
            ent = pi_masked.entropy()
            return a, lp, ent

        batched_sa_pairs = batchify_nested_dics(
            past_5_sa_pairs, env.agents, (1, 2, 5))
        ac_in = (
            obs_batch[np.newaxis, :],
            last_done[np.newaxis, :],
            batched_sa_pairs,
            avail_actions_batch[np.newaxis, :]
        )

        hstate, pi, value, other_pi = train_state.apply_fn(
            train_state.params, hstate, ac_in)

        # --- epsilon & bias broadcast to (2, *) ---
        A = avail_actions_batch.shape[-1]
        bias_mask_b = jnp.broadcast_to(partner_params.bias_mask, (2, A))
        epsilon_b = jnp.broadcast_to(partner_params.epsilon, (2, 1))

        # --- pick actions via the two paths ---
        rng, _rng = jax.random.split(rng, 2)
        e3t_action, e3t_logp, e3t_ent = get_e3t_action(
            (pi, _rng, epsilon_b, bias_mask_b, avail_actions_batch)
        )
        base_action, base_logp, base_ent = get_base_action(
            (pi, _rng, avail_actions_batch)
        )

        # choose per agent which head to use
        epsilon_agent_both = jnp.array(
            [partner_params.epsilon_agent, ~partner_params.epsilon_agent], dtype=bool
        )  # (2,)
        action = jnp.where(epsilon_agent_both, e3t_action,
                           base_action).squeeze()   # (2,)
        log_prob = jnp.where(epsilon_agent_both, e3t_logp,
                             base_logp).squeeze()   # (2,)
        entropy = jnp.where(epsilon_agent_both, e3t_ent,
                            base_ent).squeeze()    # (2,)

        env_act = unbatchify(action, env.agents, 1, env.num_agents)
        env_act = {k: v.flatten().squeeze() for k, v in env_act.items()}

        past_5_sa_pairs['agent_0']['obs'] = past_5_sa_pairs[
            'agent_0']['obs'].at[:, :-1].set(past_5_sa_pairs['agent_0']['obs'][:, 1:])
        past_5_sa_pairs['agent_0']['obs'] = past_5_sa_pairs[
            'agent_0']['obs'].at[:, - 1].set(last_obs['agent_0'])
        past_5_sa_pairs['agent_0']['action'] = past_5_sa_pairs[
            'agent_0']['action'].at[:, :-1].set(past_5_sa_pairs['agent_0']['action'][:, 1:])
        past_5_sa_pairs['agent_0']['action'] = past_5_sa_pairs[
            'agent_0']['action'].at[:, -1].set(env_act['agent_0'])

        past_5_sa_pairs['agent_1']['obs'] = past_5_sa_pairs[
            'agent_1']['obs'].at[:, :-1].set(past_5_sa_pairs['agent_1']['obs'][:, 1:])
        past_5_sa_pairs['agent_1']['obs'] = past_5_sa_pairs[
            'agent_1']['obs'].at[:, -1].set(last_obs['agent_1'])
        past_5_sa_pairs['agent_1']['action'] = past_5_sa_pairs[
            'agent_1']['action'].at[:, :-1].set(past_5_sa_pairs['agent_1']['action'][:, 1:])
        past_5_sa_pairs['agent_1']['action'] = past_5_sa_pairs[
            'agent_1']['action'].at[:, -1].set(env_act['agent_1'])

        # STEP ENV
        obsv, env_state, reward, done, info = env.step(
            rng_step, env_state, env_act)

        done_flag = done["__all__"]
        final_episode_return = stats.reward + reward["agent_0"]
        if use_dense_rewards:
            final_episode_return = final_episode_return + dense_rewards_coeff * (
                info["shaped_reward"]["agent_0"] +
                info["shaped_reward"]["agent_1"]
            )

        # Update buffer only if done, otherwise leave as is
        new_episode_return = jax.lax.cond(
            done_flag,
            lambda: stats.episode_return.at[stats.episode_counter].set(
                final_episode_return),
            lambda: stats.episode_return,
        )

        # Reset reward accumulator if done
        new_reward = jax.lax.cond(
            done_flag,
            lambda: jnp.array(0.0),
            lambda: final_episode_return,
        )

        # Increment episode counter only if done
        new_counter = stats.episode_counter + done_flag.astype(jnp.int32)

        # Update stats
        stats = stats.replace(
            reward=new_reward,
            length=stats.length + 1,
            done=done_flag,
            episode_counter=new_counter,
            episode_return=new_episode_return,
        )
        done = batchify(done, env.agents, 2)
        carry = (rng, env_state, stats, obsv,
                 done.squeeze(), hstate, past_5_sa_pairs)
        return carry, None

    key, key_r = jax.random.split(rng)
    obs, state = env.reset(key_r)

    init_x = jnp.zeros(env.observation_space("agent_0").shape)
    init_x = init_x.flatten()

    past_5_sa_pairs = {
        'agent_0': {
            'obs': jnp.zeros((1, 5, init_x.shape[0])),
            'action': jnp.zeros((1, 5, 1))
        },
        'agent_1': {
            'obs': jnp.zeros((1, 5, init_x.shape[0])),
            'action': jnp.zeros((1, 5, 1))
        }
    }

    past_5_sa_pairs['agent_0']['obs'] = obs['agent_0'][:,
                                                       None].repeat(5, axis=1)
    past_5_sa_pairs['agent_0']['action'] = jnp.ones(
        (1, 5)) * 4
    past_5_sa_pairs['agent_1']['obs'] = obs[
        'agent_1'][:, None].repeat(5, axis=1)
    past_5_sa_pairs['agent_1']['action'] = jnp.ones(
        (1, 5)) * 4

    init_carry = (rng, state, RolloutEpisodeStats(), obs,
                  jnp.array([False, False]), init_hstate, past_5_sa_pairs)

    final_carry, _ = jax.lax.scan(
        _env_step, init_carry, None, length=num_steps)

    return final_carry[2], final_carry[1]


def rollout(rng, env, network, params, hidden_size):
    def _cond_fn(carry):
        rng, env_state, stats, obsv, hstate, past_5_sa_pairs, done = carry
        return (done != True).any()  # Continue if not done.

    def _body_fn(carry):
        rng, env_state, stats, last_obs, hstate, past_5_sa_pairs, done = carry

        rng, rng_action, rng_step = jax.random.split(rng, 3)

        obs_batch = batchify(last_obs, env.agents, 2)
        batched_sa_pairs = batchify_nested_dics(
            past_5_sa_pairs, env.agents, (1, 2, 5))
        avail = env.get_avail_actions(env_state.env_state)
        avail_batch = batchify({"agent_0": avail["agent_0"].astype(jnp.float32),
                                "agent_1": avail["agent_1"].astype(jnp.float32)}, env.agents, 2)

        ac_in = (
            obs_batch[np.newaxis, :],
            done[np.newaxis, :],
            batched_sa_pairs,
            avail_batch[np.newaxis, :])

        hstate, pi, value, other_pi = network.apply(params, hstate, ac_in)
        action = pi.sample(seed=rng_action).squeeze()

        env_act = unbatchify(action, env.agents, 1, env.num_agents)
        env_act = {k: v.flatten().squeeze() for k, v in env_act.items()}

        past_5_sa_pairs['agent_0']['obs'] = past_5_sa_pairs[
            'agent_0']['obs'].at[:, :-1].set(past_5_sa_pairs['agent_0']['obs'][:, 1:])
        past_5_sa_pairs['agent_0']['obs'] = past_5_sa_pairs[
            'agent_0']['obs'].at[:, - 1].set(last_obs['agent_0'])
        past_5_sa_pairs['agent_0']['action'] = past_5_sa_pairs[
            'agent_0']['action'].at[:, :-1].set(past_5_sa_pairs['agent_0']['action'][:, 1:])
        past_5_sa_pairs['agent_0']['action'] = past_5_sa_pairs[
            'agent_0']['action'].at[:, -1].set(env_act['agent_0'])

        past_5_sa_pairs['agent_1']['obs'] = past_5_sa_pairs[
            'agent_1']['obs'].at[:, :-1].set(past_5_sa_pairs['agent_1']['obs'][:, 1:])
        past_5_sa_pairs['agent_1']['obs'] = past_5_sa_pairs[
            'agent_1']['obs'].at[:, -1].set(last_obs['agent_1'])
        past_5_sa_pairs['agent_1']['action'] = past_5_sa_pairs[
            'agent_1']['action'].at[:, :-1].set(past_5_sa_pairs['agent_1']['action'][:, 1:])
        past_5_sa_pairs['agent_1']['action'] = past_5_sa_pairs[
            'agent_1']['action'].at[:, -1].set(env_act['agent_1'])

        obsv, env_state, reward, done, info = env.step(
            rng_step, env_state, env_act
        )

        stats = stats.replace(
            reward=stats.reward + reward["agent_0"],
            length=stats.length + 1
        )
        done = batchify(done, env.agents, 2)
        carry = (rng, env_state, stats, obsv, hstate,
                 past_5_sa_pairs, done.squeeze())
        return carry

    key, key_r = jax.random.split(rng)
    obs, state = env.reset(key_r)

    init_x = jnp.zeros(env.observation_space("agent_0").shape)
    init_x = init_x.flatten()

    past_5_sa_pairs = {
        'agent_0': {
            'obs': jnp.zeros((1, 5, init_x.shape[0])),
            'action': jnp.zeros((1, 5, 1))
        },
        'agent_1': {
            'obs': jnp.zeros((1, 5, init_x.shape[0])),
            'action': jnp.zeros((1, 5, 1))
        }
    }

    past_5_sa_pairs['agent_0']['obs'] = obs['agent_0'][:,
                                                       None].repeat(5, axis=1)
    past_5_sa_pairs['agent_0']['action'] = jnp.ones(
        (1, 5)) * 4
    past_5_sa_pairs['agent_1']['obs'] = obs[
        'agent_1'][:, None].repeat(5, axis=1)
    past_5_sa_pairs['agent_1']['action'] = jnp.ones(
        (1, 5)) * 4

    init_hstate = ScannedRNN.initialize_carry(
        2, hidden_size)  # Hardcoded
    init_carry = (rng, state, RolloutStats(), obs,
                  init_hstate, past_5_sa_pairs, jnp.array([False, False]))

    final_carry = jax.lax.while_loop(_cond_fn, _body_fn, init_val=init_carry)
    return final_carry[2].reward.squeeze(), final_carry[2].length.squeeze()


def rollout_single_l(rng, env, network, params, other_network, other_params, hidden_size, popsize):
    def _cond_fn(carry):
        rng, env_state, stats, obsv, hstate, past_5_sa_pairs, done = carry
        return (done != True).any()  # Continue if not done.

    def _body_fn(carry):
        rng, env_state, stats, last_obs, hstate, past_5_sa_pairs, done = carry

        rng, rng_action, rng_o_action, rng_step = jax.random.split(rng, 4)

        in_past_sa_pairs = jax.tree.map(
            lambda x: x[np.newaxis], past_5_sa_pairs["agent_0"])

        avail = env.get_avail_actions(env_state.env_state)
        ac_in = (
            last_obs["agent_0"][np.newaxis, np.newaxis, :],
            done[np.newaxis, ...][:, 0:1],
            in_past_sa_pairs,
            avail["agent_0"].astype(jnp.float32)[np.newaxis, np.newaxis, :],
        )
        hstate, pi, _, _ = network.apply(params, hstate, ac_in)
        action0 = pi.sample(seed=rng_action).squeeze()

        pi1, _ = other_network.apply(
            other_params,
            (
                last_obs["agent_1"][np.newaxis, :],
                jnp.zeros(popsize)[np.newaxis, :],
                avail["agent_1"].astype(jnp.float32)[np.newaxis, :],
            )
        )
        action1 = pi1.sample(seed=rng_o_action).squeeze()

        env_act = {
            "agent_0": action0,
            "agent_1": action1
        }

        past_5_sa_pairs['agent_0']['obs'] = past_5_sa_pairs[
            'agent_0']['obs'].at[:, :-1].set(past_5_sa_pairs['agent_0']['obs'][:, 1:])
        past_5_sa_pairs['agent_0']['obs'] = past_5_sa_pairs[
            'agent_0']['obs'].at[:, - 1].set(last_obs['agent_0'])
        past_5_sa_pairs['agent_0']['action'] = past_5_sa_pairs[
            'agent_0']['action'].at[:, :-1].set(past_5_sa_pairs['agent_0']['action'][:, 1:])
        past_5_sa_pairs['agent_0']['action'] = past_5_sa_pairs[
            'agent_0']['action'].at[:, -1].set(env_act['agent_0'])

        obsv, env_state, reward, done, info = env.step(
            rng_step, env_state, env_act
        )

        stats = stats.replace(
            reward=stats.reward + reward["agent_0"],
            length=stats.length + 1
        )
        done = batchify(done, env.agents, 2)
        carry = (rng, env_state, stats, obsv, hstate,
                 past_5_sa_pairs, done.squeeze())
        return carry

    key, key_r = jax.random.split(rng)
    obs, state = env.reset(key_r)

    init_x = jnp.zeros(env.observation_space("agent_0").shape)
    init_x = init_x.flatten()

    past_5_sa_pairs = {
        'agent_0': {
            'obs': jnp.zeros((1, 5, init_x.shape[0])),
            'action': jnp.zeros((1, 5, 1))
        },
    }

    past_5_sa_pairs['agent_0']['obs'] = obs[
        'agent_0'][:, None].repeat(5, axis=1)
    past_5_sa_pairs['agent_0']['action'] = jnp.ones(
        (1, 5)) * 4

    init_hstate = ScannedRNN.initialize_carry(
        1, hidden_size)  # Hardcoded
    init_carry = (rng, state, RolloutStats(), obs,
                  init_hstate, past_5_sa_pairs, jnp.array([False, False]))

    final_carry = jax.lax.while_loop(_cond_fn, _body_fn, init_val=init_carry)
    return final_carry[2].reward.squeeze(), final_carry[2].length.squeeze()


def rollout_single_r(rng, env, network, params, other_network, other_params, hidden_size, popsize):
    def _cond_fn(carry):
        rng, env_state, stats, obsv, hstate, past_5_sa_pairs, done = carry
        return (done != True).any()  # Continue if not done.

    def _body_fn(carry):
        rng, env_state, stats, last_obs, hstate, past_5_sa_pairs, done = carry

        rng, rng_action, rng_o_action, rng_step = jax.random.split(rng, 4)

        in_past_sa_pairs = jax.tree.map(
            lambda x: x[np.newaxis], past_5_sa_pairs["agent_1"])
        avail = env.get_avail_actions(env_state.env_state)
        ac_in = (
            last_obs["agent_1"][np.newaxis, np.newaxis, :],
            done[np.newaxis, ...][:, 0:1],
            in_past_sa_pairs,
            avail["agent_1"].astype(jnp.float32)[np.newaxis, np.newaxis, :],

        )

        hstate, pi, value, other_pi = network.apply(params, hstate, ac_in)
        action0 = pi.sample(seed=rng_action).squeeze()

        pi1, _ = other_network.apply(
            other_params,
            (
                last_obs["agent_0"][np.newaxis, :],
                jnp.zeros(popsize)[np.newaxis, :],
                avail["agent_0"].astype(jnp.float32)[np.newaxis, :],
            )
        )
        action1 = pi1.sample(seed=rng_o_action).squeeze()

        env_act = {
            "agent_0": action1,
            "agent_1": action0,
        }

        past_5_sa_pairs['agent_1']['obs'] = past_5_sa_pairs[
            'agent_1']['obs'].at[:, :-1].set(past_5_sa_pairs['agent_1']['obs'][:, 1:])
        past_5_sa_pairs['agent_1']['obs'] = past_5_sa_pairs[
            'agent_1']['obs'].at[:, - 1].set(last_obs['agent_1'])
        past_5_sa_pairs['agent_1']['action'] = past_5_sa_pairs[
            'agent_1']['action'].at[:, :-1].set(past_5_sa_pairs['agent_1']['action'][:, 1:])
        past_5_sa_pairs['agent_1']['action'] = past_5_sa_pairs[
            'agent_1']['action'].at[:, -1].set(env_act['agent_1'])

        obsv, env_state, reward, done, info = env.step(
            rng_step, env_state, env_act
        )

        stats = stats.replace(
            reward=stats.reward + reward["agent_0"],
            length=stats.length + 1
        )
        done = batchify(done, env.agents, 2)
        carry = (rng, env_state, stats, obsv, hstate,
                 past_5_sa_pairs, done.squeeze())
        return carry

    key, key_r = jax.random.split(rng)
    obs, state = env.reset(key_r)

    init_x = jnp.zeros(env.observation_space("agent_0").shape)
    init_x = init_x.flatten()

    past_5_sa_pairs = {
        'agent_1': {
            'obs': jnp.zeros((1, 5, init_x.shape[0])),
            'action': jnp.zeros((1, 5, 1))
        },
    }

    past_5_sa_pairs['agent_1']['obs'] = obs['agent_1'][:,
                                                       None].repeat(5, axis=1)
    past_5_sa_pairs['agent_1']['action'] = jnp.ones(
        (1, 5)) * 4

    init_hstate = ScannedRNN.initialize_carry(
        1, hidden_size)  # Hardcoded
    init_carry = (rng, state, RolloutStats(), obs,
                  init_hstate, past_5_sa_pairs, jnp.array([False, False]))

    final_carry = jax.lax.while_loop(_cond_fn, _body_fn, init_val=init_carry)
    return final_carry[2].reward.squeeze(), final_carry[2].length.squeeze()


def rollout_both_ways(eval_rng, env, network, params, partner_pop_actor, partner_pop_params, gru_hidden_dim, popsize):
    eval_returns_l, _ = jax.vmap(rollout_single_l, in_axes=(0, None, None, None, None, None, None, None))(
        eval_rng, env, network, params, partner_pop_actor, partner_pop_params, gru_hidden_dim, popsize)

    eval_returns_r, _ = jax.vmap(rollout_single_r, in_axes=(0, None, None, None, None, None, None, None))(
        eval_rng, env, network, params, partner_pop_actor, partner_pop_params, gru_hidden_dim, popsize)
    return jnp.array([eval_returns_l, eval_returns_r]).mean()


class Transition(NamedTuple):
    global_done: jnp.ndarray
    done: jnp.ndarray
    action: jnp.ndarray
    value: jnp.ndarray
    reward: jnp.ndarray
    log_prob: jnp.ndarray
    obs: jnp.ndarray
    info: jnp.ndarray
    entropy: jnp.ndarray
    hstate: jnp.ndarray
    other_action: jnp.ndarray
    past_sa_pairs: jnp.ndarray
    avail_actions: jnp.ndarray


def batchify(x: dict, agent_list, num_actors):
    x = jnp.stack([x[a] for a in agent_list])
    return x.reshape((num_actors, -1))


def batchify_nested_dics(x: dict, agent_list, shape):
    data = [x[a] for a in agent_list]
    tree = jax.tree.map(lambda *v: jnp.stack(v), *data)
    return jax.tree.map(lambda x: x.reshape((*shape, -1)), tree)


def unbatchify(x: jnp.ndarray, agent_list, num_envs, num_actors):
    x = x.reshape((num_actors, num_envs, -1))
    return {a: x[i] for i, a in enumerate(agent_list)}


def make_update_fn(config, env, network, partner_pop_actor, partner_pop_params, pop_size):
    def train_loop(
        meta_state: tuple, outer_idx,
    ):

        def _sample_learnability_buffer(rng, train_state):
            update_step = meta_state[-1]

            def _batch_step(unused, batch_rng):
                ruleset_rng, rollout_rng = jax.random.split(batch_rng)
                # sample rulesets
                ruleset_rng = jax.random.split(
                    ruleset_rng, num=config.sfl_batch_size)
                partner_params = jax.vmap(
                    sample_partner_parameters, in_axes=(None, 0))(
                        config.num_agents, ruleset_rng)

                rollout_rng = jax.random.split(
                    rollout_rng, num=config.sfl_batch_size)
                rollout_stats, last_state = jax.vmap(rollout_nsteps, in_axes=(0, None, 0, None, None, None, None, None))(
                    rollout_rng,
                    env,
                    partner_params,
                    train_state,
                    jnp.zeros((config.num_agents, config.gru_hidden_dim)),
                    # It can end earlier... how do we deal with this?
                    env.env.time_limit * config.sfl_rollout_factor,
                    config.sfl_rollout_factor,
                    config.sample_with_dense_rewards,
                )
                return None, (partner_params, rollout_stats)

            batch_rng = jax.random.split(rng, num=config.sfl_num_batches)
            _, (partner_params, rollout_stats) = jax.lax.scan(
                _batch_step, None, batch_rng)

            if config.learnability_function == "gaussian-weighted-standarddev":
                mean_return = jnp.mean(
                    rollout_stats.episode_return, axis=-1)  # [2]
                std_return = jnp.std(
                    rollout_stats.episode_return, axis=-1)   # [2]
                # Global stats
                global_mean = jnp.mean(mean_return)
                global_std = jnp.std(mean_return) + 1e-8  # prevent div-by-zero
                # Gaussian weight centered on global mean
                gaussian_weight = jax.scipy.stats.norm.pdf(
                    mean_return, loc=global_mean, scale=global_std)
                # Generalised learnability
                # shape: [2]
                learnability = (std_return * gaussian_weight).squeeze()
            elif config.learnability_function == "variance":
                learnability = jnp.var(
                    rollout_stats.episode_return, axis=-1).squeeze()
            elif config.learnability_function == "variance-x-mean":
                mean_return = jnp.mean(rollout_stats.episode_return, axis=-1)
                std_return = jnp.std(rollout_stats.episode_return, axis=-1)
                learnability = (std_return * mean_return).squeeze()
            elif config.learnability_function == "mean-return":
                mean_return = jnp.mean(rollout_stats.episode_return, axis=-1)
                learnability = mean_return.squeeze()
            elif config.learnability_function == "success-rate-over-global-median":
                returns = rollout_stats.episode_return
                global_median = jnp.median(returns)
                success = returns > global_median
                success_rate = jnp.mean(
                    success, axis=-1).squeeze()
                learnability = success_rate * (1 - success_rate)
            elif config.learnability_function == "coefficent-of-variation":
                mean_return = jnp.mean(rollout_stats.episode_return, axis=-1)
                std_return = jnp.std(rollout_stats.episode_return, axis=-1)
                learnability = (std_return / (mean_return + 1e-8)).squeeze()
            elif config.learnability_function == "entropy":
                returns = rollout_stats.episode_return  # shape [2, N]
                hist_bins = 10
                hist_range = (jnp.min(returns), jnp.max(returns))
                hist, bin_edges = jnp.histogram(
                    returns, bins=hist_bins, range=hist_range, axis=-1, density=True)
                entropy = -jnp.sum(hist * jnp.log(hist + 1e-8), axis=-1)
                learnability = entropy.squeeze()
            elif config.learnability_function == "cole-rank-based":
                mean_return = jnp.mean(rollout_stats.episode_return, axis=-1)
                # higher ranks = harder partners
                ranks = jnp.argsort(jnp.argsort(-mean_return))
                learnability = (ranks + 1).astype(jnp.float32)
                learnability = learnability.squeeze()
            elif config.learnability_function == "cole-inverse-mean-return":
                mean_return = jnp.mean(rollout_stats.episode_return, axis=-1)
                learnability = 1.0 / (mean_return + 1e-8)
                learnability = learnability.squeeze()
            elif config.learnability_function == "none":
                # Random scores => random Top-K. Keeps the existing argsort/top-K code
                num_candidates = config.sfl_num_batches * config.sfl_batch_size
                rng, sel_rng = jax.random.split(rng)
                learnability = jax.random.uniform(
                    sel_rng, (num_candidates,), dtype=jnp.float32)

            print('rollout stats', rollout_stats)
            flat_partner_params = jax.tree.map(
                lambda x: x.reshape((-1,) + x.shape[2:]), partner_params)
            print('flat rulesets', flat_partner_params)

            top_learnability = jnp.argsort(
                learnability)[-config.sfl_buffer_size:]
            top_partner_params = jax.tree.map(
                lambda x: x.at[top_learnability].get(), flat_partner_params)

            info = {
                "buffer_learnability_scores": learnability.at[top_learnability].get(),
                # "top_gaussian_weight": gaussian_weight.at[top_learnability].get(),
                # "all_gaussian_weight": gaussian_weight,
            }

            return top_partner_params, info

        def _meta_step(meta_state, update_idx):
            rng, train_state, sfl_buffer, past_5_sa_pairs, learnability_info, update_steps = meta_state

            # sample rulesets for this meta update
            rng, _rng1, _rng2, _rng3 = jax.random.split(rng, num=4)

            ruleset_gen_rng = jax.random.split(
                _rng1, num=config.sfl_num_envs_to_generate)
            partner_params_gen = jax.vmap(
                sample_partner_parameters, in_axes=(None, 0))(config.num_agents, ruleset_gen_rng)

            # sample from sfl buffer
            partner_params_idxs = jax.random.randint(
                _rng2, (config.sfl_num_envs_to_sample,), 0, config.sfl_buffer_size)
            sampled_partner_params = jax.tree.map(
                lambda x: x.at[partner_params_idxs].get(), sfl_buffer)

            partner_params = jax.tree.map(lambda x, y: jnp.concatenate(
                [x, y], axis=0), partner_params_gen, sampled_partner_params)

            reset_rng = jax.random.split(_rng3, num=config.num_envs_per_device)
            obsv, env_state = jax.vmap(env.reset, in_axes=(0,))(reset_rng)

            init_hstate = ScannedRNN.initialize_carry(
                config.num_actors, config.gru_hidden_dim)

            def _update_step(update_runner_state, _):
                # COLLECT TRAJECTORIES
                runner_state, update_steps, learnability_info = update_runner_state

                def _env_step(runner_state, unused):
                    train_state, env_state, last_obs, last_done, hstate, rng, update_step, epsilon_agent, past_5_sa_pairs = runner_state
                    rng, _rng = jax.random.split(rng)
                    obs_batch = batchify(
                        last_obs, env.agents, config.num_actors)

                    avail_actions = jax.vmap(
                        env.get_avail_actions)(env_state.env_state)
                    avail_actions = jax.lax.stop_gradient(avail_actions)
                    avail_actions_0 = avail_actions["agent_0"].astype(
                        jnp.float32)
                    avail_actions_1 = avail_actions["agent_1"].astype(
                        jnp.float32)

                    avail_actions_batch = batchify(
                        {"agent_0": avail_actions_0, "agent_1": avail_actions_1},
                        env.agents, config.num_actors)

                    def get_e3t_action(args):
                        """ε-mix between masked policy and masked Dirichlet-biased random,
                        then mask again after mixing."""
                        pi_ego, key, epsilon, bias_mask, avail_mask = args
                        # both inputs are masked & normalized on the valid support
                        probs_ego = mask_and_norm(pi_ego.probs, avail_mask)
                        probs_rand = mask_and_norm(bias_mask,   avail_mask)

                        probs_mix = (1.0 - epsilon) * \
                            probs_ego + epsilon * probs_rand
                        probs_mix = mask_and_norm(
                            probs_mix, avail_mask)  # final safety pass

                        pi_e3t = distrax.Categorical(probs=probs_mix)
                        a = pi_e3t.sample(seed=key)
                        lp = pi_e3t.log_prob(a)
                        ent = pi_e3t.entropy()
                        return a, lp, ent

                    def get_base_action(args):
                        """Sample from the *masked* policy head."""
                        pi_ego, key, avail_mask = args
                        # re-mask to be safe even if the network already masked logits
                        probs_masked = mask_and_norm(pi_ego.probs, avail_mask)
                        pi_masked = distrax.Categorical(probs=probs_masked)

                        a = pi_masked.sample(seed=key)
                        lp = pi_masked.log_prob(a)
                        ent = pi_masked.entropy()
                        return a, lp, ent

                    batched_sa_pairs = batchify_nested_dics(
                        past_5_sa_pairs, env.agents, (1, config.num_actors, 5))
                    ac_in = (
                        obs_batch[np.newaxis, :],
                        last_done[np.newaxis, :],
                        batched_sa_pairs,
                        avail_actions_batch[np.newaxis, :]
                    )
                    hstate, pi, value, pred_pi = network.apply(
                        train_state.params, hstate, ac_in)

                    stacked_epsilon = jnp.concatenate(
                        [partner_params.epsilon, partner_params.epsilon])
                    bias_mask_stacked = jnp.concatenate(
                        [partner_params.bias_mask, partner_params.bias_mask])

                    eps_b = stacked_epsilon[jnp.newaxis, :, jnp.newaxis]
                    bias_b = bias_mask_stacked[jnp.newaxis, :, :]
                    mask_b = avail_actions_batch[jnp.newaxis, :, :]

                    rng, _rng = jax.random.split(rng)
                    e3t_action, e3t_log_prob, e3t_entropy = get_e3t_action(
                        (pi, _rng, eps_b, bias_b, mask_b))
                    base_action, base_log_prob, base_entropy = get_base_action(
                        (pi, _rng, mask_b))

                    epsilon_agent_both = jnp.concatenate(
                        [partner_params.epsilon_agent, ~partner_params.epsilon_agent], axis=0)
                    action = jnp.where(epsilon_agent_both,
                                       e3t_action, base_action)
                    action = action.squeeze()

                    log_prob = jnp.where(
                        epsilon_agent_both, e3t_log_prob, base_log_prob).squeeze()
                    entropy = jnp.where(epsilon_agent_both,
                                        e3t_entropy, base_entropy).squeeze()

                    env_act = unbatchify(
                        action, env.agents, config.num_envs, env.num_agents
                    )
                    env_act = {k: v.squeeze() for k, v in env_act.items()}

                    # Update state-action pairs
                    past_5_sa_pairs['agent_0']['obs'] = past_5_sa_pairs[
                        'agent_0']['obs'].at[:, :-1, :].set(past_5_sa_pairs['agent_0']['obs'][:, 1:, :])
                    past_5_sa_pairs['agent_0']['obs'] = past_5_sa_pairs[
                        'agent_0']['obs'].at[:, - 1, :].set(last_obs['agent_0'])
                    past_5_sa_pairs['agent_0']['action'] = past_5_sa_pairs[
                        'agent_0']['action'].at[:, :-1].set(past_5_sa_pairs['agent_0']['action'][:, 1:])
                    past_5_sa_pairs['agent_0']['action'] = past_5_sa_pairs[
                        'agent_0']['action'].at[:, -1].set(env_act['agent_0'])

                    past_5_sa_pairs['agent_1']['obs'] = past_5_sa_pairs[
                        'agent_1']['obs'].at[:, :-1, :].set(past_5_sa_pairs['agent_1']['obs'][:, 1:, :])
                    past_5_sa_pairs['agent_1']['obs'] = past_5_sa_pairs[
                        'agent_1']['obs'].at[:, -1, :].set(last_obs['agent_1'])
                    past_5_sa_pairs['agent_1']['action'] = past_5_sa_pairs[
                        'agent_1']['action'].at[:, :-1].set(past_5_sa_pairs['agent_1']['action'][:, 1:])
                    past_5_sa_pairs['agent_1']['action'] = past_5_sa_pairs[
                        'agent_1']['action'].at[:, -1].set(env_act['agent_1'])

                    # STEP ENV
                    rng, _rng = jax.random.split(rng)
                    rng_step = jax.random.split(_rng, config.num_envs)
                    obsv, env_state, reward, done, info = jax.vmap(
                        env.step, in_axes=(0, 0, 0)
                    )(rng_step, env_state, env_act)

                    info = jax.tree.map(lambda x: x.reshape(
                        (config.num_actors)), info)
                    done_batch = batchify(
                        done, env.agents, config.num_actors).squeeze()
                    other_action = jnp.concatenate([
                        env_act["agent_1"], env_act["agent_0"]
                    ], axis=-1)
                    transition = Transition(
                        jnp.tile(done["__all__"], env.num_agents),
                        last_done,
                        action.squeeze(),
                        value.squeeze(),
                        batchify(reward, env.agents,
                                 config.num_actors).squeeze(),
                        log_prob.squeeze(),
                        obs_batch,
                        info,
                        entropy,
                        hstate,
                        other_action.squeeze(),
                        batchify_nested_dics(
                            past_5_sa_pairs, env.agents, (config.num_actors, 5)),
                        avail_actions_batch,
                    )
                    runner_state = (train_state, env_state, obsv,
                                    done_batch, hstate, rng, update_step, epsilon_agent, past_5_sa_pairs)
                    return runner_state, transition

                initial_hstate = runner_state[-3]
                (train_state, env_state, obsv,
                 done_batch, hstate, past_5_sa_pairs, rng) = runner_state
                # sample which agent we'll increase beta to
                # DO we really sample this here?
                epsilon_agent = jax.random.bernoulli(
                    rng, shape=(config.num_envs,))
                rng, _rng = jax.random.split(rng)

                runner_state = (train_state, env_state, obsv, done_batch,
                                hstate, rng, update_steps, epsilon_agent, past_5_sa_pairs)
                runner_state, traj_batch = jax.lax.scan(
                    _env_step, runner_state, None, config.num_steps_per_update
                )

                # CALCULATE ADVANTAGE
                train_state, env_state, last_obs, last_done, hstate, rng, update_steps, beta_agent, past_5_sa_pairs = runner_state
                last_obs_batch = batchify(
                    last_obs, env.agents, config.num_actors)
                batched_sa_pairs = batchify_nested_dics(
                    past_5_sa_pairs, env.agents, (1, config.num_actors, 5))

                last_avail = jax.vmap(env.get_avail_actions)(
                    env_state.env_state)
                last_avail = jax.lax.stop_gradient(last_avail)
                last_avail_batch = batchify(
                    {"agent_0": last_avail["agent_0"].astype(jnp.float32),
                     "agent_1": last_avail["agent_1"].astype(jnp.float32)},
                    env.agents, config.num_actors
                )

                ac_in = (
                    last_obs_batch[jnp.newaxis, :],
                    last_done[jnp.newaxis, :],
                    batched_sa_pairs,
                    last_avail_batch[jnp.newaxis, :],            # NEW
                )
                _, _, last_val, _ = network.apply(
                    train_state.params, hstate, ac_in)
                last_val = last_val.squeeze()

                def _calculate_gae(traj_batch, last_val):
                    def _get_advantages(gae_and_next_value, transition):
                        gae, next_value = gae_and_next_value
                        done, value, reward = (
                            transition.done,
                            transition.value,
                            transition.reward,
                        )
                        delta = reward + config.gamma * \
                            next_value * (1 - done) - value
                        gae = (
                            delta
                            + config.gamma *
                            config.gae_lambda * (1 - done) * gae
                        )
                        return (gae, value), gae

                    _, advantages = jax.lax.scan(
                        _get_advantages,
                        (jnp.zeros_like(last_val), last_val),
                        traj_batch,
                        reverse=True,
                        unroll=16,
                    )
                    return advantages, advantages + traj_batch.value

                advantages, targets = _calculate_gae(traj_batch, last_val)

                # UPDATE NETWORK
                def _update_epoch(update_state, unused):
                    def _update_minbatch(train_state, batch_info):
                        init_hstate, traj_batch, advantages, targets = batch_info

                        def _loss_fn(params, traj_batch, gae, targets):
                            # RERUN NETWORK
                            _, pi, value, other_pi = network.apply(
                                params,
                                jax.tree.map(
                                    lambda h: h.squeeze(), init_hstate),
                                (traj_batch.obs, traj_batch.done,
                                 traj_batch.past_sa_pairs, traj_batch.avail_actions),
                            )
                            log_prob = pi.log_prob(traj_batch.action)

                            # CALCULATE VALUE LOSS
                            value_pred_clipped = traj_batch.value + (
                                value - traj_batch.value
                            ).clip(-config.clip_eps, config.clip_eps)
                            value_losses = jnp.square(value - targets)
                            value_losses_clipped = jnp.square(
                                value_pred_clipped - targets)
                            value_loss = (
                                0.5 * jnp.maximum(value_losses,
                                                  value_losses_clipped).mean()
                            )

                            # CALCULATE ACTOR LOSS
                            logratio = log_prob - traj_batch.log_prob
                            ratio = jnp.exp(logratio)
                            gae = (gae - gae.mean()) / (gae.std() + 1e-8)
                            loss_actor1 = ratio * gae
                            loss_actor2 = (
                                jnp.clip(
                                    ratio,
                                    1.0 - config.clip_eps,
                                    1.0 + config.clip_eps,
                                )
                                * gae
                            )
                            loss_actor = -jnp.minimum(loss_actor1, loss_actor2)
                            loss_actor = loss_actor.mean()
                            entropy = pi.entropy().mean()

                            def calc_moa_loss(args):
                                other_pi, other_action = args
                                other_log_prob = other_pi.log_prob(
                                    other_action)
                                # NLL loss for other agent
                                other_loss = -other_log_prob
                                other_loss = other_loss.mean()
                                return other_loss

                            def dummy_moa_loss(x): return jnp.array(0.0)
                            moa_loss = jax.lax.cond(
                                config.other_agent_prediction, calc_moa_loss, dummy_moa_loss, (other_pi, traj_batch.other_action))

                            approx_kl = ((ratio - 1) - logratio).mean()
                            clip_frac = jnp.mean(
                                jnp.abs(ratio - 1) > config.clip_eps)

                            total_loss = (
                                loss_actor
                                + config.moa_coef * moa_loss
                                + config.vf_coef * value_loss
                                - config.ent_coef * entropy
                            )
                            return total_loss, (value_loss, loss_actor, entropy, ratio, approx_kl, clip_frac)

                        grad_fn = jax.value_and_grad(_loss_fn, has_aux=True)
                        total_loss, grads = grad_fn(
                            train_state.params, traj_batch, advantages, targets
                        )
                        train_state = train_state.apply_gradients(grads=grads)
                        return train_state, total_loss

                    (
                        train_state,
                        init_hstate,
                        traj_batch,
                        advantages,
                        targets,
                        rng,
                    ) = update_state
                    rng, _rng = jax.random.split(rng)
                    batch_size = config.minibatch_size * config.num_minibatches
                    assert (
                        batch_size == config.num_steps_per_update * config.num_actors
                    ), "batch size must be equal to number of steps * number of actors per population member"
                    permutation = jax.random.permutation(_rng, batch_size)

                    init_hstate = jax.tree.map(lambda h: jnp.reshape(
                        h, (1, config.num_actors, -1)), init_hstate)
                    batch = (
                        init_hstate,
                        traj_batch,
                        advantages.squeeze(),
                        targets.squeeze(),
                    )
                    permutation = jax.random.permutation(
                        _rng, config.num_actors)

                    shuffled_batch = jax.tree_util.tree_map(
                        lambda x: jnp.take(x, permutation, axis=1), batch
                    )

                    minibatches = jax.tree_util.tree_map(
                        lambda x: jnp.swapaxes(
                            jnp.reshape(
                                x,
                                [x.shape[0], config.num_minibatches, -1]
                                + list(x.shape[2:]),
                            ),
                            1,
                            0,
                        ),
                        shuffled_batch,
                    )
                    train_state, total_loss = jax.lax.scan(
                        _update_minbatch, train_state, minibatches
                    )
                    update_state = (
                        train_state,
                        jax.tree.map(lambda h: h.squeeze(), init_hstate),
                        traj_batch,
                        advantages,
                        targets,
                        rng,
                    )
                    return update_state, total_loss

                update_state = (
                    train_state,
                    initial_hstate,
                    traj_batch,
                    advantages,
                    targets,
                    rng,
                )
                update_state, loss_info = jax.lax.scan(
                    _update_epoch, update_state, None, config.update_epochs
                )
                train_state = update_state[0]

                # Run evals
                rng, _rng = jax.random.split(rng, 2)
                eval_rng = jax.random.split(_rng, 100)
                eval_returns, _ = jax.vmap(rollout, in_axes=(0, None, None, None, None))(
                    eval_rng, env, network, train_state.params, config.gru_hidden_dim)

                if config.eval_against_pop:
                    rng, _rng = jax.random.split(rng, 2)
                    eval_rng = jax.random.split(_rng, 100)
                    # First vmap over partners and then run 100 games
                    eval_pop_returns = jax.vmap(
                        rollout_both_ways,
                        in_axes=(None, None, None, None, None, 0, None, None)
                    )(
                        eval_rng, env, network, train_state.params, partner_pop_actor,
                        partner_pop_params, config.gru_hidden_dim, pop_size)
                else:
                    eval_pop_returns = jnp.asarray(0.0)

                metric = traj_batch.info
                metric = jax.tree.map(
                    lambda x: x.reshape(
                        (config.num_steps_per_update,
                         config.num_envs, env.num_agents)
                    ),
                    traj_batch.info,
                )

                metric = jax.tree.map(lambda x: x[-1, ...].mean(), metric)
                ratio_0 = loss_info[1][3].at[0, 0].get().mean()
                loss_info = jax.tree.map(lambda x: x.mean(), loss_info)
                metric["loss"] = {
                    "total_loss": loss_info[0],
                    "value_loss": loss_info[1][0],
                    "actor_loss": loss_info[1][1],
                    "entropy": loss_info[1][2],
                    "ratio": loss_info[1][3],
                    "ratio_0": ratio_0,
                    "approx_kl": loss_info[1][4],
                    "clip_frac": loss_info[1][5],
                }
                metric["eval"] = {
                    "eval_sp_return": eval_returns.mean(),
                    "eval_pop_returns": eval_pop_returns.mean(),
                }
                hist, _ = jnp.histogram(
                    sfl_buffer.epsilon, bins=5, range=(0, 1))
                metric["partner_params"] = {
                    "buffer_learnability_scores": learnability_info["buffer_learnability_scores"].mean(),
                    "sfl_buffer.epsilon_histogram": hist,
                    "epsilon_mean": partner_params.epsilon.mean(),
                    "epsilon_var": partner_params.epsilon.var(),
                    "epsilon_max": partner_params.epsilon.max(),
                    "epsilon_min": partner_params.epsilon.min(),
                    "epsilon_agent_mean": partner_params.epsilon_agent.mean(),
                    "epsilon_agent_1": (partner_params.epsilon_agent == 1).mean(),
                    "epsilon_agent_0": (partner_params.epsilon_agent == 0).mean(),
                    "sfl_buffer.epsilon": sfl_buffer.epsilon.mean(),
                    "sfl_buffer.max": sfl_buffer.epsilon.max(),
                    "sfl_buffer.min": sfl_buffer.epsilon.min(),
                    "sfl_buffer.bias_mask_0": sfl_buffer.bias_mask[..., 0].mean(),
                    "sfl_buffer.bias_mask_1": sfl_buffer.bias_mask[..., 1].mean(),
                    "sfl_buffer.bias_mask_2": sfl_buffer.bias_mask[..., 2].mean(),
                    "sfl_buffer.bias_mask_3": sfl_buffer.bias_mask[..., 3].mean(),
                    "sfl_buffer.bias_mask_4": sfl_buffer.bias_mask[..., 4].mean(),
                    "sfl_buffer.bias_mask_5": sfl_buffer.bias_mask[..., 5].mean(),
                }
                # metric["reward_shaping_factor"] = jnp.maximum(
                #     0.0, 1.0 - (update_steps / config.num_reward_shaping_updates))
                rng = update_state[-1]

                def callback(metric):
                    wandb.log(metric)

                update_steps = update_steps + 1
                valid_mask = metric["returned_episode"]
                returns = metric["returned_episode_returns"]

                # Convert mask to float (1.0 where valid, 0.0 where not)
                mask = valid_mask.astype(jnp.float32)

                # Masked sum and count
                masked_sum = (returns * mask).sum()
                count = mask.sum()

                # Avoid divide-by-zero
                mean_return = jnp.where(count > 0, masked_sum / count, 0.0)
                metric["mean_return"] = mean_return
                metric = jax.tree.map(lambda x: x.mean(), metric)

                metric["update_step"] = update_steps
                metric["env_step"] = update_steps * \
                    config.num_steps_per_update * config.num_envs
                jax.debug.callback(callback, metric)

                runner_state = (train_state, env_state, last_obs,
                                last_done, hstate, past_5_sa_pairs, rng)
                return (runner_state, update_steps, learnability_info), metric

            runner_state = (
                train_state,
                env_state,
                obsv,
                jnp.zeros((config.num_actors), dtype=bool),
                init_hstate,
                past_5_sa_pairs,
                rng)
            update_state = (runner_state, update_idx, learnability_info)
            (runner_state, update_steps, learnability_info), loss_info = jax.lax.scan(
                _update_step, update_state, None, config.num_inner_updates)

            meta_state = (
                runner_state[-1], runner_state[0], sfl_buffer, past_5_sa_pairs, learnability_info, update_steps)
            return meta_state, loss_info

        rng, train_state, past_5_sa_pairs, update_steps = meta_state
        rng, _rng = jax.random.split(rng)
        sfl_buffer, learnability_info = _sample_learnability_buffer(
            _rng, train_state)

        def __buffer_callback(x):
            info, step = x
            wandb.log(info, step=step)

        inner_idx = jnp.arange(config.sfl_buffer_refresh_freq) + \
            (outer_idx)*config.sfl_buffer_refresh_freq
        rng, _rng = jax.random.split(rng)
        meta_state, loss_info = jax.lax.scan(
            _meta_step, (_rng, train_state, sfl_buffer, past_5_sa_pairs, learnability_info, update_steps), inner_idx, config.sfl_buffer_refresh_freq)
        return meta_state, (loss_info, learnability_info)

    return train_loop


def get_run_string(config: TrainConfig):
    return f"FF_RNN_DPD_WBIAS_IPPO_{config.learnability_function}_K_{config.sfl_buffer_size}_Ba_{config.sfl_batch_size}_LBF"


@pyrallis.wrap()
def train(config: TrainConfig):
    ##### WANDB and other setup #####
    tags = [
        "FF",
        "RNN",
        "DPD",
        "IPPO",
        "W/Bias",
        "LBF",
        config.learnability_function,
    ]

    group_string = get_run_string(config)
    run_string = f"{group_string}_SEED_{config.seed}"

    run = wandb.init(
        project=config.project,
        group=group_string,
        mode=config.mode,
        config=asdict(config),
        save_code=True,
        tags=tags,
    )

    if run.sweep_id is not None:
        run.name = run.sweep_id + "___" + run_string
    else:
        run.name = run.name + "___" + run_string

    print("XPID ID name:")
    print(run.name)
    print("-------------")

    #### Setup and check saving before training ####
    if config.checkpoint_path is not None:
        save_dir = os.path.join(config.checkpoint_path, run.name)
        # Make sure we can write the checkpoint later _before_ we wait 1 day for training!
        os.makedirs(save_dir, exist_ok=True)
        config_dict = asdict(config)
        with open(f"{save_dir}/config.pckl", 'wb') as f:
            pickle.dump(config_dict, f)

    env = make_env("lbf")
    env = LogWrapper(env, replace_info=False)

    def linear_schedule(count):
        total_inner_updates = config.num_minibatches * \
            config.update_epochs * config.num_inner_updates
        frac = 1.0 - (count // total_inner_updates) / config.num_meta_updates
        return config.lr * frac

    rng = jax.random.PRNGKey(config.seed)

    # INIT NETWORK
    network = ActorCriticRNN(
        env.action_space("agent_0").n,
        gru_hidden_dim_size=config.gru_hidden_dim,
        fc_dim_size=config.fc_dim_size,
        embedding_layers=config.embedding_layers,
        actor_layers=config.actor_layers,
        critic_layers=config.critic_layers,
        other_agent_prediction=config.other_agent_prediction,
        use_layernorm=config.use_layernorm,
        env_has_avail_actions=True,
    )
    init_x = jnp.zeros(env.observation_space("agent_0").shape)
    init_x = init_x.flatten()

    past_5_sa_pairs = {
        'agent_0': {
            'obs': jnp.zeros((1, config.num_envs, 5, init_x.shape[0])),
            'action': jnp.zeros((1, config.num_envs, 5, 1))
        },
        'agent_1': {
            'obs': jnp.zeros((1, config.num_envs, 5, init_x.shape[0])),
            'action': jnp.zeros((1, config.num_envs, 5, 1))
        }
    }

    batched_sa_pairs = batchify_nested_dics(
        past_5_sa_pairs, env.agents, (1, config.num_actors, 5))

    init_x = (
        jnp.zeros(
            (1, config.num_actors, init_x.shape[0])
        ),
        jnp.zeros((1, config.num_actors)),
        batched_sa_pairs,
        jnp.zeros((1, config.num_actors, 6))
    )
    init_hstate = ScannedRNN.initialize_carry(
        config.num_actors, config.gru_hidden_dim)

    rng, _rng = jax.random.split(rng)
    network_params = network.init(_rng, init_hstate, init_x)

    if config.anneal_lr:
        tx = optax.chain(
            optax.clip_by_global_norm(config.max_grad_norm),
            optax.adam(learning_rate=linear_schedule, eps=1e-5),
        )
    else:
        tx = optax.chain(optax.clip_by_global_norm(
            config.max_grad_norm), optax.adam(config.lr, eps=1e-5))

    train_state = TrainState.create(
        apply_fn=network.apply,
        params=network_params,
        tx=tx,
    )

    if config.eval_against_pop:
        path = Path("eval_populations/FF_BRDiv_LBF")
        partner_pop_params = []
        for file in path.iterdir():
            if "param" in file.name:
                params = pickle.load(open(file, "rb"))
                partner_pop_params.append(params["actor_params"])

        path_config = Path("eval_populations/FF_BRDiv_LBF") / "config.pckl"

        with open(path_config, "rb") as f:
            other_config = pickle.load(f)

        partner_pop_params = jax.tree.map(
            lambda *x: jnp.stack(x), *partner_pop_params)
        partner_pop_actor = ActorWithConditionalCritic(
            env.action_space(env.agents[0]).n)

        pop_size = other_config["partner_pop_size"]
        # pop_size = 1
    else:
        partner_pop_params = None
        partner_pop_actor = None
        pop_size = 0

    # INIT UPDATE FUNCTION
    _update_step = make_update_fn(
        config, env, network, partner_pop_actor, partner_pop_params, pop_size)
    jitted_update_step = jax.jit(_update_step)

    # INIT EVAL ROLLOUT FUNCTION
    jitted_rollout = rollout  # config is static

    # INIT ENV
    rng, _rng = jax.random.split(rng)
    reset_rng = jax.random.split(_rng, config.num_envs)

    obsv, env_state = jax.vmap(env.reset, in_axes=(0,))(reset_rng)

    # Fill past s-a
    past_5_sa_pairs['agent_0']['obs'] = obsv[
        'agent_0'][:, None, :].repeat(5, axis=1)
    past_5_sa_pairs['agent_0']['action'] = jnp.ones(
        (config.num_envs, 5)) * 4
    past_5_sa_pairs['agent_1']['obs'] = obsv[
        'agent_1'][:, None, :].repeat(5, axis=1)
    past_5_sa_pairs['agent_1']['action'] = jnp.ones(
        (config.num_envs, 5)) * 4

    update_steps = 0
    # None is a placeholder for the SFL buffer
    meta_state = (rng, train_state, None, past_5_sa_pairs, update_steps)

    print(config.num_outer_steps)

    for i in range(config.num_outer_steps):
        print(i)
        meta_state, (loss_info, learnability_info) = jitted_update_step(
            (meta_state[0], meta_state[1], past_5_sa_pairs, meta_state[-1]), i)
        train_state = meta_state[1]

        params = train_state.params

        # Remarkably, saving is among the most expensive operations
        if config.checkpoint_path is not None:
            if (i % config.checkpoint_freq == 0 and i != 0) or i == config.num_outer_steps - 1:
                env = make_env("lbf")
                env = LogWrapper(env, replace_info=False)

                network = ActorCriticRNN(
                    env.action_space("agent_0").n,
                    gru_hidden_dim_size=config.gru_hidden_dim,
                    fc_dim_size=config.fc_dim_size,
                    embedding_layers=config.embedding_layers,
                    actor_layers=config.actor_layers,
                    critic_layers=config.critic_layers,
                    other_agent_prediction=config.other_agent_prediction,
                    use_layernorm=config.use_layernorm,
                    env_has_avail_actions=True,
                )
                total_r, total_l = jitted_rollout(
                    rng, env, network, params, config.gru_hidden_dim)

                path = f"{save_dir}/"
                os.makedirs(path, exist_ok=True)
                payload = {"actor_params": params}
                pickle.dump(payload, open(
                    path + f"params_{i}_{total_r}.pt", "wb"))
                pickle.dump(payload, open(
                    path + f"params.pt", "wb"))
                print(
                    f"Saved params for agent with total reward {path}params_{i}_{total_r}.pt", total_r)

    return {"runner_state": meta_state, "metrics": loss_info}


if __name__ == '__main__':
    train()
