# This code is adapted from VERL
# https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/core_algos.py
# The original copyright is reproduced below:
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import defaultdict
from enum import StrEnum
from functools import wraps
from typing import Callable, List, Literal, Optional, Tuple, Union

import numpy as np
import ray
import torch
from jaxtyping import Float
from loguru import logger
from omegaconf import DictConfig

from skyrl_train.training_batch import TrainingInputBatch

# Import cloudpickle for function serialization
try:
    import cloudpickle
except ImportError:
    # Fallback to pickle if cloudpickle is not available
    import pickle as cloudpickle


# TODO (erictang000): unused right now, but will be useful as we add more algorithm support
class AdaptiveKLController:
    """
    Adaptive KL controller described in the paper:
    https://arxiv.org/pdf/1909.08593.pdf
    """

    def __init__(self, init_kl_coef, target, horizon):
        self.value = init_kl_coef
        self.target = target
        self.horizon = horizon

    def update(self, current, n_steps):
        target = self.target
        proportional_error = np.clip(current / target - 1, -0.2, 0.2)
        mult = 1 + proportional_error * n_steps / self.horizon
        self.value *= mult


class FixedKLController:
    """Fixed KL controller."""

    def __init__(self, kl_coef):
        self.value = kl_coef

    def update(self, current, n_steps):
        pass


def get_kl_controller(algorithm_cfg: DictConfig):
    if algorithm_cfg.kl_ctrl.type == "fixed":
        return FixedKLController(kl_coef=algorithm_cfg.kl_loss_coef)
    elif algorithm_cfg.kl_ctrl.type == "adaptive":
        if algorithm_cfg.kl_ctrl.horizon <= 0:
            raise ValueError(f"horizon must be larger than 0. Got {algorithm_cfg.kl_ctrl.horizon}")
        return AdaptiveKLController(
            init_kl_coef=algorithm_cfg.kl_loss_coef,
            target=algorithm_cfg.kl_ctrl.kl_target,
            horizon=algorithm_cfg.kl_ctrl.horizon,
        )
    else:
        raise ValueError(f"Invalid KL controller type: {algorithm_cfg.kl_ctrl.type}")


def masked_mean(tensor: torch.Tensor, mask: Optional[torch.Tensor], dim: Optional[int] = None) -> torch.Tensor:
    if mask is None:
        return tensor.mean(axis=dim)
    return (tensor * mask).sum(axis=dim) / mask.sum(axis=dim).clamp(min=1.0)


@torch.no_grad()
def compute_approx_kl(
    log_probs: torch.Tensor,
    log_probs_base: torch.Tensor,
    loss_mask: Optional[torch.Tensor] = None,
    kl_estimator_type: str = "k3",
) -> torch.Tensor:
    """
    Compute the approximate KL divergence between two distributions.
    Schulman blog: http://joschu.net/blog/kl-approx.html

    Args:
        log_probs: Log probabilities of the new distribution.
        log_probs_base: Log probabilities of the base distribution.
        action_mask: Mask for actions.
    """
    if kl_estimator_type == "k1":
        kld = log_probs - log_probs_base
    elif kl_estimator_type == "abs":
        kld = (log_probs - log_probs_base).abs()
    elif kl_estimator_type == "k2":
        kld = 0.5 * (log_probs - log_probs_base).square()
    # J. Schulman. Approximating kl divergence, 2020.
    # URL http://joschu.net/blog/kl-approx.html.
    elif kl_estimator_type == "k3":
        kl = log_probs_base - log_probs
        # For numerical stability
        kl = torch.clamp(kl, min=-20, max=20)
        ratio = torch.exp(kl)
        kld = (ratio - kl - 1).contiguous()
        kld = torch.clamp(kld, min=-10, max=10)
    else:
        raise ValueError(f"Invalid KL estimator type: {kl_estimator_type}")

    if loss_mask is not None:
        kld = kld * loss_mask
    return kld


@torch.no_grad()
def normalize_advantages_dict(data: TrainingInputBatch) -> TrainingInputBatch:
    """Normalizes the advantages in the data batch.

    Expects:
        - `["advantages"]`: Float[torch.Tensor, "batch_size seqlen"]
        - `["response_mask"]`: Float[torch.Tensor, "batch_size seqlen"]
    """
    advantages: Float[torch.Tensor, "batch_size seqlen"] = data["advantages"]
    response_masks: Float[torch.Tensor, "batch_size seqlen"] = data["response_mask"]
    num_actions: float = response_masks.sum()
    # mean
    mean: float = advantages.mean()
    # std
    std: float = ((advantages - mean).pow(2) * response_masks).sum()
    rstd: float = (std / num_actions).clamp(min=1e-8).rsqrt()

    data["advantages"] = (advantages - mean) * rstd
    return data


def masked_var(values, mask, unbiased=True):
    """Compute variance of tensor with masked values."""
    mean = masked_mean(values, mask)
    centered_values = values - mean
    variance = masked_mean(centered_values**2, mask)
    if unbiased:
        mask_sum = mask.sum()
        if mask_sum == 0:
            raise ValueError("At least one element in the mask has to be 1.")
        # note that if mask_sum == 1, then there is a division by zero issue
        # to avoid it you just need to use a larger minibatch_size
        if mask_sum == 1:
            raise ValueError("The sum of the mask is one, which can cause a division by zero.")
        bessel_correction = mask_sum / (mask_sum - 1)
        variance = variance * bessel_correction
    return variance


def masked_whiten(values, mask, shift_mean=True):
    """Whiten values with masked values."""
    mean, var = masked_mean(values, mask), masked_var(values, mask)
    whitened = (values - mean) * torch.rsqrt(var + 1e-8)
    if not shift_mean:
        whitened += mean
    return whitened


def ppo_critic_loss(
    values: torch.Tensor,
    old_values: torch.Tensor,
    returns: torch.Tensor,
    config: DictConfig,
    loss_mask: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, Optional[float]]:
    if config.value_clip is not None:
        values_clipped = old_values + (values - old_values).clamp(-config.value_clip, config.value_clip)
        surr1 = (values_clipped - returns) ** 2
        surr2 = (values - returns) ** 2
        loss = torch.max(surr1, surr2)
        clipfrac = masked_mean((surr1 > surr2).float(), loss_mask).mean().detach().item()
    else:
        clipfrac = None
        loss = (values - returns) ** 2

    loss = masked_mean(loss, loss_mask, dim=-1).mean()
    return 0.5 * loss, clipfrac


# Shared registry actor class for both policy loss and advantage estimator registries
@ray.remote
class RegistryActor:
    """Shared Ray actor for managing function registries across processes."""

    def __init__(self):
        self.registry = {}

    def register(self, name: str, func_serialized: bytes):
        """Register a serialized function."""
        self.registry[name] = func_serialized

    def get(self, name: str):
        """Get a serialized function by name."""
        return self.registry.get(name)

    def list_available(self):
        """List all available function names."""
        return list(self.registry.keys())

    def unregister(self, name: str):
        """Unregister a function by name."""
        return self.registry.pop(name, None)


class BaseFunctionRegistry:
    """Base class for function registries with Ray actor synchronization."""

    # Subclasses should override these class attributes
    _actor_name = None
    _function_type = "Function"

    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
        cls._functions = {}
        cls._ray_actor = None
        cls._synced_to_actor = False

    @classmethod
    def _get_or_create_actor(cls):
        """Get or create the Ray actor for managing the registry using get_if_exists."""
        if not ray.is_initialized():
            raise Exception("Ray is not initialized, cannot create registry actor")

        if cls._ray_actor is None:
            # Use get_if_exists to create actor only if it doesn't exist
            cls._ray_actor = RegistryActor.options(name=cls._actor_name, get_if_exists=True).remote()
        return cls._ray_actor

    @classmethod
    def _sync_local_to_actor(cls):
        """Sync all local functions to Ray actor."""
        if cls._synced_to_actor:
            return
        if not ray.is_initialized():
            raise Exception("Ray is not initialized, cannot sync with actor")

        try:
            actor = cls._get_or_create_actor()
            if actor is not None:
                for name, func in cls._functions.items():
                    func_serialized = cloudpickle.dumps(func)
                    ray.get(actor.register.remote(name, func_serialized))
                cls._synced_to_actor = True
        except Exception as e:
            logger.error(f"Error syncing {cls._function_type} to actor: {e}")
            raise e

    @classmethod
    def sync_with_actor(cls):
        """Sync local registry with Ray actor if Ray is available."""
        # Only try if Ray is initialized
        if not ray.is_initialized():
            raise Exception("Ray is not initialized, cannot sync with actor")

        # First check if the actor is still alive
        # NOTE(Charlie): This is mainly for unit tests, where we run multiple unit tests in the
        # same Python process, and each unit test has ray init/shutdown. This makes cls's attributes
        # outdated (e.g. the _ray_actor points to a stale actor in the previous ray session).
        try:
            _ = ray.get_actor(cls._actor_name)  # this raises exception if the actor is stale
        except ValueError:
            cls._ray_actor = None
            cls._synced_to_actor = False

        # First, sync our local functions to the actor
        cls._sync_local_to_actor()

        actor = cls._get_or_create_actor()
        if actor is None:
            return

        available = ray.get(actor.list_available.remote())

        # Sync any new functions from actor to local registry
        for name in available:
            if name not in cls._functions:
                func_serialized = ray.get(actor.get.remote(name))
                if func_serialized is not None:
                    # Deserialize the function
                    try:
                        func = cloudpickle.loads(func_serialized)
                        cls._functions[name] = func
                    except Exception as e:
                        # If deserialization fails, skip this function
                        logger.error(f"Error deserializing {name} from actor: {e}")
                        raise e

    @classmethod
    def register(cls, name: Union[str, StrEnum], func: Callable):
        """Register a function.

        If ray is initialized, this function will get or create a named ray actor (RegistryActor)
        for the registry, and sync the registry to the actor.

        If ray is not initalized, the function will be stored in the local registry only.

        To make sure all locally registered functions are available to all ray processes,
        call sync_with_actor() after ray.init().

        Args:
            name: Name of the function to register. Can be a string or a StrEnum.
            func: Function to register.

        Raises:
            ValueError: If the function is already registered.
        """
        # Convert enum to string if needed
        # note: StrEnum is not cloudpickleable: https://github.com/cloudpipe/cloudpickle/issues/558
        if isinstance(name, StrEnum):
            name = name.value

        if name in cls._functions:
            raise ValueError(f"{cls._function_type} '{name}' already registered")

        # Always store in local registry first
        cls._functions[name] = func

        # Try to sync with Ray actor if Ray is initialized
        if ray.is_initialized():
            actor = cls._get_or_create_actor()
            if actor is not None:
                # Serialize the function using cloudpickle
                func_serialized = cloudpickle.dumps(func)
                ray.get(actor.register.remote(name, func_serialized))

    @classmethod
    def get(cls, name: str) -> Callable:
        """Get a function by name.

        If ray is initialized, this function will first sync the local registry with the RegistryActor.
        Then it will return the function if it is found in the registry.

        Args:
            name: Name of the function to get. Can be a string or a StrEnum.

        Returns:
            The function if it is found in the registry.
        """
        # Try to sync with actor first if Ray is available
        if ray.is_initialized():
            cls.sync_with_actor()

        if name not in cls._functions:
            available = list(cls._functions.keys())
            raise ValueError(f"Unknown {cls._function_type.lower()} '{name}'. Available: {available}")
        return cls._functions[name]

    @classmethod
    def list_available(cls) -> List[str]:
        """List all registered functions."""
        # Try to sync with actor first if Ray is available
        if ray.is_initialized():
            cls.sync_with_actor()
        return list(cls._functions.keys())

    @classmethod
    def unregister(cls, name: Union[str, StrEnum]):
        """Unregister a function. Useful for testing."""
        # Convert enum to string if needed
        if isinstance(name, StrEnum):
            name = name.value

        # Try to sync with actor first to get any functions that might be in the actor but not local
        if ray.is_initialized():
            cls.sync_with_actor()

        # Track if we found the function anywhere
        found_locally = name in cls._functions
        found_in_actor = False

        # Remove from local registry if it exists
        if found_locally:
            del cls._functions[name]

        # Try to remove from Ray actor if Ray is available
        if ray.is_initialized():
            actor = cls._get_or_create_actor()
            if actor is not None:
                # Check if it exists in actor first
                available_in_actor = ray.get(actor.list_available.remote())
                if name in available_in_actor:
                    found_in_actor = True
                    ray.get(actor.unregister.remote(name))

        # Only raise error if the function wasn't found anywhere
        if not found_locally and not found_in_actor:
            raise ValueError(f"{cls._function_type} '{name}' not registered")

    @classmethod
    def reset(cls):
        """Resets the registry (useful for testing purposes)."""
        if ray.is_initialized() and cls._ray_actor is not None:
            try:
                actor = ray.get_actor(cls._actor_name)  # this raises exception if the actor is stale
                ray.kill(actor)
            except ValueError:
                pass  # Actor may already be gone
        cls._functions.clear()
        cls._ray_actor = None
        cls._synced_to_actor = False

    @classmethod
    def repopulate(cls):
        """Repopulate the registry with the default functions."""
        cls.reset()
        cls.register(cls._function_type, cls._function_type)


class AdvantageEstimator(StrEnum):
    GAE = "gae"
    GRPO = "grpo"
    RLOO = "rloo"
    REINFORCE_PP = "reinforce++"
    LOG_REINFORCE = "log_reinforce"
    CANCEL_REINFORCE = "cancel_reinforce"
    REJECTION_SAMPLING = "rejection_sampling"
    A_REINFORCE = "a_reinforce"


class AdvantageEstimatorRegistry(BaseFunctionRegistry):
    """
    Registry for advantage estimator functions.

    This registry allows users to register custom advantage estimators without modifying
    the skyrl_train package. Custom estimators can be registered by calling
    AdvantageEstimatorRegistry.register() directly or by using the @register_advantage_estimator
    decorator.

    See examples/algorithms/custom_advantage_estimator for a simple example of how to
    register and use custom advantage estimators.
    """

    _actor_name = "advantage_estimator_registry"
    _function_type = "advantage estimator"

    @classmethod
    def repopulate_registry(cls):
        ae_avail = set(cls.list_available())
        ae_types = {
            "grpo": [AdvantageEstimator.GRPO, compute_grpo_outcome_advantage],
            "gae": [AdvantageEstimator.GAE, compute_gae_advantage_return],
            "rloo": [AdvantageEstimator.RLOO, compute_rloo_outcome_advantage],
            "reinforce++": [AdvantageEstimator.REINFORCE_PP, compute_reinforce_plus_plus_outcome_advantage],
            "log_reinforce": [AdvantageEstimator.LOG_REINFORCE, compute_log_reinforce_outcome_advantage],
            "cancel_reinforce": [AdvantageEstimator.CANCEL_REINFORCE, compute_cancel_reinforce_outcome_advantage],
            "rejection_sampling": [AdvantageEstimator.REJECTION_SAMPLING, compute_rejection_sampling_outcome_advantage],
            "a_reinforce": [AdvantageEstimator.A_REINFORCE, compute_a_reinforce_outcome_advantage],
        }

        for ae_name, (ae_type, ae_func) in ae_types.items():
            if ae_name not in ae_avail:
                cls.register(ae_type, ae_func)


class PolicyLossType(StrEnum):
    REGULAR = "regular"
    DUAL_CLIP = "dual_clip"
    GSPO = "gspo"
    CISPO = "cispo"
    CLIP_COV = "clip_cov"
    KL_COV = "kl_cov"


class PolicyLossRegistry(BaseFunctionRegistry):
    """
    Registry for policy loss functions.

    This registry allows users to register custom policy loss functions without modifying
    the skyrl_train package. Custom functions can be registered by calling
    PolicyLossRegistry.register() directly or by using the @register_policy_loss
    decorator.

    See examples/algorithms/custom_policy_loss for a simple example of how to
    register and use custom policy loss functions.
    """

    _actor_name = "policy_loss_registry"
    _function_type = "policy loss"

    @classmethod
    def repopulate_registry(cls):
        """Repopulate the registry with default policy loss functions."""
        pl_avail = set(cls.list_available())
        pl_types = {
            "regular": [PolicyLossType.REGULAR, ppo_policy_loss],
            "dual_clip": [PolicyLossType.DUAL_CLIP, ppo_policy_loss],
            "gspo": [PolicyLossType.GSPO, gspo_policy_loss],
            "clip_cov": [PolicyLossType.CLIP_COV, compute_policy_loss_clip_cov],
            "kl_cov": [PolicyLossType.KL_COV, compute_policy_loss_kl_cov],
        }

        for pl_name, (pl_type, pl_func) in pl_types.items():
            if pl_name not in pl_avail:
                cls.register(pl_type, pl_func)


def register_advantage_estimator(name: Union[str, AdvantageEstimator]):
    """Decorator to register an advantage estimator function."""

    def decorator(func: Callable):
        @wraps(func)
        def wrapper(*args, **kwargs):
            return func(*args, **kwargs)

        AdvantageEstimatorRegistry.register(name, wrapper)
        return wrapper

    return decorator


def register_policy_loss(name: Union[str, PolicyLossType]):
    """Decorator to register a policy loss function."""

    def decorator(func: Callable):
        @wraps(func)
        def wrapper(*args, **kwargs):
            return func(*args, **kwargs)

        PolicyLossRegistry.register(name, wrapper)
        return wrapper

    return decorator


def sync_registries():
    """Sync the registries with the ray actor once ray is initialized"""
    if not ray.is_initialized():
        raise ValueError("Ray is not initialized, cannot sync registries")
    PolicyLossRegistry.sync_with_actor()
    AdvantageEstimatorRegistry.sync_with_actor()
    logger.info("Synced registries to ray actor")


def _safe_exp_delta(delta: torch.Tensor, clip: float = 20.0, out_dtype=None) -> torch.Tensor:
    """
    Clamp the delta before exponentiating to avoid potential overflow.
    """
    y = torch.clamp(delta.to(torch.float32), -clip, clip).exp()
    return y.to(out_dtype or delta.dtype)


@register_policy_loss(PolicyLossType.REGULAR)
@register_policy_loss(PolicyLossType.DUAL_CLIP)
def ppo_policy_loss(
    log_probs: torch.Tensor,
    old_log_probs: torch.Tensor,
    advantages: torch.Tensor,
    config: DictConfig,
    loss_mask: Optional[torch.Tensor] = None,
    rollout_logprobs: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, float]:
    assert config.policy_loss_type in ["regular", "dual_clip"], "loss_type must be either 'regular' or 'dual_clip'"
    loss_reduction = config.loss_reduction
    assert loss_reduction in [
        "token_mean",
        "sequence_mean",
        "seq_mean_token_sum_norm",
    ], "loss_reduction must be either 'token_mean', 'sequence_mean', or 'seq_mean_token_sum_norm'"

    ratio = _safe_exp_delta(log_probs - old_log_probs, clip=20.0, out_dtype=log_probs.dtype)
    surr1 = ratio * advantages
    surr2 = ratio.clamp(1 - config.eps_clip_low, 1 + config.eps_clip_high) * advantages
    loss = -torch.min(surr1, surr2)
    clip_ratio = masked_mean((-surr2 > -surr1).float(), loss_mask).mean().detach().item()
    clip_pg_losses1 = loss
    if config.policy_loss_type == "dual_clip":
        pg_losses3 = -advantages * config.clip_ratio_c
        clip_pg_losses2 = torch.min(pg_losses3, clip_pg_losses1)
        loss = torch.where(advantages < 0, clip_pg_losses2, clip_pg_losses1)

    if config.use_tis:
        from loguru import logger as logger_

        logger_.debug(f"Using TIS with dtype: {rollout_logprobs.dtype}")
        # Apply truncated importance sampling -> https://fengyao.notion.site/off-policy-rl
        tis_imp_ratio = _safe_exp_delta(old_log_probs - rollout_logprobs, clip=20.0, out_dtype=log_probs.dtype)
        tis_imp_ratio = torch.clamp(tis_imp_ratio, max=config.tis_imp_ratio_cap)
        loss = loss * tis_imp_ratio

    loss = reduce_loss(loss, loss_mask, loss_reduction, config.max_seq_len)
    return loss, clip_ratio


@register_policy_loss(PolicyLossType.GSPO)
def gspo_policy_loss(
    log_probs: torch.Tensor,
    old_log_probs: torch.Tensor,
    advantages: torch.Tensor,
    config: DictConfig,
    loss_mask: Optional[torch.Tensor] = None,
    rollout_logprobs: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, float]:
    """
    GSPO (Group Sequence Policy Optimization) policy loss function,
    as proposed in https://arxiv.org/abs/2507.18071.

    This implements sequence-level importance sampling instead of token-level importance sampling.
    The key difference is that importance weights are computed at the sequence level and then
    applied uniformly across all tokens in the sequence. This can lead to more stable training
    dynamics by reducing the variance in clipping behavior within sequences.

    The variant of GSPO used here is GSPO-token, a generalization which allows for token-level
    advantages [equations 14 and 15 in the paper].
    """
    # GSPO must use sequence_mean reduction
    loss_reduction = config.loss_reduction
    if loss_reduction != "sequence_mean":
        # The GSPO paper uses sequence_mean reduction; there's no reason
        # why a user couldn't use token_mean reduction, but
        # it's not clear whether it would be stable or not.
        from loguru import logger as logger_  # have to do lazy import to avoid pickling error

        logger_.warning(f"With GSPO it's recommended to use 'sequence_mean' loss reduction; got {loss_reduction}")

    # Compute log ratios
    log_ratio = log_probs - old_log_probs

    # Key GSPO innovation: sequence-level importance sampling
    # Instead of using per-token ratios, compute sequence-averaged ratios
    log_importance_weights = masked_mean(log_ratio, loss_mask, dim=-1).unsqueeze(-1)

    # s_i,t(θ) = sg[s_i(θ)] · π_θ(y_i,t|x, y_i,<t) / sg[π_θ(y_i,t|x, y_i,<t)]
    # In log space: log(s_i,t(θ)) = sg[log(s_i(θ))] + log_probs - sg[log_probs]
    # note: we put the addition at the end to avoid precision issues,
    # per https://github.com/volcengine/verl/pull/2775#discussion_r2241500280
    log_token_importance_weights = log_probs - log_probs.detach() + log_importance_weights.detach()
    # clip to avoid overflow
    log_token_importance_weights = torch.clamp(log_token_importance_weights, max=10)
    ratio = torch.exp(log_token_importance_weights)

    # Standard PPO surrogate objective with sequence-level importance weights
    surr1 = ratio * advantages
    surr2 = ratio.clamp(1 - config.eps_clip_low, 1 + config.eps_clip_high) * advantages
    loss = -torch.min(surr1, surr2)

    # Compute clipping ratio for monitoring
    clip_ratio = masked_mean((-surr2 > -surr1).float(), loss_mask).mean().detach().item()

    loss = reduce_loss(loss, loss_mask, loss_reduction, config.max_seq_len)

    return loss, clip_ratio


@register_policy_loss(PolicyLossType.CISPO)
def compute_policy_loss_cispo(
    log_probs: torch.Tensor,
    old_log_probs: torch.Tensor,
    advantages: torch.Tensor,
    config: DictConfig,
    loss_mask: Optional[torch.Tensor] = None,
    rollout_logprobs: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, float]:
    """Implementation of CISPO (Clipped IS-weight Policy Optimization) loss function,
    as proposed in https://arxiv.org/abs/2506.13585.

    Instead of clipping the importance sampling ratio in the loss directly, as done
    in PPO loss, CISPO clips the importance sampling ratio in the policy gradient
    update. This means the model can still learn from samples whose importance sampling
    ratio is clipped in CISPO, as opposed to PPO where these samples have zero
    gradient and are essentially ignored.
    """
    ratio = _safe_exp_delta(log_probs - old_log_probs, clip=20.0, out_dtype=log_probs.dtype)
    clamped_ratio = torch.clamp(ratio, 1 - config.cispo.cispo_eps_clip_low, 1 + config.cispo.cispo_eps_clip_high)
    loss = -advantages * clamped_ratio.detach() * log_probs

    is_clipped = (ratio < 1 - config.cispo.cispo_eps_clip_low) | (ratio > 1 + config.cispo.cispo_eps_clip_high)
    clip_ratio = masked_mean(is_clipped.float(), loss_mask).mean().detach().item()

    loss = reduce_loss(loss, loss_mask, config.loss_reduction, config.max_seq_len)
    return loss, clip_ratio


@register_policy_loss(PolicyLossType.CLIP_COV)
def compute_policy_loss_clip_cov(
    log_probs: torch.Tensor,
    old_log_probs: torch.Tensor,
    advantages: torch.Tensor,
    config: DictConfig,
    loss_mask: Optional[torch.Tensor] = None,
    rollout_logprobs: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, float]:
    """Clip-Cov policy loss function implementation.

    Adapted from https://github.com/PRIME-RL/Entropy-Mechanism-of-RL/blob/main/verl/trainer/ppo/core_algos.py

    This method combines standard PPO clipping with covariance-based clipping
    to provide more stable training dynamics.
    """
    # Extract config parameters with defaults
    clip_cov_ratio = config.clip_cov.clip_ratio
    clip_cov_lb = config.clip_cov.clip_cov_lb
    clip_cov_ub = config.clip_cov.clip_cov_ub

    negative_approx_kl = log_probs - old_log_probs
    ratio = torch.exp(negative_approx_kl)

    pg_losses1 = -advantages * ratio

    pg_losses2 = -advantages * torch.clamp(ratio, 1 - config.eps_clip_low, 1 + config.eps_clip_high)
    clip_by_origin = (pg_losses2 > pg_losses1) & (loss_mask > 0)

    # Compute covariance for clipping decision
    cov_all = (advantages - masked_mean(advantages, loss_mask)) * (
        log_probs - masked_mean(log_probs.detach(), loss_mask)
    )
    cov_all[loss_mask == 0] = -torch.inf
    cov_all[clip_by_origin] = -torch.inf

    # Determine number of tokens to clip based on clip_ratio
    clip_num = max(int(clip_cov_ratio * loss_mask.sum().item()), 1)
    top_k_idx = (cov_all < clip_cov_ub) & (cov_all > clip_cov_lb) & (loss_mask > 0)
    top_k_idx = torch.nonzero(top_k_idx)

    if len(top_k_idx) > 0:
        perm = torch.randperm(len(top_k_idx))
        top_k_idx = top_k_idx[perm[: min(clip_num, len(top_k_idx))]]
    else:
        top_k_idx = torch.empty((0, 2), device=cov_all.device, dtype=torch.long)

    # Create correction mask
    corr = torch.ones_like(advantages)
    if len(top_k_idx) > 0:
        corr[top_k_idx[:, 0], top_k_idx[:, 1]] = 0

    # Compute clip fraction for monitoring
    clip_frac = masked_mean((corr == 0).float(), loss_mask)

    # Apply correction mask to losses
    pg_losses = torch.maximum(pg_losses1, pg_losses2) * corr
    pg_loss = reduce_loss(
        loss=pg_losses,
        loss_mask=loss_mask,
        loss_reduction=config.loss_reduction,
        max_seq_len=config.max_seq_len,
    )

    return pg_loss, clip_frac.item()


@register_policy_loss(PolicyLossType.KL_COV)
def compute_policy_loss_kl_cov(
    log_probs: torch.Tensor,
    old_log_probs: torch.Tensor,
    advantages: torch.Tensor,
    config: DictConfig,
    loss_mask: Optional[torch.Tensor] = None,
    rollout_logprobs: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, float]:
    """KL-Cov policy loss function implementation.

    Adapted from https://github.com/PRIME-RL/Entropy-Mechanism-of-RL/blob/main/verl/trainer/ppo/core_algos.py

    Uses covariance-based selection to apply KL regularization to a subset of tokens.
    """
    kl_cov_frac = config.kl_cov.kl_cov_frac  # This should be a percentage (e.g., 0.2 for 20%)
    ppo_kl_coef = config.kl_cov.ppo_kl_coef

    negative_approx_kl = log_probs - old_log_probs
    abs_kl = negative_approx_kl.abs()
    ratio = torch.exp(negative_approx_kl)

    pg_losses1 = -advantages * ratio
    pg_losses_kl = -advantages * ratio + ppo_kl_coef * abs_kl
    pg_losses = pg_losses1.clone()

    all_valid = loss_mask > 0
    all_valid_idx = torch.nonzero(all_valid.reshape(-1), as_tuple=True)[0]
    all_valid_adv = advantages[all_valid].detach().reshape(-1).cpu()
    all_valid_logp = log_probs[all_valid].detach().reshape(-1).cpu()

    if len(all_valid_adv) > 0:
        cov_lst_all = (all_valid_adv - all_valid_adv.mean()) * (all_valid_logp - all_valid_logp.mean())
        # Use percentage-based selection like the reference implementation
        k_percent_nums = max(1, int(len(cov_lst_all) * kl_cov_frac))

        if k_percent_nums > 0:
            large_cov_idxs = torch.topk(cov_lst_all, min(k_percent_nums, len(cov_lst_all)), largest=True).indices

            if len(large_cov_idxs) > 0:
                large_cov_idxs = all_valid_idx[large_cov_idxs]
                pg_losses[
                    large_cov_idxs // advantages.shape[1],
                    large_cov_idxs % advantages.shape[1],
                ] = pg_losses_kl[
                    large_cov_idxs // advantages.shape[1],
                    large_cov_idxs % advantages.shape[1],
                ]

    pg_loss = reduce_loss(
        loss=pg_losses,
        loss_mask=loss_mask,
        loss_reduction=config.loss_reduction,
        max_seq_len=config.max_seq_len,
    )

    # NOTE (sumanthrh): Since the pg clip ratio is not applicable for KL-COV so we just use 0.0
    return pg_loss, 0.0


def reduce_loss(
    loss: torch.Tensor,
    loss_mask: Optional[torch.Tensor],
    loss_reduction: Literal["token_mean", "sequence_mean", "seq_mean_token_sum_norm"],
    max_seq_len: Optional[int] = None,
) -> torch.Tensor:
    if loss_reduction == "token_mean":
        # sum over *all* valid tokens, divide by total valid-token count
        loss = masked_mean(loss, loss_mask)
    elif loss_reduction == "sequence_mean":
        # per-sequence token-mean (dim=-1), then batch-mean
        loss = masked_mean(loss, loss_mask, dim=-1).mean()
    elif loss_reduction == "seq_mean_token_sum_norm":
        # per-sequence token-sum, normalized by the max sequence length, then batch mean
        # this is the Dr. GRPO loss reduction to avoid length bias by normalizing by a constant
        assert max_seq_len is not None, "max_seq_len must be provided for seq_mean_token_sum_norm loss reduction"
        # NOTE: max_seq_len is computed as cfg.generator.max_input_length + cfg.generator.sampling_params.max_generate_length by default
        if loss_mask is not None:
            seq_losses = torch.sum(loss * loss_mask, dim=-1) / max_seq_len
        else:
            # If no mask, assume all tokens are valid
            seq_losses = torch.sum(loss, dim=-1) / max_seq_len
        loss = torch.mean(seq_losses)
    else:
        raise ValueError(f"Invalid loss reduction type: {loss_reduction}")
    return loss


# NOTE (erictang000): below ported from verl
@register_advantage_estimator(AdvantageEstimator.REINFORCE_PP)
def compute_reinforce_plus_plus_outcome_advantage(
    token_level_rewards: torch.Tensor,
    response_mask: torch.Tensor,
    gamma: float,
    **kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Compute advantage for REINFORCE++.
    This implementation is based on the paper: https://arxiv.org/abs/2501.03262

    Args:
        - token_level_rewards: Float[torch.Tensor, "batch_size seqlen"]
        - response_mask: Float[torch.Tensor, "batch_size seqlen"]

    Returns:
        - advantages: Float[torch.Tensor, "batch_size seqlen"]
        - returns: Float[torch.Tensor, "batch_size seqlen"]
    """
    with torch.no_grad():
        returns = torch.zeros_like(token_level_rewards)
        running_return = 0

        for t in reversed(range(token_level_rewards.shape[1])):
            running_return = token_level_rewards[:, t] + gamma * running_return
            returns[:, t] = running_return
            # Reset after EOS
            running_return = running_return * response_mask[:, t]

        advantages = masked_whiten(returns, response_mask)
        advantages = advantages * response_mask

    return advantages, returns


@register_advantage_estimator(AdvantageEstimator.RLOO)
def compute_rloo_outcome_advantage(
    token_level_rewards: torch.Tensor,
    response_mask: torch.Tensor,
    index: np.ndarray,
    **kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Compute advantage for RLOO based on https://arxiv.org/abs/2402.14740

    This advantage estimator is also used in LOOP (https://arxiv.org/pdf/2502.01600),
    and was originally introduced in "Buy 4 REINFORCE Samples, Get a Baseline for Free!"
    (https://openreview.net/pdf?id=r1lgTGL5DE).

    Args:
        - token_level_rewards: Float[torch.Tensor, "batch_size seqlen"]
        - response_mask: Float[torch.Tensor, "batch_size seqlen"]
        - index: np.ndarray (batch_size)

    Returns:
        - advantages: Float[torch.Tensor, "batch_size seqlen"]
        - returns: Float[torch.Tensor, "batch_size seqlen"]
    """
    from loguru import logger as logger_

    scores = token_level_rewards.sum(dim=-1)

    id2score = defaultdict(list)
    id2mean = {}

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            id2score[index[i]].append(scores[i])
        for idx in id2score:
            if len(id2score[idx]) == 1:
                id2mean[idx] = torch.tensor(0.0, device=scores.device)
            elif len(id2score[idx]) > 1:
                id2mean[idx] = torch.mean(torch.stack(id2score[idx]))
        for i in range(bsz):
            response_num = len(id2score[index[i]])
            if response_num > 1:
                factor = response_num / (response_num - 1)
                scores[i] = (scores[i] - id2mean[index[i]]) * factor
            else:
                # if there's only one response, set the advantage to 0
                logger_.warning(f"Only one response for prompt index {index[i]}, setting advantage to 0")
                scores[i] = 0.0
        scores = scores.unsqueeze(-1) * response_mask

    return scores, scores


@register_advantage_estimator(AdvantageEstimator.GAE)
def compute_gae_advantage_return(
    token_level_rewards: Float[torch.Tensor, "batch_size seqlen"],
    values: Float[torch.Tensor, "batch_size seqlen"],
    response_mask: Float[torch.Tensor, "batch_size seqlen"],
    gamma: float,
    lambd: float,
    **kwargs,
) -> Tuple[Float[torch.Tensor, "batch_size seqlen"], Float[torch.Tensor, "batch_size seqlen"]]:
    """
    Compute advantage and return for GAE.

    Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py
    """
    with torch.no_grad():
        lastgaelam = 0
        advantages_reversed = []
        gen_len = token_level_rewards.shape[-1]

        for t in reversed(range(gen_len)):
            nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
            delta = token_level_rewards[:, t] + gamma * nextvalues - values[:, t]
            lastgaelam = delta + gamma * lambd * lastgaelam
            advantages_reversed.append(lastgaelam)
        advantages = torch.stack(advantages_reversed[::-1], dim=1)

        returns = advantages + values
        advantages = masked_whiten(advantages, response_mask)
    return advantages, returns


@register_advantage_estimator(AdvantageEstimator.GRPO)
def compute_grpo_outcome_advantage(
    token_level_rewards: torch.Tensor,
    response_mask: torch.Tensor,
    index: np.ndarray,
    epsilon: float = 1e-6,
    grpo_norm_by_std: bool = True,
    **kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Compute advantage for GRPO, operating only on Outcome reward (with only one scalar reward for each response).

    Expects:
        - token_level_rewards: Float[torch.Tensor, "batch_size seqlen"]
        - response_mask: Float[torch.Tensor, "batch_size seqlen"]
        - index: np.ndarray (batch_size)
        - epsilon: float
        - grpo_norm_by_std: bool

    Returns:
        - advantages: Float[torch.Tensor, "batch_size seqlen"]
        - returns: Float[torch.Tensor, "batch_size seqlen"]
    """
    # this assumes response-level rewards
    scores = token_level_rewards.sum(dim=-1)

    id2score = defaultdict(list)
    id2mean = {}
    id2std = {}

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            id2score[index[i]].append(scores[i])
        for idx in id2score:
            if len(id2score[idx]) == 1:
                id2mean[idx] = torch.tensor(0.0)
                id2std[idx] = torch.tensor(1.0)
            elif len(id2score[idx]) > 1:
                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
                id2std[idx] = torch.std(torch.tensor([id2score[idx]]))
            else:
                raise ValueError(f"no score in prompt index: {idx}")
        for i in range(bsz):
            if grpo_norm_by_std:
                scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)
            else:
                scores[i] = scores[i] - id2mean[index[i]]
        scores = scores.unsqueeze(-1) * response_mask

    return scores, scores


@register_advantage_estimator(AdvantageEstimator.LOG_REINFORCE)
def compute_log_reinforce_outcome_advantage(
    token_level_rewards: torch.Tensor,
    response_mask: torch.Tensor,
    index: np.ndarray,
    epsilon: float = 1e-6,
    **kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Compute advantage for log-REINFORCE, operating only on Outcome reward (with only one scalar reward for each response).

    For positive rewards: A^+ = (1 - rho_hat) / rho_hat
    For negative rewards: A^- = - 1
    where rho_hat = M^+ / M is the ratio of positive rewards for the M rollouts per prompt.

    Expects:
        - token_level_rewards: Float[torch.Tensor, "batch_size seqlen"]
        - response_mask: Float[torch.Tensor, "batch_size seqlen"]
        - index: np.ndarray (batch_size)
        - epsilon: float (small constant to avoid division by zero)
        - config.log_reinforce_zero_rho_advantage: float (advantage value when rho ≈ 0, default -1)
        - config.log_reinforce_warmup_steps: int (number of steps to use 0 for zero_rho_advantage, default 0)
        - global_step: int (current training step, passed via kwargs)

    Returns:
        - advantages: Float[torch.Tensor, "batch_size seqlen"]
        - returns: Float[torch.Tensor, "batch_size seqlen"]
    """
    # this assumes response-level rewards
    scores = token_level_rewards.sum(dim=-1)

    # Get zero_rho_advantage from config, default to -1
    config = kwargs.get('config', None)
    global_step = kwargs.get('global_step', 0)
    zero_rho_advantage = -1.0
    warmup_steps = 0
    if config is not None:
        zero_rho_advantage = getattr(config, 'log_reinforce_zero_rho_advantage', -1.0)
        warmup_steps = getattr(config, 'log_reinforce_warmup_steps', 0)

    # During warmup, use 0 for zero_rho_advantage
    if global_step <= warmup_steps:
        zero_rho_advantage = 0.0

    id2scores = defaultdict(list)
    id2indices = defaultdict(list)

    with torch.no_grad():
        bsz = scores.shape[0]

        # Group scores by prompt index
        for i in range(bsz):
            id2scores[index[i]].append(scores[i])
            id2indices[index[i]].append(i)

        # Compute advantages
        advantages = torch.zeros_like(scores)
        # Collect rho_hat values for logging
        rho_hat_list = []

        for idx in id2scores:
            prompt_scores = torch.tensor(id2scores[idx])
            prompt_indices = id2indices[idx]

            # Compute rho_hat = M^+ / M (ratio of positive rewards)
            M = len(prompt_scores)
            M_plus = (prompt_scores > 0).sum().item()
            rho_hat = M_plus / M

            # Store original rho_hat for logging
            rho_hat_list.append(rho_hat)

            # If all responses are wrong (rho_hat = 0), use configurable advantage value
            if rho_hat < epsilon:
                for i, batch_idx in enumerate(prompt_indices):
                    advantages[batch_idx] = zero_rho_advantage
                continue

            # Compute advantages
            for i, batch_idx in enumerate(prompt_indices):
                if prompt_scores[i] > 0:
                    # Positive reward: A^+ = (1 - rho_hat) / rho_hat
                    advantages[batch_idx] = (1.0 - rho_hat) / rho_hat
                else:
                    # Negative reward: A^- = -1
                    advantages[batch_idx] = -1.0

        # Expand advantages to token level
        advantages = advantages.unsqueeze(-1) * response_mask

        # Store rho_hat values in kwargs for retrieval
        if 'metadata_out' in kwargs:
            kwargs['metadata_out']['rho_hat_values'] = rho_hat_list

    return advantages, advantages


@register_advantage_estimator(AdvantageEstimator.CANCEL_REINFORCE)
def compute_cancel_reinforce_outcome_advantage(
    token_level_rewards: torch.Tensor,
    response_mask: torch.Tensor,
    index: np.ndarray,
    epsilon: float = 1e-6,
    **kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Compute advantage for cancel-REINFORCE, operating only on Outcome reward (with only one scalar reward for each response).

    For positive rewards: A^+ = 1 / rho_hat
    For negative rewards: A^- = - 1 / (1 - rho_hat)
    where rho_hat = M^+ / M is the ratio of positive rewards for the M rollouts per prompt.

    Expects:
        - token_level_rewards: Float[torch.Tensor, "batch_size seqlen"]
        - response_mask: Float[torch.Tensor, "batch_size seqlen"]
        - index: np.ndarray (batch_size)
        - epsilon: float (small constant to avoid division by zero)

    Returns:
        - advantages: Float[torch.Tensor, "batch_size seqlen"]
        - returns: Float[torch.Tensor, "batch_size seqlen"]
    """
    # this assumes response-level rewards
    scores = token_level_rewards.sum(dim=-1)

    id2scores = defaultdict(list)
    id2indices = defaultdict(list)

    with torch.no_grad():
        bsz = scores.shape[0]

        # Group scores by prompt index
        for i in range(bsz):
            id2scores[index[i]].append(scores[i])
            id2indices[index[i]].append(i)

        # Compute advantages
        advantages = torch.zeros_like(scores)

        for idx in id2scores:
            prompt_scores = torch.tensor(id2scores[idx])
            prompt_indices = id2indices[idx]

            # Compute rho_hat = M^+ / M (ratio of positive rewards)
            M = len(prompt_scores)
            M_plus = (prompt_scores > 0).sum().item()
            rho_hat = M_plus / M

            # Avoid division by zero
            if rho_hat < epsilon:
                rho_hat = epsilon
            if (1.0 - rho_hat) < epsilon:
                # If rho_hat is very close to 1, set it slightly below 1
                rho_hat = 1.0 - epsilon

            # Compute advantages
            for i, batch_idx in enumerate(prompt_indices):
                if prompt_scores[i] > 0:
                    # Positive reward: A^+ = 1 / rho_hat
                    advantages[batch_idx] = 1.0 / rho_hat
                else:
                    # Negative reward: A^- = 1 / (1 - rho_hat)
                    advantages[batch_idx] = - 1.0 / (1.0 - rho_hat)

        # Expand advantages to token level
        advantages = advantages.unsqueeze(-1) * response_mask

    return advantages, advantages


@register_advantage_estimator(AdvantageEstimator.REJECTION_SAMPLING)
def compute_rejection_sampling_outcome_advantage(
    token_level_rewards: torch.Tensor,
    response_mask: torch.Tensor,
    index: np.ndarray,
    epsilon: float = 1e-6,
    **kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Compute advantage for rejection sampling, which only reinforces positive outcomes.
    Sets A^- = 0 for negative rewards (no gradient signal for bad outcomes).

    For positive rewards: A^+ = 1 / rho_hat
    For negative rewards: A^- = 0
    where rho_hat = M^+ / M is the ratio of positive rewards for the M rollouts per prompt.

    Expects:
        - token_level_rewards: Float[torch.Tensor, "batch_size seqlen"]
        - response_mask: Float[torch.Tensor, "batch_size seqlen"]
        - index: np.ndarray (batch_size)
        - epsilon: float (small constant to avoid division by zero)

    Returns:
        - advantages: Float[torch.Tensor, "batch_size seqlen"]
        - returns: Float[torch.Tensor, "batch_size seqlen"]
    """
    # this assumes response-level rewards
    scores = token_level_rewards.sum(dim=-1)

    id2scores = defaultdict(list)
    id2indices = defaultdict(list)

    with torch.no_grad():
        bsz = scores.shape[0]

        # Group scores by prompt index
        for i in range(bsz):
            id2scores[index[i]].append(scores[i])
            id2indices[index[i]].append(i)

        # Compute advantages
        advantages = torch.zeros_like(scores)

        for idx in id2scores:
            prompt_scores = torch.tensor(id2scores[idx])
            prompt_indices = id2indices[idx]

            # Compute rho_hat = M^+ / M (ratio of positive rewards)
            M = len(prompt_scores)
            M_plus = (prompt_scores > 0).sum().item()
            rho_hat = M_plus / M

            # Avoid division by zero
            if rho_hat < epsilon:
                rho_hat = epsilon

            # Compute advantages
            for i, batch_idx in enumerate(prompt_indices):
                if prompt_scores[i] > 0:
                    # Positive reward: A^+ = 1 / rho_hat
                    advantages[batch_idx] = 1.0 / rho_hat
                else:
                    # Negative reward: A^- = 0
                    advantages[batch_idx] = 0.0

        # Expand advantages to token level
        advantages = advantages.unsqueeze(-1) * response_mask

    return advantages, advantages


@register_advantage_estimator(AdvantageEstimator.A_REINFORCE)
def compute_a_reinforce_outcome_advantage(
    token_level_rewards: torch.Tensor,
    response_mask: torch.Tensor,
    index: np.ndarray,
    epsilon: float = 1e-6,
    **kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Compute advantage for A-REINFORCE with adaptive advantage scaling based on rho_hat.

    Generalized formula that works for both binary and continuous rewards:

    For rho < 0.5:
        A_i = (r_i - rho) / (2 * rho * (1 - rho))
    For rho >= 0.5:
        A_i = (r_i - rho) / sqrt(rho * (1 - rho))

    where rho = mean reward across M rollouts per prompt.

    Note: For binary rewards (r_i ∈ {0, 1}), this simplifies to:
      - When r_i = 1 and rho < 0.5: A_i = (1 - rho) / (2 * rho * (1 - rho)) = 1 / (2 * rho) = 0.5/rho
      - When r_i = 0 and rho < 0.5: A_i = -rho / (2 * rho * (1 - rho)) = -1 / (2 * (1 - rho)) = -0.5/(1-rho)
      (matches original implementation)

    Expects:
        - token_level_rewards: Float[torch.Tensor, "batch_size seqlen"]
        - response_mask: Float[torch.Tensor, "batch_size seqlen"]
        - index: np.ndarray (batch_size)
        - epsilon: float (small constant to avoid division by zero)
        - config.a_reinforce_zero_rho_advantage: float (advantage value when rho ≈ 0, default -0.15)
        - config.a_reinforce_warmup_steps: int (number of steps to use 0 for zero_rho_advantage, default 0)
        - global_step: int (current training step, passed via kwargs)

    Returns:
        - advantages: Float[torch.Tensor, "batch_size seqlen"]
        - returns: Float[torch.Tensor, "batch_size seqlen"]
    """
    # this assumes response-level rewards
    scores = token_level_rewards.sum(dim=-1)

    # Get zero_rho_advantage from config, default to -0.15
    config = kwargs.get('config', None)
    global_step = kwargs.get('global_step', 0)
    zero_rho_advantage = -0.15
    warmup_steps = 0
    if config is not None:
        zero_rho_advantage = getattr(config, 'a_reinforce_zero_rho_advantage', -0.15)
        warmup_steps = getattr(config, 'a_reinforce_warmup_steps', 0)

    # During warmup, use 0 for zero_rho_advantage
    if global_step <= warmup_steps:
        zero_rho_advantage = 0.0

    id2scores = defaultdict(list)
    id2indices = defaultdict(list)

    with torch.no_grad():
        bsz = scores.shape[0]

        # Group scores by prompt index
        for i in range(bsz):
            id2scores[index[i]].append(scores[i])
            id2indices[index[i]].append(i)

        # Compute advantages
        advantages = torch.zeros_like(scores)
        # Collect rho values for logging
        rho_list = []

        for idx in id2scores:
            prompt_scores = torch.tensor(id2scores[idx])
            prompt_indices = id2indices[idx]

            # Compute rho = mean reward across M rollouts
            M = len(prompt_scores)
            rho = prompt_scores.mean().item()

            # Store original rho for logging
            rho_list.append(rho)

            # If all responses are wrong (rho ≈ 0), use configurable advantage value
            if rho < epsilon:
                for i, batch_idx in enumerate(prompt_indices):
                    advantages[batch_idx] = zero_rho_advantage
                continue

            # Avoid division by zero for rho very close to 1
            if (1.0 - rho) < epsilon:
                # If rho is very close to 1, set it slightly below 1
                rho = 1.0 - epsilon

            # Compute advantages based on rho threshold
            for i, batch_idx in enumerate(prompt_indices):
                r_i = prompt_scores[i].item()

                if rho < 0.5:
                    # Low success rate: A_i = (r_i - rho) / (2 * rho * (1 - rho))
                    denominator = 2.0 * rho * (1.0 - rho)
                    advantages[batch_idx] = (r_i - rho) / denominator
                else:
                    # High success rate: A_i = (r_i - rho) / sqrt(rho * (1 - rho))
                    denominator = torch.sqrt(torch.tensor(rho * (1.0 - rho))).item()
                    advantages[batch_idx] = (r_i - rho) / denominator

        # Expand advantages to token level
        advantages = advantages.unsqueeze(-1) * response_mask

        # Store rho values in kwargs for retrieval
        if 'metadata_out' in kwargs:
            kwargs['metadata_out']['rho_values'] = rho_list

    return advantages, advantages


def repopulate_all_registries():
    PolicyLossRegistry.repopulate_registry()
    AdvantageEstimatorRegistry.repopulate_registry()


def compute_advantages_and_returns(
    token_level_rewards: torch.Tensor,
    response_mask: torch.Tensor,
    index: np.ndarray,
    adv_estimator: AdvantageEstimator,
    config: DictConfig,
    values: Optional[torch.Tensor] = None,
    grpo_norm_by_std: bool = True,
    gamma=1.0,
    lambd=1.0,
    **kwargs,
) -> tuple[torch.Tensor, torch.Tensor, Optional[dict]]:
    estimator_func = AdvantageEstimatorRegistry.get(adv_estimator)

    # Create a metadata dictionary for the estimator to populate
    metadata_out = {}
    kwargs['metadata_out'] = metadata_out

    advantages, returns = estimator_func(
        token_level_rewards=token_level_rewards,
        response_mask=response_mask,
        index=index,
        values=values,
        grpo_norm_by_std=grpo_norm_by_std,
        gamma=gamma,
        lambd=lambd,
        config=config,
        **kwargs,
    )

    return advantages, returns, metadata_out
