# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Core functions to implement PPO algorithms.
The function implemented in this file should be used by trainer with different distributed strategies to
implement PPO
"""

import numpy as np
import torch
from collections import defaultdict
import pdb

import verl.utils.torch_functional as verl_F


class AdaptiveKLController:
    """
    Adaptive KL controller described in the paper:
    https://arxiv.org/pdf/1909.08593.pdf
    """

    def __init__(self, init_kl_coef, target_kl, horizon):
        self.value = init_kl_coef
        self.target = target_kl
        self.horizon = horizon

    def update(self, current_kl, n_steps):
        target = self.target
        proportional_error = np.clip(current_kl / target - 1, -0.2, 0.2)
        mult = 1 + proportional_error * n_steps / self.horizon
        self.value *= mult


class FixedKLController:
    """Fixed KL controller."""

    def __init__(self, kl_coef):
        self.value = kl_coef

    def update(self, current_kl, n_steps):
        pass


def get_kl_controller(config):
    if config.critic.kl_ctrl.type == 'fixed':
        kl_ctrl = FixedKLController(kl_coef=config.critic.kl_ctrl.kl_coef)
    elif config.critic.kl_ctrl.type == 'adaptive':
        assert config.kl_ctrl.horizon > 0, f'horizon must be larger than 0. Got {config.critic.kl_ctrl.horizon}'
        kl_ctrl = AdaptiveKLController(init_kl_coef=config.critic.kl_ctrl.kl_coef,
                                       target_kl=config.critic.kl_ctrl.target_kl,
                                       horizon=config.critic.kl_ctrl.horizon)
    else:
        raise ValueError('Unknown kl_ctrl type')

    return kl_ctrl


def compute_gae_advantage_return(token_level_rewards: torch.Tensor, values: torch.Tensor, eos_mask: torch.Tensor,
                                 gamma: torch.Tensor, lam: torch.Tensor):
    """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py

    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        values: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length). [EOS] mask. The token after [EOS] have mask zero.
        gamma: `(float)`
            discounted factor used in RL
        lam: `(float)`
            lambda value when computing Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)

    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)

    """
    with torch.no_grad():
        lastgaelam = 0
        advantages_reversed = []
        gen_len = token_level_rewards.shape[-1]

        for t in reversed(range(gen_len)):
            nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
            delta = token_level_rewards[:, t] + gamma * nextvalues - values[:, t]
            lastgaelam = delta + gamma * lam * lastgaelam
            advantages_reversed.append(lastgaelam)
        advantages = torch.stack(advantages_reversed[::-1], dim=1)

        returns = advantages + values
        advantages = verl_F.masked_whiten(advantages, eos_mask)
    return advantages, returns

# NOTE(sgm): this implementation only consider outcome supervision, where the reward is a scalar.
def compute_grpo_outcome_advantage(token_level_rewards: torch.Tensor,
                                   eos_mask: torch.Tensor,
                                   index: torch.Tensor,
                                   epsilon: float = 1e-6):
    """
    Compute advantage for GRPO, operating only on Outcome reward 
    (with only one scalar reward for each response).
    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)
    
    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)
    """
    response_length = token_level_rewards.shape[-1]
    scores = token_level_rewards.sum(dim=-1)

    id2score = defaultdict(list)
    id2mean = {}
    id2std = {}

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            id2score[index[i]].append(scores[i])
        for idx in id2score:
            if len(id2score[idx]) == 1:
                id2mean[idx] = torch.tensor(0.0)
                id2std[idx] = torch.tensor(1.0)
            elif len(id2score[idx]) > 1:
                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
                id2std[idx] = torch.std(torch.tensor([id2score[idx]]))
            else:
                raise ValueError(f"no score in prompt index: {idx}")
        for i in range(bsz):
            scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)
        scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask

    return scores, scores

def _calculate_group_relative_scores(scores: torch.Tensor, index: torch.Tensor, epsilon: float):
    id2score = defaultdict(list)
    id2mean = {}
    id2std = {}

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            id2score[index[i]].append(scores[i])
        for idx in id2score:
            if len(id2score[idx]) == 1:
                id2mean[idx] = torch.tensor(0.0)
                id2std[idx] = torch.tensor(1.0)
            elif len(id2score[idx]) > 1:
                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
                id2std[idx] = torch.std(torch.tensor([id2score[idx]]))
            else:
                raise ValueError(f"no score in prompt index: {idx}")
        for i in range(bsz):
            scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)
    return scores

def compute_tool_grpo_advantage(
    turn_level_rewards: torch.Tensor,
    outcome_level_rewards: torch.Tensor,
    turn_mask: torch.Tensor,
    eos_mask: torch.Tensor,
    index: torch.Tensor,
    lambda_coeff: float = 0.0,
    epsilon: float = 1e-6):
    
    response_length = turn_level_rewards.shape[-1]
    turn_scores = turn_level_rewards.sum(dim=-1)       # (bs,)
    outcome_scores = outcome_level_rewards.sum(dim=-1) # (bs,)
    with torch.no_grad():
        # 为两种reward分别计算两个adv
        adv_outcome = _calculate_group_relative_scores(outcome_scores, index, epsilon) # (bs,)
        # adv_turn = _calculate_group_relative_scores(turn_scores, index, epsilon) # (bs,)
        
        
        #只对多轮回复计算turn advantage （先弃用）
        adv_turn = torch.zeros_like(turn_scores) # (bs,)
        # 定义多轮的回复
        max_turn_ids, _ = torch.max(turn_mask, dim=1)  # (bs,)
        is_multi_turn = max_turn_ids > 1 # (bs,)
        
        if torch.any(is_multi_turn):
            # 计算多轮回复的样本在batch中的indices
            multi_turn_indices = is_multi_turn.nonzero(as_tuple=True)[0]  # (num_multi_turn,)  num_multi_turn: 一个batch*n中multi_turn样本的数量
            
            # 只在多轮回复的组内计算adv
            adv_turn_for_multi_turn = _calculate_group_relative_scores(
                scores=turn_scores[multi_turn_indices],  # (num_multi_turn,)
                index=index[multi_turn_indices], # (num_multi_turn,)
                epsilon=epsilon
            ) # output shape: (num_multi_turn,)
            
            # Place the calculated advantages back into the full advantage tensor
            adv_turn[multi_turn_indices] = adv_turn_for_multi_turn # (bs,)

        adv_turn_expanded = adv_turn.unsqueeze(-1).expand(-1, response_length) # (bs,) -> (bs, response_length)
        adv_outcome_expanded = adv_outcome.unsqueeze(-1).expand(-1, response_length) # (bs,) -> (bs, response_length)
        
        
        # 计算turn1的advantage
        adv_for_turn1 = lambda_coeff * adv_turn_expanded + adv_outcome_expanded
        
        # 计算turn2的advantage
        adv_for_turn2 = adv_outcome_expanded
        
        advantages = torch.zeros_like(turn_level_rewards)
        
        # turn为1的token使用turn1的advantage
        advantages = torch.where(turn_mask == 1, adv_for_turn1, advantages)
        
        # turn为2的token使用turn2的advantage
        advantages = torch.where(turn_mask == 2, adv_for_turn2, advantages)
        
        # mask掉padding token
        advantages = advantages * eos_mask

    return advantages, advantages, adv_turn, adv_outcome

def compute_tool_grpo_v2_advantage(
    turn_level_rewards: torch.Tensor,
    outcome_level_rewards: torch.Tensor,
    turn_mask: torch.Tensor,
    eos_mask: torch.Tensor,
    index: torch.Tensor,
    gamma: float = 0.0,
    epsilon: float = 1e-6):
    
    response_length = turn_level_rewards.shape[-1]
    turn_scores = turn_level_rewards.sum(dim=-1)       # (bs,)
    outcome_scores = outcome_level_rewards.sum(dim=-1) # (bs,)
    with torch.no_grad():
        # 为两种reward分别计算两个adv
        adv_outcome = _calculate_group_relative_scores(outcome_scores, index, epsilon) # (bs,)
        # adv_turn = _calculate_group_relative_scores(turn_scores, index, epsilon) # (bs,)
        
        
        #只对多轮回复计算turn advantage （先弃用）
        adv_turn = torch.zeros_like(turn_scores) # (bs,)
        # 定义多轮的回复
        max_turn_ids, _ = torch.max(turn_mask, dim=1)  # (bs,)
        is_multi_turn = max_turn_ids > 1 # (bs,)
        
        if torch.any(is_multi_turn):
            # 计算多轮回复的样本在batch中的indices
            multi_turn_indices = is_multi_turn.nonzero(as_tuple=True)[0]  # (num_multi_turn,)  num_multi_turn: 一个batch*n中multi_turn样本的数量
            
            # 只在多轮回复的组内计算adv
            adv_turn_for_multi_turn = _calculate_group_relative_scores(
                scores=turn_scores[multi_turn_indices],  # (num_multi_turn,)
                index=index[multi_turn_indices], # (num_multi_turn,)
                epsilon=epsilon
            ) # output shape: (num_multi_turn,)
            
            # Place the calculated advantages back into the full advantage tensor
            adv_turn[multi_turn_indices] = adv_turn_for_multi_turn # (bs,)

        adv_turn_expanded = adv_turn.unsqueeze(-1).expand(-1, response_length) # (bs,) -> (bs, response_length)
        adv_outcome_expanded = adv_outcome.unsqueeze(-1).expand(-1, response_length) # (bs,) -> (bs, response_length)
        
        # 计算turn1的advantage
        # adv_for_turn1 = adv_turn_expanded + adv_outcome_expanded * gamma
        
        adv_for_tool = adv_turn_expanded + adv_outcome_expanded * gamma
        is_multi_turn_expanded = is_multi_turn.unsqueeze(-1).expand(-1, response_length)

        adv_for_turn1 = torch.where(
            is_multi_turn_expanded,      # 条件
            adv_for_tool,             # 如果是多轮，使用这个值
            adv_outcome_expanded         # 如果是直接回答，使用这个值
        )
        # 计算turn2的advantage
        adv_for_turn2 = adv_outcome_expanded
        
        advantages = torch.zeros_like(turn_level_rewards)
        
        # turn为1的token使用turn1的advantage
        advantages = torch.where(turn_mask == 1, adv_for_turn1, advantages)
        
        # turn为2的token使用turn2的advantage
        advantages = torch.where(turn_mask == 2, adv_for_turn2, advantages)
        
        # mask掉padding token
        advantages = advantages * eos_mask

    return advantages, advantages, adv_turn, adv_outcome


def compute_mt_grpo_advantage(
    turn_level_rewards: torch.Tensor,
    outcome_level_rewards: torch.Tensor,
    turn_mask: torch.Tensor,
    eos_mask: torch.Tensor,
    index: torch.Tensor,
    lambda_coeff: float = 0.0,
    epsilon: float = 1e-6):
    """
    Compute advantage for MT-GRPO using turn-level credit assignment.

    Args:
        turn_level_rewards (torch.Tensor): 
            Rewards for the first turn's actions (e.g., tool use). Shape: (bs, response_length)
        outcome_level_rewards (torch.Tensor):
            Rewards for the final outcome of the trajectory. Shape: (bs, response_length)
        turn_mask (torch.Tensor):
            Mask indicating which turn each token belongs to. 
            e.g., 1 for turn 1, 2 for turn 2. Shape: (bs, response_length)
        eos_mask (torch.Tensor):
            Mask to ignore padding tokens. Shape: (bs, response_length)
        index (torch.Tensor):
            Tensor of prompt indices to group responses. Shape: (bs,)
        lambda_coeff (float):
            Coefficient to weight the turn advantage in the first turn.
        epsilon (float):
            Small value for numerical stability in division.

    Returns:
        advantages (torch.Tensor): 
            The final computed advantages for each token. Shape: (bs, response_length)
        returns (torch.Tensor): 
            Same as advantages, to match the common function signature. Shape: (bs, response_length)
    """
    response_length = turn_level_rewards.shape[-1]
    turn_scores = turn_level_rewards.sum(dim=-1)       # (bs,)
    outcome_scores = outcome_level_rewards.sum(dim=-1) # (bs,)
    with torch.no_grad():
        # 为两种reward分别计算两个adv
        adv_outcome = _calculate_group_relative_scores(outcome_scores, index, epsilon) # (bs,)
        adv_turn = _calculate_group_relative_scores(turn_scores, index, epsilon) # (bs,)
        
        # 只对多轮回复计算turn advantage （先弃用）
        # adv_turn = torch.zeros_like(turn_scores) # (bs,)
        # # 定义多轮的回复
        # max_turn_ids, _ = torch.max(turn_mask, dim=1)  # (bs,)
        # is_multi_turn = max_turn_ids > 1 # (bs,)
        
        # if torch.any(is_multi_turn):
        #     # 计算多轮回复的样本在batch中的indices
        #     multi_turn_indices = is_multi_turn.nonzero(as_tuple=True)[0]  # (num_multi_turn,)  num_multi_turn: 一个batch中multi_turn样本的数量
            
        #     # 只在多轮回复的组内计算adv
        #     adv_turn_for_multi_turn = _calculate_group_relative_scores(
        #         scores=turn_scores[multi_turn_indices],  # (num_multi_turn,)
        #         index=index[multi_turn_indices], # (num_multi_turn,)
        #         epsilon=epsilon
        #     ) # output shape: (num_multi_turn,)
            
        #     # Place the calculated advantages back into the full advantage tensor
        #     adv_turn[multi_turn_indices] = adv_turn_for_multi_turn # (bs,)

        adv_turn_expanded = adv_turn.unsqueeze(-1).expand(-1, response_length) # (bs,) -> (bs, response_length)
        adv_outcome_expanded = adv_outcome.unsqueeze(-1).expand(-1, response_length) # (bs,) -> (bs, response_length)
        
        
        # 计算turn1的advantage
        adv_for_turn1 = lambda_coeff * adv_turn_expanded + adv_outcome_expanded
        
        # 计算turn2的advantage
        adv_for_turn2 = adv_outcome_expanded
        
        advantages = torch.zeros_like(turn_level_rewards)
        
        # turn为1的token使用turn1的advantage
        advantages = torch.where(turn_mask == 1, adv_for_turn1, advantages)
        
        # turn为2的token使用turn2的advantage
        advantages = torch.where(turn_mask == 2, adv_for_turn2, advantages)
        
        # mask掉padding token
        advantages = advantages * eos_mask

    return advantages, advantages, adv_turn, adv_outcome

def compute_rloo_outcome_advantage(token_level_rewards: torch.Tensor,
                                   eos_mask: torch.Tensor,
                                   index: torch.Tensor,
                                   epsilon: float = 1e-6):
    """
    Compute advantage for RLOO based on https://arxiv.org/abs/2402.14740
    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)

    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)
    """
    response_length = token_level_rewards.shape[-1]
    scores = token_level_rewards.sum(dim=-1)

    id2score = defaultdict(list)
    id2mean = {}

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            id2score[index[i]].append(scores[i])
        for idx in id2score:
            if len(id2score[idx]) == 1:
                id2mean[idx] = torch.tensor(0.0)
            elif len(id2score[idx]) > 1:
                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
            else:
                raise ValueError(f"no score in prompt index: {idx}")
        for i in range(bsz):
            response_num = len(id2score[index[i]])
            if response_num > 1:
                scores[i] = scores[i] * response_num / (response_num -
                                                        1) - id2mean[index[i]] * response_num / (response_num - 1)
        scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask

    return scores, scores


def compute_reinforce_plus_plus_outcome_advantage(token_level_rewards: torch.Tensor, eos_mask: torch.Tensor,
                                                  gamma: torch.Tensor):
    """
    Compute advantage for REINFORCE++. 
    This implementation is based on the paper: https://arxiv.org/abs/2501.03262
    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)
    
    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)
    """

    with torch.no_grad():
        returns = torch.zeros_like(token_level_rewards)
        running_return = 0

        for t in reversed(range(token_level_rewards.shape[1])):
            running_return = token_level_rewards[:, t] + gamma * running_return
            returns[:, t] = running_return
            # Reset after EOS
            running_return = running_return * eos_mask[:, t]

        advantages = verl_F.masked_whiten(returns, eos_mask)
        advantages = advantages * eos_mask

    return advantages, returns


def compute_remax_outcome_advantage(token_level_rewards: torch.Tensor, reward_baselines: torch.Tensor,
                                    eos_mask: torch.Tensor):
    """
    Compute advantage for ReMax, operating only on Outcome reward 
    This implementation is based on the paper: https://arxiv.org/abs/2310.10505

    (with only one scalar reward for each response).
    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        reward_baselines: `(torch.Tensor)`
            shape: (bs,)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)
    
    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)
    """
    response_length = token_level_rewards.shape[-1]
    scores = token_level_rewards.sum(dim=-1)

    with torch.no_grad():
        returns = (token_level_rewards * eos_mask).flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
        advantages = returns - reward_baselines.unsqueeze(-1).tile([1, response_length]) * eos_mask

    return advantages, returns


def compute_rewards(token_level_scores, old_log_prob, ref_log_prob, kl_ratio):
    kl = old_log_prob - ref_log_prob
    return token_level_scores - kl * kl_ratio


def compute_policy_loss(old_log_prob,
                        log_prob,
                        advantages,
                        eos_mask,
                        cliprange=None,
                        cliprange_low=None,
                        cliprange_high=None,
                        turn_mask=None,
                        control_level=None,
                        control_alpha=1):
    """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1122

    Args:
        old_log_prob: `(torch.Tensor)`
            shape: (bs, response_length)
        log_prob: `(torch.Tensor)`
            shape: (bs, response_length)
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)
        cliprange: (float)
            The clip range used in PPO. See https://arxiv.org/abs/1707.06347
        cliprange_low: (float)
            The lower clip range used in PPO.
        cliprange_high: (float)
            The higher clip range used in PPO.
    Returns:
        pg_loss: `a scalar torch.Tensor`
            policy gradient loss computed via PPO
        pg_clipfrac: (float)
            the fraction of policy gradient loss being clipped
        ppo_kl: (float)
            the estimated KL divergence between the latest updating policy and the old sampling policy
    """
    negative_approx_kl = log_prob - old_log_prob
    ratio = torch.exp(negative_approx_kl)
    ppo_kl = verl_F.masked_mean(-negative_approx_kl, eos_mask)

    pg_losses = -advantages * ratio
    # pg_losses2 = -advantages * torch.clamp(ratio, 1.0 - cliprange, 1.0 + cliprange)
    if cliprange_low is None:
        cliprange_low = cliprange
    if cliprange_high is None:
        cliprange_high = cliprange
    pg_losses2 = -advantages * torch.clamp(ratio, 1.0 - cliprange_low, 1.0 + cliprange_high)  # - clip(ratio, 1-cliprange, 1+cliprange) * A
    token_loss = torch.max(pg_losses, pg_losses2)
    if control_level == 'turn_level':
        total_turn_loss = 0.0
        # 获取最大轮次数，它也代表了总的轮次数
        num_turns = torch.max(turn_mask).int()
        
        # 从第1轮开始循环到最后一轮
        for i in range(1, num_turns + 1):
            # 创建当前轮次的掩码，形状为 (bs, response_length)
            current_turn_mask = (turn_mask == i).float()
            
            # 结合eos_mask，确保我们只在有效的、属于当前轮次的token上计算loss, 乘以eos_mask是为了排除padding部分的影响
            final_mask_for_turn = current_turn_mask * eos_mask
            
            # 计算当前轮次的平均loss
            turn_loss = verl_F.masked_mean(token_loss, final_mask_for_turn)
            total_turn_loss += turn_loss
            
        # 将所有轮次的平均loss相加后，再除以总轮次数
        pg_loss = total_turn_loss / num_turns
        
    elif control_level == 'tool_level':
        max_turn_per_sequence, _ = torch.max(turn_mask, dim=1, keepdim=True)
        is_valid_token = (turn_mask > 0)
        # 确定哪些token是最后1轮
        # is_last_turn_mask 维度: (bs, response_length)
        is_last_turn_mask = is_valid_token & (turn_mask == max_turn_per_sequence)

        # 创建中间轮次的mask
        # is_intermediate_turn_mask 维度: (bs, response_length)
        is_intermediate_turn_mask = is_valid_token & (turn_mask < max_turn_per_sequence)

        # 计算中间轮次的平均loss
        # intermediate_turn_loss 维度: scalar
        if (is_intermediate_turn_mask * eos_mask).sum() == 0:
            intermediate_turn_loss = torch.tensor(0.0)
        else:
            intermediate_turn_loss = verl_F.masked_mean(token_loss, is_intermediate_turn_mask * eos_mask)
        # 计算最后一轮的平均loss
        # last_turn_loss 维度: scalar 
        last_turn_loss = verl_F.masked_mean(token_loss, is_last_turn_mask * eos_mask)

        # pg_loss 维度: scalar (零维张量)
        pg_loss = control_alpha * intermediate_turn_loss + last_turn_loss        
    else:
        pg_loss = verl_F.masked_mean(torch.max(pg_losses, pg_losses2), eos_mask)
    pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses2, pg_losses).float(), eos_mask)
    return pg_loss, pg_clipfrac, ppo_kl


def compute_entropy_loss(logits, eos_mask):
    """Compute Categorical entropy loss

    Args:
        logits: `(torch.Tensor)`
            shape: (bs, response_length, vocab_size)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)

    Returns:
        entropy: a scalar torch.Tensor

    """
    # compute entropy
    entropy = verl_F.entropy_from_logits(logits)  # (bs, response_len)
    entropy_loss = verl_F.masked_mean(entropy, mask=eos_mask)
    return entropy_loss


def compute_value_loss(vpreds, returns, values, eos_mask, cliprange_value):
    """Compute the value loss. Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1151

    Args:
        vpreds (`torch.FloatTensor`):
            Predicted values of the value head, shape (`batch_size`, `response_length`)
        values (`torch.FloatTensor`):
            Old values of value head, shape (`batch_size`, `response_length`)
        returns: (`torch.FloatTensor`):
            Ground truth returns, shape (`batch_size`, `response_length`)

    Returns:
        vf_loss: a scalar (`torch.FloatTensor`):
            value function loss
        vf_clipfrac: a float
            The ratio of vf being clipped

    """
    vpredclipped = verl_F.clip_by_value(vpreds, values - cliprange_value, values + cliprange_value)
    vf_losses1 = (vpreds - returns)**2
    vf_losses2 = (vpredclipped - returns)**2
    vf_loss = 0.5 * verl_F.masked_mean(torch.max(vf_losses1, vf_losses2), eos_mask)
    vf_clipfrac = verl_F.masked_mean(torch.gt(vf_losses2, vf_losses1).float(), eos_mask)
    return vf_loss, vf_clipfrac


def kl_penalty(logprob: torch.FloatTensor, ref_logprob: torch.FloatTensor, kl_penalty) -> torch.FloatTensor:
    """Compute KL divergence given logprob and ref_logprob.
    Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1104

    Args:
        logprob:
        ref_logprob:

    Returns:

    """
    if kl_penalty == "kl":
        return logprob - ref_logprob

    if kl_penalty == "abs":
        return (logprob - ref_logprob).abs()

    if kl_penalty == "mse":
        return 0.5 * (logprob - ref_logprob).square()

    # J. Schulman. Approximating kl divergence, 2020.
    # # URL http://joschu.net/blog/kl-approx.html.
    if kl_penalty == 'low_var_kl':
        kl = ref_logprob - logprob
        ratio = torch.exp(kl)
        kld = (ratio - kl - 1).contiguous()
        return torch.clamp(kld, min=-10, max=10)

    if kl_penalty == "full":
        # so, here logprob and ref_logprob should contain the logits for every token in vocabulary
        raise NotImplementedError

    raise NotImplementedError
