{
    "sourceFile": "verl/trainer/core_algos.py",
    "activeCommit": 0,
    "commits": [
        {
            "activePatchIndex": 2,
            "patches": [
                {
                    "date": 1758800813949,
                    "content": "Index: \n===================================================================\n--- \n+++ \n"
                },
                {
                    "date": 1758800821110,
                    "content": "Index: \n===================================================================\n--- \n+++ \n@@ -494,18 +494,9 @@\n     cliprange: float,\n     # aug_status = \"normal\",\n ) -> Tuple[torch.Tensor, float, float]:\n       \n-    # is_normal = torch.tensor([s == \"normal\" for s in aug_status], \n-    #                         device=advantages.device, \n-    #                         dtype=torch.bool).unsqueeze(1)\n \n-            #根据状态选择不同的优势值 \n-    # negative_approx_kl  = torch.where(is_normal.any(), log_prob - old_log_prob, log_prob)\n-    # if aug_status == \"normal\":\n-    #     negative_approx_kl = log_prob - old_log_prob\n-    # else:\n-    #     negative_approx_kl = log_prob\n     # clamp the ratio before exp to avoid nan\n     # see: https://github.com/pytorch/pytorch/issues/10729\n     negative_approx_kl = log_prob - old_log_prob\n     ratio = torch.exp(negative_approx_kl)\n"
                },
                {
                    "date": 1758800829197,
                    "content": "Index: \n===================================================================\n--- \n+++ \n@@ -506,11 +506,9 @@\n \n     pg_losses = -advantages * ratio\n     pg_losses2 = -advantages * clipped_ratio\n     pg_loss = torch.max(pg_losses, pg_losses2)\n-    # pg_loss = VF.masked_mean(torch.max(pg_losses, pg_losses2), eos_mask)\n-    # pg_loss2 = VF.masked_mean( pg_losses, eos_mask)\n-    # pg_loss=  torch.where(is_normal.any(), pg_loss1,pg_loss2)\n+\n     #统计被裁剪样本比例，用于调试更新幅度\n     pg_clipfrac = VF.masked_mean(torch.gt(pg_losses2, pg_losses).float(), eos_mask)\n     return pg_loss, pg_clipfrac, ppo_kl\n \n"
                }
            ],
            "date": 1758800813949,
            "name": "Commit-0",
            "content": "# Copyright 2022 The HuggingFace Team\n# Copyright 2024 Bytedance Ltd. and/or its affiliates\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nCore functions to implement PPO algorithms.\nThe function implemented in this file should be used by trainer with different distributed strategies to\nimplement PPO\n\"\"\"\n\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom typing import TYPE_CHECKING, Tuple\n\nimport numpy as np\nimport torch\n\nfrom ..utils import torch_functional as VF\n\n\nif TYPE_CHECKING:\n    from .config import AlgorithmConfig\n\n\nclass KLController(ABC):\n    @abstractmethod\n    def update(self, current_kl: float, n_steps: int) -> None: ...\n\n\nclass AdaptiveKLController(KLController):\n    \"\"\"\n    Adaptive KL controller described in the paper:\n    https://arxiv.org/pdf/1909.08593.pdf\n    \"\"\"\n\n    def __init__(self, init_kl_coef: float, target_kl: float, horizon: float):\n        self.value = init_kl_coef\n        self.target = target_kl\n        self.horizon = horizon\n\n    def update(self, current_kl: float, n_steps: int) -> None:\n        target = self.target\n        proportional_error = np.clip(current_kl / target - 1, -0.2, 0.2)\n        mult = 1 + proportional_error * n_steps / self.horizon\n        self.value *= mult\n\n\nclass FixedKLController(KLController):\n    \"\"\"Fixed KL controller.\"\"\"\n\n    def __init__(self, init_kl_coef: float):\n        self.value = init_kl_coef\n\n    def update(self, current_kl: float, n_steps: int) -> None:\n        pass\n\n\ndef get_kl_controller(algorithm_config: \"AlgorithmConfig\") -> KLController:\n    if algorithm_config.kl_type == \"fixed\":\n        kl_ctrl = FixedKLController(init_kl_coef=algorithm_config.kl_coef)\n    elif algorithm_config.kl_type == \"adaptive\":\n        assert algorithm_config.kl_horizon > 0, f\"horizon must be larger than 0. Got {algorithm_config.kl_horizon}.\"\n        kl_ctrl = AdaptiveKLController(\n            init_kl_coef=algorithm_config.kl_coef,\n            target_kl=algorithm_config.kl_target,\n            horizon=algorithm_config.kl_horizon,\n        )\n    else:\n        raise ValueError(f\"Unknown kl type: {algorithm_config.kl_type}.\")\n\n    return kl_ctrl\n\n\n@torch.no_grad()\ndef compute_gae_advantage_return(\n    token_level_rewards: torch.Tensor,\n    values: torch.Tensor,\n    eos_mask: torch.Tensor,\n    gamma: torch.Tensor,\n    lam: torch.Tensor,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py\n\n    Args:\n        token_level_rewards: `(torch.Tensor)`\n            shape: (bs, response_length)\n        values: `(torch.Tensor)`\n            shape: (bs, response_length)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length). [EOS] mask. The token after [EOS] have mask zero.\n        gamma: `(float)`\n            discounted factor used in RL\n        lam: `(float)`\n            lambda value when computing Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)\n\n    Returns:\n        advantages: `(torch.Tensor)`\n            shape: (bs, response_length)\n        Returns: `(torch.Tensor)`\n            shape: (bs, response_length)\n\n    \"\"\"\n    lastgaelam = 0\n    advantages_reversed = []\n    gen_len = token_level_rewards.shape[-1]\n    for t in reversed(range(gen_len)):\n        nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0\n        delta = token_level_rewards[:, t] + gamma * nextvalues - values[:, t]\n        lastgaelam = delta + gamma * lam * lastgaelam\n        advantages_reversed.append(lastgaelam)\n\n    advantages = torch.stack(advantages_reversed[::-1], dim=1)\n    returns = advantages + values\n    advantages = VF.masked_whiten(advantages, eos_mask)\n    return advantages, returns\n\n\n# NOTE(sgm): this implementation only consider outcome supervision, where the reward is a scalar.\n@torch.no_grad()\ndef compute_grpo_outcome_advantage(\n    token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, index: torch.Tensor, epsilon: float = 1e-6\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"\n    Compute advantage for GRPO, operating only on Outcome reward\n    (with only one scalar reward for each response).\n    Args:\n        token_level_rewards: `(torch.Tensor)`\n            shape: (bs, response_length)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length)\n    index:标识不同prompt组的索引\n    Returns:\n        advantages: `(torch.Tensor)`\n            shape: (bs, response_length)\n        Returns: `(torch.Tensor)`\n            shape: (bs, response_length)\n       Args:  \n        token_level_rewards: `(torch.Tensor)`  \n            shape: (bs, response_length)  \n            # 包含了每个 token 可能的奖励。在 outcome supervision 的情况下，  \n            # 通常只有一个非零值，位于响应序列的末尾，代表整个序列的标量奖励。  \n        eos_mask: `(torch.Tensor)`  \n            shape: (bs, response_length)  \n            # 结束符 (End-Of-Sequence) 掩码。值为 1 的位置表示有效的响应 token，  \n            # 通常在实际的 EOS token 处为 1，之后为 0。  \n            # GRPO 论文中提到，优势被放置在 EOS token 的位置。 \n            # 这里是responsemask，掩码出来的是响应部分的 token。 \n        index: `(torch.Tensor)`  \n            shape: (bs,)  \n            # 每个样本的提示 (prompt) 索引。用于对具有相同提示的响应进行分组，  \n            # 以便在同一提示下对它们的得分进行归一化。  \n        epsilon: `(float)`  \n            # 一个小的常数，用于防止在归一化时除以零（如果标准差为零）。  \n\n    Returns:  \n        advantages: `(torch.Tensor)`  \n            shape: (bs, response_length)  \n            # 计算得到的优势值。在 outcome supervision 的情况下，这通常是归一化后的标量奖励，  \n            # 扩展到响应序列的长度，并由 eos_mask 掩码。  \n        Returns: `(torch.Tensor)`  \n            shape: (bs, response_length)  \n            # 在这个特定的 GRPO outcome 实现中，返回值 (Returns) 与优势值 (advantages) 相同。  \n            # 这是因为 GRPO 的 outcome 奖励直接作为优势，没有使用值函数进行基线扣除或 GAE 计算。  \n    \n    \"\"\"\n    response_length = token_level_rewards.shape[-1]\n    scores = token_level_rewards.sum(dim=-1)\n    id2score = defaultdict(list)\n    id2mean, id2std = {}, {}\n\n    bsz = scores.shape[0]\n    for i in range(bsz):\n        id2score[index[i]].append(scores[i])\n\n    for idx in id2score:\n        if len(id2score[idx]) == 1:\n            id2mean[idx] = torch.tensor(0.0)\n            id2std[idx] = torch.tensor(1.0)\n        elif len(id2score[idx]) > 1:\n            id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))\n            id2std[idx] = torch.std(torch.tensor([id2score[idx]]))\n        else:\n            raise ValueError(f\"no score in prompt index: {idx}\")\n\n    for i in range(bsz):\n        scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)\n\n    scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask\n    return scores, scores\n\n@torch.no_grad()\ndef compute_grpo_outcome_advantage_hallucinations(\n    token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, index: torch.Tensor,hallucinations: torch.Tensor, epsilon: float = 1e-6\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"\n    Compute advantage for GRPO, operating only on Outcome reward\n    (with only one scalar reward for each response).\n    Args:\n        token_level_rewards: `(torch.Tensor)`\n            shape: (bs, response_length)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length)\n    index:标识不同prompt组的索引\n    Returns:\n        advantages: `(torch.Tensor)`\n            shape: (bs, response_length)\n        Returns: `(torch.Tensor)`\n            shape: (bs, response_length)\n    \"\"\"\n    response_length = token_level_rewards.shape[-1]\n    scores = token_level_rewards.sum(dim=-1)\n    id2score = defaultdict(list)\n    id2mean, id2std = {}, {}\n\n    bsz = scores.shape[0]\n    for i in range(bsz):\n        id2score[index[i]].append(scores[i])\n\n    for idx in id2score:\n        if len(id2score[idx]) == 1:\n            id2mean[idx] = torch.tensor(0.0)\n            id2std[idx] = torch.tensor(1.0)\n        elif len(id2score[idx]) > 1:\n            id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))\n            id2std[idx] = torch.std(torch.tensor([id2score[idx]]))\n        else:\n            raise ValueError(f\"no score in prompt index: {idx}\")\n\n    for i in range(bsz):\n        scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)\n\n    scores = scores.unsqueeze(-1).tile([1, response_length])*hallucinations * eos_mask\n\n    return scores, scores\n@torch.no_grad()\ndef compute_grpo_outcome_advantage_aug(\n    status,token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, index: torch.Tensor, epsilon: float = 1e-6\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"\n    Compute advantage for GRPO, operating only on Outcome reward\n    (with only one scalar reward for each response).\n    Args:\n        token_level_rewards: `(torch.Tensor)`\n            shape: (bs, response_length)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length)\n    index:标识不同prompt组的索引\n    Returns:\n        advantages: `(torch.Tensor)`\n            shape: (bs, response_length)\n        Returns: `(torch.Tensor)`\n            shape: (bs, response_length)\n    \n    \"\"\"\n    response_length = token_level_rewards.shape[-1]\n    scores = token_level_rewards.sum(dim=-1)\n    raw_scores = scores.clone()  # 保存原始得分用于后续计算\n    \n    # 原始分组逻辑（基于index）\n    id2score = defaultdict(list)\n    id2mean, id2std = {}, {}\n    \n    bsz = scores.shape[0]\n    for i in range(bsz):\n        id2score[index[i]].append(scores[i])\n    \n    for idx in id2score:\n        if len(id2score[idx]) == 1:\n            id2mean[idx] = torch.tensor(0.0)\n            id2std[idx] = torch.tensor(1.0)\n        elif len(id2score[idx]) > 1:\n            id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))\n            id2std[idx] = torch.std(torch.tensor([id2score[idx]]))\n        else:\n            raise ValueError(f\"no score in prompt index: {idx}\")\n    \n    for i in range(bsz):\n        scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)\n    \n    group_scores = scores.unsqueeze(-1).expand(-1, response_length) * eos_mask\n    \n    # 2. 新增逻辑：计算基于(index, status)复合分组的归一化得分\n    comp2score = defaultdict(list)\n    comp2mean, comp2std = {}, {}\n    \n    # 构建复合分组字典\n    for i in range(bsz):\n        comp_key = (index[i], status[i])  # 使用(index, status)作为复合键\n        comp2score[comp_key].append(raw_scores[i])  # 使用原始得分\n    \n    # 计算每个复合分组的统计量\n    for comp_key in comp2score:\n        group_scores_list = comp2score[comp_key]\n        if len(group_scores_list) == 1:\n            comp2mean[comp_key] = torch.tensor(0.0)\n            comp2std[comp_key] = torch.tensor(1.0)\n        else:\n            scores_tensor = torch.stack(group_scores_list)\n            comp2mean[comp_key] = torch.mean(scores_tensor)\n            comp2std[comp_key] = torch.std(scores_tensor)\n    \n    # 计算复合分组归一化得分\n    comp_scores = torch.zeros_like(raw_scores)\n    for i in range(bsz):\n        comp_key = (index[i], status[i])\n        comp_scores[i] = (raw_scores[i] - comp2mean[comp_key]) / (comp2std[comp_key] + epsilon)\n    \n    comp_scores = comp_scores.unsqueeze(-1).expand(-1, response_length) * eos_mask\n    \n    # 返回原始分组得分（两份）和复合分组得分（两份）\n    return group_scores, group_scores, comp_scores, comp_scores\n\n\n\n@torch.no_grad()\ndef compute_rloo_outcome_advantage(\n    token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, index: torch.Tensor, epsilon: float = 1e-6\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"\n    Compute advantage for RLOO based on https://arxiv.org/abs/2402.14740\n    Args:\n        token_level_rewards: `(torch.Tensor)`\n            shape: (bs, response_length)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length)\n\n    Returns:\n        advantages: `(torch.Tensor)`\n            shape: (bs, response_length)\n        Returns: `(torch.Tensor)`\n            shape: (bs, response_length)\n    \"\"\"\n    response_length = token_level_rewards.shape[-1]\n    scores = token_level_rewards.sum(dim=-1)\n\n    id2score = defaultdict(list)\n    id2mean = {}\n    bsz = scores.shape[0]\n    for i in range(bsz):\n        id2score[index[i]].append(scores[i])\n\n    for idx in id2score:\n        if len(id2score[idx]) == 1:\n            id2mean[idx] = torch.tensor(0.0)\n        elif len(id2score[idx]) > 1:\n            id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))\n        else:\n            raise ValueError(f\"no score in prompt index: {idx}.\")\n\n    for i in range(bsz):\n        response_num = len(id2score[index[i]])\n        if response_num > 1:\n            scores[i] = scores[i] * response_num / (response_num - 1) - id2mean[index[i]] * response_num / (\n                response_num - 1\n            )\n\n    scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask\n    return scores, scores\n\n\n@torch.no_grad()\ndef compute_reinforce_plus_plus_outcome_advantage(\n    token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, gamma: torch.Tensor\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"\n    Compute advantage for REINFORCE++.\n    This implementation is based on the paper: https://arxiv.org/abs/2501.03262\n    Args:\n        token_level_rewards: `(torch.Tensor)`\n            shape: (bs, response_length)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length)\n\n    Returns:\n        advantages: `(torch.Tensor)`\n            shape: (bs, response_length)\n        Returns: `(torch.Tensor)`\n            shape: (bs, response_length)\n    \"\"\"\n    returns = torch.zeros_like(token_level_rewards)\n    running_return = 0\n    for t in reversed(range(token_level_rewards.shape[1])):\n        running_return = token_level_rewards[:, t] + gamma * running_return\n        returns[:, t] = running_return\n        # Reset after EOS\n        running_return = running_return * eos_mask[:, t]\n\n    advantages = VF.masked_whiten(returns, eos_mask)\n    advantages = advantages * eos_mask\n    return advantages, returns\n\n\n@torch.no_grad()\ndef compute_remax_outcome_advantage(\n    token_level_rewards: torch.Tensor, reward_baselines: torch.Tensor, eos_mask: torch.Tensor\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"\n    Compute advantage for ReMax, operating only on Outcome reward\n    This implementation is based on the paper: https://arxiv.org/abs/2310.10505\n\n    (with only one scalar reward for each response).\n    Args:\n        token_level_rewards: `(torch.Tensor)`\n            shape: (bs, response_length)\n        reward_baselines: `(torch.Tensor)`\n            shape: (bs,)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length)\n\n    Returns:\n        advantages: `(torch.Tensor)`\n            shape: (bs, response_length)\n        Returns: `(torch.Tensor)`\n            shape: (bs, response_length)\n    \"\"\"\n    response_length = token_level_rewards.shape[-1]\n    # scores = token_level_rewards.sum(dim=-1)\n    returns = (token_level_rewards * eos_mask).flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])\n    advantages = returns - reward_baselines.unsqueeze(-1).tile([1, response_length]) * eos_mask\n    return advantages, returns\n\n\ndef compute_rewards(\n    token_level_scores: torch.Tensor,\n    old_log_prob: torch.Tensor,\n    ref_log_prob: torch.Tensor,\n    kl_ratio: float,\n) -> torch.Tensor:\n    kl = old_log_prob - ref_log_prob\n    return token_level_scores - kl * kl_ratio\n\n\ndef compute_policy_loss1(\n    old_log_prob: torch.Tensor,\n    log_prob: torch.Tensor,\n    advantages: torch.Tensor,\n    eos_mask: torch.Tensor,\n    cliprange: float,\n    # aug_status = \"normal\",\n) -> Tuple[torch.Tensor, float, float]:\n    \"\"\"Compute the policy loss.\n\n    Adapted from https://github.com/huggingface/trl/blob/v0.15.0/trl/trainer/ppo_trainer.py#L568\n\n    Args:\n        old_log_prob: `(torch.Tensor)`\n            shape: (bs, response_length)\n        log_prob: `(torch.Tensor)`\n            shape: (bs, response_length)\n        advantages: `(torch.Tensor)`\n            shape: (bs, response_length)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length)\n        cliprange: (float)\n            The clip range used in PPO. See https://arxiv.org/abs/1707.06347\n\n    Returns:\n        pg_loss: `a scalar torch.Tensor`\n            policy gradient loss computed via PPO\n        pg_clipfrac: (float)\n            a float number indicating the fraction of policy gradient loss being clipped\n    \"\"\"\n\n    # clamp the ratio before exp to avoid nan\n    # see: https://github.com/pytorch/pytorch/issues/10729\n    negative_approx_kl = log_prob - old_log_prob\n    ratio = torch.exp(negative_approx_kl)\n    # clipped_ratio = torch.exp(torch.clamp(negative_approx_kl, torch.log(1.0 - cliprange), torch.log(1.0 + cliprange)))\n    clipped_ratio = torch.exp(torch.clamp(negative_approx_kl, np.log(1.0 - cliprange), np.log(1.0 + cliprange)))\n    ppo_kl = VF.masked_mean(-negative_approx_kl, eos_mask)\n\n    pg_losses = -advantages * ratio\n    pg_losses2 = -advantages * clipped_ratio\n    pg_loss = VF.masked_mean(torch.max(pg_losses, pg_losses2), eos_mask)\n\n    pg_clipfrac = VF.masked_mean(torch.gt(pg_losses2, pg_losses).float(), eos_mask)\n    return pg_loss, pg_clipfrac, ppo_kl\n\ndef compute_policy_loss(\n    old_log_prob: torch.Tensor,\n    log_prob: torch.Tensor,\n    advantages: torch.Tensor,\n    eos_mask: torch.Tensor,\n    cliprange: float,\n    # aug_status = \"normal\",\n) -> Tuple[torch.Tensor, float, float]:\n      \n    # is_normal = torch.tensor([s == \"normal\" for s in aug_status], \n    #                         device=advantages.device, \n    #                         dtype=torch.bool).unsqueeze(1)\n\n            #根据状态选择不同的优势值 \n    # negative_approx_kl  = torch.where(is_normal.any(), log_prob - old_log_prob, log_prob)\n    # if aug_status == \"normal\":\n    #     negative_approx_kl = log_prob - old_log_prob\n    # else:\n    #     negative_approx_kl = log_prob\n    # clamp the ratio before exp to avoid nan\n    # see: https://github.com/pytorch/pytorch/issues/10729\n    negative_approx_kl = log_prob - old_log_prob\n    ratio = torch.exp(negative_approx_kl)\n    # clipped_ratio = torch.exp(torch.clamp(negative_approx_kl, torch.log(1.0 - cliprange), torch.log(1.0 + cliprange)))\n    clipped_ratio = torch.exp(torch.clamp(negative_approx_kl, np.log(1.0 - cliprange), np.log(1.0 + cliprange)))\n    ppo_kl = VF.masked_mean(-negative_approx_kl, eos_mask)\n\n    pg_losses = -advantages * ratio\n    pg_losses2 = -advantages * clipped_ratio\n    pg_loss = torch.max(pg_losses, pg_losses2)\n    # pg_loss = VF.masked_mean(torch.max(pg_losses, pg_losses2), eos_mask)\n    # pg_loss2 = VF.masked_mean( pg_losses, eos_mask)\n    # pg_loss=  torch.where(is_normal.any(), pg_loss1,pg_loss2)\n    #统计被裁剪样本比例，用于调试更新幅度\n    pg_clipfrac = VF.masked_mean(torch.gt(pg_losses2, pg_losses).float(), eos_mask)\n    return pg_loss, pg_clipfrac, ppo_kl\n\n\ndef compute_entropy_loss(logits: torch.Tensor, eos_mask: torch.Tensor) -> torch.Tensor:\n    \"\"\"Compute categorical entropy loss.\n\n    Adapted from https://github.com/huggingface/trl/blob/v0.15.0/trl/trainer/ppo_trainer.py#L582\n\n    Args:\n        logits: `(torch.Tensor)`\n            shape: (bs, response_length, vocab_size)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length)\n\n    Returns:\n        entropy: a scalar torch.Tensor\n\n    \"\"\"\n    # compute entropy\n    entropy = VF.entropy_from_logits(logits)  # (bs, response_len)\n    entropy_loss = VF.masked_mean(entropy, mask=eos_mask)\n    return entropy_loss\n\n\ndef compute_value_loss(\n    vpreds: torch.Tensor,\n    returns: torch.Tensor,\n    values: torch.Tensor,\n    eos_mask: torch.Tensor,\n    cliprange_value: float,\n) -> Tuple[torch.Tensor, float]:\n    \"\"\"Compute the value loss.\n\n    Copied from https://github.com/huggingface/trl/blob/v0.15.0/trl/trainer/ppo_trainer.py#L556\n\n    Args:\n        vpreds (`torch.FloatTensor`):\n            Predicted values of the value head, shape (`batch_size`, `response_length`)\n        returns: (`torch.FloatTensor`):\n            Ground truth returns, shape (`batch_size`, `response_length`)\n        values (`torch.FloatTensor`):\n            Old values of value head, shape (`batch_size`, `response_length`)\n        eos_mask: `(torch.Tensor)`\n            shape: (bs, response_length)\n        cliprange_value: (float)\n            The clip range for value net used in PPO. See https://arxiv.org/abs/1707.06347\n\n    Returns:\n        vf_loss: a scalar (`torch.FloatTensor`):\n            value function loss\n        vf_clipfrac: a float\n            The ratio of vf being clipped\n    \"\"\"\n    vpredclipped = VF.clip_by_value(vpreds, values - cliprange_value, values + cliprange_value)\n    vf_losses1 = torch.square(vpreds - returns)\n    vf_losses2 = torch.square(vpredclipped - returns)\n    vf_loss = 0.5 * VF.masked_mean(torch.max(vf_losses1, vf_losses2), eos_mask)\n    vf_clipfrac = VF.masked_mean(torch.gt(vf_losses2, vf_losses1).float(), eos_mask)\n    return vf_loss, vf_clipfrac\n\n\ndef kl_penalty(logprob: torch.FloatTensor, ref_logprob: torch.FloatTensor, kl_penalty: str) -> torch.Tensor:\n    \"\"\"Compute KL divergence given logprob and ref_logprob.\n    Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1104\n\n    Args:\n        logprob: torch.Tensor\n        ref_logprob: torch.Tensor\n\n    Returns:\n        kl_div: torch.Tensor\n    \"\"\"\n    if kl_penalty == \"kl\":\n        return logprob - ref_logprob\n\n    if kl_penalty == \"abs\":\n        return (logprob - ref_logprob).abs()\n\n    if kl_penalty == \"mse\":\n        return 0.5 * (logprob - ref_logprob).square()\n\n    # J. Schulman. Approximating kl divergence, 2020.\n    # # URL http://joschu.net/blog/kl-approx.html.\n    if kl_penalty == \"low_var_kl\":\n        kl = ref_logprob - logprob\n        ratio = torch.exp(kl)\n        kld = (ratio - kl - 1).contiguous()\n        return torch.clamp(kld, min=-10, max=10)\n\n    if kl_penalty == \"full\":\n        # so, here logprob and ref_logprob should contain the logits for every token in vocabulary\n        raise NotImplementedError\n\n    raise NotImplementedError\n"
        }
    ]
}