# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Core functions to implement PPO algorithms.
The function implemented in this file should be used by trainer with different distributed strategies to
implement PPO
"""

import numpy as np
import torch
from collections import defaultdict
from scipy.stats import kendalltau, spearmanr
import verl.utils.torch_functional as verl_F
from verl import DataProto
import random

class AdaptiveKLController:
    """
    Adaptive KL controller described in the paper:
    https://arxiv.org/pdf/1909.08593.pdf
    """

    def __init__(self, init_kl_coef, target_kl, horizon):
        self.value = init_kl_coef
        self.target = target_kl
        self.horizon = horizon

    def update(self, current_kl, n_steps):
        target = self.target
        proportional_error = np.clip(current_kl / target - 1, -0.2, 0.2)
        mult = 1 + proportional_error * n_steps / self.horizon
        self.value *= mult


class FixedKLController:
    """Fixed KL controller."""

    def __init__(self, kl_coef):
        self.value = kl_coef

    def update(self, current_kl, n_steps):
        pass


def get_kl_controller(config):
    if config.critic.kl_ctrl.type == 'fixed':
        kl_ctrl = FixedKLController(kl_coef=config.critic.kl_ctrl.kl_coef)
    elif config.critic.kl_ctrl.type == 'adaptive':
        assert config.kl_ctrl.horizon > 0, f'horizon must be larger than 0. Got {config.critic.kl_ctrl.horizon}'
        kl_ctrl = AdaptiveKLController(init_kl_coef=config.critic.kl_ctrl.kl_coef,
                                       target_kl=config.critic.kl_ctrl.target_kl,
                                       horizon=config.critic.kl_ctrl.horizon)
    else:
        raise ValueError('Unknown kl_ctrl type')

    return kl_ctrl


def compute_gae_advantage_return(token_level_rewards: torch.Tensor, values: torch.Tensor, eos_mask: torch.Tensor,
                                 gamma: torch.Tensor, lam: torch.Tensor):
    """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py

    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        values: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length). [EOS] mask. The token after [EOS] have mask zero.
        gamma: `(float)`
            discounted factor used in RL
        lam: `(float)`
            lambda value when computing Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)

    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)

    """
    with torch.no_grad():
        lastgaelam = 0
        advantages_reversed = []
        gen_len = token_level_rewards.shape[-1]

        for t in reversed(range(gen_len)):
            nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
            delta = token_level_rewards[:, t] + gamma * nextvalues - values[:, t]
            lastgaelam = delta + gamma * lam * lastgaelam
            advantages_reversed.append(lastgaelam)
        advantages = torch.stack(advantages_reversed[::-1], dim=1)

        returns = advantages + values
        advantages = verl_F.masked_whiten(advantages, eos_mask)
    return advantages, returns


def grpo_scores(scores, response_length, eos_mask, epsilon, index, hybrid_indexs, inter_adv_special_token_only=False):
    id2score = defaultdict(list)
    id2mean = {}
    id2std = {}

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            id2score[index[i]].append(scores[i])
        for idx in id2score:
            if len(id2score[idx]) == 1:
                id2mean[idx] = torch.tensor(0.0)
                id2std[idx] = torch.tensor(1.0)
            elif len(id2score[idx]) > 1:
                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
                id2std[idx] = torch.std(torch.tensor([id2score[idx]]))
            else:
                raise ValueError(f"no score in prompt index: {idx}")
        for i in range(bsz):
            scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)
        if not inter_adv_special_token_only:
            scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask
        else:
            print("using inter_adv_special_token_only", inter_adv_special_token_only)
            assert hybrid_indexs.float().mean() == 0.5
            think_ids = [13708, 766, 397] 
            no_think_ids = [27, 2152, 5854]
            scores = scores.unsqueeze(-1).tile([1, response_length])
            hybrid_indexs = hybrid_indexs.bool().unsqueeze(-1) # [bs, 1]

            special_token_think_mask = torch.zeros_like(scores, dtype=torch.float32, device=scores.device)
            special_token_think_mask[:, :len(think_ids)] = 1.0

            special_token_no_think_mask = torch.zeros_like(scores, dtype=torch.float32, device=scores.device)
            special_token_no_think_mask[:, :len(no_think_ids)] = 1.0

            special_token_mask = (~hybrid_indexs) * special_token_no_think_mask + hybrid_indexs * special_token_think_mask
            scores = scores * special_token_mask * eos_mask
    
    return scores



def rloo_scores(scores, response_length, eos_mask, epsilon, index, hybrid_indexs, inter_adv_special_token_only=False):
    id2score = defaultdict(list)
    id2mean = {}
    id2std = {}

    print("===> using rl alg RLOO.")

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            id2score[index[i]].append(scores[i])
        for idx in id2score:
            if len(id2score[idx]) == 1:
                id2mean[idx] = torch.tensor(0.0)
            elif len(id2score[idx]) > 1:
                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
            else:
                raise ValueError(f"no score in prompt index: {idx}")
        for i in range(bsz):
            response_num = len(id2score[index[i]])
            if response_num > 1:
                scores[i] = scores[i] * response_num / (response_num - 1) - id2mean[index[i]] * response_num / (response_num - 1)
        if not inter_adv_special_token_only:
            scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask
        else:
            print("using inter_adv_special_token_only", inter_adv_special_token_only)
            assert hybrid_indexs.float().mean() == 0.5
            think_ids = [13708, 766, 397] 
            no_think_ids = [27, 2152, 5854]
            scores = scores.unsqueeze(-1).tile([1, response_length])
            hybrid_indexs = hybrid_indexs.bool().unsqueeze(-1) # [bs, 1]

            special_token_think_mask = torch.zeros_like(scores, dtype=torch.float32, device=scores.device)
            special_token_think_mask[:, :len(think_ids)] = 1.0

            special_token_no_think_mask = torch.zeros_like(scores, dtype=torch.float32, device=scores.device)
            special_token_no_think_mask[:, :len(no_think_ids)] = 1.0

            special_token_mask = (~hybrid_indexs) * special_token_no_think_mask + hybrid_indexs * special_token_think_mask
            scores = scores * special_token_mask * eos_mask
    
    return scores


def find_last_one_indices(eos_mask: torch.Tensor) -> torch.Tensor:
    # 反转每一行（列逆序），再用 argmax 找第一个 1（实际上就是“最后一个 1”）
    reversed_mask = eos_mask.flip(dims=[1])
    last_one_pos = eos_mask.size(1) - 1 - reversed_mask.float().argmax(dim=1)
    return last_one_pos

def reinforce_plus_plus_scores(scores, response_length, eos_mask, epsilon, index, hybrid_indexs, inter_adv_special_token_only=False, gamma=1.0):
    id2score = defaultdict(list)
    id2mean = {}
    id2std = {}

    print("===> using rl alg reinforce_plus_plus.")

    new_token_level_rewards = torch.zeros_like(eos_mask)
    # print("eos_mask", eos_mask)
    reward_index_marix = find_last_one_indices(eos_mask)

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            new_token_level_rewards[i, reward_index_marix[i]] = scores[i]
        
        returns = torch.zeros_like(new_token_level_rewards)
        running_return = 0

        for t in reversed(range(new_token_level_rewards.shape[1])):
            running_return = new_token_level_rewards[:, t] + gamma * running_return
            returns[:, t] = running_return
            # Reset after EOS
            running_return = running_return * eos_mask[:, t]

        advantages = verl_F.masked_whiten(returns, eos_mask)
        advantages = advantages * eos_mask

        #     id2score[index[i]].append(scores[i])
        # for idx in id2score:
        #     if len(id2score[idx]) == 1:
        #         id2mean[idx] = torch.tensor(0.0)
        #     elif len(id2score[idx]) > 1:
        #         id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
        #     else:
        #         raise ValueError(f"no score in prompt index: {idx}")
        # for i in range(bsz):
        #     response_num = len(id2score[index[i]])
        #     if response_num > 1:
        #         scores[i] = scores[i] * response_num / (response_num - 1) - id2mean[index[i]] * response_num / (response_num - 1)
        # if not inter_adv_special_token_only:
        #     scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask
        # else:
        #     print("using inter_adv_special_token_only", inter_adv_special_token_only)
        #     assert hybrid_indexs.float().mean() == 0.5
        #     think_ids = [13708, 766, 397] 
        #     no_think_ids = [27, 2152, 5854]
        #     scores = scores.unsqueeze(-1).tile([1, response_length])
        #     hybrid_indexs = hybrid_indexs.bool().unsqueeze(-1) # [bs, 1]

        #     special_token_think_mask = torch.zeros_like(scores, dtype=torch.float32, device=scores.device)
        #     special_token_think_mask[:, :len(think_ids)] = 1.0

        #     special_token_no_think_mask = torch.zeros_like(scores, dtype=torch.float32, device=scores.device)
        #     special_token_no_think_mask[:, :len(no_think_ids)] = 1.0

        #     special_token_mask = (~hybrid_indexs) * special_token_no_think_mask + hybrid_indexs * special_token_think_mask
        #     scores = scores * special_token_mask * eos_mask
    
    return advantages


# NOTE(sgm): this implementation only consider outcome supervision, where the reward is a scalar.
def compute_grpo_outcome_advantage(token_level_rewards: torch.Tensor,
                                   rm_scores: torch.Tensor,
                                   eos_mask: torch.Tensor,
                                   index: torch.Tensor,
                                   hybrid_indexs: torch.Tensor,
                                   epsilon: float = 1e-6,
                                   group_inner_alpha: float = 1.0,
                                   group_inter_alpha: float = 1.0,
                                   margin: float = 0.2,
                                   format_error_reward: float = -5.0,
                                   data: DataProto = None,
                                   inter_adv_special_token_only: bool = False,
                                   adv_estimator: str = "grpo"):
    """
    Compute advantage for GRPO, operating only on Outcome reward 
    (with only one scalar reward for each response).
    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)
    
    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)
    """
    # response_length = rm_scores.shape[-1]
    # scores = rm_scores.sum(dim=-1)
    # format_scores = token_level_rewards.sum(dim=-1)
    # # assert torch.all(torch.isin(format_scores, torch.tensor([format_error_reward, 0])))
    # assert format_error_reward != 0.

    # true_response_len = eos_mask.sum(dim=-1)

    # # hybrid score
    # ### balance the ratio
    # id2score_think = defaultdict(list)
    # id2score_no_think = defaultdict(list)
    # with torch.no_grad():
    #     bsz = scores.shape[0]
    #     for i in range(bsz):
    #         if hybrid_indexs[i] == 0:
    #             id2score_no_think[index[i]].append(scores[i].item())
    #         elif hybrid_indexs[i] == 1:
    #             id2score_think[index[i]].append(scores[i].item())
    #         else:
    #             assert 0

    # index2hybrid = {}
    # index2minscore_for_better_one = {}
    # index2minscore_for_wrose_one = {}
    # ratio_think_is_better = []
    # for key in id2score_no_think.keys():
    #     assert len(id2score_no_think[key]) == len(id2score_think[key])
    #     avg_think = np.mean(id2score_think[key])
    #     avg_no_think = np.mean(id2score_no_think[key])

    #     if avg_think <= (avg_no_think + margin):
    #         # select no think
    #         index2hybrid[key] = 0
    #         index2minscore_for_better_one[key] = min(id2score_no_think[key])
    #         index2minscore_for_wrose_one[key] = min(id2score_think[key])
    #         ratio_think_is_better.append(0)
    #     else:
    #         # select think
    #         index2hybrid[key] = 1
    #         index2minscore_for_better_one[key] = min(id2score_think[key])
    #         index2minscore_for_wrose_one[key] = min(id2score_no_think[key])
    #         ratio_think_is_better.append(1)

    # drop_key = [] # for balance the ratio of think is better
    # key_length = len(id2score_no_think.keys())
    # ori_ratio_think_is_better = ratio_think_is_better
    # think_count = ratio_think_is_better.count(1)
    # no_think_count = ratio_think_is_better.count(0)
    # coff = abs(no_think_count - think_count)
    # record_coff = coff

    # if np.mean(ratio_think_is_better) > 0.5:
    #     for key in id2score_no_think.keys():
    #         if index2hybrid[key] == 1:
    #             drop_key.append(key)
    #             coff -= 1
    #             if coff == 0:
    #                 break
    # elif np.mean(ratio_think_is_better) < 0.5:
    #     for key in id2score_no_think.keys():
    #         if index2hybrid[key] == 0:
    #             drop_key.append(key)
    #             coff -= 1
    #             if coff == 0:
    #                 break
    # else:
    #     pass

    # make training stable for multi-nodes
    all_keys = {}

    with torch.no_grad():
        bsz = rm_scores.shape[0]
        for i in range(bsz):
            all_keys[index[i]] = 1

    all_keys_list = all_keys.keys()
    length_key = len(all_keys_list)

    if length_key % 8 != 0:
         coff = (length_key % 8)
    else:
        coff = 0

    drop_key = random.sample(all_keys_list, coff)
    record_coff = len(drop_key) * 4

    selected_indexs = torch.ones_like(data.batch['rm_scores'].sum(dim=-1))
    for i in range(bsz):
        if index[i] in drop_key:
            selected_indexs[i] = 0

    data = data.select_by_index(selected_indexs)

    ### recompute these metrics
    ability = data.non_tensor_batch['ability']
    token_level_rewards = data.batch['token_level_rewards']
    rm_scores = data.batch['rm_scores']
    assert rm_scores.shape[0] % 32 == 0
    index = data.non_tensor_batch['uid']
    responses = data.batch['responses']
    response_length = responses.size(-1)
    attention_mask = data.batch['attention_mask']
    eos_mask = attention_mask[:, -response_length:]
    hybrid_indexs = data.batch['hybrid_indexs']

    response_length = rm_scores.shape[-1]
    scores = rm_scores.sum(dim=-1)
    rule_scores = token_level_rewards.sum(dim=-1)

    # print("rule_scores", rule_scores)
    # print("scores", scores)

    true_response_len = eos_mask.sum(dim=-1)

    kendall_corr, _ = kendalltau(true_response_len.tolist(), scores.tolist())
    spearman_corr, _ = spearmanr(true_response_len.tolist(), scores.tolist())

    # hybrid score
    think_ids = [13708, 766, 397] 
    no_think_ids = [27, 2152, 5854, 766, 397]
    id2score_think = defaultdict(list)
    id2score_no_think = defaultdict(list)
    id2score_math_rm_score_think = defaultdict(list)
    id2score_math_rm_score_no_think = defaultdict(list)
    id2ability = {}

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            if index[i] in id2ability.keys():
                id2ability[index[i]] == ability[i]
            else:
                id2ability[index[i]] = ability[i]

            if ability[i] == "general":
                if hybrid_indexs[i] == 0:
                    id2score_no_think[index[i]].append(scores[i].item())
                elif hybrid_indexs[i] == 1:
                    id2score_think[index[i]].append(scores[i].item())
                else:
                    assert 0
            elif ability[i] == "math":
                if hybrid_indexs[i] == 0:
                    id2score_no_think[index[i]].append(rule_scores[i].item())
                    id2score_math_rm_score_no_think[index[i]].append(scores[i].item())
                elif hybrid_indexs[i] == 1:
                    id2score_think[index[i]].append(rule_scores[i].item())
                    id2score_math_rm_score_think[index[i]].append(scores[i].item())
                else:
                    assert 0
            else:
                assert 0

            # check the token id is right
            if hybrid_indexs[i] == 0:
                assert responses[i].tolist()[:len(no_think_ids)] == no_think_ids, print("responses", responses[i], hybrid_indexs[i])
            elif hybrid_indexs[i] == 1:
                assert responses[i].tolist()[:len(think_ids)] == think_ids, print("responses", responses[i], hybrid_indexs[i])
            else:
                assert 0

    index2hybrid = {}
    index2minscore_for_better_one = {}
    index2minscore_for_wrose_one = {}
    ratio_think_is_better = []
    
    for key in id2score_no_think.keys():
        assert len(id2score_no_think[key]) == len(id2score_think[key])

        avg_think = np.mean(id2score_think[key])
        avg_no_think = np.mean(id2score_no_think[key])

        # if id2ability[key] == "math":
        #     print("avg_think", avg_think, "avg_no_think", avg_no_think)

        if id2ability[key] == "general":
            avg_no_think = avg_no_think + margin

        if (avg_think < avg_no_think) or (avg_think == avg_no_think and avg_think != 0.0 and id2ability[key] == "math"):
            # select no think
            index2hybrid[key] = 0
            index2minscore_for_better_one[key] = id2score_no_think[key]
            index2minscore_for_wrose_one[key] = id2score_think[key]
            
            if id2ability[key] == "math" and (id2score_no_think[key][0] == id2score_no_think[key][1]):
                index2minscore_for_better_one[key] = id2score_math_rm_score_no_think[key]

            if id2ability[key] == "math" and (id2score_think[key][0] == id2score_think[key][1]):
                index2minscore_for_wrose_one[key] = id2score_math_rm_score_think[key]

            ratio_think_is_better.append(0)
        else:
            # select think
            index2hybrid[key] = 1
            index2minscore_for_better_one[key] = id2score_think[key]
            index2minscore_for_wrose_one[key] = id2score_no_think[key]

            if id2ability[key] == "math" and (id2score_no_think[key][0] == id2score_no_think[key][1]):
                index2minscore_for_wrose_one[key] = id2score_math_rm_score_no_think[key]

            if id2ability[key] == "math" and (id2score_think[key][0] == id2score_think[key][1]):
                index2minscore_for_better_one[key] = id2score_math_rm_score_think[key]

            ratio_think_is_better.append(1)

    # reformat scores
    group_inner_scores = torch.zeros_like(scores)
    group_inter_scores = torch.zeros_like(scores)

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            if hybrid_indexs[i] != index2hybrid[index[i]]:
                group_inter_scores[i] = 0

                assert len(index2minscore_for_wrose_one[index[i]]) == 2
                if index2minscore_for_wrose_one[index[i]][0] == index2minscore_for_wrose_one[index[i]][1]:
                    assert 0
                else:
                    if scores[i] != min(index2minscore_for_wrose_one[index[i]]):
                        group_inner_scores[i] = 1
                    else:
                        group_inner_scores[i] = 0
            else:
                group_inter_scores[i] = 1
                assert len(index2minscore_for_better_one[index[i]]) == 2
                if index2minscore_for_better_one[index[i]][0] == index2minscore_for_better_one[index[i]][1]:
                    assert 0
                else:
                    if scores[i] != min(index2minscore_for_better_one[index[i]]):
                        group_inner_scores[i] = 1
                    else:
                        group_inner_scores[i] = 0

        group_inter_scores_sum = group_inter_scores.sum().item()
        group_inner_scores_sum = group_inner_scores.sum().item()

        if adv_estimator == "grpo":
            group_inter_scores = grpo_scores(group_inter_scores, response_length, eos_mask, epsilon, index, hybrid_indexs, inter_adv_special_token_only=inter_adv_special_token_only)
            # print("group_inter_scores", group_inter_scores[:, :6])
            group_inner_scores = grpo_scores(group_inner_scores, response_length, eos_mask, epsilon, index, hybrid_indexs)

        elif adv_estimator == "rloo":
            group_inter_scores = rloo_scores(group_inter_scores, response_length, eos_mask, epsilon, index, hybrid_indexs, inter_adv_special_token_only=inter_adv_special_token_only)
            # print("group_inter_scores", group_inter_scores[:, :6])
            group_inner_scores = rloo_scores(group_inner_scores, response_length, eos_mask, epsilon, index, hybrid_indexs)

        elif adv_estimator == "reinforce_plus_plus":
            group_inter_scores = grpo_scores(group_inter_scores, response_length, eos_mask, epsilon, index, hybrid_indexs, inter_adv_special_token_only=inter_adv_special_token_only)
            group_inter_scores = verl_F.masked_whiten(group_inter_scores, eos_mask)
            group_inner_scores = reinforce_plus_plus_scores(group_inner_scores, response_length, eos_mask, epsilon, index, hybrid_indexs, gamma=0.99)
        else:
            assert 0

        final_scores = group_inner_scores * group_inner_alpha + group_inter_scores * group_inter_alpha

    meta_info = {
        "ratio_of_think_is_better": np.mean(ratio_think_is_better),
        "avg_reward_for_think": np.mean(list(id2score_think.values())),
        "avg_reward_for_no_think": np.mean(list(id2score_no_think.values())),
        "kendall_corr": kendall_corr,
        "spearman_corr": spearman_corr,
        "group_inter_scores": group_inter_scores_sum / group_inter_scores.shape[0],
        "group_inner_alpha": group_inner_alpha,
        "group_inter_alpha": group_inter_alpha,
        "format_error_reward": format_error_reward,
        "group_inner_scores": group_inner_scores_sum / group_inner_scores.shape[0],
        # "format_correct_ratio_after_drop": torch.sum(format_scores == 0.).item() / format_scores.shape[0],
        "bsz": data.batch['token_level_scores'].shape[0],
        # "ratio_of_think_is_better_before_balance": np.mean(ori_ratio_think_is_better),
        "coff": record_coff
    }

    return final_scores, final_scores, meta_info, data


def compute_rloo_outcome_advantage(token_level_rewards: torch.Tensor,
                                   rm_scores: torch.Tensor,
                                   eos_mask: torch.Tensor,
                                   index: torch.Tensor,
                                   hybrid_indexs: torch.Tensor,
                                   epsilon: float = 1e-6):
    """
    Compute advantage for RLOO based on https://arxiv.org/abs/2402.14740
    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)

    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)
    """
    response_length = rm_scores.shape[-1]
    scores = rm_scores.sum(dim=-1)
    format_scores = token_level_rewards.sum(dim=-1)
    assert torch.all(torch.isin(format_scores, torch.tensor([-1, 0]))), "Tensor contains values other than -1, 1, or 2."

    true_response_len = eos_mask.sum(dim=-1)

    kendall_corr, _ = kendalltau(true_response_len.tolist(), scores.tolist())
    spearman_corr, _ = spearmanr(true_response_len.tolist(), scores.tolist())

    # hybrid score
    id2score_think = defaultdict(list)
    id2score_no_think = defaultdict(list)
    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            if hybrid_indexs[i] == 0:
                id2score_no_think[index[i]].append(scores[i].item())
            elif hybrid_indexs[i] == 1:
                id2score_think[index[i]].append(scores[i].item())
            else:
                assert 0

    index2hybrid = {}
    index2minscore = {}
    ratio_think_is_better = []
    for key in id2score_no_think.keys():
        assert len(id2score_no_think[key]) == len(id2score_think[key])
        avg_think = np.mean(id2score_think[key])
        avg_no_think = np.mean(id2score_no_think[key])

        if avg_think <= avg_no_think:
            # select no think
            index2hybrid[key] = 0
            index2minscore[key] = max(id2score_no_think[key])
            ratio_think_is_better.append(0)
        else:
            # select think
            index2hybrid[key] = 1
            index2minscore[key] = max(id2score_think[key])
            ratio_think_is_better.append(1)

    # reformat scores
    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            if hybrid_indexs[i] != index2hybrid[index[i]]:
                scores[i] = 0
            else:
                if scores[i] != index2minscore[index[i]]:
                    scores[i] = 1
                else:
                    scores[i] = 2
            scores[i] += format_scores[i]
    
    id2score = defaultdict(list)
    id2mean = {}

    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            id2score[index[i]].append(scores[i])
        for idx in id2score:
            if len(id2score[idx]) == 1:
                id2mean[idx] = torch.tensor(0.0)
            elif len(id2score[idx]) > 1:
                id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
            else:
                raise ValueError(f"no score in prompt index: {idx}")
        for i in range(bsz):
            response_num = len(id2score[index[i]])
            if response_num > 1:
                scores[i] = scores[i] * response_num / (response_num -
                                                        1) - id2mean[index[i]] * response_num / (response_num - 1)
        scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask

    meta_info = {
        "ratio_of_think_is_better": np.mean(ratio_think_is_better),
        "avg_reward_for_think": np.mean(list(id2score_think.values())),
        "avg_reward_for_no_think": np.mean(list(id2score_no_think.values())),
        "kendall_corr": kendall_corr,
        "spearman_corr": spearman_corr,
        "format_correct_ratio": torch.sum(format_scores == 0.).item() / format_scores.shape[0],
    }

    return scores, scores, meta_info


def compute_reinforce_plus_plus_outcome_advantage(token_level_rewards: torch.Tensor, rm_scores: torch.Tensor, index: torch.Tensor, attention_mask: torch.Tensor, eos_mask: torch.Tensor,
                                                  position_ids: torch.Tensor, gamma: torch.Tensor, hybrid_indexs: torch.Tensor):
    """
    Compute advantage for REINFORCE++. 
    This implementation is based on the paper: https://arxiv.org/abs/2501.03262
    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)
    
    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)
    """

    response_length = rm_scores.shape[-1]
    scores = rm_scores.sum(dim=-1)
    format_scores = token_level_rewards.sum(dim=-1)
    assert torch.all(torch.isin(format_scores, torch.tensor([-5, 0]))), "Tensor contains values other than -5 or 2."

    true_response_len = eos_mask.sum(dim=-1)

    kendall_corr, _ = kendalltau(true_response_len.tolist(), scores.tolist())
    spearman_corr, _ = spearmanr(true_response_len.tolist(), scores.tolist())

    # hybrid score
    id2score_think = defaultdict(list)
    id2score_no_think = defaultdict(list)
    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            if hybrid_indexs[i] == 0:
                id2score_no_think[index[i]].append(scores[i].item())
            elif hybrid_indexs[i] == 1:
                id2score_think[index[i]].append(scores[i].item())
            else:
                assert 0

    index2hybrid = {}
    index2minscore = {}
    ratio_think_is_better = []
    for key in id2score_no_think.keys():
        assert len(id2score_no_think[key]) == len(id2score_think[key])
        avg_think = np.mean(id2score_think[key])
        avg_no_think = np.mean(id2score_no_think[key])

        if avg_think <= avg_no_think:
            # select no think
            index2hybrid[key] = 0
            index2minscore[key] = max(id2score_no_think[key])
            ratio_think_is_better.append(0)
        else:
            # select think
            index2hybrid[key] = 1
            index2minscore[key] = max(id2score_think[key])
            ratio_think_is_better.append(1)

    new_token_level_rewards = torch.zeros_like(token_level_rewards)

    # reformat scores
    with torch.no_grad():
        bsz = scores.shape[0]
        for i in range(bsz):
            indices = torch.nonzero(rm_scores[i] != 0)
            assert indices.shape[-1] == 1, print("indices", indices)
            indice = indices[0].item()
            if hybrid_indexs[i] != index2hybrid[index[i]]:
                new_token_level_rewards[i, indice] = 0
            else:
                if scores[i] != index2minscore[index[i]]:
                    new_token_level_rewards[i, indice] = 1
                else:
                    new_token_level_rewards[i, indice] = 2
            new_token_level_rewards[i, indice] += format_scores[i]

    with torch.no_grad():
        returns = torch.zeros_like(new_token_level_rewards)
        running_return = 0

        for t in reversed(range(new_token_level_rewards.shape[1])):
            running_return = new_token_level_rewards[:, t] + gamma * running_return
            returns[:, t] = running_return
            # Reset after EOS
            running_return = running_return * eos_mask[:, t]

        advantages = verl_F.masked_whiten(returns, eos_mask)
        advantages = advantages * eos_mask

    meta_info = {
        "ratio_of_think_is_better": np.mean(ratio_think_is_better),
        "avg_reward_for_think": np.mean(list(id2score_think.values())),
        "avg_reward_for_no_think": np.mean(list(id2score_no_think.values())),
        "kendall_corr": kendall_corr,
        "spearman_corr": spearman_corr,
        "format_correct_ratio": torch.sum(format_scores == 0.).item() / format_scores.shape[0],
    }

    return advantages, returns, meta_info


def compute_remax_outcome_advantage(token_level_rewards: torch.Tensor, reward_baselines: torch.Tensor,
                                    eos_mask: torch.Tensor):
    """
    Compute advantage for ReMax, operating only on Outcome reward 
    This implementation is based on the paper: https://arxiv.org/abs/2310.10505

    (with only one scalar reward for each response).
    Args:
        token_level_rewards: `(torch.Tensor)`
            shape: (bs, response_length)
        reward_baselines: `(torch.Tensor)`
            shape: (bs,)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)
    
    Returns:
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        Returns: `(torch.Tensor)`
            shape: (bs, response_length)
    """
    response_length = token_level_rewards.shape[-1]
    scores = token_level_rewards.sum(dim=-1)

    with torch.no_grad():
        returns = (token_level_rewards * eos_mask).flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
        advantages = returns - reward_baselines.unsqueeze(-1).tile([1, response_length]) * eos_mask

    return advantages, returns


def compute_rewards(token_level_scores, old_log_prob, ref_log_prob, kl_ratio):
    kl = old_log_prob - ref_log_prob
    return token_level_scores - kl * kl_ratio



def agg_loss(loss_mat: torch.Tensor, loss_mask: torch.Tensor, loss_agg_mode: str):
    """
    Aggregate the loss matrix into a scalar.
    Args:
        loss_mat: `(torch.Tensor)`
            shape: (bs, response_length)
        loss_mask: `(torch.Tensor)`
            shape: (bs, response_length)
        loss_agg_mode: (str) choices: "token-mean" /
                                      "seq-mean-token-sum" /
                                      "seq-mean-token-mean" /
                                      "seq-mean-token-sum-norm" /
            "token-mean" is the default behavior
    Returns:
        loss: `a scalar torch.Tensor`
            aggregated loss
    """
    if loss_agg_mode == "token-mean":
        loss = verl_F.masked_mean(loss_mat, loss_mask)
    elif loss_agg_mode == "seq-mean-token-sum":
        seq_losses = torch.sum(loss_mat * loss_mask, dim=-1)  # token-sum
        loss = torch.mean(seq_losses)  # seq-mean
    elif loss_agg_mode == "seq-mean-token-mean":
        seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1)  # token-mean
        loss = torch.mean(seq_losses)  # seq-mean
    elif loss_agg_mode == "seq-mean-token-sum-norm":
        seq_losses = torch.sum(loss_mat * loss_mask, dim=-1)
        loss = torch.sum(seq_losses) / loss_mask.shape[-1]  # The divisor
        # (loss_mask.shape[-1]) should ideally be constant
        # throughout training to well-replicate the DrGRPO paper.
        # TODO: Perhaps add user-defined normalizer argument to
        # agg_loss to ensure divisor stays constant throughout.
    else:
        raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")

    return loss


def compute_policy_loss(old_log_prob, log_prob, advantages, eos_mask, cliprange, DrGRPO = True):
    """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1122

    Args:
        old_log_prob: `(torch.Tensor)`
            shape: (bs, response_length)
        log_prob: `(torch.Tensor)`
            shape: (bs, response_length)
        advantages: `(torch.Tensor)`
            shape: (bs, response_length)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)
        cliprange: (float)
            The clip range used in PPO. See https://arxiv.org/abs/1707.06347

    Returns:
        pg_loss: `a scalar torch.Tensor`
            policy gradient loss computed via PPO
        pg_clipfrac: (float)
            a float number indicating the fraction of policy gradient loss being clipped

    """
    # print("log_prob", log_prob, torch.isnan(log_prob).any(), "old_log_prob", old_log_prob, torch.isnan(old_log_prob).any())
    # assert not torch.isnan(log_prob).any()
    # assert not torch.isnan(old_log_prob).any()

    negative_approx_kl = log_prob - old_log_prob
    ratio = torch.exp(negative_approx_kl)
    # print("ratio", ratio, torch.isnan(ratio).any())
    # assert not torch.isnan(ratio).any()

    ppo_kl = verl_F.masked_mean(-negative_approx_kl, eos_mask)
    # print("ppo_kl", ppo_kl, torch.isnan(ppo_kl).any())
    # assert not torch.isnan(ppo_kl).any()

    pg_losses = -advantages * ratio
    pg_losses2 = -advantages * torch.clamp(ratio, 1.0 - cliprange, 1.0 + cliprange)

    if DrGRPO:
        print("===> using DrGRPO.")
        pg_loss = agg_loss(loss_mat=torch.max(pg_losses, pg_losses2), loss_mask=eos_mask, loss_agg_mode="seq-mean-token-sum-norm")
    else:
        pg_loss = verl_F.masked_mean(torch.max(pg_losses, pg_losses2), eos_mask)
    # print("pg_loss", pg_loss, torch.isnan(pg_loss).any())
    # assert not torch.isnan(pg_loss).any()

    pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses2, pg_losses).float(), eos_mask)
    # print("pg_clipfrac", pg_clipfrac, torch.isnan(pg_clipfrac).any())
    # assert not torch.isnan(pg_clipfrac).any()

    return pg_loss, pg_clipfrac, ppo_kl


def compute_entropy_loss(logits, eos_mask):
    """Compute Categorical entropy loss

    Args:
        logits: `(torch.Tensor)`
            shape: (bs, response_length, vocab_size)
        eos_mask: `(torch.Tensor)`
            shape: (bs, response_length)

    Returns:
        entropy: a scalar torch.Tensor

    """
    # compute entropy
    entropy = verl_F.entropy_from_logits(logits)  # (bs, response_len)
    entropy_loss = verl_F.masked_mean(entropy, mask=eos_mask)
    return entropy_loss


def compute_value_loss(vpreds, returns, values, eos_mask, cliprange_value):
    """Compute the value loss. Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1151

    Args:
        vpreds (`torch.FloatTensor`):
            Predicted values of the value head, shape (`batch_size`, `response_length`)
        values (`torch.FloatTensor`):
            Old values of value head, shape (`batch_size`, `response_length`)
        returns: (`torch.FloatTensor`):
            Ground truth returns, shape (`batch_size`, `response_length`)

    Returns:
        vf_loss: a scalar (`torch.FloatTensor`):
            value function loss
        vf_clipfrac: a float
            The ratio of vf being clipped

    """
    vpredclipped = verl_F.clip_by_value(vpreds, values - cliprange_value, values + cliprange_value)
    vf_losses1 = (vpreds - returns)**2
    vf_losses2 = (vpredclipped - returns)**2
    vf_loss = 0.5 * verl_F.masked_mean(torch.max(vf_losses1, vf_losses2), eos_mask)
    vf_clipfrac = verl_F.masked_mean(torch.gt(vf_losses2, vf_losses1).float(), eos_mask)
    return vf_loss, vf_clipfrac


def kl_penalty(logprob: torch.FloatTensor, ref_logprob: torch.FloatTensor, kl_penalty) -> torch.FloatTensor:
    """Compute KL divergence given logprob and ref_logprob.
    Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1104

    Args:
        logprob:
        ref_logprob:

    Returns:

    """
    if kl_penalty == "kl":
        return logprob - ref_logprob

    if kl_penalty == "abs":
        return (logprob - ref_logprob).abs()

    if kl_penalty == "mse":
        return 0.5 * (logprob - ref_logprob).square()

    # J. Schulman. Approximating kl divergence, 2020.
    # # URL http://joschu.net/blog/kl-approx.html.
    if kl_penalty == 'low_var_kl':
        kl = ref_logprob - logprob
        torch.clamp(kl, min=-10, max=10)
        ratio = torch.exp(kl)
        kld = (ratio - kl - 1).contiguous()
        return torch.clamp(kld, min=-10, max=10)

    if kl_penalty == "full":
        # so, here logprob and ref_logprob should contain the logits for every token in vocabulary
        raise NotImplementedError

    raise NotImplementedError
