
"""
FSDP Reward Model.
"""

import torch
from tensordict import TensorDict
from verl import DataProto
from verl.utils.torch_functional import pad_sequence_to_length
from verl.workers.reward_model.base import BasePPORewardModel

class FSDPRewardModel(BasePPORewardModel):
    def __init__(self, config, reward_model_module, sft_tokenizer=None, rm_tokenizer=None):
        super().__init__(config)
        self.reward_model_module = reward_model_module
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.sft_tokenizer = sft_tokenizer
        self.rm_tokenizer = rm_tokenizer
        self.use_different_tokenizer = rm_tokenizer is not None

        if getattr(self.config, "param_offload", False):
            self.offload_params_to_cpu()

    def re_encode_by_rm_tokenizer(self, data: DataProto) -> DataProto:
        assert self.use_different_tokenizer, 're-encode need rm tokenizer not be None!'
        input_ids = data.batch['input_ids']
        attention_mask = data.batch['attention_mask']
        position_ids = data.batch['position_ids']
        ori_values = {'input_ids': input_ids, 'attention_mask': attention_mask, 'position_ids': position_ids}
        ori_bs, ori_seqlen = input_ids.size(0), input_ids.size(1)
        input_ids_for_rm = []
        attention_mask_for_rm = []
        position_ids_for_rm = []
        print_decode = True
        ori_seqlen = ori_seqlen + 128
        for id, mask in zip(input_ids, attention_mask):
            non_zero_indices = torch.nonzero(mask).view(-1)
            begin_pos, end_pos = non_zero_indices[0].item(), non_zero_indices[-1].item()
            valid_id = id[begin_pos:end_pos + 1]
            decode_result = self.sft_tokenizer.decode(valid_id)
            decode_with_rm_chat = decode_result.replace("<|user|>\n", "[INST] ").replace(
                "</s>\n<|assistant|>\n", " [/INST]").replace("</s> \n<|assistant|>\n", " [/INST]") + "</s>"
            if print_decode and (not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0):
                print(f'device {self.device}: sft decode result:\n{decode_result}\n \
                        \ndevice {self.device}: sft decode result with rm chat template:\n{decode_with_rm_chat}\n\n'
                     )
                print_decode = False
            rm_input_ids = self.rm_tokenizer(decode_with_rm_chat,
                                             return_tensors='pt')['input_ids'][0].to(input_ids.device)
            rm_attention_mask = torch.ones_like(rm_input_ids, device=input_ids.device)
            cur_seqlen = rm_input_ids.shape[-1]
            if cur_seqlen > ori_seqlen:
                print(f'warning: rm encode seqlen {cur_seqlen} > sft encode seqlen {ori_seqlen}')
                rm_input_ids = rm_input_ids[:ori_seqlen]
                rm_attention_mask = rm_attention_mask[:ori_seqlen]
            else:
                rm_input_ids = pad_sequence_to_length(rm_input_ids, ori_seqlen, self.rm_tokenizer.pad_token_id)
                rm_attention_mask = pad_sequence_to_length(rm_attention_mask, ori_seqlen, 0)
            rm_position_ids = torch.arange(0, ori_seqlen, device=input_ids.device)
            input_ids_for_rm.append(torch.unsqueeze(rm_input_ids, dim=0))
            attention_mask_for_rm.append(torch.unsqueeze(rm_attention_mask, dim=0))
            position_ids_for_rm.append(torch.unsqueeze(rm_position_ids, dim=0))
        input_ids_for_rm = torch.cat(input_ids_for_rm, dim=0)
        attention_mask_for_rm = torch.cat(attention_mask_for_rm, dim=0)
        position_ids_for_rm = torch.cat(position_ids_for_rm, dim=0)
        data.batch['input_ids'] = input_ids_for_rm
        data.batch['attention_mask'] = attention_mask_for_rm
        data.batch['position_ids'] = position_ids_for_rm
        return data, ori_values

    @torch.no_grad()
    def compute_reward(self, data: DataProto) -> DataProto:
        if getattr(self.config, "param_offload", False):
            self.load_params_to_cuda()

        if self.use_different_tokenizer:
            data, ori_values = self.re_encode_by_rm_tokenizer(data)

        input_ids = data.batch['input_ids']
        attention_mask = data.batch['attention_mask']
        position_ids = data.batch['position_ids']
        responses = data.batch['responses']
        batch_size = responses.size(0)
        response_length = responses.size(1)

        # Forward pass
        with torch.no_grad():
            output = self.reward_model_module(
                input_ids=input_ids,
                attention_mask=attention_mask,
                position_ids=position_ids
            )
            # 假设 output.logits 为 (bs, seq_len)
            logits = output.logits if hasattr(output, "logits") else output

        token_level_rewards = logits
        ends = attention_mask.cumsum(dim=-1).argmax(dim=-1).view(-1, 1)
        rewards = torch.gather(token_level_rewards, dim=1, index=ends)

        if self.use_different_tokenizer:
            data.batch.update(ori_values)
            input_ids = ori_values['input_ids']
            attention_mask = ori_values['attention_mask']
            position_ids = ori_values['position_ids']

        token_level_rewards = rewards.expand(attention_mask.shape[0], attention_mask.shape[1])
        eos_mask_idx = torch.argmax(position_ids * attention_mask, dim=-1)
        eos_mask = torch.zeros_like(attention_mask)
        eos_mask[torch.arange(batch_size), eos_mask_idx] = 1.
        token_level_rewards = token_level_rewards * eos_mask
        token_level_rewards = token_level_rewards[:, -response_length:]

        if getattr(self.config, "param_offload", False):
            self.offload_params_to_cpu()
        else:
            torch.cuda.empty_cache()

        batch = TensorDict({'rm_scores': token_level_rewards}, batch_size=input_ids.shape[0])
        return DataProto(batch=batch)

    def offload_params_to_cpu(self):
        if self.device == torch.device('cuda'):
            for name, param in self.reward_model_module.named_parameters():
                param.data = param.data.to('cpu', non_blocking=True)
            self.device = torch.device('cpu')
            torch.cuda.empty_cache()

    def load_params_to_cuda(self):
        if self.device == torch.device('cpu'):
            for name, param in self.reward_model_module.named_parameters():
                param.data = param.data.to(torch.cuda.current_device(), non_blocking=True)
            self.device = torch.device('cuda')