# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib.util
import os
import sys
from abc import ABC, abstractmethod
from collections import defaultdict
from functools import partial
from typing import Callable, Dict, List, Optional, Tuple, TypedDict

import torch
from transformers import PreTrainedTokenizer

from ...protocol import DataProto
from .config import RewardConfig


class RewardScore(TypedDict):
    overall: float
    format: Optional[float]
    accuracy: Optional[float]


SequentialRewardFunction = Callable[[str|dict, str], RewardScore]

BatchRewardFunction = Callable[[List[str|dict], List[str]], List[RewardScore]]


class FunctionRewardManager(ABC):
    """Reward manager for rule-based reward."""

    def __init__(self, config: RewardConfig, tokenizer: PreTrainedTokenizer):
        if config.reward_function is None:
            raise ValueError("Reward function is not provided.")

        if not os.path.exists(config.reward_function):
            raise FileNotFoundError(f"Reward function file {config.reward_function} not found.")

        spec = importlib.util.spec_from_file_location("custom_reward_fn", config.reward_function)
        module = importlib.util.module_from_spec(spec)
        try:
            sys.modules["custom_reward_fn"] = module
            spec.loader.exec_module(module)
        except Exception as e:
            raise RuntimeError(f"Failed to load reward function: {e}")

        if not hasattr(module, config.reward_function_name):
            raise AttributeError(f"Module {module} does not have function {config.reward_function_name}.")

        reward_fn = getattr(module, config.reward_function_name)
        print(f"Using reward function `{config.reward_function_name}` from `{config.reward_function}`.")
        self.reward_fn = partial(reward_fn, **config.reward_function_kwargs)
        self.config = config
        self.tokenizer = tokenizer

    @abstractmethod
    def compute_reward(self, data: DataProto) -> Tuple[torch.Tensor, Dict[str, List[float]]]:
        """Compute reward for a batch of data."""
        ...


class SequentialFunctionRewardManager(FunctionRewardManager):
    reward_fn: SequentialRewardFunction

    def compute_reward(self, data: DataProto) -> Tuple[torch.Tensor, Dict[str, List[float]]]:
        reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
        reward_metrics = defaultdict(list)
        response_ids = data.batch["responses"]
        response_length = data.batch["response_mask"].sum(dim=-1)
        for i in range(len(data)):
            valid_response_ids = response_ids[i][: response_length[i]]
            response_str = self.tokenizer.decode(
                valid_response_ids, skip_special_tokens=self.config.skip_special_tokens
            )
            ground_truth = data.non_tensor_batch["ground_truth"][i]

            score = self.reward_fn(response_str, ground_truth)
            reward_tensor[i, response_length[i] - 1] = score["overall"]
            for key, value in score.items():
                reward_metrics[key].append(value)

        return reward_tensor, reward_metrics


class BatchFunctionRewardManager(FunctionRewardManager):
    reward_fn: BatchRewardFunction

    def compute_reward(self, data: DataProto) -> Tuple[torch.Tensor, Dict[str, List[float]]]:

        """
        Returns:
            - reward_tensors_dict (Dict[str, torch.Tensor]): 
              A dictionary where keys are reward names (e.g., "format", "bleu1"),
              and values are the corresponding token-level reward tensors (shape: [batch_size, seq_len]).
            - reward_metrics (Dict[str, List[float]]): 
              A list of raw scores used for logging.
        """

        response_str, ground_truth, prompts = [], [], []
        response_ids = data.batch["responses"]
        response_length = data.batch["response_mask"].sum(dim=-1)
        # valid_response_ids_list, ground_truth_ids_list, raw_prompt_ids_list = [], [], []

        for i in range(len(data)):
            valid_response_ids = response_ids[i][: response_length[i]]
            response_str.append(
                self.tokenizer.decode(valid_response_ids, skip_special_tokens=self.config.skip_special_tokens)
            )
            ground_truth.append(data.non_tensor_batch["ground_truth"][i])
            prompts.append(data.non_tensor_batch["prompt"][i])

            # valid_response_ids_list.append(valid_response_ids)
            # ground_truth_ids_list.append(data.tensor_batch["ground_truth_ids"][i][:data.non_tensor_batch["ground_truth_ids_length"][i]].tolist())
            # raw_prompt_ids_list.append(data.non_tensor_batch["raw_prompt_ids"][i][:data.non_tensor_batch["raw_prompt_ids_length"][i]].tolist())

        # This step calls your core reward calculation function and returns a list of score dictionaries
        # scores: [{'format': 1.0, 'bleu1': 0.8, ...}, {'format': 0.0, 'bleu1': 0.6, ...}]
        scores_list = self.reward_fn(response_str, ground_truth, prompts)

        # Initialize reward_metrics for logging
        reward_metrics = defaultdict(list)
        
        # Initialize reward_tensors_dict for PPO training
        reward_tensors_dict = {}
        if not scores_list:
            return reward_tensors_dict, dict(reward_metrics)

        # 1. Convert the list of score dictionaries to a dictionary where values are lists
        # scores_by_name: {'format': [1.0, 0.0], 'bleu1': [0.8, 0.6], ...}
        scores_by_name = defaultdict(list)
        for score_dict in scores_list:
            for key, value in score_dict.items():
                # # "overall" is a weighted sum, we no longer need it, only process independent reward components
                # if key != "response_length":
                scores_by_name[key].append(value)

        # 2. Create token-level reward tensors for each reward component
        device = response_ids.device
        batch_size = len(data)
        batch_indices = torch.arange(batch_size, device=device)
        # Find the index of the last token in each sequence
        last_token_indices = (response_length - 1).clamp(min=0)

        for name, values in scores_by_name.items():
            # For logging
            reward_metrics[name].extend(values)

            # Create a zero-filled tensor
            reward_tensor = torch.zeros_like(response_ids, dtype=torch.float32, device=device)
            
            # Place this batch of reward scores at the last token position of each sequence at once
            # tensor_values = torch.tensor(values, dtype=torch.float32, device=device)

            for i in range(batch_size):
                reward_tensor[i, last_token_indices[i]] = values[i]
            
            # Store in dictionary
            reward_tensors_dict[name] = reward_tensor

        return reward_tensors_dict, dict(reward_metrics)
