import os
import numpy as np

clip_higher = float(os.environ.get("REWRARD_SHAPING_CLIP_HIGHER", "1"))

def reward_shaping(args, conv, final_reward, gamma):
    """
    计算奖励的折扣值。(只针对正样本进行处理)

    Args:
        args: 包含参数的对象。
        conv: 对话历史记录。
        final_reward (float): 最终的任务奖励 (例如, 1 for success, 0 or -1 for failure)。
        gamma (float): 折扣因子 (0 < gamma <= 1)。

    Returns:
        List[float]: 折扣后的奖励值。
    """
    if final_reward == 1: # focus on correct trajectory
        discounts = np.power(gamma, np.arange(len(conv))[::-1])
        future_rewards = discounts * final_reward
        if args.step_auxiliary_reward == 0:
            intermediate_rewards = np.zeros(len(conv))
        else:
            assert False, "step_auxiliary_reward must be set to a non-zero value for reward shaping."
            intermediate_rewards = [args.step_auxiliary_reward if conv_sample.tool_status else 0 for conv_sample in conv]

        total_rewards = intermediate_rewards + future_rewards
        if args.step_reward_shaping_clip:
            # Clip the total rewards to a maximum value
            # This is useful to prevent large rewards from skewing the training process
            total_rewards = np.clip(total_rewards, 0, clip_higher)
        
        for idx, conv_sample in enumerate(conv):
            conv_sample.reward = total_rewards[idx]

    return conv
    