import torch
import numpy as np

# def reward_func(queries, prompts, labels):
#     # queries is prompts + responses
#     # labels is answers
#     print(queries)
#     return torch.randn(len(queries))


scores = [1, 1, 1.1,1.1,1.1, 1,1] 
# 计算相对奖励（zero-sum within group）
mean_score = np.mean(scores)
std_score = np.std(scores) if np.std(scores) > 0 else 1.0

rollout_records = []
for i,_ in enumerate(scores):
    # 标准化分数
    normalized_score = (scores[i] - mean_score) / (std_score+1e-4)

    rollout_records.append(normalized_score)
print(f"GRPO Rewards: {rollout_records}, mean score: {mean_score:.4f}, std: {std_score:.4f}")