import re
import os
pwd_path=os.path.dirname(__file__)
root_path=os.path.join(pwd_path, "../../")
import sys
print(f"pwd_path: {pwd_path}")
sys.path.insert(0, os.path.join(pwd_path, "../../"))
from utils.MATH_evaluator_list import MATHEvaluator
import math
import json

class Reward():
    def __init__(self, reward_funcs:list[str], tokenizer=None, max_completion_length=6*1024):
        self.tokenizer = tokenizer
        self.max_completion_length=max_completion_length
        self.evaluator = MATHEvaluator()
        reward_func_mapping = {
            'format': self.format_reward_func,
            'correctness': self.correctness_reward_func,
            'length': self.length_reward_func,
        }
        selected_reward_funcs = []
        for name in reward_funcs:
            if name not in reward_func_mapping:
                raise ValueError(f"Invalid reward function name: {name}")
            else:
                selected_reward_funcs.append(reward_func_mapping[name])
        self.selected_reward_funcs=selected_reward_funcs
        
    def check(self, pred_ans, real_ans):
        correctness = self.evaluator.score(pred_ans, real_ans)
        return correctness

    def format_reward_func_string(self, responses, **kwargs) -> list[float]:
        "<think>\n任意字符\n</think>任意字符"
        """Reward function that checks if the completion has a specific format."""
        pattern = r"<think>\n.*\n</think>.*"
        matches = [re.match(pattern, r, flags=re.DOTALL) for r in responses]
        rewards = [0.5 if match else 0.0 for match in matches]
        # 保存reward不为0.5的response到指定路径（jsonlines格式）
        file_path = "/home/cwy/project/LLM_Inference/Curriculum_Reinforced_learning/temp/basdcase.jsonl"
        with open(file_path, "a", encoding="utf-8") as f:
            for response, reward in zip(responses, rewards):
                if reward != 0.5:
                    # 每一行写入一个json对象
                    json_line = json.dumps({"response": response, "reward": reward}, ensure_ascii=False)
                    f.write(json_line + "\n")

        return rewards

    # Reward functions
    def correctness_reward_func_string(self, responses, answer, **kwargs) -> list[float]:
        correctness = self.check(responses, answer)
        rewards = [2 if x else 0 for x in correctness]
        return rewards
    # Length Reward
    def length_reward_func_string(self, responses, **kwargs) -> list[float]:
        """
        根据生成回答的字符长度给予奖励。奖励函数遵循如下设计：
        - 理想的回答长度在6000到12000字符之间，此时奖励为最大值1；
        - 如果回答太短（少于6000字符），奖励按照sin函数从0平滑上升到1；
        - 如果回答太长（介于12000到18000字符之间），奖励按照sin函数从1平滑下降到0；
        - 如果回答长度超过18000字符或极短（<=0），则奖励为0。
        
        公式说明：
        - 当 L < 6000 时：reward = sin( (L/6000) * (π/2) )
        - 当 6000 <= L <= 12000 时：reward = 1
        - 当 12000 < L <= 18000 时：reward = sin( ((18000 - L)/6000) * (π/2) )
        - 当 L > 18000 或 L <= 0 时：reward = 0
        """
        rewards = []
        ideal_min = self.max_completion_length//3
        ideal_max = self.max_completion_length*2//3
        max_possible = self.max_completion_length  # 超过这个长度奖励归零

        def reward(L):
            if L <= 0:
                return 0.0
            elif L < ideal_min:
                return math.sin((L / ideal_min) * (math.pi / 2))
            elif L <= ideal_max:
                return 1.0
            elif L <= max_possible:
                return math.sin(((max_possible - L) / (max_possible - ideal_max)) * (math.pi / 2))
            else:
                return 0.0
        if self.tokenizer==None:
            return [0]*len(responses)
        
        for response in responses:
            L = len(self.tokenizer.encode(response))
            rewards.append(reward(L))
        return rewards
    def format_reward_func(self, completions, **kwargs) -> list[float]:
        responses = [completion[0]['content'] for completion in completions]
        return self.format_reward_func_string(responses)
    def correctness_reward_func(self, completions, answer, **kwargs) -> list[float]:
        responses = []
        for resp in [completion[0]['content'] for completion in completions]:
            if "</think>" in resp:
                resp=resp.split("</think>")[-1]
            else:
                resp=""
            responses.append(resp)
        return self.correctness_reward_func_string(responses, answer)
    def length_reward_func(self, completions, **kwargs) -> list[float]:
        responses = [completion[0]['content'] for completion in completions]
        return self.length_reward_func_string(responses)

    def reward(self, completions, answer, **kwargs) -> list[float]:
        reward_list=[]  # list[list[float]]
        for func in self.selected_reward_funcs:
            try:
                rew = func(completions=completions, answer=answer)
                reward_list.append(rew)
            except Exception as e:
                print(f"Error calculating reward: {e}")
        reward=[]
        for i in range(len(reward_list[0])):
            rew=0
            for j in range(len(reward_list)):
                rew+=reward_list[j][i]
            reward.append(rew)
        return reward


if __name__=="__main__":

    "<think>\n任意字符\n</think>任意字符"

    inputs=[
        """<think>
    This is the reasoning section.
    </think>
    <think>
    This is the answer section.
    </think>
    """,
        """<think>\nThis is the reasoning section.\n</think>this is answer""",
        """<think>\nThis is the reasoning section.\n</think>this is answer""",
        """<think>\nOkay, so I have this problem where there's an electron microscope magnifying a piece of tissu
e, and the diameters of three layers are given—1 cm, 1.5 cm, and 2 cm. When magnified 1000 times, what is the actua
l combined diameter of all three layers? The hint says the answer is 0.0045 cm. Hmm, I need to figure out how that 
works.\n\nFirst off, the electron microscope magnifies everything 1000 times. So, if there's a diameter measured un
der normal conditions, the actual size is found by dividing that by 1000. That makes sense because the magnificatio
n scales down the observed size—so a larger magnification means we get smaller actual objects.\n\nSo, for each laye
r, I need to find the actual diameter by dividing each measured diameter by 1000. Then, I can add up those three ac
tual diameters to get the combined diameter.\n\nLet me write this out step by step:\n\n1. Take the first diameter: 
1 cm. Dividing by 1000 gives us 0.001 cm. That's the actual diameter of the first layer.\n2. The second layer is 1.
5 cm. Dividing this by 1000 gives 0.0015 cm.\n3. The third layer is 2 cm. Dividing by 1000 gives 0.002 cm.\n\nNow, 
I have the actual diameters for each layer: 0.001 cm, 0.0015 cm, and 0.002 cm. To find the combined diameter, I nee
d to add these three together.\n\nLet me add them step by step:\n\nFirst, 0.001 + 0.0015. Let's see, that should be
 0.0025 cm.\n\nThen, adding 0.0025 and 0.002. Hmm, 0.0025 + 0.002 is 0.0045 cm.\n\nSo, the total combined actual di
ameter is 0.0045 cm. That matches the hint given! So, dividing each measured diameter by 1000 and adding them up gi
ves me the actual combined size.\n\nWait, let me just verify if I did that right. Is the magnification factor 1000,
 so I need to divide each by 1000. Yes, that seems right.\n\nAlternatively, I can think of it as:\n\nEach division 
by 1000 changes the unit from centimeters to micrometers or millimeters? Wait, actually, 1 centimeter is 10,000 mic
rometers or 1,000,000 nanometers. But in any case, since all the original measurements are in centimeters, and we'r
e just scaling them down, dividing by 1000 is just adjusting the measurement.\n\nTherefore, each original cm become
s 0.001 cm, each mm becomes 0.0001 cm, etc. But in this problem, all are in cm, so we just scale them down by divis
ion.\n\nCalculating:\n\n1 cm → 0.001 cm actual\n1.5 cm → 0.0015 cm actual\n2 cm → 0.002 cm actual\n\nAdding them:\n
\n0.001 + 0.0015 = 0.0025\n0.0025 + 0.002 = 0.0045\n\nYes, that adds up. So, 0.0045 cm is indeed the combined actua
l diameter. The hint confirms it, so I think I'm correct. Therefore, my final answer is 0.0045.\n\n**Final Answer**
\nThe actual combined diameter of all three layers is \\boxed{0.0045} centimeters.\n</think>\n\nTo find the actual 
combined diameter of all three layers when magnified 1000 times by an electron microscope, we need to divide each m
easured diameter by 1000 and then sum the results.\n\n1. The first layer: \\(1 \\text{ cm} \\div 1000 = 0.001 \\tex
t{ cm}\\)\n2. The second layer: \\(1.5 \\text{ cm} \\div 1000 = 0.0015 \\text{ cm}\\)\n3. The third layer: \\(2 \\t
ext{ cm} \\div 1000 = 0.002 \\text{ cm}\\)\n\nNext, we add these actual diameters together:\n\n\\[\n0.001 + 0.0015 
+ 0.002 = 0.0045\n\\]\n\nThus, the actual combined diameter of all three layers is \\(\\boxed{0.0045}\\) centimeter
s."""
    ]

    reward=Reward(reward_funcs=["format","correctness","length"])
    print(f"format_reward_func:{reward.correctness_reward_func_string(inputs, ['1']*4)}")

