from verl.tools.utils.req_alipay_dsv3 import dsv3_infer_alipay as dsv3_infer

PROMPT_TEMPLATE = '''[Question]
{question}

[Gold Answer]
{ground_truth}

[The Start of Assistant's Predicted Answer]
{model_output}
[The End of Assistant's Predicted Answer]

[System]
We would like to request your feedback on the performance of the AI assistant in response to the user question displayed above according to the gold answer. Please use the following listed aspects and their descriptions as evaluation criteria:
    - Accuracy and Hallucinations: The assistant's answer is semantically consistent with the gold answer; The numerical value and order need to be accurate, and there should be no hallucinations.
    - Completeness: Referring to the reference answers, the assistant's answer should contain all the key points needed to answer the user's question; further elaboration on these key points can be omitted.
Please rate whether this answer is suitable for the question. Please note that the gold answer can be considered as a correct answer to the question.

The assistant receives an overall score on a scale of 1 to 100, where a higher score indicates better overall performance.
Please note that if the assistant's answer and the gold answer fully meet the above criteria, its overall rating should be the full marks (100).
Please first provide a comprehensive explanation of your evaluation, avoiding any potential bias.
Then, output a line indicating the score of the Assistant.

PLEASE OUTPUT WITH THE FOLLOWING FORMAT, WHERE THE SCORE IS A SCALE OF 1 TO 100 BY STRICTLY FOLLOWING THIS FORMAT: "[[score]]", FOR EXAMPLE "Rating: [[100]]":
<start output>
Evaluation evidence: your evluation explanation here, no more than 100 words
Rating: [[score]]
<end output>

Now, start your evaluation:'''


import re
import json
import numpy as np
import time
import string
from collections import Counter

def extract_score(response_text):
    match = re.search(r'Rating:\s*\[\[([0-9]{1,3})\]\]', response_text)
    if match:
        return int(match.group(1))
    return None

def model_score(question, ground_truth, model_output):
    prompt = PROMPT_TEMPLATE.format(
        question=question,
        ground_truth=ground_truth,
        model_output=model_output
    )

    max_retries = 5
    for attempt in range(1, max_retries + 1):
        try:
            response = dsv3_infer(prompt)
            score = extract_score(response)
            if score is not None:
                return score
        except Exception as e:
            print(f"[Attempt {attempt}] Error: {e}")

        if attempt < max_retries:
            time.sleep(1.0)

    print("Warning: Failed to obtain a valid score after all retries.")
    return 0



def compute_score(model_output, ground_truth, extra_info):
    acc_score = 0.01 * model_score(extra_info["question"], model_output, ground_truth)
    # print("****************************************************")
    # print("model_output:" + model_output)
    # print("ground_truth:" + ground_truth)
    # print("acc_score:", acc_score)
    # print("----------------------------------------------------")
    return acc_score

