from openai import AssistantEventHandler, OpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from utils.helper import get_rating_score


'''
deployments structure

{
    assistants : [(names,context)],
    finetune : [(names,context)],
    base : [(names,context)]
}

Structured this way since different categories of llms were made in different resouce
groups when creating them in azure.
'''

#output is just the score for specific prompt
def final_eval(deployment, cat, prompt,task,all_rewards_sum,all_rewards_diag):
    context = deployment[1]
    # Choose the appropriate rubric based on the deployment category.
    if cat == 'assistants' or cat != 'small':
        rubric = ("Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. "
                  "Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct (small omissions allowed), 50–69: Some correctness but significant gaps, "
                  "30–49: Limited correctness, and 10–29 or 0–9 for incorrect outputs. After you decide the best score, explain your reasoning briefly, then output only: Score: <numeric score>. "
                  "Be thorough and do not inflate scores.")
    elif cat == "small":
        rubric = ("Your job is to rate the user's summary of a medical report based on the provided report and provide a numeric score strictly between 0 to 100. "
                  "Use scores above 90 sparingly if the summary is exceptionally good. The output format should be Score: (numeric score).")
    number = get_rating_score(context, prompt, rubric)
    all_rewards_sum.append(number)
    print("reward:")
    print(number)
    return number, all_rewards_sum, all_rewards_diag