# -*- coding: utf-8 -*-
"""
Created on Sun Mar 16 13:14:41 2025

@author: baran
"""

# -*- coding: utf-8 -*-
"""
Created on Tue Feb  4 10:51:22 2025

@author: baran
"""

from client import get_client
from openai import AssistantEventHandler, OpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
import re
from sum_call import get_summary
from transformers import AutoTokenizer

import tiktoken
'''
deployments structure

{
    assistants : [(names,context)],
    finetune : [(names,context)],
    base : [(names,context)]
}

Structured this way since different categories of llms were made in different resouce
groups when creating them in azure.
'''

#output is just the score for specific prompt
def final_eval(deployment, cat, prompt,task,all_rewards_sum,all_rewards_diag,summary):
    #if cat == "finetune_med" or cat == "finetune_tele" or cat == "finetune_med_new":
    #    cat_client = "finetune"
    #client = get_client(cat_client)
    name = deployment[0]
    context = deployment[1]
    number = -5000
    arm_to_llm = {
         "base"            : "gpt-3.5-turbo",
         "assistants"      : "gpt-3.5-turbo",
         "finetune_med"    : "gpt-4",
         "finetune_tele"   : "gpt-4",
         "finetune_med_new": "gpt-4",
         "llama"           : "llama-13b"
     }
    if task == 'diagnosis':
        if cat == "finetune_med" :
            cat = "finetune"
            actual = "Med"
        elif cat == "finetune_med_new" :
            cat = "finetune"
            actual = "Med_New"
        elif cat == "finetune_tele":
            cat = "finetune"
            actual = "Tele"
        client = get_client(cat)

    while int(number) < 0 or int(number) > 100:
        #print(number)
        if cat == 'assistants':
            new_client = get_client("base")
            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The context is: {context} output is: {prompt}.' }
                ]
                response = new_client.chat.completions.create(
                    model= "gpt-4o",
                    messages=new_message#[
                        #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The score is determined on how relevant and correct the message answers the prompt in regard to the context. The output format should be Score: (the score)."},
                        #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    #]
                )
                score = response.choices[0].message.content
                openai_models = {"gpt-3.5-turbo","gpt-4"}
                encodings = {m: tiktoken.encoding_for_model(m) for m in openai_models}
                llama_tok = AutoTokenizer.from_pretrained("openlm-research/open_llama_13b")
            
                # pick the right one for this selected arm
                llm_name = arm_to_llm[cat]
                if llm_name in encodings:
                    out_len = len(encodings[llm_name].encode(summary))
                else:
                    out_len = len(llama_tok(summary, truncation=True, padding=False)["input_ids"])
                number = ""
                # for char in score:
                #     if char.isdigit():
                #         number += char
                match = re.search(r"Score:\s*(\d+)", score)
                if match:
                    number = int(match.group(1))
                    all_rewards_sum.append(number)
                if number == "":
                    number = -5000
            
            
            elif task == 'diagnosis':
                if hasattr(client, "chat"):  # OpenAI SDK (AzureOpenAI client)
                    response = client.chat.completions.create(
                        model=name,
                        messages=[
                            {"role": "system", "content": context},
                            {"role": "user", "content": prompt}
                        ],
                        max_tokens=2048,
                        temperature=0.8,
                        top_p=0.1
                    )
                    message = response.choices[0].message.content
                
                else:  # Azure SDK (ChatCompletionsClient client)
                    from azure.ai.inference.models import SystemMessage, UserMessage
                    response = client.complete(
                        model=name,
                        messages=[
                            SystemMessage(content=context),
                            UserMessage(content=prompt)
                        ],
                        max_tokens=2048,
                        temperature=0.8,
                        top_p=0.1
                    )
                    message = response.choices[0].message.content
                print(f"Diagnosis prediction: {message}")
            
                out_len = len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(prompt))  # or shared tokenizer
                return 10, out_len, all_rewards_sum, all_rewards_diag  # dummy score, for compatibility

        elif cat == "small":
            
            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use scores above 90 sparingly, if the summary is exceptionally good based on the content of the report. The output format should be Score: (the score)."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The context is: {context} output is: {prompt}.' }
                ]
                new_client = get_client("base")
                response = new_client.chat.completions.create(
                    model= "gpt-4o",
                    messages=new_message#[
                        #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
                        #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    #]
                )
                score = response.choices[0].message.content
                openai_models = {"gpt-3.5-turbo","gpt-4"}
                encodings = {m: tiktoken.encoding_for_model(m) for m in openai_models}
                llama_tok = AutoTokenizer.from_pretrained("openlm-research/open_llama_13b")
            
                # pick the right one for this selected arm
                llm_name = arm_to_llm[cat]
                if llm_name in encodings:
                    out_len = len(encodings[llm_name].encode(summary))
                else:
                    out_len = len(llama_tok(summary, truncation=True, padding=False)["input_ids"])
                number = ""
                # for char in score:
                #     if char.isdigit():
                #         number += char
                match = re.search(r"Score:\s*(\d+)", score)
                if match:
                    number = int(match.group(1))
                    all_rewards_sum.append(number)
                if number == "":
                    number = -5000
        
        elif cat == 'llama':
            
            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use scores above 90 sparingly, if the summary is exceptionally good based on the content of the report. The output format should be Score: (the score)."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The context is: {context} output is: {prompt}.' }
                ]
                new_client = get_client("base")
                response = new_client.chat.completions.create(
                    model= "gpt-4o",
                    messages=new_message#[
                        #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
                        #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    #]
                )
                score = response.choices[0].message.content
                openai_models = {"gpt-3.5-turbo","gpt-4"}
                encodings = {m: tiktoken.encoding_for_model(m) for m in openai_models}
                llama_tok = AutoTokenizer.from_pretrained("openlm-research/open_llama_13b")
            
                # pick the right one for this selected arm
                llm_name = arm_to_llm[cat]
                if llm_name in encodings:
                    out_len = len(encodings[llm_name].encode(summary))
                else:
                    out_len = len(llama_tok(summary, truncation=True, padding=False)["input_ids"])
                number = ""
                # for char in score:
                #     if char.isdigit():
                #         number += char
                match = re.search(r"Score:\s*(\d+)", score)
                if match:
                    number = int(match.group(1))
                    all_rewards_sum.append(number)
                if number == "":
                    number = -5000
            
            
            elif task == 'diagnosis':
                if hasattr(client, "chat"):  # OpenAI SDK (AzureOpenAI client)
                    response = client.chat.completions.create(
                        model=name,
                        messages=[
                            {"role": "system", "content": context},
                            {"role": "user", "content": prompt}
                        ],
                        max_tokens=2048,
                        temperature=0.8,
                        top_p=0.1
                    )
                    message = response.choices[0].message.content
                
                else:  # Azure SDK (ChatCompletionsClient client)
                    from azure.ai.inference.models import SystemMessage, UserMessage
                    response = client.complete(
                        model=name,
                        messages=[
                            SystemMessage(content=context),
                            UserMessage(content=prompt)
                        ],
                        max_tokens=2048,
                        temperature=0.8,
                        top_p=0.1
                    )
                    message = response.choices[0].message.content
                print(f"Diagnosis prediction: {message}")
            
                out_len = len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(prompt))  # or shared tokenizer
                return 10, out_len, all_rewards_sum, all_rewards_diag  # dummy score, for compatibility

            

        else:
    
            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The context is: {context} output is: {prompt}.' }
                ]
                new_client = get_client("base")
                response = new_client.chat.completions.create(
                    model= "gpt-4o",
                    messages=new_message#[
                        #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
                        #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    #]
                )
                score = response.choices[0].message.content
                openai_models = {"gpt-3.5-turbo","gpt-4"}
                encodings = {m: tiktoken.encoding_for_model(m) for m in openai_models}
                llama_tok = AutoTokenizer.from_pretrained("openlm-research/open_llama_13b")
            
                # pick the right one for this selected arm
                llm_name = arm_to_llm[cat]
                if llm_name in encodings:
                    out_len = len(encodings[llm_name].encode(summary))
                else:
                    out_len = len(llama_tok(summary, truncation=True, padding=False)["input_ids"])
                number = ""
                # for char in score:
                #     if char.isdigit():
                #         number += char
                match = re.search(r"Score:\s*(\d+)", score)
                if match:
                    number = int(match.group(1))
                    all_rewards_sum.append(number)
                if number == "":
                    number = -5000
                    
            elif task == 'diagnosis':
                if hasattr(client, "chat"):  # OpenAI SDK (AzureOpenAI client)
                    response = client.chat.completions.create(
                        model=name,
                        messages=[
                            {"role": "system", "content": context},
                            {"role": "user", "content": prompt}
                        ],
                        max_tokens=2048,
                        temperature=0.8,
                        top_p=0.1
                    )
                    message = response.choices[0].message.content
                
                else:  # Azure SDK (ChatCompletionsClient client)
                    from azure.ai.inference.models import SystemMessage, UserMessage
                    response = client.complete(
                        model=name,
                        messages=[
                            SystemMessage(content=context),
                            UserMessage(content=prompt)
                        ],
                        max_tokens=2048,
                        temperature=0.8,
                        top_p=0.1
                    )
                    message = response.choices[0].message.content
                print(f"Diagnosis prediction: {message}")
            
                out_len = len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(prompt))  # or shared tokenizer
                return 10, out_len, all_rewards_sum, all_rewards_diag  # dummy score, for compatibility


    print(f"summary reward: {int(number)}")
    #print(int(number))
    return int(number),out_len,all_rewards_sum,all_rewards_diag

# deployment = ("Assistant","You are an medical diagnosis agent whose primary goal is to give diagnosis based on medical reports. For experimentation purposes only.")
# prompt = """{"question": "What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]",
# 		"option 1": "To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints",
# 		"option 2": "To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints",
# 		"option 3": "To supply data or analytics from the MFAF to notification endpoints",
# 		"option 4": "To fetch data or analytics from the MFAF based on fetch instructions",
# 		"answer": "option 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints",
# 		"explanation": "The Nmfaf_3daDataManagement_Deconfigure service operation is used to stop mapping data or analytics received by the MFAF to one or more out-bound notification endpoints.",
# 		"category": "Standards specifications"}"""

# print(final_eval(deployment, "assistants", prompt))