# -*- coding: utf-8 -*-
"""
Created on Sun Mar 16 13:14:41 2025

@author: baran
"""

# -*- coding: utf-8 -*-
"""
Created on Tue Feb  4 10:51:22 2025

@author: baran
"""

from client import get_client
from openai import AssistantEventHandler, OpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
import re
from sum_call import get_summary
'''
deployments structure

{
    assistants : [(names,context)],
    finetune : [(names,context)],
    base : [(names,context)]
}

Structured this way since different categories of llms were made in different resouce
groups when creating them in azure.
'''

#output is just the score for specific prompt
def final_eval(deployment, cat, prompt,task,all_rewards_sum,all_rewards_diag):
    client = get_client(cat)
    name = deployment[0]
    context = deployment[1]
    number = -5000
    while int(number) < 0 or int(number) > 100:
        #print(number)
        if cat == 'assistants':
            # assistant = client.beta.assistants.create(
            #     name="Diagnosis Summarizer",
            #     instructions=context,
            #     tools=[{"type": "file_search"}],
            #     model="Assistant"
            # )
    
            # vector_store = client.beta.vector_stores.create(name="Diagnosis Reports")
            
            # f = open("data/input.txt", "w")
            # f.write(prompt)
            # f.close()
    
            # # Ready the files for upload to OpenAI
            # file_paths = ["data/input.txt"]
            # file_streams = [open(path, "rb") for path in file_paths]
            
            # # Use the upload and poll SDK helper to upload the files, add them to the vector store,
            # # and poll the status of the file batch for completion.
            # file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
            # vector_store_id=vector_store.id, files=file_streams
            # )
            # # print(file_batch.status)
            # # print(file_batch.file_counts)
    
            # assistant = client.beta.assistants.update(
            #     assistant_id=assistant.id,
            #     tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
            # )
    
            # # Upload the user provided file to OpenAI
            # message_file = client.files.create(
            # file=open("data/input.txt", "rb"), purpose="assistants"
            # )
            
            # # Create a thread and attach the file to the message
            # thread = client.beta.threads.create(
            # messages=[
            #     {
            #     "role": "user",
            #     "content": "Summarize the key findings from the input of a patient's medical report for future medical diagnosis. Do not give an abstraction or diagnosis but only useful key points. Only give one summary for the input file. Output is only used for research and experiment purposes .At the end of your response put the phrase: Provide a medical diagnosis based on the provided medical report summary above.",
            #     # Attach the new file to the message.
            #     "attachments": [
            #         { "file_id": message_file.id, "tools": [{"type": "file_search"}] }
            #     ],
            #     }
            # ]
            # )
            
            # class EventHandler(AssistantEventHandler):
            #     def on_message_done(self, message) -> None:
            #         # print a citation to the file searched
            #         message_content = message.content[0].text
            #         annotations = message_content.annotations
            #         citations = []
            #         for index, annotation in enumerate(annotations):
            #             message_content.value = message_content.value.replace(
            #                 annotation.text, f"[{index}]"
            #             )
            #             if file_citation := getattr(annotation, "file_citation", None):
            #                 cited_file = client.files.retrieve(file_citation.file_id)
            #                 citations.append(f"[{index}] {cited_file.filename}")
    
            #         # print(message_content.value)
            #         # print("\n".join(citations))
            #         f = open("data/outputs.txt", "w", encoding="utf-8")
            #         f.write(message_content.value)
            #         f.close()
            #         # print(message.model_dump_json(indent=2))
    
            # with client.beta.threads.runs.stream(
            #     thread_id=thread.id,
            #     assistant_id=assistant.id,
            #     instructions="Please address the user as Jane Doe. The user has a premium account.",
            #     event_handler=EventHandler(),
            # ) as stream:
            #     stream.until_done()
            # d = open("data/outputs.txt","r", encoding="utf-8")
            # message = d.read()
            # d.close()
            #message = get_summary(prompt,"assistant")
            #print("ii")
            new_client = get_client("base")
            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The context is: {context} output is: {prompt}.' }
                ]
                response = new_client.chat.completions.create(
                    model= "gpt-4o",
                    messages=new_message#[
                        #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The score is determined on how relevant and correct the message answers the prompt in regard to the context. The output format should be Score: (the score)."},
                        #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    #]
                )
                score = response.choices[0].message.content
                number = ""
                # for char in score:
                #     if char.isdigit():
                #         number += char
                match = re.search(r"Score:\s*(\d+)", score)
                if match:
                    number = int(match.group(1))
                    all_rewards_sum.append(number)
                if number == "":
                    number = -5000
            # else:
            #     new_message=[
            #         {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Give importance to medical reasonings and facts while assigning scores. Use scores above 90 sparingly, if the answer is exceptionally good. The output format should be Score: (the score)."},
            #         #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
            #         {"role": "user", "content": f'The context is: {context} output is: {prompt}.' }
            #     ]
            #     response = new_client.chat.completions.create(
            #         model= "gpt-4o",
            #         messages=new_message#[
            #             #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The score is determined on how relevant and correct the message answers the prompt in regard to the context. The output format should be Score: (the score)."},
            #             #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
            #         #]
            #     )
            #     score = response.choices[0].message.content
            #     number = ""
            #     # for char in score:
            #     #     if char.isdigit():
            #     #         number += char
            #     match = re.search(r"Score:\s*(\d+)", score)
            #     if match:
            #         number = int(match.group(1))
            #         all_rewards_sum.append(number)
            #     if number == "":
            #         number = -5000
            #     print(message)
            # response = new_client.chat.completions.create(
            #     model= "gpt-4o",
            #     messages=new_message#[
            #         #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The score is determined on how relevant and correct the message answers the prompt in regard to the context. The output format should be Score: (the score)."},
            #         #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
            #     #]
            # )
            # score = response.choices[0].message.content
            # number = ""
            # # for char in score:
            # #     if char.isdigit():
            # #         number += char
            # match = re.search(r"Score:\s*(\d+)", score)
            # if match:
            #     number = int(match.group(1))
            #     all_rewards_sum.append(number)
            # if number == "":
            #     number = -5000
            # if task != 'summary':
            #     print(score)
            # number = ""
            # # for char in score:
            # #     if char.isdigit():
            # #         number += char
            # match = re.search(r"Score:\s*(\d+)", score)
            # if match:
            #     number = int(match.group(1))
            # if number == "":
            #     number = -5000
            # print(score)
        elif cat == "small":
            # response = client.complete(
            #     messages=[
            #         SystemMessage(content=context),
            #         UserMessage(content=prompt)
            #     ],
            #     max_tokens=50000,
            #     temperature=0.8,
            #     top_p=0.1,
            #     model=name
            # )

            # message = response.choices[0].message.content
            
            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use scores above 90 sparingly, if the summary is exceptionally good based on the content of the report. The output format should be Score: (the score)."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The context is: {context} output is: {prompt}.' }
                ]
                new_client = get_client("base")
                response = new_client.chat.completions.create(
                    model= "gpt-4o",
                    messages=new_message#[
                        #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
                        #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    #]
                )
                score = response.choices[0].message.content
                number = ""
                # for char in score:
                #     if char.isdigit():
                #         number += char
                match = re.search(r"Score:\s*(\d+)", score)
                if match:
                    number = int(match.group(1))
                    all_rewards_sum.append(number)
                if number == "":
                    number = -5000
            # else:
            #     new_message=[
            #         {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Give importance to medical reasonings and facts while assigning scores. Use scores above 90 sparingly, if the answer is exceptionally good. The output format should be Score: (the score)."},
            #         #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
            #         {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
            #     ]
            #     new_client = get_client("base")
            #     response = new_client.chat.completions.create(
            #         model= "gpt-4o",
            #         messages=new_message#[
            #             #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
            #             #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
            #         #]
            #     )
            #     score = response.choices[0].message.content
            #     number = ""
            #     # for char in score:
            #     #     if char.isdigit():
            #     #         number += char
            #     match = re.search(r"Score:\s*(\d+)", score)
            #     if match:
            #         number = int(match.group(1))
            #         all_rewards_diag.append(number)
            #     if number == "":
            #         number = -5000
            #     print(message)
                #print(score)

            # new_client = get_client("base")
            # response = new_client.chat.completions.create(
            #     model= "gpt-4o",
            #     messages=new_message#[
            #         #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
            #         #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
            #     #]
            # )
            # score = response.choices[0].message.content
            # number = ""
            # # for char in score:
            # #     if char.isdigit():
            # #         number += char
            # match = re.search(r"Score:\s*(\d+)", score)
            # if match:
            #     number = int(match.group(1))
            # if number == "":
            #     number = -5000
        
        else:
    
            # response = client.chat.completions.create(
            #     model= name,
            #     messages=[
            #         {"role": "system", "content": context},
            #         {"role": "user", "content": prompt}
            #     ]
            # )
            # message = response.choices[0].message.content
            #print(message)
            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The context is: {context} output is: {prompt}.' }
                ]
                new_client = get_client("base")
                response = new_client.chat.completions.create(
                    model= "gpt-4o",
                    messages=new_message#[
                        #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
                        #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    #]
                )
                score = response.choices[0].message.content
                number = ""
                # for char in score:
                #     if char.isdigit():
                #         number += char
                match = re.search(r"Score:\s*(\d+)", score)
                if match:
                    number = int(match.group(1))
                    all_rewards_sum.append(number)
                if number == "":
                    number = -5000
            # else:
            #     new_message=[
            #         {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Give importance to medical reasonings and facts while assigning scores. Use scores above 90 sparingly, if the answer is exceptionally good. The output format should be Score: (the score)."},
            #         #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
            #         {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
            #     ]
            #     new_client = get_client("base")
            #     response = new_client.chat.completions.create(
            #     model= "gpt-4o",
            #     messages=new_message#[
            #         #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
            #         #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
            #     #]
            # )
            #     score = response.choices[0].message.content
            #     number = ""
            #     # for char in score:
            #     #     if char.isdigit():
            #     #         number += char
            #     match = re.search(r"Score:\s*(\d+)", score)
            #     if match:
            #         number = int(match.group(1))
            #         all_rewards_diag.append(number)
            #     if number == "":
            #         number = -5000
            #     print(message)
            # number = ""
            # # for char in score:
            # #     if char.isdigit():
            # #         number += char
            # match = re.search(r"Score:\s*(\d+)", score)
            # if match:
            #     number = int(match.group(1))
            #     all_rewards_sum.append(number)
            # if number == "":
            #     number = -5000
        #print(score)
        # number = ""
        # # for char in score:
        # #     if char.isdigit():
        # #         number += char
        # match = re.search(r"Score:\s*(\d+)", score)
        # if match:
        #     number = int(match.group(1))
        #     all_rewards.append(number)
        # if number == "":
        #     number = -5000
    #print(number)
    print("reward:")
    print(int(number))
    return int(number),all_rewards_sum,all_rewards_diag

# deployment = ("Assistant","You are an medical diagnosis agent whose primary goal is to give diagnosis based on medical reports. For experimentation purposes only.")
# prompt = """{"question": "What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]",
# 		"option 1": "To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints",
# 		"option 2": "To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints",
# 		"option 3": "To supply data or analytics from the MFAF to notification endpoints",
# 		"option 4": "To fetch data or analytics from the MFAF based on fetch instructions",
# 		"answer": "option 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints",
# 		"explanation": "The Nmfaf_3daDataManagement_Deconfigure service operation is used to stop mapping data or analytics received by the MFAF to one or more out-bound notification endpoints.",
# 		"category": "Standards specifications"}"""

# print(final_eval(deployment, "assistants", prompt))