from client import get_client
from openai import AssistantEventHandler, OpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
import re
'''
deployments structure

{
    assistants : [(names,context)],
    finetune : [(names,context)],
    base : [(names,context)]
}

Structured this way since different categories of llms were made in different resouce
groups when creating them in azure.
'''

#output is just the score for specific prompt
def final_eval(deployment, cat, prompt,task):
    client = get_client(cat)
    name = deployment[0]
    context = deployment[1]
    number = -5000
    while int(number) < 0 or int(number) > 100:
        #print(number)
        if cat == 'assistants':
            assistant = client.beta.assistants.create(
                name="Diagnosis Summarizer",
                instructions=context,
                tools=[{"type": "file_search"}],
                model=name
            )
    
            vector_store = client.beta.vector_stores.create(name="Diagnosis Reports")
            
            f = open("data/input.txt", "w")
            f.write(prompt)
            f.close()
    
            # Ready the files for upload to OpenAI
            file_paths = ["data/input.txt"]
            file_streams = [open(path, "rb") for path in file_paths]
            
            # Use the upload and poll SDK helper to upload the files, add them to the vector store,
            # and poll the status of the file batch for completion.
            file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
            vector_store_id=vector_store.id, files=file_streams
            )
            # print(file_batch.status)
            # print(file_batch.file_counts)
    
            assistant = client.beta.assistants.update(
                assistant_id=assistant.id,
                tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
            )
    
            # Upload the user provided file to OpenAI
            message_file = client.files.create(
            file=open("data/input.txt", "rb"), purpose="assistants"
            )
            
            # Create a thread and attach the file to the message
            thread = client.beta.threads.create(
            messages=[
                {
                "role": "user",
                "content": prompt,
                # Attach the new file to the message.
                "attachments": [
                    { "file_id": message_file.id, "tools": [{"type": "file_search"}] }
                ],
                }
            ]
            )
            
            class EventHandler(AssistantEventHandler):
                def on_message_done(self, message) -> None:
                    # print a citation to the file searched
                    message_content = message.content[0].text
                    annotations = message_content.annotations
                    citations = []
                    for index, annotation in enumerate(annotations):
                        message_content.value = message_content.value.replace(
                            annotation.text, f"[{index}]"
                        )
                        if file_citation := getattr(annotation, "file_citation", None):
                            cited_file = client.files.retrieve(file_citation.file_id)
                            citations.append(f"[{index}] {cited_file.filename}")
    
                    # print(message_content.value)
                    # print("\n".join(citations))
                    f = open("data/outputs.txt", "w", encoding="utf-8")
                    f.write(message_content.value)
                    f.close()
                    # print(message.model_dump_json(indent=2))
    
            with client.beta.threads.runs.stream(
                thread_id=thread.id,
                assistant_id=assistant.id,
                instructions="Please address the user as Jane Doe. The user has a premium account.",
                event_handler=EventHandler(),
            ) as stream:
                stream.until_done()
            d = open("data/outputs.txt","r", encoding="utf-8")
            message = d.read()
            d.close()
    
            new_client = get_client("base")
            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                ]
            else:
                new_message=[
                    {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                ]
            response = new_client.chat.completions.create(
                model= "gpt-4o",
                messages=new_message#[
                    #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The score is determined on how relevant and correct the message answers the prompt in regard to the context. The output format should be Score: (the score)."},
                    #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                #]
            )
            score = response.choices[0].message.content
            # number = ""
            # # for char in score:
            # #     if char.isdigit():
            # #         number += char
            # match = re.search(r"Score:\s*(\d+)", score)
            # if match:
            #     number = int(match.group(1))
            # if number == "":
            #     number = -5000
            # print(score)
        elif cat == "small":
            response = client.complete(
                messages=[
                    #SystemMessage(content=context),
                    UserMessage(content=prompt)
                ],
                max_tokens=50000,
                temperature=0.8,
                top_p=0.1,
                model=name
            )

            message = response.choices[0].message.content

            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use scores above 90 sparingly, if the summary is exceptionally good based on the content of the report. The output format should be Score: (the score)."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                ]
            else:
                new_message=[
                    {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use scores above 90 sparingly, if the answer is exceptionally good. The output format should be Score: (the score)."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                ]
            new_client = get_client("base")
            response = new_client.chat.completions.create(
                model= "gpt-4o",
                messages=new_message#[
                    #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
                    #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                #]
            )
            score = response.choices[0].message.content
            # number = ""
            # # for char in score:
            # #     if char.isdigit():
            # #         number += char
            # match = re.search(r"Score:\s*(\d+)", score)
            # if match:
            #     number = int(match.group(1))
            # if number == "":
            #     number = -5000
        
        else:
    
            response = client.chat.completions.create(
                model= name,
                messages=[
                    #{"role": "system", "content": context},
                    {"role": "user", "content": prompt}
                ]
            )
            message = response.choices[0].message.content
            #print(message)
            if task == 'summary':
                new_message=[
                    {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                ]
            else:
                new_message=[
                    {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                    #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely.Verify that the score you assign is between 0 and 100."},
                    {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                ]
            new_client = get_client("base")
            response = new_client.chat.completions.create(
                model= "gpt-4o",
                messages=new_message#[
                    #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The output format should be Score: (the score)."},
                    #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                #]
            )
            score = response.choices[0].message.content
        #print(score)
        number = ""
        # for char in score:
        #     if char.isdigit():
        #         number += char
        match = re.search(r"Score:\s*(\d+)", score)
        if match:
            number = int(match.group(1))
        if number == "":
            number = -5000
    #print(number)
    return int(number)

# deployment = ("Assistant","You are an medical diagnosis agent whose primary goal is to give diagnosis based on medical reports. For experimentation purposes only.")
# prompt = """{"question": "What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]",
# 		"option 1": "To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints",
# 		"option 2": "To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints",
# 		"option 3": "To supply data or analytics from the MFAF to notification endpoints",
# 		"option 4": "To fetch data or analytics from the MFAF based on fetch instructions",
# 		"answer": "option 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints",
# 		"explanation": "The Nmfaf_3daDataManagement_Deconfigure service operation is used to stop mapping data or analytics received by the MFAF to one or more out-bound notification endpoints.",
# 		"category": "Standards specifications"}"""

# print(final_eval(deployment, "assistants", prompt))