from client import get_client
from openai import AssistantEventHandler, OpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
import re
'''
deployments variable structure

{
    assistants : [(names,context)],
    finetune : [(names,context)],
    base : [(names,context)]
}

Structured this way since different categories of llms were made in different resouce
groups when creating them in azure.

output is a dict with model name as key and the evaluation score as value.
'''

def opt_eval(deployments, prompt,task,selected,avg_array,t):
    result = {}
    #print(deployments)
    actual = "base"
    #number = -5000
    reward = None
    #print(actual)
    for cat in deployments:
        
    #print(cat)
        number = -5000
        if cat == "finetune_med" :
            cat = "finetune"
            actual = "Med"
        elif cat == "finetune_tele":
            cat = "finetune"
            actual = "Tele"
        client = get_client(cat)
        # print("here")
        # print(cat)
        # print(selected)
        # print(actual)
        # print("now")
        while int(number) < 0 or int(number) > 100:
            #print(number)

            if cat == 'assistants':
                # print("doing assistants")
                #print("assistant")
                for x in deployments[cat]:
                    name = x[0]
                    context = x[1]
                    #print(name)
                    #print(context)
                    assistant = client.beta.assistants.create(
                        name="Diagnosis Summarizer",
                        instructions=context,
                        tools=[{"type": "file_search"}],
                        model=name
                    )
    
                    vector_store = client.beta.vector_stores.create(name="Diagnosis Reports")
                    
                    f = open("data/input.txt", "w")
                    f.write(prompt)
                    f.close()
    
                    # Ready the files for upload to OpenAI
                    file_paths = ["data/input.txt"]
                    # file_paths = ["input.txt"]

                    file_streams = [open(path, "rb") for path in file_paths]
                    
                    # Use the upload and poll SDK helper to upload the files, add them to the vector store,
                    # and poll the status of the file batch for completion.
                    file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
                    vector_store_id=vector_store.id, files=file_streams
                    )
                    # print(file_batch.status)
                    # print(file_batch.file_counts)
    
                    assistant = client.beta.assistants.update(
                        assistant_id=assistant.id,
                        tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
                    )
    
                    # Upload the user provided file to OpenAI
                    message_file = client.files.create(
                    file=open("data/input.txt", "rb"), purpose="assistants"
                    )
                    # file=open("input.txt", "rb"), purpose="assistants"
                    # )
                    
                    # Create a thread and attach the file to the message
                    thread = client.beta.threads.create(
                    messages=[
                        {
                        "role": "user",
                        "content": prompt,
                        # Attach the new file to the message.
                        "attachments": [
                            { "file_id": message_file.id, "tools": [{"type": "file_search"}] }
                        ],
                        }
                    ]
                    )
                    
                    class EventHandler(AssistantEventHandler):
                        def on_message_done(self, message) -> None:
                            # print a citation to the file searched
                            message_content = message.content[0].text
                            annotations = message_content.annotations
                            citations = []
                            for index, annotation in enumerate(annotations):
                                message_content.value = message_content.value.replace(
                                    annotation.text, f"[{index}]"
                                )
                                if file_citation := getattr(annotation, "file_citation", None):
                                    cited_file = client.files.retrieve(file_citation.file_id)
                                    citations.append(f"[{index}] {cited_file.filename}")
    
                            # print(message_content.value)
                            # print("\n".join(citations))
                            f = open("data/outputs.txt", "w", encoding="utf-8")
                            f.write(message_content.value)
                            f.close()
                            # print(message.model_dump_json(indent=2))
    
                    with client.beta.threads.runs.stream(
                        thread_id=thread.id,
                        assistant_id=assistant.id,
                        instructions="Please address the user as Jane Doe. The user has a premium account.",
                        event_handler=EventHandler(),
                    ) as stream:
                        stream.until_done()
                    d = open("data/outputs.txt","r", encoding="utf-8")
                    # d = open("outputs.txt","r", encoding="utf-8")
                    message = d.read()
                    d.close()
                    if task == 'summary':
                        new_message=[
                            {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                            #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
                            {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                        ]
                    else:
                        new_message=[
                    {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                            #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
                            {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                        ]
                    new_client = get_client("base")
                    response = new_client.chat.completions.create(
                        model= "gpt-4o",
                        messages=new_message#[
                            #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The score is determined on how relevant and correct the message answers the prompt in regard to the context. The output format should be Score: (the score)."},
                            #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                        #]
                    )
                    score = response.choices[0].message.content
                    #print(score)
    
                    number = ""
                    # for char in score:
                    #     if char.isdigit():
                    #         number += char

                    match = re.search(r"Score:\s*(\d+)", score)
                    if match:
                        number = int(match.group(1))   
                        avg_array[name] = avg_array[name]*t + number
                        avg_array[name] = avg_array[name]/(t+1)
                    if number == "":
                        number = -5000
    
                    result[name] = number
                    if cat == selected or actual == selected:
                        reward = number
                        
            elif cat == "small":
                name = deployments[cat][0]
                context = deployments[cat][1]
                #print("small diag")
                response = client.complete(
                    messages=[
                        #SystemMessage(content=context),
                        UserMessage(content=prompt)
                    ],
                    max_tokens=2048,
                    temperature=0.8,
                    top_p=0.1,
                    model=name
                )

                message = response.choices[0].message.content
                #print(message)
                if task == 'summary':
                    new_message=[
                        {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                        #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
                        {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    ]
                else:
                    new_message=[
                        {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                        #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
                        {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    ]
                new_client = get_client("base")
                response = new_client.chat.completions.create(
                    model= "gpt-4o",
                    messages=new_message#[
                        #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The score is determined on how relevant and correct the message answers the prompt in regard to the context. The output format should be Score: (the score)."},
                        #{"role": "system", "content": "You are a strict output evaluator. Your job is to compare the user's answer to the question and provide a numeric score (0–100) based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely."},
                        #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    #]
                )
                score = response.choices[0].message.content
                #print(score)

                number = ""
                match = re.search(r"Score:\s*(\d+)", score)
                if match:
                    number = int(match.group(1))
                    avg_array[name] = avg_array[name]*t + number
                    avg_array[name] = avg_array[name]/(t+1)
                if number == "":
                    number = -5000
                result[name] = number

                if cat == selected or actual == selected:
                    reward = number
            else:
                # print("doing finetuing")
                #print(actual)
                if actual == "Med":
                    cat = "finetune_med"
                    #print("medical diag:")
                elif actual =="Tele":
                    cat = "finetune_tele"
                    #print("telecom diag:")
                # else:
                #     print("base diag:")
                    
                    #while int(number) < 0 or int(number) > 100:
                name = deployments[cat][0]
                context = deployments[cat][1]
                #print(name)
                #print(context)
                response = client.chat.completions.create(
                    model= name,
                    messages=[
                        #{"role": "system", "content": context},
                        {"role": "user", "content": prompt}
                    ]
                )
                message = response.choices[0].message.content
                #print(message)
                if task == 'summary':
                    new_message=[
                        {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                        #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
                        {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    ]
                else:
                    new_message=[
                        {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores."},
                        #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
                        {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    ]
                new_client = get_client("base")
                response = new_client.chat.completions.create(
                    model= "gpt-4o",
                    messages=new_message#[
                        #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The score is determined on how relevant and correct the message answers the prompt in regard to the context. The output format should be Score: (the score)."},
                        #{"role": "system", "content": "You are a strict output evaluator. Your job is to compare the user's answer to the question and provide a numeric score (0–100) based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely."},
                        #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                    #]
                )
                
                score = response.choices[0].message.content
                
                #print(score)

                number = ""
                # for char in score:
                #     if char.isdigit():
                #         number += char
                match = re.search(r"Score:\s*(\d+)", score)
                if match:
                    number = int(match.group(1))  
                    avg_array[name] = avg_array[name]*t + number
                    avg_array[name] = avg_array[name]/(t+1)
                if number == "":
                    number = -5000
                result[name] = number
                
                if cat == selected or actual == selected:
                    reward = number
                    
                # else:
                #     name = deployments[cat][0]
                #     context = deployments[cat][1]
                #     #print(name)
                #     #print(context)
                #     response = client.chat.completions.create(
                #         model= name,
                #         messages=[
                #             {"role": "system", "content": context},
                #             {"role": "user", "content": prompt}
                #         ]
                #     )
                #     message = response.choices[0].message.content
                #     # print(message)
                #     if task == 'summary':
                #         new_message=[
                #             {"role": "system", "content": " Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. The score is determined on how relevant and correct the message answers the prompt in regard to the context. Use scores above 90 sparingly, if the answer is exceptionally good. The output format should be Score: (the score)."},
                #             #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's summary of a medical report based on the medical report provided and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
                #             {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                #         ]
                #     else:
                #         new_message=[
                #             {"role": "system", "content": "Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. The score is determined on how relevant and correct the message answers the prompt in regard to the context. Use scores above 90 sparingly, if the answer is exceptionally good. The output format should be Score: (the score)."},
                #             #{"role": "system", "content": "You are a strict output evaluator giving ratings between 0-100. Your job is to rate the user's diagnosis of a patient based on the findings of the summarized medical report and provide a numeric score which is strictly between 0 to 100 based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely. Verify that the score you assign is between 0 and 100."},
                #             {"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                #         ]
                #     new_client = get_client("base")
                #     response = new_client.chat.completions.create(
                #         model= "gpt-4o",
                #         messages=new_message#[
                #             #{"role": "system", "content": "You are a llm output generation evaluator with an ouput score from 0 to 100. The score is determined on how relevant and correct the message answers the prompt in regard to the context. The output format should be Score: (the score)."},
                #             #{"role": "system", "content": "You are a strict output evaluator. Your job is to compare the user's answer to the question and provide a numeric score (0–100) based on correctness and completeness. Use the following rubric: 90–100: Fully correct and comprehensive, 70–89: Mostly correct, may have small omissions, 50–69: Some correctness, but significant gaps or minor errors, 30–49: Limited correctness, missing major points or containing major errors, 10–29: Largely incorrect or irrelevant, 0–9: Completely nonsensical or irrelevant. After you decide the best score, explain your reasoning briefly, then output only: Score: <the numeric score>. Be thorough and do not inflate scores. Always follow the rubric precisely."},
                #             #{"role": "user", "content": f'The prompt is: {prompt} context is: {context} output is: {message}.' }
                #         #]
                #     )
                #     score = response.choices[0].message.content
                #     # print(score)
        
                #     number = ""
                #     for char in score:
                #         if char.isdigit():
                #             number += char
                #     # print("here")
                #     # print(cat)
                #     # print(selected)
                #     # print(actual)
                #     # print("now")
                #     result[name] = number    
                
                
                    
    
                
                #print(11)
                #print(cat)
                #for x in deployments[cat]:
            
            
    best_dep = ""
    best_score = 0
    print(result)
    print("avg:")
    print(avg_array)
    for dep in result:
        if int(result[dep]) >= best_score:
            best_score = int(result[dep])
            best_dep = dep
    #return best_dep, best_score
    if reward is None:
        raise ValueError("Reward was not calculated for the selected deployment.")
    return int(best_score)-int(reward),int(reward),avg_array

# deployment = {'assistants' : [("Assistant", "You are an medical diagnosis agent whose primary goal is to give diagnosis based on medical reports. For experimentation purposes only.")], 'finetune' : [("Med","You are an medical diagnosis agent whose primary goal is to give diagnosis based on medical reports. For experimentation purposes only.")], 'base' : []}
# prompt = """{"question": "What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]",
# 		"option 1": "To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints",
# 		"option 2": "To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints",
# 		"option 3": "To supply data or analytics from the MFAF to notification endpoints",
# 		"option 4": "To fetch data or analytics from the MFAF based on fetch instructions",
# 		"answer": "option 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints",
# 		"explanation": "The Nmfaf_3daDataManagement_Deconfigure service operation is used to stop mapping data or analytics received by the MFAF to one or more out-bound notification endpoints.",
# 		"category": "Standards specifications"}"""

# print(opt_eval(deployment, prompt))