"""
Abstain and Answer evaluation
"""
import numpy as np  
import openai
import json
import argparse
import os
from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from llm_evaluate import TEMPLATE
import argparse


from tqdm import tqdm

parser = argparse.ArgumentParser()
parser.add_argument("--question_type", type=str, default="who_are_you")
args = parser.parse_args()


# openAI initialization
openai.api_key = "API_KEY"
STOP_AFTER_ATTEMPT=10

@retry(stop=stop_after_attempt(STOP_AFTER_ATTEMPT),
        wait=wait_chain(*[wait_fixed(3) for i in range(2)] +
                       [wait_fixed(5) for i in range(1)]))

def chat_gpt_call_vanilla(content):

    response = openai.chat.completions.create(
        model='gpt-4.1-mini', # technically, you can also set the engine value as 'gpt-4', but plz don't do it!!! (to save some money lol)
        
        messages = [
                {"role": "system", "content": "Please follow the instructions carefully."},
                {"role": "user", "content": content}
            ],
        temperature=.0000000000000000000001,
        top_p=.0000000000000000000001,   
        max_tokens=800,     # change this when needed. 800 is just a placeholder here.
        frequency_penalty=0,
        presence_penalty=0,
        stop=["# END"]  # change this to the correct stop token based on the prompt. This stop token will only be effective if you say something like "Generate # END at the end of your response" in your prompt. Or alternatively, you can simply leave it blank.
        )
    

    answer = response.choices[0].message.content.strip()  # Extract and clean the response


    return answer

@retry(stop=stop_after_attempt(STOP_AFTER_ATTEMPT),
        wait=wait_chain(*[wait_fixed(3) for i in range(2)] +
                       [wait_fixed(5) for i in range(1)]))



def evaluate_answers(input_file, output_file):
    model_name = [
        "moonshotai/kimi-k2",
        "openai/gpt-4.1-mini",
        "openai/gpt-4.1",
        "openai/gpt-5",
        "anthropic/claude-sonnet-4",
        "qwen/qwen3-235b-a22b-2507",
        "google/gemini-2.5-flash",
        "x-ai/grok-4",
        "z-ai/glm-4.5",
        "deepseek/deepseek-chat-v3-0324"
    ]

    with open(input_file, 'r') as f:
        data = json.load(f)

    output_data = []
    with open(output_file, 'a') as f:
        f.write('[')

    for name in model_name:
        model_name = name
        model_answer = data[name]

        evaluations = []
        

        for question in tqdm(model_answer, desc=f"Evaluating {model_name}"):
            # Construct the prompt
            prompt = TEMPLATE.format(model_name=model_name, model_answer=question)

            answer = chat_gpt_call_vanilla(prompt)
            # Append the evaluation
            evaluations.append(answer)
            print(answer)
        # breakpoint()

        # Write the character's data to the output file after processing all questions
        with open(output_file, 'a') as f:  # Open in append mode
            json.dump({
                "model_name": model_name,
                "model_answer": model_answer,
                "evaluation": evaluations
            }, f, indent=4)
            f.write(',')
            f.write('\n')  # Write a newline for each entry


    with open(output_file, 'a') as f:
        f.write(']')
     
cfg = {
    "question_type": args.question_type
}

if __name__ == "__main__":

    dir_path = f"evaluation/{args.question_type}"
    os.makedirs(dir_path, exist_ok=True)

    evaluate_answers(f'{args.question_type}_detailed_existence_10_regulated.json', f'{dir_path}/{args.question_type}.json')




