import json
import os
from openai import OpenAI
from tqdm import tqdm

os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) # Initialize the OpenAI client

# Define a function to run inference on the model
def run_inference(model, Question, Input, Standard_Answer, Model_Answer):
    # Run the model on the prompt
    prompt = f"""Below are two answers to a physics question.\n\n Question is {Question}\n\n Input Options:{Input}\n\n Standard Answer: {Standard_Answer} is the standard answer to the question, and \n\n Model Answer:{Model_Answer} is the answer extracted from a model's output to this question.  Determine whether these two answers are consistent.
    Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
    If they are consistent, Judgement is 1; if they are different, Judgement is 0. Just give the final jusgement in 1 or 0 nothing else.
    """
    # Run the model on the prompt
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a expert assistant for you to evaluate and compare the final answer of solution of problem to ground truth answer."},
            {"role": "user", "content": prompt}
        ]
    )
    # Extract the response from the model
    try:
        # Extract the response from the model
        response = int(response.choices[0].message.content.strip())
    except ValueError:
        response = 0
    return response

# Load the JSON file containing the questions and input
with open('', 'r') as file:
    data = json.load(file)

save_results = []
# Iterate through each response in the JSON file
for idx, item in enumerate(tqdm(data, desc="Evaluating responses")):
    Question = item['question']
    Input = item['input']
    Standard_Answer = item['answer']
    Model_Answer = item['extracted_answer']

    # Evaluate the model's response
    response = run_inference("gpt-4o", Question, Input, Standard_Answer, Model_Answer)
    print(response)

    # Save the evaluation result
    item['judgement'] = response
    save_results.append(item)

    # Save the results every 10 responses
    if (idx+1) % 10 == 0:
        # Read existing results from the file, if it exists
        existing_results = []
        with open('', 'r') as file:
            existing_results = json.load(file)
        # Append the new results to the existing results
        for i in save_results:
            existing_results.append(i)
        # Save the results to the file
        with open('', 'w') as file:
            json.dump(existing_results, file, indent=4)
        save_results = []
 