import json
from openai import OpenAI
import time


def api_predict():
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=conversation
    )
    return response.choices[0].message.content

base_url="http://180.184.175.69:3000/v1"
grok_url='sk-SmLTETMmom7jc1wYRq8ASi2XhQNTIh48J3ZTK6rlfRQ2EjS1'
api_key="sk-3suUMcrhp3ghB4XWshKlBqZHnmkoDYRuX9XK8ua8wKJcaRbv"
client = OpenAI(base_url=base_url, api_key=api_key)
# client = AsyncOpenAI(base_url=base_url, api_key=api_key)

from openai import OpenAI

# client = OpenAI()

def LLM_judge(infer_answer, question_answer):
    prompt = [
        {
            "role": "system",
            "content": "You are a helpful assistant that determines whether the inferred answer matches the correct answer. If the error range of the number is less than 1, we can consider it correct. Return 'True' if they match, and 'False' if they do not."
        },
        {
            "role": "user",
            "content": f"Inferred Answer: {infer_answer}\nCorrect Answer: {question_answer}\nDo these answers match? Return only 'True' or 'False'."
        }
    ]
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=prompt,
        max_tokens = 1024,
    )
    
    result = response.choices[0].message.content.strip().lower()
    return result == "true"

def run_generated_code(result_python_code, initialized_variables):
    exec_env = {}
    exec(result_python_code, exec_env)

    func_name = [name for name in exec_env if callable(exec_env[name])][0]
    result = exec_env[func_name](**initialized_variables)
    
    return result

def remove_code_block_markers(text):
    lines = text.split("\n")
    return "\n".join(line for line in lines if not line.strip().startswith("```"))


if __name__ == "__main__":

    dataset_dict = "./datasets"
    file_path = "./datasets/EncycloBench_raw.jsonl"
    
    infer_result_path = "./results_temp_result_mar9.json"
    
    infer_results = []
    
    init_questions = []
    
    with open(infer_result_path, "r") as f:
        infer_results = json.load(f)

    with open(file_path, 'r') as f:
        for question in f:
            init_questions.append(json.loads(question.strip()))
    
    total_tests = 0
    correct_count = 0
    missing_variables_count = 0

    for infer_result in infer_results:
        ans_py_code = remove_code_block_markers(infer_result["python_code"])
        infer_ans = infer_result["response"]
        ans_variables = infer_result["init_variables"]
        
        total_tests += 1  
        
        try:
            question_answer = run_generated_code(ans_py_code, ans_variables)
            if isinstance(question_answer, list):
                question_answer = ",".join(str(item) for item in question_answer)
            else:
                question_answer = str(question_answer)
            is_correct = LLM_judge(infer_ans, question_answer)
            print(f"Test {total_tests}: Correct = {is_correct}")
            
            if is_correct:
                correct_count += 1
        except:
            print(f"Test {total_tests}: Missing variables")
            # print(f"Inferred Answer: {infer_ans}")
            missing_variables_count += 1

    print("\nFinal Statistics:")
    print(f"Total Tests: {total_tests}")
    print(f"Correct Answers: {correct_count}")
    print(f"Missing Variables: {missing_variables_count}")
    if total_tests > 0:
        accuracy = correct_count / total_tests * 100
        print(f"Accuracy: {accuracy:.2f}%")
    else:
        print("No tests were run.")