from openai import OpenAI
import os

client = OpenAI(
    # 如果没有配置环境变量，请用阿里云百炼API Key替换：api_key="sk-xxx"
    api_key="",
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)


import json

client2 = OpenAI(api_key="")

file_path_1 = ""
file_path_2 = ""
correct_1 = 0
total_1 = 0

correct_2 = 0
total_2 = 0

with open(file_path_1, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i == 0: 
            break
        if line.strip():
            item = json.loads(line)
            if item['story_type'] != "true_belief":
                continue
            choices_text = item["containers"]
            if isinstance(choices_text, list):
                choices_text = ", ".join(choices_text)
          

            entire_instruction = f"Story: {item['story']} Question: {item['question']} Choices: {choices_text}"
            #messages = {
                #"role":"system", "content":"Read the following social event related to you and answer the questions.",
                #"role":"user", "content": entire_instruction
            #}
            answer = item['answer']
            #print(entire_instruction)
            #print(answer)

            #response = client.responses.create(
                #model="o3-2025-04-16",
                #instructions="You are a helpful assistant",
                #input=entire_instruction,
                #temperature = 0
            #)
            messages = [{"role": "user", "content": entire_instruction}]
            model_output = ""
            completion = client.chat.completions.create(
                model="deepseek-r1-0528",
                messages=messages,
                stream=True
            )
            is_answering = False  # 是否进入回复阶段
            #print("\n" + "=" * 20 + "思考过程" + "=" * 20)
            for chunk in completion:
                delta = chunk.choices[0].delta
                # 只收集思考内容
                #if hasattr(delta, "reasoning_content") and delta.reasoning_content is not None:
                    #if not is_answering:
                        #print(delta.reasoning_content, end="", flush=True)
                # 收到content，开始进行回复
                if hasattr(delta, "content") and delta.content:
                    if not is_answering:
                        #print("\n" + "=" * 20 + "完整回复" + "=" * 20)
                        is_answering = True
                    #print(delta.content, end="", flush=True)
                    model_output+= delta.content
            #print(model_output)

            ##对结果进行judge 用gpt-4o

            judge_instruction = f"The correct answer is {answer}. The model output is {model_output}. Determine whether the model output is correct. If it is correct, output True If it is incorrect, output False"

            judge_response = client2.responses.create(
                model="gpt-4o-2024-08-06",
                input=judge_instruction,
                temperature = 0
            )

            print(judge_response.output_text)

            if judge_response.output_text == "True":
                total_1+=1
                correct_1+=1
            else:
                total_1+=1

with open(file_path_2, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 54: 
            break
        if line.strip():
            item = json.loads(line)
            if item['story_type'] != "true_belief":
                continue
            choices_text = item["containers"]
            if isinstance(choices_text, list):
                choices_text = ", ".join(choices_text)

            entire_instruction = f"Story: {item['story']} Question: {item['question']} Choices: {choices_text}"
            answer = item['answer']
            #print(entire_instruction)
            #print(answer)

            messages = [{"role": "user", "content": entire_instruction}]
            model_output = ""
            completion = client.chat.completions.create(
                model="deepseek-r1-0528",
                messages=messages,
                stream=True
            )
            is_answering = False  # 是否进入回复阶段
            print("\n" + "=" * 20 + "思考过程" + "=" * 20)
            for chunk in completion:
                delta = chunk.choices[0].delta
                # 只收集思考内容
                #if hasattr(delta, "reasoning_content") and delta.reasoning_content is not None:
                    #if not is_answering:
                        #print(delta.reasoning_content, end="", flush=True)
                # 收到content，开始进行回复
                if hasattr(delta, "content") and delta.content:
                    if not is_answering:
                        #print("\n" + "=" * 20 + "完整回复" + "=" * 20)
                        is_answering = True
                    #print(delta.content, end="", flush=True)
                    model_output+= delta.content

            #response = client.responses.create(
                #model="o3-2025-04-16",
                #instructions="Read the following social event related to you and answer the questions.",
                #input=entire_instruction,
                #temperature = 0
            #)

            #model_output = response.output_text
            #print(model_output)

            ##对结果进行judge 用gpt-4o

            judge_instruction = f"The correct answer is {answer}. The model output is {model_output}. Determine whether the model output is correct. If it is correct, output True If it is incorrect, output False"

            judge_response = client2.responses.create(
                model="gpt-4o-2024-08-06",
                input=judge_instruction,
                temperature = 0
            )

            print(judge_response.output_text)

            if judge_response.output_text == "True":
                total_2+=1
                correct_2+=1
            else:
                total_2+=1
            
accuracy_1 = correct_1/total_1
accuracy_2 = correct_2/total_2
print(correct_1)
print(total_1)
print(correct_2)
print(total_2)
print(f"{accuracy_1:.4f}")
print(f"{accuracy_2:.4f}")