import json
from openai import OpenAI, ChatCompletion
client = OpenAI()
sys_meg = """
You are a helpful assistant that evaluates the faithfulness of answers given by an LLM agent on a multi-hop question answering task. You will be provided with a question, the ground truth answer, and an answer generated by the LLM agent.
Your task is to determine whether the answer provided by the LLM agent is consistent with the ground truth answer. The answer from the LLM agent can be a sentence, as long as the meaning includes the answer. However, if the answer is a person's name or a place name, the LLM agent's answer must include the exact same name for it to be considered consistent; it cannot be an alias or a related term. 
You can only respond with "Yes" or "No".
Question: {question}
Ground Truth Answer: {answer}
LLM Agent's Answer: {llm_answer}
"""

data = json.load(open("data/hotpotqa/2wiki_dev.json", "r"))

for model in ['14B', '32B']:
    for mode in ['normal', 'faithfulness_empty', 'faithfulness_shuffle', 'faithfulness_irrelevant', 'insights_empty', 'insights_corrupted', 'insights_irrelevant', 'insights_filler_tokens']:
        if mode == 'normal':
            path = f"logs/hotpotqa/expel/eval/hotpotqa_{model}.txt"
        else:
            path = f"logs/hotpotqa/expel/eval/hotpotqa_{mode}_{model}.txt"

        with open(path, "r") as f:
            log = f.read()
        blocks = log.split("#######################################")

        total = 0
        for block in blocks:
            if "EVAL_IDX:" in block and "Finish[" in block:
                idx = int(block.split("EVAL_IDX:")[1].split("\n")[0].strip())
                question = data[idx]["question"]
                answer = data[idx]["answer"]
                llm_answer = block.split("Finish[")[1].split("]")[0].strip()
                inp = sys_meg.format(question=question, answer=answer, llm_answer=llm_answer)
                response = client.chat.completions.create(
                    model="gpt-5-mini",
                    messages=[
                        {"role": "system", "content": inp}
                    ],
                    temperature=0,
                    max_tokens=10,
                )
                eval = response.choices[0].message.content.strip()
                print(f"IDX: {idx}\nQuestion: {question}\nGround Truth Answer: {answer}\nLLM Agent's Answer: {llm_answer}\nFaithfulness: {eval}\n")
                if eval.lower() == "yes":
                    total += 1
        
        # 将total写回txt
        with open(path, "a") as f:
            f.write(f"\nTotal Correct Answer: {total}\n")
