'''
Author: swchen
Date: 2025-09-17 15:07:11
LastEditors: swchen
LastEditTime: 2025-09-25 17:24:08
FilePath: /SupervisorAgent/smolagents/examples/smolagents_benchmark_drop/eval.py
Description: 

Copyright (c) 2025 by Shaowen Chen, All Rights Reserved. 
'''
# EXAMPLE COMMAND: from folder examples/open_deep_research, run: python run_gaia.py --concurrency 32 --run-name generate-traces-03-apr-noplanning --model-id gpt-4o
import argparse
import json
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from typing import Any


import json
from pathlib import Path
import requests
append_answer_lock = threading.Lock()


def load_jsonl_multiline(path: str):
    """支持多行格式的 jsonl 文件读取"""
    records = []
    with open(path, "r", encoding="utf-8") as f:
        buffer = ""
        for line in f:
            line = line.strip()
            if not line:
                continue
            buffer += line
            # 尝试解析
            try:
                record = json.loads(buffer)
                records.append(record)
                buffer = ""  # 清空，等待下一个对象
            except json.JSONDecodeError:
                # 说明还没凑成一个完整 JSON
                continue
    return records


def append_answer(entry: dict, jsonl_file: str) -> None:
    jsonl_path = Path(jsonl_file)
    jsonl_path.parent.mkdir(parents=True, exist_ok=True)
    with append_answer_lock, open(jsonl_file, "a", encoding="utf-8") as fp:
        fp.write(json.dumps(to_builtin_type(entry), ensure_ascii=False, indent=2) + "\n")
    assert jsonl_path.exists(), "File not found!"
    print("Answer exported to file:", jsonl_path.resolve())
    

def to_builtin_type(obj):
    # 如果是 dict，递归处理每个 kv
    if isinstance(obj, dict):
        return {k: to_builtin_type(v) for k, v in obj.items()}
    # 如果是 list，递归处理每个元素
    elif isinstance(obj, list):
        return [to_builtin_type(i) for i in obj]
    # 发现 pyarrow、pandas、numpy 的 Integer 对象，都转成 int
    elif type(obj).__name__ == "Integer":
        return int(obj)
    # 支持 numpy 的 int
    try:
        import numpy as np
        if isinstance(obj, np.integer):
            return int(obj)
    except ImportError:
        pass
    return obj


def compare_res(input1, input2, question):
    """简化版数值比较"""
    prompt = f"""# Answer Equivalence Task

Determine if two answers are semantically equivalent regardless of formatting differences. Consider numerical expressions, text answers, boolean values, and various representations. Here is the question:
{question}

**Rules:**
- Same meaning/value = True
- Different meanings/values = False
- Different units = False (e.g., "3 m" ≠ "3 cm")
- Ignore formatting, case, and minor wording differences
- Consider mathematical equivalence
- Consider semantic equivalence for text answers

**Examples:**
**Numerical:**
- `1/2` vs `0.5` → True
- `\\frac{{3}}{{4}}` vs `75%` → True  
- `\\sqrt{{16}}` vs `4` → True
- `3.5 m` vs `3.5 cm` → False
- `2^3` vs `8` → True
- `π` vs `3.14159` → True

**Text/Boolean:**
- `Yes` vs `True` → True
- `No` vs `False` → True
- `correct` vs `right` → True
- `New York` vs `NYC` → True
- `cat` vs `dog` → False

**Lists/Sets:**
- `[1,2,3]` vs `[3,1,2]` → True (if order doesn't matter)

**Case Insensitive:**
- `LONDON` vs `london` → True
- `DNA` vs `dna` → True

**Output format:** Just return `True` or `False`

**Compare:**
Input1: `{input1}`
Input2: `{input2}`

**Answer:**"""
    # 配置你的API信息
    LLM_MODEL_NAME = "gpt-4.1"
    LLM_API_KEY = "sk-MoH9x1Sf1p312gXWsLiZElA0ZI27es72NdLpfMyB3Vxi4NLQ"
    LLM_BASE_URL = "https://zjuapi.com/v1"

    url = f"{LLM_BASE_URL}/chat/completions"
        
    headers = {
        "Authorization": f"Bearer {LLM_API_KEY}",
        "Content-Type": "application/json"
    }

    # API call
    data = {
        "model": LLM_MODEL_NAME,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 50,
        "temperature": 0.1
    }
    
    response = requests.post(f"{LLM_BASE_URL}/chat/completions", 
                           json=data, headers=headers)
    
    if response.status_code == 200:
        result = response.json()['choices'][0]['message']['content'].strip()
        return result.lower() == 'true'
    return None


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--file-path-1", type=str, default="", help="Path to the input JSONL file.")
    parser.add_argument("--file-path-2", type=str, default="", help="Path to the second input JSONL file.")
    parser.add_argument("--file-path-res-1", type=str, default="", help="Path to the output JSONL file with results.")
    parser.add_argument("--file-path-res-2", type=str, default="", help="Path to the second output JSONL file with results.")
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    file_path_1 = args.file_path_1
    file_path_2 = args.file_path_2
    res1 = load_jsonl_multiline(file_path_1)
    res2 = load_jsonl_multiline(file_path_2)

    file_path_1_res = args.file_path_res_1
    file_path_2_res = args.file_path_res_2
    for r in res1:
        r["is_correct"] = compare_res(r["prediction"], r["true_answer"], r["question"])
        append_answer(r, file_path_1_res)
    for r in res2:
        r["is_correct"] = compare_res(r["prediction"], r["true_answer"], r["question"])
        append_answer(r, file_path_2_res)