#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
compare_judgement_multi.py

统计多个模型在指定类别文件下（如 coding.json, knowledge.json, math.json 等）
的“最终回答”与人工标签的符合情况。
"""

import json
from pathlib import Path

# ========= 配置区：按需修改 ========= #
GROUND_TRUTH_PATH = Path(
    
)  # 含 "label" 的文件

# 关键词：必须是 ["coding", "knowledge", "math", "reasoning", "roleplay", "writing"]
KEYWORD = "knowledge"
# =================================== #

# 19个模型文件夹路径
MODEL_DIRS = [
   





]



def load_json(path: Path):
    """读取 JSON（数组形式）。"""
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


def evaluate_model(truth_dict, model_file: Path):
    """比较单个模型的结果"""
    if not model_file.exists():
        print(f"[Skip] 文件不存在: {model_file}")
        return None

    lrm_data = load_json(model_file)

    right_number = 0
    wrong_number = 0

    for item in lrm_data:
        _id = str(item["id"])  # 统一 id 为 str
        label = truth_dict.get(_id)

        if label is None:
            continue

        judge_result = item.get("judge_result") or item.get("judge result")

        if judge_result == label:
            right_number += 1
        else:
            wrong_number += 1

    total = right_number + wrong_number
    acc = (right_number / total) * 100 if total > 0 else 0.0

    return right_number, wrong_number, acc


def main():
    # 1. 读取标准答案
    truth_data = load_json(GROUND_TRUTH_PATH)
    truth_dict = {str(item["id"]): item["label"] for item in truth_data}

    # 2. 遍历所有模型
    print(f"\n=== 多模型比较结果 ({KEYWORD}.json) ===")
    for model_dir in MODEL_DIRS:
        model_file = model_dir / f"{KEYWORD}.json"
        result = evaluate_model(truth_dict, model_file)
        if result is None:
            continue
        right, wrong, acc = result
        print(f"{model_dir.name} 的 {KEYWORD} 文件 ACC 为: {acc:.2f}% "
              f"(Right={right}, Wrong={wrong})")


if __name__ == "__main__":
    main()
