import json
import re
import requests
import os
from collections import OrderedDict

# ============================= Verdict 推断函数 ============================= #
def infer_verdict(text: str) -> str:
    """
    推断文本中的 verdict 标签，返回 '[[A]]' / '[[B]]' / '[[C]]'.
    检索顺序：
      1) 显式行: 'Verdict: [[A]]'
      2) 任意位置已有 '[[A]]' / '[[B]]' / '[[C]]'
      3) 关键词启发式: 统计 "assistant a ... better/..." vs "assistant b ... better/..."
    """
    # 1) 捕获 "Verdict: [[X]]"
    m = re.findall(r"verdict\s*:\s*\[\[([ABC])\]\]", text, flags=re.I)
    if m:
        return f"[[{m[-1]}]]"

    # 2) 捕获裸 [[X]]
    m = re.findall(r"\[\[([ABC])\]\]", text)
    if m:
        return f"[[{m[-1]}]]"

    # 3) 关键词启发式
    lower = text.lower()
    keywords = r"(better|wins|outperforms|superior|stronger|more\s+accurate|" \
               r"more\s+detailed|more\s+helpful|more\s+comprehensive|more\s+robust)"
    a_hits = len(re.findall(r"assistant\s+a[^.]{0,80}?\b" + keywords, lower))
    b_hits = len(re.findall(r"assistant\s+b[^.]{0,80}?\b" + keywords, lower))

    if a_hits > b_hits:
        return "[[A]]"
    if b_hits > a_hits:
        return "[[B]]"
    return "[[C]]"   # 默认平局


# ============================== OpenAI Chat 调用 ============================ #
def query(prompt: str, model_name: str):
    url = 
    headers = {
        "Authorization": 
        "Content-Type": "application/json"
    }
    data = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}]
    }

    rsp = requests.post(url, headers=headers, json=data).json()
    msg = rsp["choices"][0]["message"]

    think = msg.get("reasoning_content", "").strip()
    resp  = msg["content"].strip()

    cost = {k: rsp["usage"][k] for k in ("prompt_tokens",
                                         "completion_tokens",
                                         "total_tokens")}
    return think, resp, cost


# =================================== Judge ================================= #
class Judge:
    def __init__(self, model_name: str):
        self.model_name = model_name

    @staticmethod
    def generate_prompt(question: str, ans_a: str, ans_b: str) -> str:
        """
        构造系统提示：
          - reasoning_content 需在末尾写 `Verdict: [[X]]`
          - main content 末尾写同样的 `[[X]]`
        """
        return f"""
[System]

You are an impartial judge comparing two AI assistants.  
**Write in *two* parts**:

1. **reasoning_content** (returned via the `reasoning_content` field):  
   • Provide a detailed comparative analysis.  
   • **At the very end of this field, on a new line, write exactly:**  
     `Verdict: [[A]]`, `Verdict: [[B]]`, or `Verdict: [[C]]`.

2. **main response content** (normal `content` field):  
   • Give a concise explanation of your decision.  
   • **End with the same tag** `[[A]]`, `[[B]]`, or `[[C]]`.

**Verdict rules:**  
- [[A]] if Assistant A is better, [[B]] if Assistant B is better, [[C]] for a tie.

**Evaluation Guidelines:**  
- Helpfulness, relevance, accuracy, depth, creativity, detail.  
- Compare answers directly and be objective.  
- Avoid position/length/name biases.

[User Question]
{question}

[Assistant A's Answer]
{ans_a}

[Assistant B's Answer]
{ans_b}

[Your Evaluation]
"""

    def __call__(self, question: str, ans_a: str, ans_b: str):
        prompt = self.generate_prompt(question, ans_a, ans_b)
        think, resp, cost = query(prompt, self.model_name)

        # ----------- 独立推断两个 verdict ----------- #
        lbl_think = infer_verdict(think)   # [[A]] / [[B]] / [[C]]
        lbl_resp  = infer_verdict(resp)

        # 追加标签到文本末尾（若尚未附加）
        if not think.rstrip().endswith(lbl_think):
            think += f"\n\n{lbl_think}"
        if not resp.rstrip().endswith(lbl_resp):
            resp  += f"\n\n{lbl_resp}"

        # 映射为 A>B / B>A / A=B
        map2str = {"[[A]]": "A>B", "[[B]]": "B>A", "[[C]]": "A=B"}
        res_think = map2str[lbl_think]
        res_resp  = map2str[lbl_resp]

        return {
            "judge_thinking":  think,
            "thinking_result": res_think,
            "judge_response":  resp,
            "judge result":    res_resp,
            "cost_tokens":     cost
        }


# ================================= 主流程 =================================== #
def main():
    input_path = 
    output_dir = 
    os.makedirs(output_dir, exist_ok=True)

    judge = Judge("deepseek-r1")
    results = []

    with open(input_path, encoding="utf-8") as f:
        data = json.load(f)
    data = data if isinstance(data, list) else [data]

    for item in data:
        rec = judge(item["question"], item["response_A"], item["response_B"])

        ordered = OrderedDict()
        ordered["id"]              = item.get("id", "")
        ordered["question"]        = item.get("question", "")
        ordered["response_a"]      = item.get("response_A", "")
        ordered["response_b"]      = item.get("response_B", "")
        ordered["model_A"]         = item.get("model_A", "")
        ordered["model_B"]         = item.get("model_B", "")
        ordered["judge model"]     = "deepseek-r1"
        ordered["judge_thinking"]  = rec["judge_thinking"]
        ordered["thinking_result"] = rec["thinking_result"]   # A>B / B>A / A=B
        ordered["judge_response"]  = rec["judge_response"]
        ordered["judge result"]    = rec["judge result"]      # A>B / B>A / A=B
        ordered["cost_tokens"]     = rec["cost_tokens"]

        results.append(ordered)
        print(json.dumps(ordered, ensure_ascii=False, indent=2))

    out_path = os.path.join(
        output_dir,
        os.path.basename(input_path).replace(".json", "_judged.json")
    )
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    print(f"\n✅ 评估结果已保存到：{out_path}")


if __name__ == "__main__":
    main()
