#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import re
import requests
import os
import time
from collections import OrderedDict
import traceback

def infer_wrong_type(text: str) -> str:
    """
    从模型回答中提取 Error type detected: [[编号]]
    """
    m = re.findall(r"error\s*type[s]?\s*detected\s*:\s*\[\[([1-6])\]\]", text, flags=re.I)
    if m:
        return f"[[{m[-1]}]]"
    return "[[0]]"   # 如果没找到，就默认 [[0]] 表示未识别

def query(prompt: str, model_name: str):
    url = 
    headers = {
        "Authorization": 
        "Content-Type": "application/json"
    }
    data = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}]
    }

    t0 = time.time()
    rsp = requests.post(url, headers=headers, json=data).json()
    t1 = time.time()
    print(rsp)
    msg = rsp["choices"][0]["message"]
    resp = msg["content"].strip()
    cost = {k: rsp["usage"][k] for k in ("prompt_tokens",
                                         "completion_tokens",
                                         "total_tokens")}
    elapsed = round(t1 - t0, 2)  # 秒，保留两位小数
    return resp, cost, elapsed


class WrongTypeJudge:
    def __init__(self, model_name: str):
        self.model_name = model_name

    @staticmethod
    def generate_prompt(judge_response: str) -> str:
        return f"""
You are an expert in evaluating model judgment processes. 
Given the "judge_response" content from a model's reasoning process, please analyze and identify the **single most important error type** it contains — the one that most likely causes the model’s judgment to diverge from human evaluation. 

### Error Types and Examples

[[1]] Misunderstanding the Question or Requirements
--Misinterpreting the problem statement (e.g., confusing “remove exactly k characters” with “remove up to k characters” in coding).
--Focusing on the wrong aspect (e.g., summarizing the process of natural selection when the user asked for its impact on evolution).
--Mistaking the format or scope required (e.g., answering a single-turn reasoning question as if it were multi-turn).

[[2]] Incorrect or Confused Evaluation Criteria
--Judging only based on answer correctness, ignoring completeness of explanation or reasoning.
--Overvaluing writing style or structure in an essay, while neglecting content relevance.
--Focusing on the mathematical notation or formatting rather than the correctness of the solution steps.
--Prioritizing “creativity” or “roleplay immersion” in character responses over whether the reply fulfills the user’s request.

[[3]] Overlooking Important Details or Substantive Errors
--Failing to notice a critical bug in code (e.g., off-by-one or failing special cases).
--Missing a calculation error, units mismatch, or a crucial logical misstep in an answer.
--Ignoring that a key requirement (e.g., “must use exactly k removals” or “must address the friend’s introversion directly”) is not met.
--Not spotting factual mistakes or unsupported claims in knowledge or reasoning responses.

[[4]] Superficial Features or Format Bias
--Rewarding longer, more detailed, or more formally structured answers even if they are incorrect or less relevant.
--Preferring responses with markdown/LaTeX/visualization, or creative style, regardless of whether these contribute to accuracy or helpfulness.
--Assuming that the presence of step-by-step reasoning or detailed explanations guarantees correctness, without verifying the logic.

[[5]] Logical, Reasoning, or Factual Errors
--Failing to identify logical gaps in an answer’s reasoning chain.
--Accepting answers with unjustified assumptions or circular logic.
--Overlooking an answer that skips critical steps or draws conclusions not supported by the evidence provided.
--Missing when a response in roleplay/writing introduces factual inconsistencies with the established context or scenario.

[[6]] Partial Comparison or Missing Key Contrasts
--Only comparing surface features (e.g., length, style, structure) and missing substantive differences in accuracy or depth.
--Neglecting to contrast core elements, such as which answer better addresses the user’s real need or solves the root problem.
--Ignoring which response better anticipates objections or edge cases, focusing instead on irrelevant differences.


### Instructions:
Given the following "judge_response" content, identify **only one error type**: the most critical one.

### Please output using the following format:

Error type detected: [[编号]]

[[编号]] 错误类型
解释及文本证据

### Now analyze this "judge_response":

{judge_response}
"""

    def __call__(self, judge_response: str):
        prompt = self.generate_prompt(judge_response)
        resp, cost, elapsed = query(prompt, self.model_name)

        lbl_resp = infer_wrong_type(resp)
        if not resp.rstrip().endswith(lbl_resp):
            resp += f"\n\n{lbl_resp}"

        return {
            "judge_analysis": resp,
            "wrong_types": lbl_resp,
            "cost_tokens": cost,
            "time": elapsed
        }


def main(input_path, output_dir, start_id=1):
    os.makedirs(output_dir, exist_ok=True)
    judge = WrongTypeJudge("deepseek-ai/DeepSeek-R1")

    with open(input_path, encoding="utf-8") as f:
        data = json.load(f)
    data = data if isinstance(data, list) else [data]

    # 构建id到下标的映射，便于任意id断点续跑
    id_to_idx = {str(item["id"]): idx for idx, item in enumerate(data)}
    start_idx = id_to_idx.get(str(start_id), 0)

    for idx, item in enumerate(data[start_idx:], start=start_idx):
        cur_id = item.get("id", idx+1)
        output_file = os.path.join(output_dir, f"id:{cur_id}.json")
        if os.path.exists(output_file):
            print(f"文件已存在，跳过 id={cur_id}")
            continue

        try:
            rec = judge(item["judge_response"])
            ordered = OrderedDict()
            ordered["id"]             = cur_id
            ordered["judge_response"] = item.get("judge_response", "")
            ordered["wrong_types"]    = rec["wrong_types"]  # [[1]] ~ [[6]]
            ordered["judge_analysis"] = rec["judge_analysis"]
            ordered["cost_tokens"]    = rec["cost_tokens"]
            ordered["time"]           = rec["time"]         # 秒

            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(ordered, f, ensure_ascii=False, indent=4)
            print(f"✅ id={cur_id} 已保存到 {output_file}")

        except Exception as e:
            print(f"❌ id={cur_id} 处理失败: {e}")
            traceback.print_exc()
            break


if __name__ == "__main__":
    # ===== 这里自定义你的输入输出和起始id =====
    input_path =       
    output_dir = 
    start_id = "1"                                    

    main(input_path, output_dir, start_id)
