import json
import re
import requests
import os
from collections import OrderedDict
import traceback

def infer_verdict(text: str) -> str:
    m = re.findall(r"verdict\s*:\s*\[\[([ABC])\]\]", text, flags=re.I)
    if m:
        return f"[[{m[-1]}]]"
    m = re.findall(r"\[\[([ABC])\]\]", text)
    if m:
        return f"[[{m[-1]}]]"
    lower = text.lower()
    keywords = r"(better|wins|outperforms|superior|stronger|more\s+accurate|" \
               r"more\s+detailed|more\s+helpful|more\s+comprehensive|more\s+robust)"
    a_hits = len(re.findall(r"assistant\s+a[^.]{0,80}?\b" + keywords, lower))
    b_hits = len(re.findall(r"assistant\s+b[^.]{0,80}?\b" + keywords, lower))
    if a_hits > b_hits:
        return "[[A]]"
    if b_hits > a_hits:
        return "[[B]]"
    return "[[C]]"

def query(prompt: str, model_name: str):
    url = 
    headers = {
        "Authorization": 
        "Content-Type": "application/json"
    }
    data = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 4096,
        "temperature": 0.1
    }
    response = requests.post(url, headers=headers, json=data)
    try:
        rsp = response.json()
    except Exception:
        print("❌ Invalid JSON response:", response.text)
        raise RuntimeError("Response is not valid JSON.")

    try:
        content = rsp["choices"][0]["message"]["content"].strip()
    except Exception:
        raise RuntimeError("Missing `choices[0].message.content` in response.")

    print(content)

    # 提取 PART 1 和 PART 2
    part1_match = re.search(r"=== START PART 1 ===(.*?)=== END PART 1 ===", content, flags=re.DOTALL)
    part2_match = re.search(r"=== START PART 2 ===(.*?)=== END PART 2 ===", content, flags=re.DOTALL)

    judge_thinking = part1_match.group(1).strip() if part1_match else "No PART 1 found"
    judge_response = part2_match.group(1).strip() if part2_match else "No PART 2 found"

    try:
        cost = {k: rsp["usage"][k] for k in ("prompt_tokens", "completion_tokens", "total_tokens")}
    except:
        cost = {}

    return judge_thinking, judge_response, cost

class Judge:
    def __init__(self, model_name: str):
        self.model_name = model_name

    @staticmethod
    def generate_prompt(question: str, ans_a: str, ans_b: str) -> str:
        return f"""
    [System]
    Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. 
    
    You must output TWO FULLY SEPARATE SECTIONS using the following EXACT format:

    === START PART 1 ===  
    - **You MUST enclose your **Chain of Thought** here — do NOT summarizeor or compress your reasoning.
    - In this section, describe how you reasoned about which assistant performed better.
    - Include all intermediate steps, calculations, and considerations.
    - **Do NOT summarize or repeat their answers.**  
    - Focus on evaluation criteria: clarity, accuracy, depth, relevance, logical flow, etc.
    - At the end of this section, write exactly: `Verdict: [[A]]`, `[[B]]`, or `[[C]]` on its own line. ("[[A]]" means assistant A is better;"[[B]]" means assistant B is better;"[[C]]" means a tie)
    === END PART 1 ===

    === START PART 2 ===  
    - This part represents your final answer to the user after analyzing both responses.
    - Provide a detailed, structured analysis by evaluation criteria.
    - Make your decision and clearly state which assistant is better and why.
    - At the end, also write exactly: `Verdict: [[A]]`, `[[B]]`, or `[[C]]` on its own line. ("[[A]]" means assistant A is better;"[[B]]" means assistant B is better;"[[C]]" means a tie)
    === END PART 2 ===

    **Evaluation Guidelines:**
    - Choose the assistant that better follows instructions and answers the question
    - Consider: helpfulness, relevance, accuracy, depth, creativity, and detail
    - Compare responses directly
    - Avoid position/length/name biases
    - Be objective
    - Each part must have one verdict.
    
    **Output Format**
    === START PART 1 ===
    (Here is your internal reasoning process or COT,do not forget your veridct.)
    === END PART 1 ===

    === START PART 2 ===
    (Here is your response content to the user,do not forget your veridct.)
    === END PART 2 ===

    [User Question]  
    {question}

    [Assistant A's Answer]  
    {ans_a}

    [Assistant B's Answer]  
    {ans_b}

    [Your Evaluation]
   
    """

    def __call__(self, question: str, ans_a: str, ans_b: str):
        prompt = self.generate_prompt(question, ans_a, ans_b)
        think, resp, cost = query(prompt, self.model_name)

        lbl_think = infer_verdict(think)
        lbl_resp  = infer_verdict(resp)

        if not think.rstrip().endswith(lbl_think):
            think += f"\n\nVerdict: {lbl_think}"
        if not resp.rstrip().endswith(lbl_resp):
            resp  += f"\n\nVerdict: {lbl_resp}"

        map2str = {"[[A]]": "A>B", "[[B]]": "B>A", "[[C]]": "A=B"}
        res_think = map2str[lbl_think]
        res_resp  = map2str[lbl_resp]

        return {
            "judge_thinking":  think,

            "thinking_result": res_think,
            "judge_response":  resp,
            "judge result":    res_resp,
            "cost_tokens":     cost
        }

def main(input_path, output_dir, start_id=1):
    os.makedirs(output_dir, exist_ok=True)
    judge = Judge("deepseek-r1")
    with open(input_path, encoding="utf-8") as f:
        data = json.load(f)
    data = data if isinstance(data, list) else [data]

    id_to_idx = {str(item["id"]): idx for idx, item in enumerate(data)}
    start_idx = id_to_idx.get(str(start_id), 0)

    for idx, item in enumerate(data[start_idx:], start=start_idx):
        cur_id = item.get("id", idx+1)
        output_file = os.path.join(output_dir, f"id:{cur_id}.json")
        if os.path.exists(output_file):
            print(f"File exists, skipping id={cur_id}")
            continue

        try:
            rec = judge(item["question"], item["response_A"], item["response_B"])
            ordered = OrderedDict()
            ordered["id"]              = cur_id
            ordered["question"]        = item.get("question", "")
            ordered["response_a"]      = item.get("response_A", "")
            ordered["response_b"]      = item.get("response_B", "")
            ordered["model_A"]         = item.get("model_A", "")
            ordered["model_B"]         = item.get("model_B", "")
            ordered["judge model"]     = "deepseek-r1"
            ordered["judge_thinking"]  = rec["judge_thinking"]
            ordered["thinking_result"] = rec["thinking_result"]
            ordered["judge_response"]  = rec["judge_response"]
            ordered["judge result"]    = rec["judge result"]
            ordered["cost_tokens"]     = rec["cost_tokens"]

            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(ordered, f, ensure_ascii=False, indent=4)
            print(f"✅ id={cur_id} saved to {output_file}")

        except Exception as e:
            print(f"❌ id={cur_id} processing failed: {e}")
            traceback.print_exc()
            break

if __name__ == "__main__":
    # === Customize input path, output directory, and start ID ===
    input_path = 
    output_dir = 
    start_id = "50"

    main(input_path, output_dir, start_id)
