import json
import re
import requests
import argparse
import os
from collections import OrderedDict

def query(prompt, model_name):
    url = 
    headers = {
        "Authorization": 
        "Content-Type": "application/json"
    }
    data = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}]
    }


    response = requests.post(url, headers=headers, json=data)
    response = json.loads(response.text)
    result = response['choices'][0]['message']['content']
    print(response)
    # 打印API响应，帮助调试
    print("API Response: ", result)

    # 检查是否存在 </think> 标签并进行处理
    if "</think>" in result:
        think = result.split("</think>")[0].replace("<think>", "").strip()
        action = result.split("</think>")[1].strip()
    else:
        # 如果没有找到 </think> 标签，则提供默认值或错误信息
        print("Warning: Response format did not include </think> tag.")
        think = "No thinking found in the response."
        action = result.strip()

    cost_tokens = {
        'prompt_tokens': response['usage']['prompt_tokens'],
        'completion_tokens': response['usage']['completion_tokens'],
        'total_tokens': response['usage']['total_tokens']
    }
    return think, action, cost_tokens


class Judge:
    def __init__(self, model_name):
        self.model_name = model_name

    def get_judge(self, prompt):
        return query(prompt, self.model_name)

    def generate_prompt(self, question, answer_a, answer_b):
        template = """
    [System]
    Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. 
    
    **Output Format Requirements:**
    1. First, write your internal thinking process wrapped in <think> and </think> tags
    2. Then, provide your final judgment outside the think tags using exactly one of these formats:
       - "[[A]]" if assistant A is better
       - "[[B]]" if assistant B is better
       - "[[C]]" for a tie
    
    **Evaluation Guidelines:**
    - Choose the assistant that better follows instructions and answers the question
    - Consider: helpfulness, relevance, accuracy, depth, creativity, and detail
    - Compare responses directly
    - Avoid position/length/name biases
    - Be objective
    
    [User Question]
    {question}
    
    [Assistant A's Answer]
    {answer_a}
    
    [Assistant B's Answer]
    {answer_b}
    
    [Your Evaluation]
    <think>
    [Place your comparative analysis and reasoning here]
    </think>
    """
        return template.format(question=question, answer_a=answer_a, answer_b=answer_b)

    def __call__(self, question, answer_a, answer_b):
        record = {
            'question': question,
            'response_a': answer_a,
            'response_b': answer_b,
            'judge model': self.model_name,
        }

        prompt = self.generate_prompt(question, answer_a, answer_b)
        judge_thinking, judge_response, cost_tokens = self.get_judge(prompt)

        record['judge_thinking'] = judge_thinking
        record['judge_response'] = judge_response
        record['cost_tokens'] = cost_tokens

        pattern = r"\[\[([ABC])\]\]"
        matches = re.findall(pattern, judge_response)

        if matches:
            final_verdict = matches[-1]
            if final_verdict == 'A':
                record['judge result'] = 'A>B'
            elif final_verdict == 'B':
                record['judge result'] = 'B>A'
            else:
                record['judge result'] = 'A=B'
        else:
            record['judge result'] = ''

        return record


def main():
    # ✅ 自定义输入输出路径
    input_path = 
    output_dir =
    os.makedirs(output_dir, exist_ok=True)

    model_name = "deepseek-r1"
    method = Judge(model_name)
    result = []

    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if isinstance(data, dict):
        data = [data]
    elif not isinstance(data, list):
        raise ValueError("Input JSON must be a dictionary or list of dictionaries.")

    for item in data:
        question = item.get('question', '')
        answer_a = item.get('response_A', '')
        answer_b = item.get('response_B', '')

        record_point = method(question, answer_a, answer_b)

        record_point['model_A'] = item.get('model_A', '')
        record_point['model_B'] = item.get('model_B', '')

        ordered_record = OrderedDict()
        ordered_record['question'] = record_point.get('question', '')
        ordered_record['response_a'] = record_point.get('response_a', '')
        ordered_record['response_b'] = record_point.get('response_b', '')
        ordered_record['model_A'] = record_point.get('model_A', '')
        ordered_record['model_B'] = record_point.get('model_B', '')
        ordered_record['judge model'] = record_point.get('judge model', '')
        ordered_record['judge_thinking'] = record_point.get('judge_thinking', '')
        ordered_record['judge_response'] = record_point.get('judge_response', '')
        ordered_record['judge result'] = record_point.get('judge result', '')
        ordered_record['cost_tokens'] = record_point.get('cost_tokens', {})

        result.append(ordered_record)
        print(json.dumps(ordered_record, ensure_ascii=False, indent=2))

    input_filename = os.path.basename(input_path)
    output_filename = input_filename.replace('.json', '_judged.json')
    output_path = os.path.join(output_dir, output_filename)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=4)

    print(f"\n✅ 评估结果已保存到：{output_path}")

if __name__ == '__main__':
    main()