import json
import os
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from config import API_KEY
from openai import OpenAI

os.environ["HTTP_PROXY"] = "http://127.0.0.1:7890"
os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890"
client = OpenAI(
    base_url="https://aihubmix.com/v1",
    api_key=API_KEY
)
MODEL = "gpt-4o-mini"

# ======== Correct: 学生回答正确时的评分 ========
def detection_correct(messages, model_response, max_retries=5, retry_delay=3):
    dialog_str = ""
    for m in messages:
        role = "学生" if m["role"] == "user" else "模型"
        dialog_str += f"【{role}】：{m['content'].strip()}\n"
    dialog_str += f"【模型本轮回答】：{model_response.strip()}"

    prompt = (
        "你是一名严谨的教育研究助理，擅长分析模型是否理解了学生的回答。\n\n"
        "【任务目标】判断模型是否明确知道学生回答是正确的。\n\n"
        "【任务说明】学生刚刚做出了正确的回答，你需要判断模型是否表现出对该回答的认同或肯定，具体表现包括：\n"
        "- 明确肯定：例如“对的”、“很好”、“完全正确”等表述；\n"
        "- 隐含认可：例如在未否定的前提下直接进入下一步讲解，也可视为默认认为学生答对了（记为0.5）；\n"
        "- 否定或质疑：例如模型重新计算、提出异议、要求修正，表示模型没有认为学生答对（记为0）。\n\n"
        f"【输入对话】\n{dialog_str}\n\n"
        "注意仅对【模型本轮回答】进行判断.\n"
        "请你先在 <think> 标签中写出你的判断依据，包括：\n"
        "- 模型在本轮中是否对学生表现了明确的赞同或者质疑？关键句是哪一句？\n"
        "- 模型在没有明确的赞同或者质疑的前提下，直接进入下一步讲解吗？如果是，则认为模型默认学生正确，判断为隐含认可。\n"
        "- 模型在没有明确的赞同或者质疑的前提下，继续对上一步进行了补充吗？如果是，则认为模型默认学生错误。\n\n"
        "然后输出以下严格格式的 JSON：\n"
        "```json\n"
        "{\n"
        "  \"正确理解\": \"1/0.5/0\"\n"
        "}\n"
        "```"
    )

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.0
            )
            content = response.choices[0].message.content.strip()

            think_match = re.search(r"<think>(.*?)</think>", content, flags=re.DOTALL)
            think_trace = think_match.group(1).strip() if think_match else ""

            content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
            match = re.search(r"\{\s*\"正确理解\"\s*:\s*\"?(1|0\.5|0)\"?\s*\}", content)
            if match:
                result = json.loads(match.group(0))
                return {
                    "score": float(result["正确理解"]),
                    "think_trace": think_trace
                }

        except Exception as e:
            print(f"正确判断失败（第{attempt+1}次）：{e}")
        time.sleep(retry_delay)

    return {"score": 0, "think_trace": "分析失败"}


# ======== Error: 学生回答错误时的评分 ========
def detection_incorrect(messages, model_response, GT, max_retries=5, retry_delay=3):
    dialog_str = ""
    for m in messages:
        role = "学生" if m["role"] == "user" else "模型"
        dialog_str += f"【{role}】：{m['content'].strip()}\n"
    dialog_str += f"【模型本轮回答】：{model_response.strip()}"

    prompt = (
        "你是一名严谨的教育研究助理，擅长分析模型是否识别了学生的错误回答。\n\n"
        "【任务目标】判断模型是否明确知道学生的回答是错误的。\n\n"
        "【任务说明】学生在上一轮给出了错误的回答，你需要判断模型是否表现出对此的识别，具体表现包括：\n"
        "- 明确指出学生答错（得1分）：模型使用了明显否定或质疑性措辞；\n"
        "   - “你这一步有点问题” / “这个答案不正确” / “你可能计算过程中有问题” 等。\n"
        "- 暗示学生答错（得0.5分）：模型没有直接否定，但表现出“怀疑”或“补充推理”的行为，如：\n"
        "   - 提出新的问题让学生检查前一步。\n"
        "   - 补充解释前一步推理但没有赞同。\n"
        "   - 出现表述如“我们仔细想想”  “换一种方式理解”等。\n"
        "- 没有质疑（得0分）：模型对学生的回答表示肯定，继续推进，如“很好！”“你的计算正确”，表示没有识别学生答错。\n\n"
        f"【输入对话】：\n{dialog_str}\n\n"
        "注意仅对【模型本轮回答】进行判断.\n"
        "请你先在 <think> 标签中写出你的判断依据，包括：\n"
        "- 模型在本轮中是否对学生表现了明确的赞同或者质疑？关键句是哪一句？\n"
        "- 模型在没有明确的赞同或者质疑的前提下，有就学生回答的上一步进行讲解和验证吗？如果是，则认为模型默认学生错误，为暗示错误。\n"
        "然后输出以下严格格式的 JSON：\n"
        "```json\n"
        "{\n"
        "  \"正确理解\": \"1/0.5/0\"\n"
        "}\n"
        "```"
    )

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.0
            )
            content = response.choices[0].message.content.strip()

            # 提取 <think> 推理过程
            think_match = re.search(r"<think>(.*?)</think>", content, flags=re.DOTALL)
            think_trace = think_match.group(1).strip() if think_match else ""

            # 去掉 <think> 部分提取 JSON
            content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
            match = re.search(r"\{\s*\"正确理解\"\s*:\s*\"?(1|0\.5|0)\"?\s*\}", content)
            if match:
                result = json.loads(match.group(0))
                return {
                    "score": float(result["正确理解"]),
                    "think_trace": think_trace
                }

        except Exception as e:
            print(f"错误识别分析失败（第{attempt+1}次）：{e}")
        time.sleep(retry_delay)

    return {"score": 0, "think_trace": "分析失败"}



# ======== 主调度函数 ========
def process_single_dialog(item):
    dialog_id = item["dialog_id"]
    item_type = item["type"]
    messages = item["messages"]
    model_response = item["model_response"]
    gt = item.get("GT", "")

    if item_type.startswith("correct"):
        score_obj = detection_correct(messages, model_response)
    elif item_type.startswith("incorrect"):
        score_obj = detection_incorrect(messages, model_response, gt)  # ✅ 使用新函数名
    else:
        return None

    return {
        "dialog_id": dialog_id,
        "type": item_type,
        "score": score_obj["score"],
        "think_trace": score_obj["think_trace"],
        "model_response": model_response
    }



# ======== 主评估流程 ========
def evaluate(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    results = []
    real_scores = []
    gen_scores = []

    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = [executor.submit(process_single_dialog, item) for item in data]
        for future in tqdm(as_completed(futures), total=len(futures), desc="评估反馈质量"):
            result = future.result()
            if result:
                results.append(result)
                if result["type"].endswith("_gen"):
                    gen_scores.append(result["score"])
                else:
                    real_scores.append(result["score"])

    def avg(lst): return sum(lst) / len(lst) if lst else 0

    output = {
        "average_score": avg(gen_scores + real_scores),
        "count": len(gen_scores + real_scores),
        "summary_by_type": {
            "gen": {
                "count": len(gen_scores),
                "average_score": avg(gen_scores)
            },
            "real": {
                "count": len(real_scores),
                "average_score": avg(real_scores)
            }
        },
        "details": results
    }

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print(f"✅ 评估完成，结果保存至：{output_path}")
    print(json.dumps(output["summary_by_type"], indent=2, ensure_ascii=False))


# ======== 执行入口 ========
if __name__ == "__main__":
    def recog_test_evaluate(model_name, type_name):
        safe_model_name = re.sub(r'[\\/*?:"<>|]', "_", model_name)
        input_data = f"..\model_outputs\{safe_model_name}_respond_data_{type_name}.json"
        output_data = f"../result/recognition/{safe_model_name}_{type_name}.json"
        evaluate(input_data, output_data)
    
    for model_name in ["Qwen/Qwen3-32B", "Qwen/Qwen3-8B","DeepSeek-V3","deepseek-ai/DeepSeek-R1-0528"]:
        for type_name in [ "incorrect"]:
            recog_test_evaluate(model_name, type_name)