{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fca142c",
   "metadata": {},
   "outputs": [],
   "source": [
    "eval_prompt = \"\"\"\n",
    "You are a careful, unbiased and strict evaluator.\n",
    "\n",
    "Given a user question and an assistant answer, evaluate the overall quality of the answer.\n",
    "Consider the following aspects:\n",
    "- Correctness and factual accuracy\n",
    "- Helpfulness and relevance to the question\n",
    "- Clarity and completeness\n",
    "- Following the user’s instructions\n",
    "\n",
    "Provide a single numeric score from 1 to 10, where:\n",
    "1 = very poor answer\n",
    "10 = excellent answer\n",
    "\n",
    "Do not provide any explanation. Output only the number.\n",
    "\n",
    "User Question:\n",
    "{question}\n",
    "\n",
    "Assistant Answer:\n",
    "{answer}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8771c0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "\n",
    "def get_gpt4_eval(prompt):\n",
    "    client = OpenAI(\n",
    "        base_url=\"https://api.ai-gaochao.cn/v1\",\n",
    "        api_key=\"\"\n",
    "    )\n",
    "\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant to help with evaluating answers.\"},\n",
    "        {\"role\": \"user\", \"content\": prompt}\n",
    "    ]\n",
    "\n",
    "    completion = client.chat.completions.create(\n",
    "        model=\"gpt-4o-mini\",\n",
    "        messages=messages\n",
    "    )\n",
    "    return completion.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "941123ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data1 =pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen2.5-7b.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d3d9038",
   "metadata": {},
   "outputs": [],
   "source": [
    "retry = 3\n",
    "for idx,row in data1.iterrows():\n",
    "    question = row['chat_history'][0]\n",
    "    answer = row[\"output\"][0]\n",
    "    prompt = eval_prompt.format(question=question, answer=answer)\n",
    "    scores =[]\n",
    "    for _ in range(retry):\n",
    "        score = get_gpt4_eval(prompt)\n",
    "        scores.append(score)\n",
    "    mean_score = sum([int(s) for s in scores])/len(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5783b11",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "\n",
    "# ========================\n",
    "# 可配置参数\n",
    "# ========================\n",
    "MAX_WORKERS = 32\n",
    "RETRY = 3\n",
    "CHECKPOINT_EVERY = 200\n",
    "\n",
    "test_llm = [\"qwen3-think\"]\n",
    "\n",
    "\n",
    "# ========================\n",
    "# 单条样本评分函数\n",
    "# ========================\n",
    "def eval_one_row(row, retry, eval_prompt):\n",
    "    question = row[\"chat_history\"][0]\n",
    "    answer = row[\"output\"][0]\n",
    "    answer_split = answer.split(\"</think>\")\n",
    "    if len(answer_split) != 2:\n",
    "        return None\n",
    "    prompt = eval_prompt.format(question=question, answer=answer_split[1])\n",
    "\n",
    "    scores = []\n",
    "    for _ in range(retry):\n",
    "        score = get_gpt4_eval(prompt)  # 允许抛异常\n",
    "        scores.append(int(score))\n",
    "\n",
    "    return sum(scores) / len(scores)\n",
    "\n",
    "\n",
    "# ========================\n",
    "# 主评测循环\n",
    "# ========================\n",
    "for model_idx, llm in enumerate(test_llm, 1):\n",
    "\n",
    "    print(f\"\\n[{model_idx}/{len(test_llm)}] Evaluating LLM: {llm}\")\n",
    "\n",
    "    candidate_model_file_path = (\n",
    "        f\"/home/-/-/pac/zeroeval/result_dirs/magpie/{llm}.json\"\n",
    "    )\n",
    "\n",
    "    # ---------- 读数据 ----------\n",
    "    data1 = pd.read_json(candidate_model_file_path)\n",
    "\n",
    "    # ---------- 初始化字段（断点续跑友好） ----------\n",
    "    if \"gpt4_score\" not in data1.columns:\n",
    "        data1[\"gpt4_score\"] = None\n",
    "    if \"gpt4_fail_reason\" not in data1.columns:\n",
    "        data1[\"gpt4_fail_reason\"] = None\n",
    "\n",
    "    # ---------- 只算还没算过的 ----------\n",
    "    pending_df = data1[data1[\"gpt4_score\"].isna()]\n",
    "    print(f\"Pending samples: {len(pending_df)} / {len(data1)}\")\n",
    "\n",
    "    if len(pending_df) == 0:\n",
    "        print(\"✔ All samples already evaluated, skipping.\")\n",
    "        continue\n",
    "\n",
    "    scores = {}\n",
    "    fail_logs = {}\n",
    "\n",
    "    completed = 0\n",
    "    futures = {}\n",
    "\n",
    "    # ========================\n",
    "    # 并行执行 + 稳定进度条\n",
    "    # ========================\n",
    "    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:\n",
    "\n",
    "        for idx, row in pending_df.iterrows():\n",
    "            futures[\n",
    "                executor.submit(eval_one_row, row, RETRY, eval_prompt)\n",
    "            ] = idx\n",
    "\n",
    "        with tqdm(\n",
    "            total=len(futures),\n",
    "            desc=f\"Scoring {llm}\",\n",
    "            leave=True\n",
    "        ) as pbar:\n",
    "\n",
    "            for future in as_completed(futures):\n",
    "                idx = futures[future]\n",
    "\n",
    "                try:\n",
    "                    scores[idx] = future.result()\n",
    "                except Exception as e:\n",
    "                    scores[idx] = None\n",
    "                    fail_logs[idx] = str(e)\n",
    "\n",
    "                completed += 1\n",
    "                pbar.update(1)\n",
    "\n",
    "                # ---------- checkpoint ----------\n",
    "                if completed % CHECKPOINT_EVERY == 0:\n",
    "                    data1.loc[scores.keys(), \"gpt4_score\"] = (\n",
    "                        pd.Series(scores)\n",
    "                    )\n",
    "                    data1.loc[fail_logs.keys(), \"gpt4_fail_reason\"] = (\n",
    "                        pd.Series(fail_logs)\n",
    "                    )\n",
    "\n",
    "                    data1.to_json(\n",
    "                        candidate_model_file_path,\n",
    "                        orient=\"records\",\n",
    "                        indent=2,\n",
    "                        force_ascii=False\n",
    "                    )\n",
    "\n",
    "    # ========================\n",
    "    # 最终写盘（保险）\n",
    "    # ========================\n",
    "    data1.loc[scores.keys(), \"gpt4_score\"] = pd.Series(scores)\n",
    "    data1.loc[fail_logs.keys(), \"gpt4_fail_reason\"] = pd.Series(fail_logs)\n",
    "\n",
    "    data1.to_json(\n",
    "        candidate_model_file_path,\n",
    "        orient=\"records\",\n",
    "        indent=2,\n",
    "        force_ascii=False\n",
    "    )\n",
    "\n",
    "    print(\n",
    "        f\"✔ Finished {llm}: \"\n",
    "        f\"{len(scores)} done, {len(fail_logs)} failed\"\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba17b237",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen3-think.json\")\n",
    "# data.drop([\"gpt4_score\",\"gpt4_fail_reason\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8770c6d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90c2783e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_json(\"/home/-/-/pac/zeroeval/result_dirs/magpie/qwen3-think.json\", orient='records', indent=2,force_ascii=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67334b8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data1.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8c66f93",
   "metadata": {},
   "outputs": [],
   "source": [
    "data1.iloc[0,1][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "785a5916",
   "metadata": {},
   "outputs": [],
   "source": [
    "data1.iloc[0,3][0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vllm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
