{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mr_eval.utils.utils import *\n",
    "import os\n",
    "\n",
    "def list_jsonl_files(folder_path):\n",
    "    \"\"\"\n",
    "    列举文件夹中的所有 .jsonl 文件\n",
    "    Args:\n",
    "        folder_path (str): 文件夹路径\n",
    "    Returns:\n",
    "        List[str]: 所有 .jsonl 文件的路径\n",
    "    \"\"\"\n",
    "    return [f for f in os.listdir(folder_path) if f.endswith(\".jsonl\")]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "## Model names\n",
    "# qwen_prm7b=\"[Qwen2.5-Math-PRM-7B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B)\",\n",
    "#     qwen_prm72b=\"[Qwen2.5-Math-PRM-72B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B)\",\n",
    "#  {\"Name\": \"Pure-PRM-7B\", \"Source\": \"https://huggingface.co/jinachris/Qwen2.5-Math-7B-PRM800K\", \"Class\": \"PRM\"}\n",
    "prm_model_name_dict = dict(\n",
    "    skyworkprm_1_5B=\"\\\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B}\",\n",
    "    skyworkprm_7B=\"\\\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B}\",\n",
    "    llemma7b_prm_prm800k=\"\\\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B}\",\n",
    "    llemma7b_prm_metamath=\"\\\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B}\",\n",
    "    llemma7b_oprm_prm800k=\"\\\\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B}\",\n",
    "    mathminos_mistral=\"\\\\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B}\",\n",
    "    mathshepherd=\"\\\\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B}\",\n",
    "    reasoneval7b=\"\\\\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B}\",\n",
    "    reasoneval34b=\"\\\\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B}\",\n",
    "    llama3_1_8b_prm_mistral=\"\\\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B}\",\n",
    "    llama3_1_8b_prm_deepseek=\"\\\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B}\",\n",
    "    qwen_prm7b=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B}\",\n",
    "    qwen_prm72b=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-PRM-72B}\",\n",
    "    pure_prm_7b=\"\\\\href{https://huggingface.co/jinachris/Qwen2.5-Math-7B-PRM800K}{Pure-PRM-7B}\",\n",
    ")\n",
    "\n",
    "close_model_name_dict = dict(\n",
    "    gpt4o=\"\\\\href{https://openai.com/index/hello-gpt-4o/}{GPT-4o}\",\n",
    "    o1mini=\"\\\\href{https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/}{o1-mini}$^\\dagger$\",\n",
    "    \n",
    "    gemini_2_flash=\"\\\\href{https://deepmind.google/technologies/gemini/flash/}{Gemini-2.0-flash-exp}\",\n",
    "    gemini_2_thinking=\"\\\\href{https://ai.google.dev/gemini-api/docs/thinking-mode}{Gemini-2.0-thinking-exp-1219}\",\n",
    "    # ds_v3=\"\\\\href{https://github.com/deepseek-ai/DeepSeek-V3}{DeepSeek-V3}\",\n",
    "    \n",
    "    \n",
    ")\n",
    "    \n",
    "open_model_name_dict = dict(\n",
    "    # o1preview=\"\\\\href{https://openai.com/index/introducing-openai-o1-preview/}{o1-preview}$^\\dagger$\",\n",
    "    \n",
    "    metamath_7b=\"\\\\href{https://huggingface.co/meta-math/MetaMath-7B-V1.0}{MetaMath-7B}\",\n",
    "    metamath_13b=\"\\\\href{https://huggingface.co/meta-math/MetaMath-13B-V1.0}{MetaMath-13B}\",\n",
    "    # metamath_70b=\"\\\\href{https://huggingface.co/meta-math/MetaMath-70B-V1.0}{MetaMath-70B}\",\n",
    "    # qwen25_7b=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct}{Qwen2.5-Math-7B}\",\n",
    "    qwen25_72b=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-72B}\",\n",
    "    qwen_qwq=\"\\\\href{https://huggingface.co/Qwen/QwQ-32B-Preview}{QwQ-Preview-32B}\",\n",
    "    # r1_distill_llama_8b=\"\\\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B}{R1-Distill-Llama3.1-8B}\",\n",
    "    r1_distill_llama_70b=\"\\\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B}{R1-Distill-Llama3.1-70B}\",\n",
    "    r1_distill_qwen_7b=\"\\\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B}{R1-Distill-Qwen-7B}\",\n",
    "    ds_r1=\"\\\\href{https://github.com/deepseek-ai/DeepSeek-R1}{DeepSeek-R1}$^\\dagger$\"\n",
    "    # r1_distill_qwen_32b=\"\\\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B}{R1-Distill-Qwen-32B}\",\n",
    "    # wizardmath_7b=\"\\\\href{https://huggingface.co/WizardLMTeam/WizardMath-7B-V1.0}{izardMath-7B}\",\n",
    ")\n",
    "\n",
    "\n",
    "\n",
    "classification_name_dict = dict(\n",
    "    domain_inconsistency=\"DC.\",\n",
    "    redundency=\"NR.\",\n",
    "    multi_solutions=\"MS.\",\n",
    "    deception=\"DR.\",\n",
    "    confidence=\"CI.\",\n",
    "    step_contradiction=\"SC.\",\n",
    "    circular=\"NCL.\",\n",
    "    missing_condition=\"PS.\",\n",
    "    counterfactual=\"ES.\"\n",
    ")\n",
    "\n",
    "classification_parallel_dict = dict(\n",
    "    simplicity=dict(\n",
    "        redundency=\"NR.\",\n",
    "        circular=\"NCL.\",\n",
    "    ),\n",
    "    soundness=dict(\n",
    "        counterfactual=\"ES.\",\n",
    "        step_contradiction=\"SC.\",\n",
    "        domain_inconsistency=\"DC.\",\n",
    "        confidence=\"CI.\",\n",
    "    ),\n",
    "    sensitivity=dict(\n",
    "        missing_condition=\"PS.\",\n",
    "        deception=\"DR.\",\n",
    "        multi_solutions=\"MS.\",\n",
    "    )\n",
    ")\n",
    "classifications = [\"redundency\", \"circular\", \"counterfactual\", \"step_contradiction\", \"domain_inconsistency\",  \"confidence\", \"missing_condition\", \"deception\", \"multi_solutions\", ]\n",
    "metrics = [\"f1\", \"negative_f1\", \"total_step_acc\", \"correct_step_acc\", \"wrong_step_acc\", \"first_error_acc\", \"similarity\",]\n",
    "\n",
    "## File paths\n",
    "res_dir = \"/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/scripts/logs/prmtest_classified\"\n",
    "vllm_res_dir = \"/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/scripts/logs/prmtest_classified/vllm\"\n",
    "\n",
    "res_files = list_jsonl_files(res_dir)\n",
    "vllm_res_files = list_jsonl_files(vllm_res_dir)\n",
    "res_names = [f.split(\".\")[0] for f in res_files+vllm_res_files]\n",
    "res_paths = [os.path.join(res_dir, f) for f in res_files]\n",
    "vllm_res_paths = [os.path.join(vllm_res_dir, f) for f in vllm_res_files]\n",
    "res_paths += vllm_res_paths\n",
    "file_dict = dict(zip(res_names, res_paths))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_res_dict(file_dict,model_lists=None):\n",
    "    res_dict = {}\n",
    "    if not model_lists:\n",
    "        for model_name, file_path in file_dict.items():\n",
    "            res_dict[model_name] = process_jsonl(file_path)[-1]\n",
    "    else:\n",
    "        for model_name in model_lists:\n",
    "            file_path = file_dict[model_name]\n",
    "            res_dict[model_name] = process_jsonl(file_path)[-1]\n",
    "    return res_dict\n",
    "\n",
    "\n",
    "def get_prmscore_from_current_res_dict(res_dict,classification=None):\n",
    "    '''\n",
    "    Get PRM score from model level dict\n",
    "    '''\n",
    "    if not classification:\n",
    "        prm_score = res_dict[\"total_hallucination_results\"]['f1'] * 0.5 + res_dict[\"total_hallucination_results\"]['negative_f1'] * 0.5\n",
    "    else:\n",
    "        if classification in [\"multi_solutions\"]:\n",
    "            prm_score = res_dict[\"hallucination_type_results\"]['f1'][classification]\n",
    "        else:\n",
    "            prm_score = res_dict[\"hallucination_type_results\"]['f1'][classification] * 0.5 + res_dict[\"hallucination_type_results\"]['negative_f1'][classification] * 0.5\n",
    "    prm_score = prm_score if prm_score > 0 else 0\n",
    "    return prm_score\n",
    "\n",
    "\n",
    "def get_avg_prmscore_from_current_res_dict(res_dict,classifications):\n",
    "    '''\n",
    "    Get AVG PRM score from model level dict\n",
    "    '''\n",
    "    assert classifications\n",
    "    res = [get_prmscore_from_current_res_dict(res_dict,classification) for classification in classifications]\n",
    "    return sum(res) / len(res)\n",
    "    \n",
    "\n",
    "def get_res_str(model_dict,classification_dict,res_dict):\n",
    "    res_str = \"\"\n",
    "    # current_classification_dict = classification_dict[classification_name]\n",
    "    avg_res_list = []\n",
    "    # avg_qualify_list = []\n",
    "    for idx,(model_name, model_display_name) in enumerate(model_dict.items()):\n",
    "        temp_str = f\"{model_display_name}\"\n",
    "        current_res_dict = res_dict[model_name]\n",
    "        prm_score = get_prmscore_from_current_res_dict(current_res_dict)\n",
    "        \n",
    "        all_model_scores = sorted([get_prmscore_from_current_res_dict(res) for res in res_dict.values()],reverse=True)\n",
    "        if idx == 0:\n",
    "            avg_res_list.append(sum(all_model_scores) / len(all_model_scores))\n",
    "        if prm_score == max(all_model_scores):\n",
    "            temp_str += f\" & \\\\textbf{{{prm_score * 100:.1f}}}\"\n",
    "        elif prm_score == all_model_scores[1]:\n",
    "            temp_str += f\" & \\\\underline{{{prm_score * 100:.1f}}}\"\n",
    "        else:\n",
    "            temp_str += f\" & {prm_score * 100:.1f}\" if prm_score > 0 else \" & - \"\n",
    "        # qualify = current_res_dict[\"validitiy_rate\"]\n",
    "        # avg_qualify_list.append(qualify)\n",
    "        # temp_str += f\" & {qualify * 100:.0f}\"\n",
    "        \n",
    "        for big_classification, current_classification_dict in classification_dict.items():\n",
    "            all_avt = sorted([get_avg_prmscore_from_current_res_dict(res,list(current_classification_dict.keys())) for res in res_dict.values()], reverse=True)\n",
    "            avg = []\n",
    "            for classification, display_classification_name in current_classification_dict.items():\n",
    "                prm_score = get_prmscore_from_current_res_dict(current_res_dict, classification)\n",
    "                \n",
    "                all_prm_scores = sorted([get_prmscore_from_current_res_dict(res,classification) for res in res_dict.values()], reverse=True)\n",
    "                if idx == 0:\n",
    "                    avg_res_list.append(sum(all_prm_scores) / len(all_prm_scores))\n",
    "                avg.append(prm_score)\n",
    "                if prm_score == max(all_prm_scores):\n",
    "                    temp_str += f\" & \\\\textbf{{{prm_score * 100:.1f}}}\"\n",
    "                elif prm_score == all_prm_scores[1]:\n",
    "                    temp_str += f\" & \\\\underline{{{prm_score * 100:.1f}}}\"\n",
    "                else:\n",
    "                    temp_str += f\" & {prm_score * 100:.1f}\" if prm_score > 0 else \" & - \"\n",
    "            avg_score = sum(avg) / len(avg)\n",
    "            if avg_score == max(all_avt):\n",
    "                temp_str += f\" & \\\\textbf{{{avg_score * 100:.1f}}}\"\n",
    "            elif avg_score == all_avt[1]:\n",
    "                temp_str += f\" & \\\\underline{{{avg_score * 100:.1f}}}\"\n",
    "            else:\n",
    "                temp_str += f\" & {avg_score * 100:.1f}\" if avg_score > 0 else \" & - \"\n",
    "            if idx == 0:\n",
    "                avg_res_list.append(sum(all_avt) / len(all_avt))\n",
    "        temp_str += \"\\\\\\\\\\n\"\n",
    "        res_str += temp_str\n",
    "    # avg_qualify = sum(avg_qualify_list) / len(avg_qualify_list)\n",
    "    avg_res_str = f\"\\\\cellcolor{{gray!10}} \\\\textbf{{Avg.}} \"\n",
    "    # & \\\\cellcolor{{gray!10}} {avg_res_list[0]*100:.1f}  & \\\\cellcolor{{gray!10}} {avg_qualify*100:.0f}\n",
    "    for res in avg_res_list:\n",
    "        avg_res_str += f\"& \\\\cellcolor{{gray!10}} {res * 100:.1f} \"\n",
    "    avg_res_str += \"\\\\\\\\\\n\"\n",
    "    res_str += avg_res_str\n",
    "    \n",
    "    return res_str\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\hline \\multicolumn{14}{c}{\\textit{\\textbf{Open-source Process Level Reward Models}}} \\\\   \\hline \n",
      "\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B} & 61.1 & 52.0 & 56.4 & 54.2 & 64.8 & 64.9 & 63.3 & 66.5 & 64.9 & 57.5 & 63.3 & 91.1 & 70.7\\\\\n",
      "\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B} & 65.1 & \\underline{56.4} & \\textbf{62.8} & \\textbf{59.6} & 69.4 & 67.1 & \\underline{67.7} & 69.9 & 68.5 & \\textbf{60.9} & 65.8 & 93.2 & 73.3\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B} & 52.0 & 49.3 & 53.4 & 51.4 & 56.4 & 47.1 & 46.7 & 53.3 & 50.9 & 51.0 & 53.5 & 93.6 & 66.0\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B} & 50.5 & 50.2 & 50.5 & 50.3 & 51.9 & 47.6 & 44.4 & 52.1 & 49.0 & 50.5 & 51.3 & 96.0 & 66.0\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B} & 50.3 & 48.7 & 49.3 & 49.0 & 54.2 & 46.8 & 44.5 & 53.5 & 49.8 & 49.2 & 51.3 & 91.8 & 64.1\\\\\n",
      "\\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B} & 54.2 & 48.8 & 54.0 & 51.4 & 57.0 & 52.1 & 50.7 & 57.8 & 54.4 & 52.8 & 55.8 & 91.1 & 66.5\\\\\n",
      "\\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B} & 47.0 & 44.0 & 50.3 & 47.1 & 49.4 & 44.5 & 41.3 & 47.7 & 45.7 & 47.2 & 48.6 & 86.1 & 60.7\\\\\n",
      "\\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B} & 60.1 & \\textbf{61.0} & 50.1 & \\underline{55.6} & 62.1 & 65.9 & 61.5 & 66.0 & 63.9 & 55.7 & 58.0 & 99.5 & 71.1\\\\\n",
      "\\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B} & 60.5 & 54.8 & 48.1 & 51.5 & 66.4 & 60.3 & 57.8 & 67.5 & 63.0 & 57.7 & 64.3 & 97.2 & 73.1\\\\\n",
      "\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B} & 54.4 & 46.1 & 47.3 & 46.7 & 56.6 & 55.1 & 54.4 & 63.8 & 57.5 & 51.5 & 56.2 & 97.9 & 68.5\\\\\n",
      "\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B} & 54.2 & 46.4 & 48.9 & 47.6 & 55.7 & 55.0 & 53.2 & 66.2 & 57.5 & 49.0 & 55.4 & \\textbf{99.8} & 68.1\\\\\n",
      "\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B} & \\underline{65.5} & 49.0 & 55.1 & 52.1 & \\underline{71.8} & 67.3 & 66.3 & \\underline{78.5} & \\underline{71.0} & 57.6 & 69.1 & \\underline{99.7} & 75.5\\\\\n",
      "\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-PRM-72B} & \\textbf{68.2} & 50.4 & \\underline{58.8} & 54.6 & \\textbf{73.7} & \\textbf{71.1} & \\textbf{72.2} & \\textbf{78.6} & \\textbf{73.9} & \\underline{60.3} & \\textbf{71.2} & 99.4 & \\textbf{77.0}\\\\\n",
      "\\href{https://huggingface.co/jinachris/Qwen2.5-Math-7B-PRM800K}{Pure-PRM-7B} & 65.3 & 49.2 & 55.2 & 52.2 & 71.1 & \\underline{68.8} & 64.0 & 76.9 & 70.2 & 60.3 & \\underline{69.2} & 98.0 & \\underline{75.8}\\\\\n",
      "\\cellcolor{gray!10} \\textbf{Avg.} & \\cellcolor{gray!10} 57.7 & \\cellcolor{gray!10} 50.5 & \\cellcolor{gray!10} 52.9 & \\cellcolor{gray!10} 51.7 & \\cellcolor{gray!10} 61.5 & \\cellcolor{gray!10} 58.1 & \\cellcolor{gray!10} 56.3 & \\cellcolor{gray!10} 64.2 & \\cellcolor{gray!10} 60.0 & \\cellcolor{gray!10} 54.4 & \\cellcolor{gray!10} 59.5 & \\cellcolor{gray!10} 95.3 & \\cellcolor{gray!10} 69.7 \\\\\n",
      "\\hline \\multicolumn{14}{c}{\\textit{\\textbf{Open LLMs, Prompted as Critic Models}}} \\\\   \\hline \n",
      "\\href{https://huggingface.co/meta-math/MetaMath-7B-V1.0}{MetaMath-7B} & 49.7 & 48.9 & 46.9 & 47.9 & 47.3 & 48.9 & 48.4 & 48.8 & 48.3 & 46.5 & 48.3 & 98.0 & 64.2\\\\\n",
      "\\href{https://huggingface.co/meta-math/MetaMath-13B-V1.0}{MetaMath-13B} & 49.4 & 50.3 & 44.4 & 47.3 & 47.8 & 47.4 & 49.4 & 48.1 & 48.2 & 49.0 & 48.1 & 99.5 & 65.5\\\\\n",
      "\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-72B} & 57.4 & 55.3 & 54.9 & 55.1 & 55.5 & \\underline{71.6} & 58.1 & 59.1 & 61.1 & 47.4 & 53.8 & \\textbf{100.0} & 67.1\\\\\n",
      "\\href{https://huggingface.co/Qwen/QwQ-32B-Preview}{QwQ-Preview-32B} & \\underline{63.6} & \\underline{57.2} & \\underline{55.6} & \\underline{56.4} & \\underline{67.4} & \\textbf{72.3} & \\underline{66.2} & \\underline{66.9} & \\underline{68.2} & \\underline{57.8} & \\underline{62.7} & \\textbf{100.0} & \\underline{73.5}\\\\\n",
      "\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B}{R1-Distill-Llama3.1-70B} & 57.5 & 49.5 & 48.1 & 48.8 & 61.4 & 65.5 & 65.8 & 61.1 & 63.4 & 48.8 & 54.1 & 100.0 & 67.6\\\\\n",
      "\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B}{R1-Distill-Qwen-7B} & 52.6 & 32.9 & 37.9 & 35.4 & 47.3 & 54.1 & 48.4 & 48.0 & 49.4 & 45.6 & 46.8 & \\textbf{100.0} & 64.1\\\\\n",
      "\\href{https://github.com/deepseek-ai/DeepSeek-R1}{DeepSeek-R1}$^\\dagger$ & \\textbf{67.8} & \\textbf{63.0} & \\textbf{62.7} & \\textbf{62.9} & \\textbf{68.2} & 68.5 & \\textbf{73.5} & \\textbf{75.4} & \\textbf{71.4} & \\textbf{63.3} & \\textbf{68.0} & \\textbf{100.0} & \\textbf{77.1}\\\\\n",
      "\\cellcolor{gray!10} \\textbf{Avg.} & \\cellcolor{gray!10} 56.8 & \\cellcolor{gray!10} 51.0 & \\cellcolor{gray!10} 50.1 & \\cellcolor{gray!10} 50.5 & \\cellcolor{gray!10} 56.4 & \\cellcolor{gray!10} 61.2 & \\cellcolor{gray!10} 58.5 & \\cellcolor{gray!10} 58.2 & \\cellcolor{gray!10} 58.6 & \\cellcolor{gray!10} 51.2 & \\cellcolor{gray!10} 54.5 & \\cellcolor{gray!10} 99.6 & \\cellcolor{gray!10} 68.5 \\\\\n",
      "\\hline \\multicolumn{14}{c}{\\textit{\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\   \\hline \n",
      "\\href{https://openai.com/index/hello-gpt-4o/}{GPT-4o} & 66.8 & 57.0 & 62.4 & 59.7 & 72.0 & \\underline{69.7} & 70.7 & 71.1 & 70.9 & \\textbf{62.5} & 65.7 & 99.2 & \\textbf{75.8}\\\\\n",
      "\\href{https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/}{o1-mini}$^\\dagger$ & \\underline{68.8} & 65.6 & \\underline{63.7} & \\underline{64.6} & \\textbf{74.5} & 67.7 & \\textbf{73.8} & \\textbf{72.3} & \\textbf{72.1} & \\underline{61.8} & 64.8 & \\textbf{100.0} & \\underline{75.5}\\\\\n",
      "\\href{https://deepmind.google/technologies/gemini/flash/}{Gemini-2.0-flash-exp} & 66.0 & \\underline{67.2} & 58.1 & 62.7 & 70.4 & 65.7 & 66.0 & 67.3 & 67.3 & 61.8 & \\textbf{66.2} & 98.2 & 75.4\\\\\n",
      "\\href{https://ai.google.dev/gemini-api/docs/thinking-mode}{Gemini-2.0-thinking-exp-1219} & \\textbf{68.8} & \\textbf{68.5} & \\textbf{63.8} & \\textbf{66.2} & \\underline{72.9} & \\textbf{71.3} & \\underline{71.0} & \\underline{71.8} & \\underline{71.8} & 60.3 & \\underline{65.7} & \\underline{99.8} & 75.3\\\\\n",
      "\\cellcolor{gray!10} \\textbf{Avg.} & \\cellcolor{gray!10} 67.6 & \\cellcolor{gray!10} 64.6 & \\cellcolor{gray!10} 62.0 & \\cellcolor{gray!10} 63.3 & \\cellcolor{gray!10} 72.4 & \\cellcolor{gray!10} 68.6 & \\cellcolor{gray!10} 70.4 & \\cellcolor{gray!10} 70.7 & \\cellcolor{gray!10} 70.5 & \\cellcolor{gray!10} 61.6 & \\cellcolor{gray!10} 65.6 & \\cellcolor{gray!10} 99.3 & \\cellcolor{gray!10} 75.5 \\\\\n",
      "\n"
     ]
    }
   ],
   "source": [
    "res_str = \"\"\n",
    "\n",
    "## PRMs\n",
    "model_type_panel=\"\\hline \\multicolumn{14}{c}{\\\\textit{\\\\textbf{Open-source Process Level Reward Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(prm_model_name_dict.keys()))\n",
    "prm_str = get_res_str(prm_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += model_type_panel + prm_str\n",
    "\n",
    "## Open Models\n",
    "model_type_panel= \"\\hline \\multicolumn{14}{c}{\\\\textit{\\\\textbf{Open LLMs, Prompted as Critic Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(open_model_name_dict.keys()))\n",
    "open_str = get_res_str(open_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += model_type_panel + open_str\n",
    "\n",
    "## Close Models\n",
    "model_type_panel= \"\\hline \\multicolumn{14}{c}{\\\\textit{\\\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(close_model_name_dict.keys()))\n",
    "close_str = get_res_str(close_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += model_type_panel + close_str\n",
    "\n",
    "\n",
    "print(res_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Markdown CHART"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Model names\n",
    "prm_model_name_dict = dict(\n",
    "    skyworkprm_1_5B=\"[Skywork-PRM-1.5B](https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B)\",\n",
    "    skyworkprm_7B=\"[Skywork-PRM-7B](https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B)\",\n",
    "    llemma7b_prm_prm800k=\"[Llemma-PRM800k-7B](https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf)\",\n",
    "    llemma7b_prm_metamath=\"[Llemma-MetaMath-7B](https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf)\",\n",
    "    llemma7b_oprm_prm800k=\"[Llemma-oprm-7B](https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf)\",\n",
    "    mathminos_mistral=\"[MATHMinos-Mistral-7B](https://github.com/KbsdJames/MATH-Minos)\",\n",
    "    mathshepherd=\"[MathShepherd-Mistral-7B](https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm)\",\n",
    "    reasoneval7b=\"[ReasonEval-7B](https://huggingface.co/GAIR/ReasonEval-7B)\",\n",
    "    llama3_1_8b_prm_mistral=\"[RLHFlow-PRM-Mistral-8B](https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data)\",\n",
    "    llama3_1_8b_prm_deepseek=\"[RLHFlow-PRM-Deepseek-8B](https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data)\",\n",
    "    reasoneval34b=\"[ReasonEval-34B](https://huggingface.co/GAIR/ReasonEval-34B)\",\n",
    "    qwen_prm7b=\"[Qwen2.5-Math-PRM-7B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B)\",\n",
    "    qwen_prm72b=\"[Qwen2.5-Math-PRM-72B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B)\",\n",
    "    pure_prm=\"[Pure-PRM](https://huggingface.co/jinachris/Qwen2.5-Math-7B-PRM800K)\",\n",
    ")\n",
    "\n",
    "close_model_name_dict = dict(\n",
    "    gpt4o=\"[GPT-4o](https://openai.com/index/hello-gpt-4o/)\",\n",
    "    o1mini=\"[o1-mini](https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/)\\\\$^\\dagger$\",\n",
    "    \n",
    "    gemini_2_flash=\"[Gemini-2.0-flash-exp](https://deepmind.google/technologies/gemini/flash/)\",\n",
    "    gemini_2_thinking=\"[Gemini-2.0-thinking-exp-1219](https://ai.google.dev/gemini-api/docs/thinking-mode)\",\n",
    ")\n",
    "\n",
    "open_model_name_dict = dict(\n",
    "    qwen_qwq=\"[QwQ-Preview-32B](https://huggingface.co/Qwen/QwQ-32B-Preview)\",\n",
    "    metamath_7b=\"[MetaMath-7B](https://huggingface.co/meta-math/MetaMath-7B-V1.0)\",\n",
    "    metamath_13b=\"[MetaMath-13B](https://huggingface.co/meta-math/MetaMath-13B-V1.0)\",\n",
    "    metamath_70b=\"[MetaMath-70B](https://huggingface.co/meta-math/MetaMath-70B-V1.0)\",\n",
    "    qwen25_7b=\"[Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)\",\n",
    "    qwen25_72b=\"[Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B)\",\n",
    "    r1_distill_llama_8b=\"[R1-Distill-Llama3.1-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)\",\n",
    "    r1_distill_llama_70b=\"[R1-Distill-Llama3.1-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)\",\n",
    "    r1_distill_qwen_7b=\"[R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)\",\n",
    "    r1_distill_qwen_32b=\"[R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B)\",\n",
    "    wizardmath_7b=\"[WizardMath-7B](https://huggingface.co/WizardLMTeam/WizardMath-7B-V1.0)\"\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_res_str(model_dict, classification_dict, res_dict):\n",
    "    res_str = \"\"\n",
    "    avg_res_list = []\n",
    "\n",
    "    # 表头部分\n",
    "    header_row = \"| Model | Overall\"\n",
    "    separator_row = \"|-------|-------\"\n",
    "    for big_classification, current_classification_dict in classification_dict.items():\n",
    "        for classification, display_classification_name in current_classification_dict.items():\n",
    "            header_row += f\"| {display_classification_name} \"\n",
    "            separator_row += \"|-------\"\n",
    "        header_row += f\"| Avg ({big_classification}) \"\n",
    "        separator_row += \"|-------\"\n",
    "    header_row += \" |\\n\"\n",
    "    separator_row += \" |\\n\"\n",
    "    res_str += header_row + separator_row\n",
    "\n",
    "    # 数据部分\n",
    "    for idx, (model_name, model_display_name) in enumerate(model_dict.items()):\n",
    "        temp_str = f\"| {model_display_name} \"\n",
    "        current_res_dict = res_dict[model_name]\n",
    "\n",
    "        # 计算 PRM Score\n",
    "        prm_score = get_prmscore_from_current_res_dict(current_res_dict)\n",
    "        all_model_scores = sorted([get_prmscore_from_current_res_dict(res) for res in res_dict.values()], reverse=True)\n",
    "        if idx == 0:\n",
    "            avg_res_list.append(sum(all_model_scores) / len(all_model_scores))\n",
    "        if prm_score == max(all_model_scores):\n",
    "            temp_str += f\"| **{prm_score * 100:.1f}** \"\n",
    "        elif prm_score == all_model_scores[1]:\n",
    "            temp_str += f\"| _{prm_score * 100:.1f}_ \"\n",
    "        else:\n",
    "            temp_str += f\"| {prm_score * 100:.1f} \"\n",
    "\n",
    "        # 分类指标部分\n",
    "        for big_classification, current_classification_dict in classification_dict.items():\n",
    "            all_avt = sorted([get_avg_prmscore_from_current_res_dict(res, list(current_classification_dict.keys())) for res in res_dict.values()], reverse=True)\n",
    "            avg = []\n",
    "            for classification, display_classification_name in current_classification_dict.items():\n",
    "                prm_score = get_prmscore_from_current_res_dict(current_res_dict, classification)\n",
    "                all_prm_scores = sorted([get_prmscore_from_current_res_dict(res, classification) for res in res_dict.values()], reverse=True)\n",
    "                if idx == 0:\n",
    "                    avg_res_list.append(sum(all_prm_scores) / len(all_prm_scores))\n",
    "                avg.append(prm_score)\n",
    "                if prm_score == max(all_prm_scores):\n",
    "                    temp_str += f\"| **{prm_score * 100:.1f}** \"\n",
    "                elif prm_score == all_prm_scores[1]:\n",
    "                    temp_str += f\"| _{prm_score * 100:.1f}_ \"\n",
    "                else:\n",
    "                    temp_str += f\"| {prm_score * 100:.1f} \"\n",
    "\n",
    "            # 分类指标的平均分\n",
    "            avg_score = sum(avg) / len(avg)\n",
    "            if avg_score == max(all_avt):\n",
    "                temp_str += f\"| **{avg_score * 100:.1f}** \"\n",
    "            elif avg_score == all_avt[1]:\n",
    "                temp_str += f\"| _{avg_score * 100:.1f}_ \"\n",
    "            else:\n",
    "                temp_str += f\"| {avg_score * 100:.1f} \"\n",
    "            if idx == 0:\n",
    "                avg_res_list.append(sum(all_avt) / len(all_avt))\n",
    "\n",
    "        # 行结束\n",
    "        temp_str += \"\\n\"\n",
    "        res_str += temp_str\n",
    "\n",
    "    # 平均行\n",
    "    avg_res_str = \"| **Avg.** \"\n",
    "    for res in avg_res_list:\n",
    "        avg_res_str += f\"| **{res * 100:.1f}** \"\n",
    "    avg_res_str += \"|\\n\"\n",
    "    res_str += avg_res_str\n",
    "\n",
    "    return res_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| Model | Overall| NR. | NCL. | Avg (simplicity) | ES. | SC. | DC. | CI. | Avg (soundness) | PS. | DR. | MS. | Avg (sensitivity)  |\n",
      "|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|------- |\n",
      "| [Skywork-PRM-1.5B](https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B) | 31.7 | 31.4 | 35.8 | 33.6 | 32.4 | 25.7 | 26.0 | 30.2 | 28.6 | 33.1 | 32.3 | 81.1 | 48.8 \n",
      "| [Skywork-PRM-7B](https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B) | 36.2 | 35.7 | 41.2 | 38.4 | 36.7 | 29.1 | 30.6 | 34.4 | 32.7 | 36.8 | 37.4 | 88.8 | 54.3 \n",
      "| [Llemma-PRM800k-7B](https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf) | 52.0 | 49.3 | 53.4 | 51.4 | 56.4 | 47.1 | 46.7 | 53.3 | 50.9 | 51.0 | 53.5 | 93.6 | 66.0 \n",
      "| [Llemma-MetaMath-7B](https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf) | 50.5 | 50.2 | 50.5 | 50.3 | 51.9 | 47.6 | 44.4 | 52.1 | 49.0 | 50.5 | 51.3 | 96.0 | 66.0 \n",
      "| [Llemma-oprm-7B](https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf) | 50.3 | 48.7 | 49.3 | 49.0 | 54.2 | 46.8 | 44.5 | 53.5 | 49.8 | 49.2 | 51.3 | 91.8 | 64.1 \n",
      "| [MATHMinos-Mistral-7B](https://github.com/KbsdJames/MATH-Minos) | 54.2 | 48.8 | 54.0 | 51.4 | 57.0 | 52.1 | 50.7 | 57.8 | 54.4 | 52.8 | 55.8 | 91.1 | 66.5 \n",
      "| [MathShepherd-Mistral-7B](https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm) | 47.0 | 44.0 | 50.3 | 47.1 | 49.4 | 44.5 | 41.3 | 47.7 | 45.7 | 47.2 | 48.6 | 86.1 | 60.7 \n",
      "| [ReasonEval-7B](https://huggingface.co/GAIR/ReasonEval-7B) | 60.0 | **61.0** | 50.1 | **55.5** | 62.1 | 65.9 | 61.5 | 66.0 | 63.9 | 55.6 | 58.0 | 99.5 | 71.0 \n",
      "| [RLHFlow-PRM-Mistral-8B](https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data) | 54.4 | 46.1 | 47.3 | 46.7 | 56.6 | 55.1 | 54.4 | 63.8 | 57.5 | 51.5 | 56.2 | 97.9 | 68.5 \n",
      "| [RLHFlow-PRM-Deepseek-8B](https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data) | 54.2 | 46.4 | 48.9 | 47.6 | 55.7 | 55.0 | 53.2 | 66.2 | 57.5 | 49.0 | 55.4 | **99.8** | 68.1 \n",
      "| [ReasonEval-34B](https://huggingface.co/GAIR/ReasonEval-34B) | 60.5 | _54.8_ | 48.1 | 51.5 | 66.4 | 60.3 | 57.8 | 67.5 | 63.0 | _57.7_ | 64.3 | 97.2 | 73.1 \n",
      "| [Qwen2.5-Math-PRM-7B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B) | _65.5_ | 49.0 | _55.1_ | 52.1 | _71.8_ | _67.3_ | _66.3_ | _78.5_ | _71.0_ | 57.6 | _69.1_ | _99.7_ | _75.5_ \n",
      "| [Qwen2.5-Math-PRM-72B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B) | **68.2** | 50.4 | **58.8** | _54.6_ | **73.7** | **71.1** | **72.2** | **78.6** | **73.9** | **60.3** | **71.2** | 99.4 | **77.0** \n",
      "| **Avg.** | **52.7** | **47.4** | **49.4** | **48.4** | **55.7** | **51.4** | **50.0** | **57.7** | **53.7** | **50.2** | **54.2** | **94.0** | **66.1** |\n",
      "| Model | Overall| NR. | NCL. | Avg (simplicity) | ES. | SC. | DC. | CI. | Avg (soundness) | PS. | DR. | MS. | Avg (sensitivity)  |\n",
      "|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|------- |\n",
      "| [QwQ-Preview-32B](https://huggingface.co/Qwen/QwQ-32B-Preview) | **63.6** | **57.2** | **55.6** | **56.4** | **67.4** | **72.3** | _66.2_ | **66.9** | **68.2** | **57.8** | **62.7** | **100.0** | **73.5** \n",
      "| [MetaMath-7B](https://huggingface.co/meta-math/MetaMath-7B-V1.0) | 49.7 | 48.9 | 46.9 | 47.9 | 47.3 | 48.9 | 48.4 | 48.8 | 48.3 | 46.5 | 48.3 | 98.0 | 64.2 \n",
      "| [MetaMath-13B](https://huggingface.co/meta-math/MetaMath-13B-V1.0) | 49.4 | 50.3 | 44.4 | 47.3 | 47.8 | 47.4 | 49.4 | 48.1 | 48.2 | 49.0 | 48.1 | 99.5 | 65.5 \n",
      "| [MetaMath-70B](https://huggingface.co/meta-math/MetaMath-70B-V1.0) | 45.9 | 45.5 | 43.0 | 44.2 | 44.6 | 51.4 | 47.5 | 0.0 | 35.9 | 44.0 | 45.3 | **100.0** | 63.1 \n",
      "| [Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 49.2 | 0.0 | 0.0 | 0.0 | 0.0 | 48.6 | 0.0 | 0.0 | 12.2 | 0.0 | 0.0 | 0.0 | 0.0 \n",
      "| [Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B) | 57.4 | 55.3 | _54.9_ | _55.1_ | 55.5 | _71.6_ | 58.1 | 59.1 | 61.1 | 47.4 | 53.8 | **100.0** | 67.1 \n",
      "| [R1-Distill-Llama3.1-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | 52.7 | 49.9 | 46.6 | 48.2 | 52.9 | 63.2 | 54.5 | 54.9 | 56.4 | 46.7 | 48.5 | **100.0** | 65.0 \n",
      "| [R1-Distill-Llama3.1-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | 57.5 | 49.5 | 48.1 | 48.8 | 61.4 | 65.5 | 65.8 | 61.1 | 63.4 | 48.8 | 54.1 | 100.0 | 67.6 \n",
      "| [R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 52.6 | 32.9 | 37.9 | 35.4 | 47.3 | 54.1 | 48.4 | 48.0 | 49.4 | 45.6 | 46.8 | **100.0** | 64.1 \n",
      "| [R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | _60.2_ | _57.2_ | 51.9 | 54.5 | _66.1_ | 68.4 | **69.3** | _64.8_ | _67.2_ | _53.3_ | _54.6_ | 99.9 | _69.3_ \n",
      "| [WizardMath-7B](https://huggingface.co/WizardLMTeam/WizardMath-7B-V1.0) | 49.2 | 50.8 | 48.5 | 49.6 | 44.9 | 0.0 | 45.8 | 47.0 | 34.4 | 0.0 | 45.5 | **100.0** | 48.5 \n",
      "| **Avg.** | **53.4** | **45.2** | **43.4** | **44.3** | **48.7** | **53.8** | **50.3** | **45.3** | **49.5** | **39.9** | **46.1** | **90.7** | **58.9** |\n",
      "| Model | Overall| NR. | NCL. | Avg (simplicity) | ES. | SC. | DC. | CI. | Avg (soundness) | PS. | DR. | MS. | Avg (sensitivity)  |\n",
      "|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|------- |\n",
      "| [GPT-4o](https://openai.com/index/hello-gpt-4o/) | 66.8 | 57.0 | 62.4 | 59.7 | 72.0 | _69.7_ | 70.7 | 71.1 | 70.9 | **62.5** | 65.7 | 99.2 | **75.8** \n",
      "| [o1-mini](https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/)\\$^\\dagger$ | _68.8_ | 65.6 | _63.7_ | _64.6_ | **74.5** | 67.7 | **73.8** | **72.3** | **72.1** | _61.8_ | 64.8 | **100.0** | _75.5_ \n",
      "| [Gemini-2.0-flash-exp](https://deepmind.google/technologies/gemini/flash/) | 66.0 | _67.2_ | 58.1 | 62.7 | 70.4 | 65.7 | 66.0 | 67.3 | 67.3 | 61.8 | **66.2** | 98.2 | 75.4 \n",
      "| [Gemini-2.0-thinking-exp-1219](https://ai.google.dev/gemini-api/docs/thinking-mode) | **68.8** | **68.5** | **63.8** | **66.2** | _72.9_ | **71.3** | _71.0_ | _71.8_ | _71.8_ | 60.3 | _65.7_ | _99.8_ | 75.3 \n",
      "| **Avg.** | **67.6** | **64.6** | **62.0** | **63.3** | **72.4** | **68.6** | **70.4** | **70.7** | **70.5** | **61.6** | **65.6** | **99.3** | **75.5** |\n",
      "\n"
     ]
    }
   ],
   "source": [
    "res_str = \"\"\n",
    "\n",
    "## PRMs\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(prm_model_name_dict.keys()))\n",
    "prm_str = get_res_str(prm_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += prm_str\n",
    "\n",
    "## Open Models\n",
    "res_dict = get_res_dict(file_dict, model_lists=list(open_model_name_dict.keys()))\n",
    "open_str = get_res_str(open_model_name_dict, classification_parallel_dict, res_dict)\n",
    "res_str += open_str\n",
    "\n",
    "## Close Models\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(close_model_name_dict.keys()))\n",
    "close_str = get_res_str(close_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += close_str\n",
    "\n",
    "\n",
    "print(res_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Form HTML str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "prm_model_dict = {\n",
    "    \"skyworkprm_1_5B\": {\"Name\": \"Skywork-PRM-1.5B\", \"Source\": \"https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B\", \"Class\": \"PRM\"},\n",
    "    \"skyworkprm_7B\": {\"Name\": \"Skywork-PRM-7B\", \"Source\": \"https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B\", \"Class\": \"PRM\"},\n",
    "    \"llemma7b_prm_prm800k\": {\"Name\": \"Llemma-PRM800k-7B\", \"Source\": \"https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf\", \"Class\": \"PRM\"},\n",
    "    \"llemma7b_prm_metamath\": {\"Name\": \"Llemma-MetaMath-7B\", \"Source\": \"https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf\", \"Class\": \"PRM\"},\n",
    "    \"llemma7b_oprm_prm800k\": {\"Name\": \"Llemma-oprm-7B\", \"Source\": \"https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf\", \"Class\": \"PRM\"},\n",
    "    \"mathminos_mistral\": {\"Name\": \"MATHMinos-Mistral-7B\", \"Source\": \"https://github.com/KbsdJames/MATH-Minos\", \"Class\": \"PRM\"},\n",
    "    \"mathshepherd\": {\"Name\": \"MathShepherd-Mistral-7B\", \"Source\": \"https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm\", \"Class\": \"PRM\"},\n",
    "    \"reasoneval7b\": {\"Name\": \"ReasonEval-7B\", \"Source\": \"https://huggingface.co/GAIR/ReasonEval-7B\", \"Class\": \"PRM\"},\n",
    "    \"llama3_1_8b_prm_mistral\": {\"Name\": \"RLHFlow-PRM-Mistral-8B\", \"Source\": \"https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data\", \"Class\": \"PRM\"},\n",
    "    \"llama3_1_8b_prm_deepseek\": {\"Name\": \"RLHFlow-PRM-Deepseek-8B\", \"Source\": \"https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data\", \"Class\": \"PRM\"},\n",
    "    \"reasoneval34b\": {\"Name\": \"ReasonEval-34B\", \"Source\": \"https://huggingface.co/GAIR/ReasonEval-34B\", \"Class\": \"PRM\"},\n",
    "    \"gpt4o\": {\"Name\": \"GPT-4o\", \"Source\": \"https://openai.com/index/hello-gpt-4o/\", \"Class\": \"LM-C\"},\n",
    "    \"o1mini\": {\"Name\": \"o1-mini\", \"Source\": \"https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/\", \"Class\": \"LM-C\"},\n",
    "    \"gemini_2_flash\": {\"Name\": \"Gemini-2.0-flash-exp\", \"Source\": \"https://deepmind.google/technologies/gemini/flash/\", \"Class\": \"LM-C\"},\n",
    "    \"gemini_2_thinking\": {\"Name\": \"Gemini-2.0-thinking-exp-1219\", \"Source\": \"https://ai.google.dev/gemini-api/docs/thinking-mode\", \"Class\": \"LM-C\"},\n",
    "    # \"o1preview\": {\"Name\": \"o1-preview\", \"Source\": \"https://openai.com/index/introducing-openai-o1-preview/\", \"Class\": \"LM-C\"},\n",
    "    \"qwen_qwq\": {\"Name\": \"QwQ-Preview-32B\", \"Source\": \"https://huggingface.co/Qwen/QwQ-32B-Preview\", \"Class\": \"LM-O\"},\n",
    "    \"qwen_prm7b\": {\"Name\": \"Qwen2.5-Math-PRM-7B\", \"Source\": \"https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B\", \"Class\": \"PRM\"},\n",
    "    \"qwen_prm72b\": {\"Name\": \"Qwen2.5-Math-PRM-72B\", \"Source\": \"https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B\", \"Class\": \"PRM\"},\n",
    "    \"metamath_7b\": {\"Name\": \"MetaMath-7B\", \"Source\": \"https://huggingface.co/meta-math/MetaMath-7B-V1.0\", \"Class\": \"LM-O\"},\n",
    "    \"metamath_13b\": {\"Name\": \"MetaMath-13B\", \"Source\": \"https://huggingface.co/meta-math/MetaMath-13B-V1.0\", \"Class\": \"LM-O\"},\n",
    "    # \"metamath_70b\": {\"Name\": \"MetaMath-70B\", \"Source\": \"https://huggingface.co/meta-math/MetaMath-70B-V1.0\", \"Class\": \"LM-O\"},\n",
    "    # \"qwen25_7b\": {\"Name\": \"Qwen2.5-Math-7B\", \"Source\": \"https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct\", \"Class\": \"LM-O\"},\n",
    "    \"qwen25_72b\": {\"Name\": \"Qwen2.5-Math-72B\", \"Source\": \"https://huggingface.co/Qwen/Qwen2.5-Math-72B\", \"Class\": \"LM-O\"},\n",
    "    \"r1_distill_llama_8b\": {\"Name\": \"R1-Distill-Llama3.1-8B\", \"Source\": \"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B\", \"Class\": \"LM-O\"},\n",
    "    \"r1_distill_llama_70b\": {\"Name\": \"R1-Distill-Llama3.1-70B\", \"Source\": \"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B\", \"Class\": \"LM-O\"},\n",
    "    \"r1_distill_qwen_7b\": {\"Name\": \"R1-Distill-Qwen-7B\", \"Source\": \"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\", \"Class\": \"LM-O\"},\n",
    "    \"r1_distill_qwen_32b\": {\"Name\": \"R1-Distill-Qwen-32B\", \"Source\": \"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B\", \"Class\": \"LM-O\"},\n",
    "    # \"wizardmath_7b\": {\"Name\": \"WizardMath-7B\", \"Source\": \"https://huggingface.co/WizardLMTeam/WizardMath-7B-V1.0\", \"Class\": \"LM-O\"}\n",
    "    \"pure_prm_7b\": {\"Name\": \"Pure-PRM-7B\", \"Source\": \"https://huggingface.co/jinachris/Qwen2.5-Math-7B-PRM800K\", \"Class\": \"PRM\"}\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_html_table(model_dict, classification_dict, res_dict):\n",
    "    res_str = \"\"\n",
    "    html_str = '<table class=\"js-sort-table\" id=\"results\">\\n'\n",
    "    \n",
    "    # 表头部分\n",
    "    html_str += '  <tr>\\n'\n",
    "    html_str += '    <td class=\"js-sort-number\"><strong>#</strong></td>\\n'\n",
    "    html_str += '    <td class=\"js-sort-number\"><strong>Model</strong></td>\\n'\n",
    "    html_str += '    <td class=\"js-sort-number\"><strong>Class</strong></td>\\n'\n",
    "    html_str += '    <td class=\"js-sort-number\"><strong>Source</strong></td>\\n'\n",
    "    html_str += '    <td class=\"js-sort-number\"><strong>Overall</strong></td>\\n'\n",
    "    \n",
    "    # 动态生成分类列标题\n",
    "    for big_classification_idx, (big_classification, current_classification_dict) in enumerate(classification_dict.items()):\n",
    "        for classification, display_classification_name in current_classification_dict.items():\n",
    "            html_str += f'    <td class=\"js-sort-number\"><strong>{display_classification_name}</strong></td>\\n'\n",
    "        html_str += f'    <td class=\"js-sort-number\"><strong>S{big_classification_idx+1}</strong></td>\\n'  # 添加大类 Avg 列\n",
    "    html_str += '  </tr>\\n'\n",
    "    res_str += html_str\n",
    "    sort_list = []\n",
    "    # 数据部分\n",
    "    for idx, (model_k, model) in enumerate(model_dict.items()):\n",
    "        \n",
    "\n",
    "        # 计算 PRM Score\n",
    "        all_model_scores = sorted([get_prmscore_from_current_res_dict(res) for res in res_dict.values()], reverse=True)\n",
    "        current_res_dict = res_dict.get(model_k, {})\n",
    "        prm_score = get_prmscore_from_current_res_dict(current_res_dict)\n",
    "        if prm_score == all_model_scores[0]:\n",
    "            current_total_res_str= f'    <td><b class=\"best-score-text\">{prm_score * 100:.1f}</b></td>\\n'\n",
    "            current_model_name_str = f'     <td><b class=\"best-score-text\">{model[\"Name\"]} 🥇</b></td>\\n'\n",
    "        elif prm_score == all_model_scores[1]:\n",
    "            current_total_res_str= f'    <td><b class=\"best-score-text\">{prm_score * 100:.1f}</b></td>\\n'\n",
    "            current_model_name_str = f'     <td><b class=\"best-score-text\">{model[\"Name\"]} 🥈</b></td>\\n'\n",
    "        elif prm_score == all_model_scores[2]:\n",
    "            current_total_res_str= f'    <td><b class=\"best-score-text\">{prm_score * 100:.1f}</b></td>\\n'\n",
    "            current_model_name_str = f'     <td><b class=\"best-score-text\">{model[\"Name\"]} 🥉</b></td>\\n'\n",
    "        else:\n",
    "            current_total_res_str= f'    <td><b class=\"\">{prm_score * 100:.1f}</b></td>\\n'\n",
    "            current_model_name_str = f'     <td><b class=\"\">{model[\"Name\"]}</b></td>\\n'\n",
    "        html_str = ''\n",
    "        html_str += '  <tr>\\n'\n",
    "        html_str += \"    <td>{CURRENT_RANK}</td>\\n\"\n",
    "        html_str += current_model_name_str\n",
    "        html_str += f'    <td>{model[\"Class\"]}</td>\\n'\n",
    "        html_str += f'    <td><a href=\"{model[\"Source\"]}\" class=\"ext-link\" target=\"_blank\">Link</a></td>\\n'\n",
    "        html_str += current_total_res_str\n",
    "        currunt_total_prm_score = prm_score\n",
    "        # 分类指标部分\n",
    "        for big_classification, current_classification_dict in classification_dict.items():\n",
    "            avg = []  # 保存当前大类的分类指标分数\n",
    "            for classification, display_classification_name in current_classification_dict.items():\n",
    "                prm_score = get_prmscore_from_current_res_dict(current_res_dict, classification)\n",
    "                avg.append(prm_score)\n",
    "                html_str += f'    <td>{prm_score * 100:.1f}</td>\\n'\n",
    "            \n",
    "            #大类平均值\n",
    "            avg_score = sum(avg) / len(avg) if avg else 0\n",
    "            html_str += f'    <td><b class=\"\">{avg_score * 100:.1f}</b></td>\\n'\n",
    "\n",
    "        html_str += '  </tr>\\n'\n",
    "        sort_list.append((currunt_total_prm_score, html_str))\n",
    "    sort_list.sort(key=lambda x: x[0], reverse=True)\n",
    "    for idx,(_, html_str) in enumerate(sort_list):\n",
    "        res_str += html_str.format(CURRENT_RANK=idx+1)\n",
    "    res_str += '</table>\\n'\n",
    "    return res_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<table class=\"js-sort-table\" id=\"results\">\n",
      "  <tr>\n",
      "    <td class=\"js-sort-number\"><strong>#</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>Model</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>Class</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>Source</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>Overall</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>NR.</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>NCL.</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>S1</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>ES.</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>SC.</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>DC.</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>CI.</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>S2</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>PS.</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>DR.</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>MS.</strong></td>\n",
      "    <td class=\"js-sort-number\"><strong>S3</strong></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>1</td>\n",
      "     <td><b class=\"best-score-text\">Gemini-2.0-thinking-exp-1219 🥇</b></td>\n",
      "    <td>LM-C</td>\n",
      "    <td><a href=\"https://ai.google.dev/gemini-api/docs/thinking-mode\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"best-score-text\">68.8</b></td>\n",
      "    <td>68.5</td>\n",
      "    <td>63.8</td>\n",
      "    <td><b class=\"\">66.2</b></td>\n",
      "    <td>72.9</td>\n",
      "    <td>71.3</td>\n",
      "    <td>71.0</td>\n",
      "    <td>71.8</td>\n",
      "    <td><b class=\"\">71.8</b></td>\n",
      "    <td>60.3</td>\n",
      "    <td>65.7</td>\n",
      "    <td>99.8</td>\n",
      "    <td><b class=\"\">75.3</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>2</td>\n",
      "     <td><b class=\"best-score-text\">o1-mini 🥈</b></td>\n",
      "    <td>LM-C</td>\n",
      "    <td><a href=\"https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"best-score-text\">68.8</b></td>\n",
      "    <td>65.6</td>\n",
      "    <td>63.7</td>\n",
      "    <td><b class=\"\">64.6</b></td>\n",
      "    <td>74.5</td>\n",
      "    <td>67.7</td>\n",
      "    <td>73.8</td>\n",
      "    <td>72.3</td>\n",
      "    <td><b class=\"\">72.1</b></td>\n",
      "    <td>61.8</td>\n",
      "    <td>64.8</td>\n",
      "    <td>100.0</td>\n",
      "    <td><b class=\"\">75.5</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>3</td>\n",
      "     <td><b class=\"best-score-text\">Qwen2.5-Math-PRM-72B 🥉</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"best-score-text\">68.2</b></td>\n",
      "    <td>50.4</td>\n",
      "    <td>58.8</td>\n",
      "    <td><b class=\"\">54.6</b></td>\n",
      "    <td>73.7</td>\n",
      "    <td>71.1</td>\n",
      "    <td>72.2</td>\n",
      "    <td>78.6</td>\n",
      "    <td><b class=\"\">73.9</b></td>\n",
      "    <td>60.3</td>\n",
      "    <td>71.2</td>\n",
      "    <td>99.4</td>\n",
      "    <td><b class=\"\">77.0</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>4</td>\n",
      "     <td><b class=\"\">GPT-4o</b></td>\n",
      "    <td>LM-C</td>\n",
      "    <td><a href=\"https://openai.com/index/hello-gpt-4o/\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">66.8</b></td>\n",
      "    <td>57.0</td>\n",
      "    <td>62.4</td>\n",
      "    <td><b class=\"\">59.7</b></td>\n",
      "    <td>72.0</td>\n",
      "    <td>69.7</td>\n",
      "    <td>70.7</td>\n",
      "    <td>71.1</td>\n",
      "    <td><b class=\"\">70.9</b></td>\n",
      "    <td>62.5</td>\n",
      "    <td>65.7</td>\n",
      "    <td>99.2</td>\n",
      "    <td><b class=\"\">75.8</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>5</td>\n",
      "     <td><b class=\"\">Gemini-2.0-flash-exp</b></td>\n",
      "    <td>LM-C</td>\n",
      "    <td><a href=\"https://deepmind.google/technologies/gemini/flash/\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">66.0</b></td>\n",
      "    <td>67.2</td>\n",
      "    <td>58.1</td>\n",
      "    <td><b class=\"\">62.7</b></td>\n",
      "    <td>70.4</td>\n",
      "    <td>65.7</td>\n",
      "    <td>66.0</td>\n",
      "    <td>67.3</td>\n",
      "    <td><b class=\"\">67.3</b></td>\n",
      "    <td>61.8</td>\n",
      "    <td>66.2</td>\n",
      "    <td>98.2</td>\n",
      "    <td><b class=\"\">75.4</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>6</td>\n",
      "     <td><b class=\"\">Qwen2.5-Math-PRM-7B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">65.5</b></td>\n",
      "    <td>49.0</td>\n",
      "    <td>55.1</td>\n",
      "    <td><b class=\"\">52.1</b></td>\n",
      "    <td>71.8</td>\n",
      "    <td>67.3</td>\n",
      "    <td>66.3</td>\n",
      "    <td>78.5</td>\n",
      "    <td><b class=\"\">71.0</b></td>\n",
      "    <td>57.6</td>\n",
      "    <td>69.1</td>\n",
      "    <td>99.7</td>\n",
      "    <td><b class=\"\">75.5</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>7</td>\n",
      "     <td><b class=\"\">Pure-PRM-7B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/jinachris/Qwen2.5-Math-7B-PRM800K\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">65.3</b></td>\n",
      "    <td>49.2</td>\n",
      "    <td>55.2</td>\n",
      "    <td><b class=\"\">52.2</b></td>\n",
      "    <td>71.1</td>\n",
      "    <td>68.8</td>\n",
      "    <td>64.0</td>\n",
      "    <td>76.9</td>\n",
      "    <td><b class=\"\">70.2</b></td>\n",
      "    <td>60.3</td>\n",
      "    <td>69.2</td>\n",
      "    <td>98.0</td>\n",
      "    <td><b class=\"\">75.8</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>8</td>\n",
      "     <td><b class=\"\">Skywork-PRM-7B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">65.1</b></td>\n",
      "    <td>56.4</td>\n",
      "    <td>62.8</td>\n",
      "    <td><b class=\"\">59.6</b></td>\n",
      "    <td>69.4</td>\n",
      "    <td>67.1</td>\n",
      "    <td>67.7</td>\n",
      "    <td>69.9</td>\n",
      "    <td><b class=\"\">68.5</b></td>\n",
      "    <td>60.9</td>\n",
      "    <td>65.8</td>\n",
      "    <td>93.2</td>\n",
      "    <td><b class=\"\">73.3</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>9</td>\n",
      "     <td><b class=\"\">QwQ-Preview-32B</b></td>\n",
      "    <td>LM-O</td>\n",
      "    <td><a href=\"https://huggingface.co/Qwen/QwQ-32B-Preview\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">63.6</b></td>\n",
      "    <td>57.2</td>\n",
      "    <td>55.6</td>\n",
      "    <td><b class=\"\">56.4</b></td>\n",
      "    <td>67.4</td>\n",
      "    <td>72.3</td>\n",
      "    <td>66.2</td>\n",
      "    <td>66.9</td>\n",
      "    <td><b class=\"\">68.2</b></td>\n",
      "    <td>57.8</td>\n",
      "    <td>62.7</td>\n",
      "    <td>100.0</td>\n",
      "    <td><b class=\"\">73.5</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>10</td>\n",
      "     <td><b class=\"\">Skywork-PRM-1.5B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">61.1</b></td>\n",
      "    <td>52.0</td>\n",
      "    <td>56.4</td>\n",
      "    <td><b class=\"\">54.2</b></td>\n",
      "    <td>64.8</td>\n",
      "    <td>64.9</td>\n",
      "    <td>63.3</td>\n",
      "    <td>66.5</td>\n",
      "    <td><b class=\"\">64.9</b></td>\n",
      "    <td>57.5</td>\n",
      "    <td>63.3</td>\n",
      "    <td>91.1</td>\n",
      "    <td><b class=\"\">70.7</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>11</td>\n",
      "     <td><b class=\"\">ReasonEval-34B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/GAIR/ReasonEval-34B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">60.5</b></td>\n",
      "    <td>54.8</td>\n",
      "    <td>48.1</td>\n",
      "    <td><b class=\"\">51.5</b></td>\n",
      "    <td>66.4</td>\n",
      "    <td>60.3</td>\n",
      "    <td>57.8</td>\n",
      "    <td>67.5</td>\n",
      "    <td><b class=\"\">63.0</b></td>\n",
      "    <td>57.7</td>\n",
      "    <td>64.3</td>\n",
      "    <td>97.2</td>\n",
      "    <td><b class=\"\">73.1</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>12</td>\n",
      "     <td><b class=\"\">R1-Distill-Qwen-32B</b></td>\n",
      "    <td>LM-O</td>\n",
      "    <td><a href=\"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">60.2</b></td>\n",
      "    <td>57.2</td>\n",
      "    <td>51.9</td>\n",
      "    <td><b class=\"\">54.5</b></td>\n",
      "    <td>66.1</td>\n",
      "    <td>68.4</td>\n",
      "    <td>69.3</td>\n",
      "    <td>64.8</td>\n",
      "    <td><b class=\"\">67.2</b></td>\n",
      "    <td>53.3</td>\n",
      "    <td>54.6</td>\n",
      "    <td>99.9</td>\n",
      "    <td><b class=\"\">69.3</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>13</td>\n",
      "     <td><b class=\"\">ReasonEval-7B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/GAIR/ReasonEval-7B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">60.1</b></td>\n",
      "    <td>61.0</td>\n",
      "    <td>50.1</td>\n",
      "    <td><b class=\"\">55.6</b></td>\n",
      "    <td>62.1</td>\n",
      "    <td>65.9</td>\n",
      "    <td>61.5</td>\n",
      "    <td>66.0</td>\n",
      "    <td><b class=\"\">63.9</b></td>\n",
      "    <td>55.7</td>\n",
      "    <td>58.0</td>\n",
      "    <td>99.5</td>\n",
      "    <td><b class=\"\">71.1</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>14</td>\n",
      "     <td><b class=\"\">R1-Distill-Llama3.1-70B</b></td>\n",
      "    <td>LM-O</td>\n",
      "    <td><a href=\"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">57.5</b></td>\n",
      "    <td>49.5</td>\n",
      "    <td>48.1</td>\n",
      "    <td><b class=\"\">48.8</b></td>\n",
      "    <td>61.4</td>\n",
      "    <td>65.5</td>\n",
      "    <td>65.8</td>\n",
      "    <td>61.1</td>\n",
      "    <td><b class=\"\">63.4</b></td>\n",
      "    <td>48.8</td>\n",
      "    <td>54.1</td>\n",
      "    <td>100.0</td>\n",
      "    <td><b class=\"\">67.6</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>15</td>\n",
      "     <td><b class=\"\">Qwen2.5-Math-72B</b></td>\n",
      "    <td>LM-O</td>\n",
      "    <td><a href=\"https://huggingface.co/Qwen/Qwen2.5-Math-72B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">57.4</b></td>\n",
      "    <td>55.3</td>\n",
      "    <td>54.9</td>\n",
      "    <td><b class=\"\">55.1</b></td>\n",
      "    <td>55.5</td>\n",
      "    <td>71.6</td>\n",
      "    <td>58.1</td>\n",
      "    <td>59.1</td>\n",
      "    <td><b class=\"\">61.1</b></td>\n",
      "    <td>47.4</td>\n",
      "    <td>53.8</td>\n",
      "    <td>100.0</td>\n",
      "    <td><b class=\"\">67.1</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>16</td>\n",
      "     <td><b class=\"\">RLHFlow-PRM-Mistral-8B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">54.4</b></td>\n",
      "    <td>46.1</td>\n",
      "    <td>47.3</td>\n",
      "    <td><b class=\"\">46.7</b></td>\n",
      "    <td>56.6</td>\n",
      "    <td>55.1</td>\n",
      "    <td>54.4</td>\n",
      "    <td>63.8</td>\n",
      "    <td><b class=\"\">57.5</b></td>\n",
      "    <td>51.5</td>\n",
      "    <td>56.2</td>\n",
      "    <td>97.9</td>\n",
      "    <td><b class=\"\">68.5</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>17</td>\n",
      "     <td><b class=\"\">RLHFlow-PRM-Deepseek-8B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">54.2</b></td>\n",
      "    <td>46.4</td>\n",
      "    <td>48.9</td>\n",
      "    <td><b class=\"\">47.6</b></td>\n",
      "    <td>55.7</td>\n",
      "    <td>55.0</td>\n",
      "    <td>53.2</td>\n",
      "    <td>66.2</td>\n",
      "    <td><b class=\"\">57.5</b></td>\n",
      "    <td>49.0</td>\n",
      "    <td>55.4</td>\n",
      "    <td>99.8</td>\n",
      "    <td><b class=\"\">68.1</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>18</td>\n",
      "     <td><b class=\"\">MATHMinos-Mistral-7B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://github.com/KbsdJames/MATH-Minos\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">54.2</b></td>\n",
      "    <td>48.8</td>\n",
      "    <td>54.0</td>\n",
      "    <td><b class=\"\">51.4</b></td>\n",
      "    <td>57.0</td>\n",
      "    <td>52.1</td>\n",
      "    <td>50.7</td>\n",
      "    <td>57.8</td>\n",
      "    <td><b class=\"\">54.4</b></td>\n",
      "    <td>52.8</td>\n",
      "    <td>55.8</td>\n",
      "    <td>91.1</td>\n",
      "    <td><b class=\"\">66.5</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>19</td>\n",
      "     <td><b class=\"\">R1-Distill-Llama3.1-8B</b></td>\n",
      "    <td>LM-O</td>\n",
      "    <td><a href=\"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">52.7</b></td>\n",
      "    <td>49.9</td>\n",
      "    <td>46.6</td>\n",
      "    <td><b class=\"\">48.2</b></td>\n",
      "    <td>52.9</td>\n",
      "    <td>63.2</td>\n",
      "    <td>54.5</td>\n",
      "    <td>54.9</td>\n",
      "    <td><b class=\"\">56.4</b></td>\n",
      "    <td>46.7</td>\n",
      "    <td>48.5</td>\n",
      "    <td>100.0</td>\n",
      "    <td><b class=\"\">65.0</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>20</td>\n",
      "     <td><b class=\"\">R1-Distill-Qwen-7B</b></td>\n",
      "    <td>LM-O</td>\n",
      "    <td><a href=\"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">52.6</b></td>\n",
      "    <td>32.9</td>\n",
      "    <td>37.9</td>\n",
      "    <td><b class=\"\">35.4</b></td>\n",
      "    <td>47.3</td>\n",
      "    <td>54.1</td>\n",
      "    <td>48.4</td>\n",
      "    <td>48.0</td>\n",
      "    <td><b class=\"\">49.4</b></td>\n",
      "    <td>45.6</td>\n",
      "    <td>46.8</td>\n",
      "    <td>100.0</td>\n",
      "    <td><b class=\"\">64.1</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>21</td>\n",
      "     <td><b class=\"\">Llemma-PRM800k-7B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">52.0</b></td>\n",
      "    <td>49.3</td>\n",
      "    <td>53.4</td>\n",
      "    <td><b class=\"\">51.4</b></td>\n",
      "    <td>56.4</td>\n",
      "    <td>47.1</td>\n",
      "    <td>46.7</td>\n",
      "    <td>53.3</td>\n",
      "    <td><b class=\"\">50.9</b></td>\n",
      "    <td>51.0</td>\n",
      "    <td>53.5</td>\n",
      "    <td>93.6</td>\n",
      "    <td><b class=\"\">66.0</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>22</td>\n",
      "     <td><b class=\"\">Llemma-MetaMath-7B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">50.5</b></td>\n",
      "    <td>50.2</td>\n",
      "    <td>50.5</td>\n",
      "    <td><b class=\"\">50.3</b></td>\n",
      "    <td>51.9</td>\n",
      "    <td>47.6</td>\n",
      "    <td>44.4</td>\n",
      "    <td>52.1</td>\n",
      "    <td><b class=\"\">49.0</b></td>\n",
      "    <td>50.5</td>\n",
      "    <td>51.3</td>\n",
      "    <td>96.0</td>\n",
      "    <td><b class=\"\">66.0</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>23</td>\n",
      "     <td><b class=\"\">Llemma-oprm-7B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">50.3</b></td>\n",
      "    <td>48.7</td>\n",
      "    <td>49.3</td>\n",
      "    <td><b class=\"\">49.0</b></td>\n",
      "    <td>54.2</td>\n",
      "    <td>46.8</td>\n",
      "    <td>44.5</td>\n",
      "    <td>53.5</td>\n",
      "    <td><b class=\"\">49.8</b></td>\n",
      "    <td>49.2</td>\n",
      "    <td>51.3</td>\n",
      "    <td>91.8</td>\n",
      "    <td><b class=\"\">64.1</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>24</td>\n",
      "     <td><b class=\"\">MetaMath-7B</b></td>\n",
      "    <td>LM-O</td>\n",
      "    <td><a href=\"https://huggingface.co/meta-math/MetaMath-7B-V1.0\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">49.7</b></td>\n",
      "    <td>48.9</td>\n",
      "    <td>46.9</td>\n",
      "    <td><b class=\"\">47.9</b></td>\n",
      "    <td>47.3</td>\n",
      "    <td>48.9</td>\n",
      "    <td>48.4</td>\n",
      "    <td>48.8</td>\n",
      "    <td><b class=\"\">48.3</b></td>\n",
      "    <td>46.5</td>\n",
      "    <td>48.3</td>\n",
      "    <td>98.0</td>\n",
      "    <td><b class=\"\">64.2</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>25</td>\n",
      "     <td><b class=\"\">MetaMath-13B</b></td>\n",
      "    <td>LM-O</td>\n",
      "    <td><a href=\"https://huggingface.co/meta-math/MetaMath-13B-V1.0\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">49.4</b></td>\n",
      "    <td>50.3</td>\n",
      "    <td>44.4</td>\n",
      "    <td><b class=\"\">47.3</b></td>\n",
      "    <td>47.8</td>\n",
      "    <td>47.4</td>\n",
      "    <td>49.4</td>\n",
      "    <td>48.1</td>\n",
      "    <td><b class=\"\">48.2</b></td>\n",
      "    <td>49.0</td>\n",
      "    <td>48.1</td>\n",
      "    <td>99.5</td>\n",
      "    <td><b class=\"\">65.5</b></td>\n",
      "  </tr>\n",
      "  <tr>\n",
      "    <td>26</td>\n",
      "     <td><b class=\"\">MathShepherd-Mistral-7B</b></td>\n",
      "    <td>PRM</td>\n",
      "    <td><a href=\"https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm\" class=\"ext-link\" target=\"_blank\">Link</a></td>\n",
      "    <td><b class=\"\">47.0</b></td>\n",
      "    <td>44.0</td>\n",
      "    <td>50.3</td>\n",
      "    <td><b class=\"\">47.1</b></td>\n",
      "    <td>49.4</td>\n",
      "    <td>44.5</td>\n",
      "    <td>41.3</td>\n",
      "    <td>47.7</td>\n",
      "    <td><b class=\"\">45.7</b></td>\n",
      "    <td>47.2</td>\n",
      "    <td>48.6</td>\n",
      "    <td>86.1</td>\n",
      "    <td><b class=\"\">60.7</b></td>\n",
      "  </tr>\n",
      "</table>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "res_str = \"\"\n",
    "\n",
    "## PRMs\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(prm_model_dict.keys()))\n",
    "prm_str = get_html_table(prm_model_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += prm_str\n",
    "\n",
    "print(res_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "smoe",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
