{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "attempted relative import with no known parent package",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmr_eval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mlist_jsonl_files\u001b[39m(folder_path):\n",
      "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
     ]
    }
   ],
   "source": [
    "from mr_eval.utils.utils import *\n",
    "import os\n",
    "\n",
    "def list_jsonl_files(folder_path):\n",
    "    \"\"\"\n",
    "    列举文件夹中的所有 .jsonl 文件\n",
    "    Args:\n",
    "        folder_path (str): 文件夹路径\n",
    "    Returns:\n",
    "        List[str]: 所有 .jsonl 文件的路径\n",
    "    \"\"\"\n",
    "    return [f for f in os.listdir(folder_path) if f.endswith(\".jsonl\")]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Model names\n",
    "prm_model_name_dict = dict(\n",
    "    qwen_prm=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B}\",\n",
    "    qwen_prm_7b_05_to_1_4000=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_05_to_1_4000}\", \n",
    "    qwen_prm_7b_05_to_1_2000=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_05_to_1_2000}\", \n",
    "    qwen_prm_7b_CL_0=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL_0}\", \n",
    "    qwen_prm_7b_CL_1=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL_1}\", \n",
    "    qwen_prm_7b_CL_2=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL_2}\", \n",
    "    qwen_prm_7b_CL_3=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL_3}\", \n",
    "    qwen_prm_7b_CL3=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL3}\", \n",
    "    qwen_prm_7b_CL_1_dynamic=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL_1_dynamic}\", \n",
    "    qwen_prm_7b_CL_2_dynamic=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL_2_dynamic}\", \n",
    "    qwen_prm_7b_CL0_wo_0rating=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL_0_wo_0rating}\", \n",
    "    qwen_prm_7b_CL1_dynamic_03_to_1=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL1_dynamic_03_to_1}\", \n",
    "    qwen_prm_7b_CL2_dynamic_01_to_1=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL2_dynamic_01_to_1}\", \n",
    "    qwen_prm_7b_CL1_sameques_aug=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL1_sameques_aug}\", \n",
    "    qwen_prm_7b_CL2_sameques_aug=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL2_sameques_aug}\", \n",
    "    qwen_prm_7b_CL3_sameques_aug=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL3_sameques_aug}\", \n",
    "    qwen_prm_7b_CL4_sameques_aug=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL4_sameques_aug}\", \n",
    "    qwen_prm_7b_CL2_sameques_aug_4000=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL2_sameques_aug_4000}\", \n",
    "    qwen_prm_7b_CL0_samestep_aug=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL1_samestep_aug}\", \n",
    "    qwen_prm_7b_CL1_samestep_aug=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL2_samestep_aug}\", \n",
    "    qwen_prm_7b_CL2_samestep_aug=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B_CL3_samestep_aug}\", \n",
    "    # skyworkprm_1_5B=\"\\\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B}\",\n",
    "    # skyworkprm_7B=\"\\\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B}\",\n",
    "    # llemma7b_prm_prm800k=\"\\\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B}\",\n",
    "    # llemma7b_prm_metamath=\"\\\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B}\",\n",
    "    # llemma7b_oprm_prm800k=\"\\\\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B}\",\n",
    "    # mathminos_mistral=\"\\\\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B}\",\n",
    "    # mathshepherd=\"\\\\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B}\",\n",
    "    # reasoneval7b=\"\\\\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B}\",\n",
    "    # llama3_1_8b_prm_mistral=\"\\\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B}\",\n",
    "    # llama3_1_8b_prm_deepseek=\"\\\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B}\",\n",
    "    # reasoneval34b=\"\\\\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B}\",\n",
    ")\n",
    "close_model_name_dict = dict(\n",
    "    # gpt4o=\"\\\\href{https://openai.com/index/hello-gpt-4o/}{GPT-4o}\",\n",
    "    # o1mini=\"\\\\href{https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/}{o1-mini}$^\\dagger$\",\n",
    "    # o1preview=\"\\\\href{https://openai.com/index/introducing-openai-o1-preview/}{o1-preview}$^\\dagger$\",\n",
    "    # gemini_2_flash=\"\\\\href{https://deepmind.google/technologies/gemini/flash/}{Gemini-2.0-flash-exp}\",\n",
    "    # gemini_2_thinking=\"\\\\href{https://ai.google.dev/gemini-api/docs/thinking-mode}{Gemini-2.0-thinking-exp-1219}\",\n",
    ")\n",
    "    \n",
    "open_model_name_dict = dict(\n",
    "    qwen_qwq=\"\\\\href{https://huggingface.co/Qwen/QwQ-32B-Preview}{QwQ-Preview-32B}\",\n",
    ")\n",
    "\n",
    "\n",
    "\n",
    "classification_name_dict = dict(\n",
    "    domain_inconsistency=\"DC.\",\n",
    "    redundency=\"NR.\",\n",
    "    multi_solutions=\"MS.\",\n",
    "    deception=\"DR.\",\n",
    "    confidence=\"CI.\",\n",
    "    step_contradiction=\"SC.\",\n",
    "    circular=\"NCL.\",\n",
    "    missing_condition=\"PS.\",\n",
    "    counterfactual=\"ES.\"\n",
    ")\n",
    "classification_parallel_dict = dict(\n",
    "    simplicity=dict(\n",
    "        redundency=\"NR.\",\n",
    "        circular=\"NCL.\",\n",
    "    ),\n",
    "    soundness=dict(\n",
    "        counterfactual=\"ES.\",\n",
    "        step_contradiction=\"SC.\",\n",
    "        domain_inconsistency=\"DC.\",\n",
    "        confidence=\"CI.\",\n",
    "    ),\n",
    "    sensitivity=dict(\n",
    "        missing_condition=\"PS.\",\n",
    "        deception=\"DR.\",\n",
    "        multi_solutions=\"MS.\",\n",
    "    )\n",
    ")\n",
    "classifications = [\"redundency\", \"circular\", \"counterfactual\", \"step_contradiction\", \"domain_inconsistency\",  \"confidence\", \"missing_condition\", \"deception\", \"multi_solutions\", ]\n",
    "metrics = [\"f1\", \"negative_f1\", \"total_step_acc\", \"correct_step_acc\", \"wrong_step_acc\", \"first_error_acc\", \"similarity\",]\n",
    "\n",
    "## File paths\n",
    "res_dir = \"/cmlscratch/agrawal5/Reward_hacking/scripts/PRMBench/results\"\n",
    "res_files = list_jsonl_files(res_dir)\n",
    "res_names = [f.split(\".\")[0] for f in res_files]\n",
    "res_paths = [os.path.join(res_dir, f) for f in res_files]\n",
    "file_dict = dict(zip(res_names, res_paths))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[37], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_res_dict\u001b[39m(file_dict,model_lists\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m      2\u001b[0m     res_dict \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m model_lists:\n",
      "Cell \u001b[0;32mIn[37], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_res_dict\u001b[39m(file_dict,model_lists\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m      2\u001b[0m     res_dict \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m model_lists:\n",
      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:1457\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.SafeCallWrapper.__call__\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:701\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:1152\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:1135\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32m_pydevd_bundle/pydevd_cython.pyx:312\u001b[0m, in \u001b[0;36m_pydevd_bundle.pydevd_cython.PyDBFrame.do_wait_suspend\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32m~/anaconda3/envs/smoe/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd.py:2070\u001b[0m, in \u001b[0;36mPyDB.do_wait_suspend\u001b[0;34m(self, thread, frame, event, arg, exception_type)\u001b[0m\n\u001b[1;32m   2067\u001b[0m             from_this_thread\u001b[38;5;241m.\u001b[39mappend(frame_custom_thread_id)\n\u001b[1;32m   2069\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_threads_suspended_single_notification\u001b[38;5;241m.\u001b[39mnotify_thread_suspended(thread_id, thread, stop_reason):\n\u001b[0;32m-> 2070\u001b[0m         keep_suspended \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_wait_suspend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mthread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msuspend_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrom_this_thread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframes_tracker\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2072\u001b[0m frames_list \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   2074\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m keep_suspended:\n\u001b[1;32m   2075\u001b[0m     \u001b[38;5;66;03m# This means that we should pause again after a set next statement.\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/envs/smoe/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd.py:2106\u001b[0m, in \u001b[0;36mPyDB._do_wait_suspend\u001b[0;34m(self, thread, frame, event, arg, suspend_type, from_this_thread, frames_tracker)\u001b[0m\n\u001b[1;32m   2103\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_input_hook()\n\u001b[1;32m   2105\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprocess_internal_commands()\n\u001b[0;32m-> 2106\u001b[0m     time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m0.01\u001b[39m)\n\u001b[1;32m   2108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcancel_async_evaluation(get_current_thread_id(thread), \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mid\u001b[39m(frame)))\n\u001b[1;32m   2110\u001b[0m \u001b[38;5;66;03m# process any stepping instructions\u001b[39;00m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "def get_res_dict(file_dict,model_lists=None):\n",
    "    res_dict = {}\n",
    "    if not model_lists:\n",
    "        for model_name, file_path in file_dict.items():\n",
    "            res_dict[model_name] = process_jsonl(file_path)[-1]\n",
    "    else:\n",
    "        for model_name in model_lists:\n",
    "            file_path = file_dict[model_name]\n",
    "            res_dict[model_name] = process_jsonl(file_path)[-1]\n",
    "    return res_dict\n",
    "\n",
    "\n",
    "def get_prmscore_from_current_res_dict(res_dict,classification=None):\n",
    "    '''\n",
    "    Get PRM score from model level dict\n",
    "    '''\n",
    "    if not classification:\n",
    "        prm_score = res_dict[\"total_hallucination_results\"]['f1'] * 0.5 + res_dict[\"total_hallucination_results\"]['negative_f1'] * 0.5\n",
    "    else:\n",
    "        if classification in [\"multi_solutions\"]:\n",
    "            prm_score = res_dict[\"hallucination_type_results\"]['f1'][classification]\n",
    "        else:\n",
    "            prm_score = res_dict[\"hallucination_type_results\"]['f1'][classification] * 0.5 + res_dict[\"hallucination_type_results\"]['negative_f1'][classification] * 0.5\n",
    "    return prm_score\n",
    "\n",
    "\n",
    "def get_avg_prmscore_from_current_res_dict(res_dict,classifications):\n",
    "    '''\n",
    "    Get AVG PRM score from model level dict\n",
    "    '''\n",
    "    assert classifications\n",
    "    res = [get_prmscore_from_current_res_dict(res_dict,classification) for classification in classifications]\n",
    "    return sum(res) / len(res)\n",
    "    \n",
    "\n",
    "def get_res_str(model_dict,classification_dict,res_dict):\n",
    "    res_str = \"\"\n",
    "    # current_classification_dict = classification_dict[classification_name]\n",
    "    for model_name, model_display_name in model_dict.items():\n",
    "        temp_str = f\"{model_display_name}\"\n",
    "        current_res_dict = res_dict[model_name]\n",
    "        prm_score = get_prmscore_from_current_res_dict(current_res_dict)\n",
    "        all_model_scores = sorted([get_prmscore_from_current_res_dict(res) for res in res_dict.values()],reverse=True)\n",
    "        if prm_score == max(all_model_scores):\n",
    "            temp_str += f\" & \\\\textbf{{{prm_score * 100:.1f}}}\"\n",
    "        elif prm_score == all_model_scores[1]:\n",
    "            temp_str += f\" & \\\\underline{{{prm_score * 100:.1f}}}\"\n",
    "        else:\n",
    "            temp_str += f\" & {prm_score * 100:.1f}\"\n",
    "        \n",
    "        for big_classification, current_classification_dict in classification_dict.items():\n",
    "            all_avt = sorted([get_avg_prmscore_from_current_res_dict(res,list(current_classification_dict.keys())) for res in res_dict.values()], reverse=True)\n",
    "            avg = []\n",
    "            for classification, display_classification_name in current_classification_dict.items():\n",
    "                prm_score = get_prmscore_from_current_res_dict(current_res_dict,classification)\n",
    "                all_prm_scores = sorted([get_prmscore_from_current_res_dict(res,classification) for res in res_dict.values()], reverse=True)\n",
    "                avg.append(prm_score)\n",
    "                if prm_score == max(all_prm_scores):\n",
    "                    temp_str += f\" & \\\\textbf{{{prm_score * 100:.1f}}}\"\n",
    "                elif prm_score == all_prm_scores[1]:\n",
    "                    temp_str += f\" & \\\\underline{{{prm_score * 100:.1f}}}\"\n",
    "                else:\n",
    "                    temp_str += f\" & {prm_score * 100:.1f}\"\n",
    "            avg_score = sum(avg) / len(avg)\n",
    "            if avg_score == max(all_avt):\n",
    "                temp_str += f\" & \\\\textbf{{{avg_score * 100:.1f}}}\"\n",
    "            elif avg_score == all_avt[1]:\n",
    "                temp_str += f\" & \\\\underline{{{avg_score * 100:.1f}}}\"\n",
    "            else:\n",
    "                temp_str += f\" & {avg_score * 100:.1f}\"\n",
    "        temp_str += \"\\\\\\\\\\n\"\n",
    "        res_str += temp_str\n",
    "    return res_str\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\hline \\multicolumn{14}{c}{\\textit{\\textbf{Open-source Process Level Reward Models}}} \\\\   \\hline \n",
      "\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B} & 31.7 & 31.4 & 35.8 & 33.6 & 32.4 & 25.7 & 26.0 & 30.2 & 28.6 & 33.1 & 32.3 & 81.1 & 48.8\\\\\n",
      "\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B} & 36.2 & 35.7 & 41.2 & 38.4 & 36.7 & 29.1 & 30.6 & 34.4 & 32.7 & 36.8 & 37.4 & 88.8 & 54.3\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B} & 52.0 & 49.3 & \\underline{53.4} & 51.4 & 56.4 & 47.1 & 46.7 & 53.3 & 50.9 & 51.0 & 53.5 & 93.6 & 66.0\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B} & 50.5 & 50.2 & 50.5 & 50.3 & 51.9 & 47.6 & 44.4 & 52.1 & 49.0 & 50.5 & 51.3 & 96.0 & 66.0\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B} & 50.3 & 48.7 & 49.3 & 49.0 & 54.2 & 46.8 & 44.5 & 53.5 & 49.8 & 49.2 & 51.3 & 91.8 & 64.1\\\\\n",
      "\\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B} & 54.2 & 48.8 & \\textbf{54.0} & 51.4 & 57.0 & 52.1 & 50.7 & 57.8 & 54.4 & 52.8 & 55.8 & 91.1 & 66.5\\\\\n",
      "\\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B} & 47.0 & 44.0 & 50.3 & 47.1 & 49.4 & 44.5 & 41.3 & 47.7 & 45.7 & 47.2 & 48.6 & 86.1 & 60.7\\\\\n",
      "\\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B} & \\underline{60.0} & \\textbf{61.0} & 50.1 & \\textbf{55.5} & \\underline{62.1} & \\textbf{65.9} & \\textbf{61.5} & 66.0 & \\textbf{63.9} & \\underline{55.6} & \\underline{58.0} & \\underline{99.5} & \\underline{71.0}\\\\\n",
      "\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B} & 54.4 & 46.1 & 47.3 & 46.7 & 56.6 & 55.1 & 54.4 & 63.8 & 57.5 & 51.5 & 56.2 & 97.9 & 68.5\\\\\n",
      "\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B} & 54.2 & 46.4 & 48.9 & 47.6 & 55.7 & 55.0 & 53.2 & \\underline{66.2} & 57.5 & 49.0 & 55.4 & \\textbf{99.8} & 68.1\\\\\n",
      "\\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B} & \\textbf{60.5} & \\underline{54.8} & 48.1 & \\underline{51.5} & \\textbf{66.4} & \\underline{60.3} & \\underline{57.8} & \\textbf{67.5} & \\underline{63.0} & \\textbf{57.7} & \\textbf{64.3} & 97.2 & \\textbf{73.1}\\\\\n",
      "\\hline \\multicolumn{14}{c}{\\textit{\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\   \\hline \n",
      "\\href{https://openai.com/index/hello-gpt-4o/}{GPT-4o} & 66.8 & 57.0 & 62.4 & 59.7 & 72.0 & \\underline{69.7} & 70.7 & 71.1 & 70.9 & \\textbf{62.5} & 65.7 & 99.2 & \\textbf{75.8}\\\\\n",
      "\\href{https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/}{o1-mini}$^\\dagger$ & \\underline{68.8} & 65.6 & 63.7 & 64.6 & \\textbf{74.5} & 67.7 & \\textbf{73.8} & \\textbf{72.3} & \\textbf{72.1} & \\underline{61.8} & 64.8 & \\textbf{100.0} & \\underline{75.5}\\\\\n",
      "\\href{https://openai.com/index/introducing-openai-o1-preview/}{o1-preview}$^\\dagger$ & 65.7 & 64.5 & \\textbf{65.3} & \\underline{64.9} & 68.7 & 62.8 & 67.6 & 67.2 & 66.5 & 57.9 & \\underline{66.0} & \\textbf{100.0} & 74.6\\\\\n",
      "\\href{https://deepmind.google/technologies/gemini/flash/}{Gemini-2.0-flash-exp} & 66.0 & \\underline{67.2} & 58.1 & 62.7 & 70.4 & 65.7 & 66.0 & 67.3 & 67.3 & 61.8 & \\textbf{66.2} & 98.2 & 75.4\\\\\n",
      "\\href{https://ai.google.dev/gemini-api/docs/thinking-mode}{Gemini-2.0-thinking-exp-1219} & \\textbf{68.8} & \\textbf{68.5} & \\underline{63.8} & \\textbf{66.2} & \\underline{72.9} & \\textbf{71.3} & \\underline{71.0} & \\underline{71.8} & \\underline{71.8} & 60.3 & 65.7 & 99.8 & 75.3\\\\\n",
      "\n"
     ]
    }
   ],
   "source": [
    "res_str = \"\"\n",
    "\n",
    "## PRMs\n",
    "model_type_panel=\"\\hline \\multicolumn{14}{c}{\\\\textit{\\\\textbf{Open-source Process Level Reward Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(prm_model_name_dict.keys()))\n",
    "prm_str = get_res_str(prm_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += model_type_panel + prm_str\n",
    "\n",
    "## Close Models\n",
    "model_type_panel= \"\\hline \\multicolumn{14}{c}{\\\\textit{\\\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(close_model_name_dict.keys()))\n",
    "close_str = get_res_str(close_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += model_type_panel + close_str\n",
    "\n",
    "\n",
    "print(res_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "smoe",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
