{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mr_eval.utils.utils import *\n",
    "import os\n",
    "\n",
    "def list_jsonl_files(folder_path):\n",
    "    \"\"\"\n",
    "    列举文件夹中的所有 .jsonl 文件\n",
    "    Args:\n",
    "        folder_path (str): 文件夹路径\n",
    "    Returns:\n",
    "        List[str]: 所有 .jsonl 文件的路径\n",
    "    \"\"\"\n",
    "    return [f for f in os.listdir(folder_path) if f.endswith(\".jsonl\")]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Model names\n",
    "prm_model_name_dict = dict(\n",
    "    skyworkprm_1_5B=\"\\\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B}\",\n",
    "    skyworkprm_7B=\"\\\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B}\",\n",
    "    llemma7b_prm_prm800k=\"\\\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B}\",\n",
    "    llemma7b_prm_metamath=\"\\\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B}\",\n",
    "    llemma7b_oprm_prm800k=\"\\\\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B}\",\n",
    "    mathminos_mistral=\"\\\\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B}\",\n",
    "    mathshepherd=\"\\\\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B}\",\n",
    "    reasoneval7b=\"\\\\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B}\",\n",
    "    reasoneval34b=\"\\\\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B}\",\n",
    "    llama3_1_8b_prm_mistral=\"\\\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B}\",\n",
    "    llama3_1_8b_prm_deepseek=\"\\\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B}\",\n",
    "    qwen_prm7b=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B}\",\n",
    "    qwen_prm72b=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-PRM-72B}\",\n",
    "    pure_prm_7b=\"\\\\href{https://huggingface.co/jinachris/Qwen2.5-Math-7B-PRM800K}{Pure-PRM-7B}\",\n",
    ")\n",
    "\n",
    "close_model_name_dict = dict(\n",
    "    gpt4o=\"\\\\href{https://openai.com/index/hello-gpt-4o/}{GPT-4o}\",\n",
    "    o1mini=\"\\\\href{https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/}{o1-mini}$^\\dagger$\",\n",
    "    \n",
    "    gemini_2_flash=\"\\\\href{https://deepmind.google/technologies/gemini/flash/}{Gemini-2.0-flash-exp}\",\n",
    "    gemini_2_thinking=\"\\\\href{https://ai.google.dev/gemini-api/docs/thinking-mode}{Gemini-2.0-thinking-exp-1219}\",\n",
    "    # ds_v3=\"\\\\href{https://github.com/deepseek-ai/DeepSeek-V3}{DeepSeek-V3}\",\n",
    "    \n",
    "    \n",
    ")\n",
    "    \n",
    "open_model_name_dict = dict(\n",
    "    # o1preview=\"\\\\href{https://openai.com/index/introducing-openai-o1-preview/}{o1-preview}$^\\dagger$\",\n",
    "    \n",
    "    metamath_7b=\"\\\\href{https://huggingface.co/meta-math/MetaMath-7B-V1.0}{MetaMath-7B}\",\n",
    "    metamath_13b=\"\\\\href{https://huggingface.co/meta-math/MetaMath-13B-V1.0}{MetaMath-13B}\",\n",
    "    # metamath_70b=\"\\\\href{https://huggingface.co/meta-math/MetaMath-70B-V1.0}{MetaMath-70B}\",\n",
    "    # qwen25_7b=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct}{Qwen2.5-Math-7B}\",\n",
    "    qwen25_72b=\"\\\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-72B}\",\n",
    "    qwen_qwq=\"\\\\href{https://huggingface.co/Qwen/QwQ-32B-Preview}{QwQ-Preview-32B}\",\n",
    "    # r1_distill_llama_8b=\"\\\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B}{R1-Distill-Llama3.1-8B}\",\n",
    "    r1_distill_llama_70b=\"\\\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B}{R1-Distill-Llama3.1-70B}\",\n",
    "    r1_distill_qwen_7b=\"\\\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B}{R1-Distill-Qwen-7B}\",\n",
    "    ds_r1=\"\\\\href{https://github.com/deepseek-ai/DeepSeek-R1}{DeepSeek-R1}$^\\dagger$\"\n",
    "    # r1_distill_qwen_32b=\"\\\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B}{R1-Distill-Qwen-32B}\",\n",
    "    # wizardmath_7b=\"\\\\href{https://huggingface.co/WizardLMTeam/WizardMath-7B-V1.0}{izardMath-7B}\",\n",
    ")\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "classification_name_dict = dict(\n",
    "    domain_inconsistency=\"DC.\",\n",
    "    redundency=\"NR.\",\n",
    "    multi_solutions=\"MS.\",\n",
    "    deception=\"DR.\",\n",
    "    confidence=\"CI.\",\n",
    "    step_contradiction=\"SC.\",\n",
    "    circular=\"NCL.\",\n",
    "    missing_condition=\"PS.\",\n",
    "    counterfactual=\"ES.\"\n",
    ")\n",
    "classification_parallel_dict = dict(\n",
    "    simplicity=dict(\n",
    "        redundency=\"NR.\",\n",
    "        circular=\"NCL.\",\n",
    "    ),\n",
    "    soundness=dict(\n",
    "        counterfactual=\"ES.\",\n",
    "        step_contradiction=\"SC.\",\n",
    "        domain_inconsistency=\"DC.\",\n",
    "        confidence=\"CI.\",\n",
    "    ),\n",
    "    sensitivity=dict(\n",
    "        missing_condition=\"PS.\",\n",
    "        deception=\"DR.\",\n",
    "        multi_solutions=\"MS.\",\n",
    "    )\n",
    ")\n",
    "classifications = [\"redundency\", \"circular\", \"counterfactual\", \"step_contradiction\", \"domain_inconsistency\",  \"confidence\", \"missing_condition\", \"deception\", \"multi_solutions\", ]\n",
    "metrics = [\"f1\", \"negative_f1\", \"total_step_acc\", \"correct_step_acc\", \"wrong_step_acc\", \"first_error_acc\", \"similarity\",]\n",
    "\n",
    "## File paths\n",
    "res_dir = \"/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/scripts/logs/prmtest_classified\"\n",
    "vllm_res_dir = \"/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/scripts/logs/prmtest_classified/vllm\"\n",
    "\n",
    "res_files = list_jsonl_files(res_dir)\n",
    "vllm_res_files = list_jsonl_files(vllm_res_dir)\n",
    "res_names = [f.split(\".\")[0] for f in res_files+vllm_res_files]\n",
    "res_paths = [os.path.join(res_dir, f) for f in res_files]\n",
    "vllm_res_paths = [os.path.join(vllm_res_dir, f) for f in vllm_res_files]\n",
    "res_paths += vllm_res_paths\n",
    "file_dict = dict(zip(res_names, res_paths))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_res_dict(file_dict,model_lists=None):\n",
    "    res_dict = {}\n",
    "    if not model_lists:\n",
    "        for model_name, file_path in file_dict.items():\n",
    "            res_dict[model_name] = process_jsonl(file_path)[-1]\n",
    "    else:\n",
    "        for model_name in model_lists:\n",
    "            file_path = file_dict[model_name]\n",
    "            res_dict[model_name] = process_jsonl(file_path)[-1]\n",
    "    return res_dict\n",
    "\n",
    "\n",
    "def get_prmscore_from_current_res_dict(res_dict,classification=None):\n",
    "    '''\n",
    "    Get PRM score from model level dict\n",
    "    '''\n",
    "    if not classification:\n",
    "        prm_score = res_dict[\"total_hallucination_results\"]['f1'] * 0.5 + res_dict[\"total_hallucination_results\"]['negative_f1'] * 0.5\n",
    "    else:\n",
    "        if classification in [\"multi_solutions\"]:\n",
    "            prm_score = res_dict[\"hallucination_type_results\"]['f1'][classification]\n",
    "        else:\n",
    "            prm_score = res_dict[\"hallucination_type_results\"]['f1'][classification] * 0.5 + res_dict[\"hallucination_type_results\"]['negative_f1'][classification] * 0.5\n",
    "    return prm_score\n",
    "\n",
    "\n",
    "def get_avg_prmscore_from_current_res_dict(res_dict,classifications):\n",
    "    '''\n",
    "    Get AVG PRM score from model level dict\n",
    "    '''\n",
    "    assert classifications\n",
    "    res = [get_prmscore_from_current_res_dict(res_dict,classification) for classification in classifications]\n",
    "    return sum(res) / len(res)\n",
    "\n",
    "def get_metric_from_current_res_dict(res_dict,metric,classification=None):\n",
    "    '''\n",
    "    Get metric from model level dict\n",
    "    '''\n",
    "    if not classification:\n",
    "        if metric == \"similarity\":\n",
    "            return 1 - res_dict[\"total_hallucination_results\"][metric]\n",
    "        else:\n",
    "            return res_dict[\"total_hallucination_results\"][metric]\n",
    "    else:\n",
    "        if metric == \"similarity\":\n",
    "            return 1 - res_dict[\"hallucination_type_results\"][metric][classification]\n",
    "        else:\n",
    "            return res_dict[\"hallucination_type_results\"][metric][classification]\n",
    "    \n",
    "# def get_avg_metric_from_current_res_dict(res_dict,metric,classifications):\n",
    "#     '''\n",
    "#     Get AVG metric from model level dict\n",
    "#     '''\n",
    "#     assert classifications\n",
    "#     res = [get_metric_from_current_res_dict(res_dict,metric,classification) for classification in classifications]\n",
    "#     return sum(res) / len(res)\n",
    "    \n",
    "\n",
    "def get_res_str(model_dict,classification_dict,res_dict):\n",
    "    res_str = \"\"\n",
    "    # current_classification_dict = classification_dict[classification_name]\n",
    "    avg_res_list = []\n",
    "    for idx,(model_name, model_display_name) in enumerate(model_dict.items()):\n",
    "        temp_str = f\"{model_display_name}\"\n",
    "        current_res_dict = res_dict[model_name]\n",
    "        prm_score = get_prmscore_from_current_res_dict(current_res_dict)\n",
    "        all_model_scores = sorted([get_prmscore_from_current_res_dict(res) for res in res_dict.values()],reverse=True)\n",
    "        if idx == 0:\n",
    "            avg_res_list.append(sum(all_model_scores) / len(all_model_scores))\n",
    "        if prm_score == max(all_model_scores):\n",
    "            temp_str += f\" & \\\\textbf{{{prm_score * 100:.1f}}}\"\n",
    "        elif prm_score == all_model_scores[1]:\n",
    "            temp_str += f\" & \\\\underline{{{prm_score * 100:.1f}}}\"\n",
    "        else:\n",
    "            temp_str += f\" & {prm_score * 100:.1f}\"\n",
    "        \n",
    "        for big_classification, current_classification_dict in classification_dict.items():\n",
    "            all_avt = sorted([get_avg_prmscore_from_current_res_dict(res,list(current_classification_dict.keys())) for res in res_dict.values()], reverse=True)\n",
    "            avg = []\n",
    "            for classification, display_classification_name in current_classification_dict.items():\n",
    "                prm_score = get_prmscore_from_current_res_dict(current_res_dict,classification)\n",
    "                all_prm_scores = sorted([get_prmscore_from_current_res_dict(res,classification) for res in res_dict.values()], reverse=True)\n",
    "                if idx == 0:\n",
    "                    avg_res_list.append(sum(all_prm_scores) / len(all_prm_scores))\n",
    "                avg.append(prm_score)\n",
    "                if prm_score == max(all_prm_scores):\n",
    "                    temp_str += f\" & \\\\textbf{{{prm_score * 100:.1f}}}\"\n",
    "                elif prm_score == all_prm_scores[1]:\n",
    "                    temp_str += f\" & \\\\underline{{{prm_score * 100:.1f}}}\"\n",
    "                else:\n",
    "                    temp_str += f\" & {prm_score * 100:.1f}\"\n",
    "            avg_score = sum(avg) / len(avg)\n",
    "            if avg_score == max(all_avt):\n",
    "                temp_str += f\" & \\\\textbf{{{avg_score * 100:.1f}}}\"\n",
    "            elif avg_score == all_avt[1]:\n",
    "                temp_str += f\" & \\\\underline{{{avg_score * 100:.1f}}}\"\n",
    "            else:\n",
    "                temp_str += f\" & {avg_score * 100:.1f}\"\n",
    "            if idx == 0:\n",
    "                avg_res_list.append(sum(all_avt) / len(all_avt))\n",
    "        temp_str += \"\\\\\\\\\\n\"\n",
    "        res_str += temp_str\n",
    "    avg_res_str = \"\\\\cellcolor{gray!10} \\\\textbf{Avg.} \"\n",
    "    for res in avg_res_list:\n",
    "        avg_res_str += f\"& \\\\cellcolor{{gray!10}} {res * 100:.1f} \"\n",
    "    avg_res_str += \"\\\\\\\\\\n\"\n",
    "    res_str += avg_res_str\n",
    "    \n",
    "    return res_str\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\hline \\multicolumn{14}{c}{\\textit{\\textbf{Open-source Process Level Reward Models}}} \\\\   \\hline \n",
      "\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B} & 61.1 & 52.0 & 56.4 & 54.2 & 64.8 & 64.9 & 63.3 & 66.5 & 64.9 & 57.5 & 63.3 & 91.1 & 70.7\\\\\n",
      "\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B} & 65.1 & \\underline{56.4} & \\textbf{62.8} & \\textbf{59.6} & 69.4 & 67.1 & \\underline{67.7} & 69.9 & 68.5 & \\textbf{60.9} & 65.8 & 93.2 & 73.3\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B} & 52.0 & 49.3 & 53.4 & 51.4 & 56.4 & 47.1 & 46.7 & 53.3 & 50.9 & 51.0 & 53.5 & 93.6 & 66.0\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B} & 50.5 & 50.2 & 50.5 & 50.3 & 51.9 & 47.6 & 44.4 & 52.1 & 49.0 & 50.5 & 51.3 & 96.0 & 66.0\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B} & 50.3 & 48.7 & 49.3 & 49.0 & 54.2 & 46.8 & 44.5 & 53.5 & 49.8 & 49.2 & 51.3 & 91.8 & 64.1\\\\\n",
      "\\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B} & 54.2 & 48.8 & 54.0 & 51.4 & 57.0 & 52.1 & 50.7 & 57.8 & 54.4 & 52.8 & 55.8 & 91.1 & 66.5\\\\\n",
      "\\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B} & 47.0 & 44.0 & 50.3 & 47.1 & 49.4 & 44.5 & 41.3 & 47.7 & 45.7 & 47.2 & 48.6 & 86.1 & 60.7\\\\\n",
      "\\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B} & 60.1 & \\textbf{61.0} & 50.1 & \\underline{55.6} & 62.1 & 65.9 & 61.5 & 66.0 & 63.9 & 55.7 & 58.0 & 99.5 & 71.1\\\\\n",
      "\\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B} & 60.5 & 54.8 & 48.1 & 51.5 & 66.4 & 60.3 & 57.8 & 67.5 & 63.0 & 57.7 & 64.3 & 97.2 & 73.1\\\\\n",
      "\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B} & 54.4 & 46.1 & 47.3 & 46.7 & 56.6 & 55.1 & 54.4 & 63.8 & 57.5 & 51.5 & 56.2 & 97.9 & 68.5\\\\\n",
      "\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B} & 54.2 & 46.4 & 48.9 & 47.6 & 55.7 & 55.0 & 53.2 & 66.2 & 57.5 & 49.0 & 55.4 & \\textbf{99.8} & 68.1\\\\\n",
      "\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B} & \\underline{65.5} & 49.0 & 55.1 & 52.1 & \\underline{71.8} & 67.3 & 66.3 & \\underline{78.5} & \\underline{71.0} & 57.6 & 69.1 & \\underline{99.7} & 75.5\\\\\n",
      "\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-PRM-72B} & \\textbf{68.2} & 50.4 & \\underline{58.8} & 54.6 & \\textbf{73.7} & \\textbf{71.1} & \\textbf{72.2} & \\textbf{78.6} & \\textbf{73.9} & \\underline{60.3} & \\textbf{71.2} & 99.4 & \\textbf{77.0}\\\\\n",
      "\\href{https://huggingface.co/jinachris/Qwen2.5-Math-7B-PRM800K}{Pure-PRM-7B} & 65.3 & 49.2 & 55.2 & 52.2 & 71.1 & \\underline{68.8} & 64.0 & 76.9 & 70.2 & 60.3 & \\underline{69.2} & 98.0 & \\underline{75.8}\\\\\n",
      "\\cellcolor{gray!10} \\textbf{Avg.} & \\cellcolor{gray!10} 57.7 & \\cellcolor{gray!10} 50.5 & \\cellcolor{gray!10} 52.9 & \\cellcolor{gray!10} 51.7 & \\cellcolor{gray!10} 61.5 & \\cellcolor{gray!10} 58.1 & \\cellcolor{gray!10} 56.3 & \\cellcolor{gray!10} 64.2 & \\cellcolor{gray!10} 60.0 & \\cellcolor{gray!10} 54.4 & \\cellcolor{gray!10} 59.5 & \\cellcolor{gray!10} 95.3 & \\cellcolor{gray!10} 69.7 \\\\\n",
      "\\hline \\multicolumn{14}{c}{\\textit{\\textbf{Open LLMs, Prompted as Critic Models}}} \\\\   \\hline \n",
      "\\href{https://huggingface.co/meta-math/MetaMath-7B-V1.0}{MetaMath-7B} & 49.7 & 48.9 & 46.9 & 47.9 & 47.3 & 48.9 & 48.4 & 48.8 & 48.3 & 46.5 & 48.3 & 98.0 & 64.2\\\\\n",
      "\\href{https://huggingface.co/meta-math/MetaMath-13B-V1.0}{MetaMath-13B} & 49.4 & 50.3 & 44.4 & 47.3 & 47.8 & 47.4 & 49.4 & 48.1 & 48.2 & 49.0 & 48.1 & 99.5 & 65.5\\\\\n",
      "\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-72B} & 57.4 & 55.3 & 54.9 & 55.1 & 55.5 & \\underline{71.6} & 58.1 & 59.1 & 61.1 & 47.4 & 53.8 & \\textbf{100.0} & 67.1\\\\\n",
      "\\href{https://huggingface.co/Qwen/QwQ-32B-Preview}{QwQ-Preview-32B} & \\underline{63.6} & \\underline{57.2} & \\underline{55.6} & \\underline{56.4} & \\underline{67.4} & \\textbf{72.3} & \\underline{66.2} & \\underline{66.9} & \\underline{68.2} & \\underline{57.8} & \\underline{62.7} & \\textbf{100.0} & \\underline{73.5}\\\\\n",
      "\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B}{R1-Distill-Llama3.1-70B} & 57.5 & 49.5 & 48.1 & 48.8 & 61.4 & 65.5 & 65.8 & 61.1 & 63.4 & 48.8 & 54.1 & 100.0 & 67.6\\\\\n",
      "\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B}{R1-Distill-Qwen-7B} & 52.6 & 32.9 & 37.9 & 35.4 & 47.3 & 54.1 & 48.4 & 48.0 & 49.4 & 45.6 & 46.8 & \\textbf{100.0} & 64.1\\\\\n",
      "\\href{https://github.com/deepseek-ai/DeepSeek-R1}{DeepSeek-R1}$^\\dagger$ & \\textbf{67.8} & \\textbf{63.0} & \\textbf{62.7} & \\textbf{62.9} & \\textbf{68.2} & 68.5 & \\textbf{73.5} & \\textbf{75.4} & \\textbf{71.4} & \\textbf{63.3} & \\textbf{68.0} & \\textbf{100.0} & \\textbf{77.1}\\\\\n",
      "\\cellcolor{gray!10} \\textbf{Avg.} & \\cellcolor{gray!10} 56.8 & \\cellcolor{gray!10} 51.0 & \\cellcolor{gray!10} 50.1 & \\cellcolor{gray!10} 50.5 & \\cellcolor{gray!10} 56.4 & \\cellcolor{gray!10} 61.2 & \\cellcolor{gray!10} 58.5 & \\cellcolor{gray!10} 58.2 & \\cellcolor{gray!10} 58.6 & \\cellcolor{gray!10} 51.2 & \\cellcolor{gray!10} 54.5 & \\cellcolor{gray!10} 99.6 & \\cellcolor{gray!10} 68.5 \\\\\n",
      "\\hline \\multicolumn{14}{c}{\\textit{\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\   \\hline \n",
      "\\href{https://openai.com/index/hello-gpt-4o/}{GPT-4o} & 66.8 & 57.0 & 62.4 & 59.7 & 72.0 & \\underline{69.7} & 70.7 & 71.1 & 70.9 & \\textbf{62.5} & 65.7 & 99.2 & \\textbf{75.8}\\\\\n",
      "\\href{https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/}{o1-mini}$^\\dagger$ & \\underline{68.8} & 65.6 & \\underline{63.7} & \\underline{64.6} & \\textbf{74.5} & 67.7 & \\textbf{73.8} & \\textbf{72.3} & \\textbf{72.1} & \\underline{61.8} & 64.8 & \\textbf{100.0} & \\underline{75.5}\\\\\n",
      "\\href{https://deepmind.google/technologies/gemini/flash/}{Gemini-2.0-flash-exp} & 66.0 & \\underline{67.2} & 58.1 & 62.7 & 70.4 & 65.7 & 66.0 & 67.3 & 67.3 & 61.8 & \\textbf{66.2} & 98.2 & 75.4\\\\\n",
      "\\href{https://ai.google.dev/gemini-api/docs/thinking-mode}{Gemini-2.0-thinking-exp-1219} & \\textbf{68.8} & \\textbf{68.5} & \\textbf{63.8} & \\textbf{66.2} & \\underline{72.9} & \\textbf{71.3} & \\underline{71.0} & \\underline{71.8} & \\underline{71.8} & 60.3 & \\underline{65.7} & \\underline{99.8} & 75.3\\\\\n",
      "\\cellcolor{gray!10} \\textbf{Avg.} & \\cellcolor{gray!10} 67.6 & \\cellcolor{gray!10} 64.6 & \\cellcolor{gray!10} 62.0 & \\cellcolor{gray!10} 63.3 & \\cellcolor{gray!10} 72.4 & \\cellcolor{gray!10} 68.6 & \\cellcolor{gray!10} 70.4 & \\cellcolor{gray!10} 70.7 & \\cellcolor{gray!10} 70.5 & \\cellcolor{gray!10} 61.6 & \\cellcolor{gray!10} 65.6 & \\cellcolor{gray!10} 99.3 & \\cellcolor{gray!10} 75.5 \\\\\n",
      "\n"
     ]
    }
   ],
   "source": [
    "res_str = \"\"\n",
    "\n",
    "## PRMs\n",
    "model_type_panel=\"\\hline \\multicolumn{14}{c}{\\\\textit{\\\\textbf{Open-source Process Level Reward Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(prm_model_name_dict.keys()))\n",
    "prm_str = get_res_str(prm_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += model_type_panel + prm_str\n",
    "## open llms\n",
    "model_type_panel=\"\\hline \\multicolumn{14}{c}{\\\\textit{\\\\textbf{Open LLMs, Prompted as Critic Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(open_model_name_dict.keys()))\n",
    "prm_str = get_res_str(open_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += model_type_panel + prm_str\n",
    "## Close Models\n",
    "model_type_panel= \"\\hline \\multicolumn{14}{c}{\\\\textit{\\\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(close_model_name_dict.keys()))\n",
    "close_str = get_res_str(close_model_name_dict, classification_parallel_dict, res_dict,)\n",
    "res_str += model_type_panel + close_str\n",
    "\n",
    "\n",
    "print(res_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## appendix latex str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "display_metrics = [\"f1\", \"negative_f1\", \"total_step_acc\", \"correct_step_acc\", \"wrong_step_acc\", \"first_error_acc\", \"similarity\",]\n",
    "def get_appendix_res_str(model_dict,res_dict, classification):\n",
    "    res_str = \"\"\n",
    "    # current_classification_dict = classification_dict[classification_name]\n",
    "    avg_res_list = []\n",
    "    for idx,(model_name, model_display_name) in enumerate(model_dict.items()):\n",
    "        temp_str = f\"{model_display_name}\"\n",
    "        current_res_dict = res_dict[model_name]\n",
    "        prm_score = get_prmscore_from_current_res_dict(current_res_dict,classification)\n",
    "        all_model_scores = sorted([get_prmscore_from_current_res_dict(res,classification) for res in res_dict.values()],reverse=True)\n",
    "        if idx == 0:\n",
    "            avg_res_list.append(sum(all_model_scores) / len(all_model_scores))\n",
    "        if prm_score == max(all_model_scores):\n",
    "            temp_str += f\" & \\\\textbf{{{prm_score * 100:.1f}}}\"\n",
    "        elif prm_score == all_model_scores[1]:\n",
    "            temp_str += f\" & \\\\underline{{{prm_score * 100:.1f}}}\"\n",
    "        else:\n",
    "            temp_str += f\" & {prm_score * 100:.1f}\"\n",
    "        \n",
    "        # detailed metrics\n",
    "        for display_metric in display_metrics:\n",
    "            metric_score = get_metric_from_current_res_dict(current_res_dict,display_metric,classification)\n",
    "            all_metric_scores = sorted([get_metric_from_current_res_dict(res,display_metric,classification) for res in res_dict.values()],reverse=True)\n",
    "            if idx == 0:\n",
    "                avg_res_list.append(sum(all_metric_scores) / len(all_metric_scores))\n",
    "            if display_metric == \"similarity\":\n",
    "                temp_str += f\" & {metric_score * 100:.1f}\"\n",
    "            else:\n",
    "                if metric_score == max(all_metric_scores):\n",
    "                    temp_str += f\" & \\\\textbf{{{metric_score * 100:.1f}}}\"\n",
    "                elif metric_score == all_metric_scores[1]:\n",
    "                    temp_str += f\" & \\\\underline{{{metric_score * 100:.1f}}}\"\n",
    "                else:\n",
    "                    temp_str += f\" & {metric_score * 100:.1f}\"\n",
    "        temp_str += \"\\\\\\\\\\n\"\n",
    "        res_str += temp_str\n",
    "    avg_res_str = \"\\\\cellcolor{gray!10} \\\\textbf{Avg.} \"\n",
    "    for res in avg_res_list:\n",
    "        avg_res_str += f\"& \\\\cellcolor{{gray!10}} {res * 100:.1f} \"\n",
    "    avg_res_str += \"\\\\\\\\\\n\"\n",
    "    res_str += avg_res_str\n",
    "    \n",
    "    return res_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "classification_parallel_dict = dict(\n",
    "    simplicity=dict(\n",
    "        redundency=\"NR.\",\n",
    "        circular=\"NCL.\",\n",
    "    ),\n",
    "    soundness=dict(\n",
    "        counterfactual=\"ES.\",\n",
    "        step_contradiction=\"SC.\",\n",
    "        domain_inconsistency=\"DC.\",\n",
    "        confidence=\"CI.\",\n",
    "    ),\n",
    "    sensitivity=dict(\n",
    "        missing_condition=\"PS.\",\n",
    "        deception=\"DR.\",\n",
    "        multi_solutions=\"MS.\",\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\hline \\multicolumn{9}{c}{\\textit{\\textbf{Open-source Process Level Reward Models}}} \\\\   \\hline \n",
      "\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B} & 63.3 & 88.9 & 37.8 & 81.1 & 89.5 & 36.2 & 50.1 & 91.9\\\\\n",
      "\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B} & 65.8 & 89.0 & 42.5 & 81.5 & 88.7 & 43.2 & 56.0 & 90.5\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B} & 53.5 & 77.8 & 29.1 & 66.2 & 70.4 & 43.8 & 23.1 & 85.1\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B} & 51.3 & 80.6 & 22.1 & 68.9 & 76.6 & 27.9 & 15.1 & 85.6\\\\\n",
      "\\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B} & 51.3 & 78.4 & 24.1 & 66.4 & 72.5 & 33.8 & 17.2 & 86.6\\\\\n",
      "\\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B} & 55.8 & 79.1 & 32.4 & 68.1 & 72.8 & 45.2 & 41.4 & 83.7\\\\\n",
      "\\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B} & 48.6 & 65.6 & 31.7 & 54.2 & 52.5 & \\textbf{62.8} & \\textbf{57.1} & 84.0\\\\\n",
      "\\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B} & 58.0 & 90.6 & 25.4 & 83.2 & \\textbf{96.7} & 16.9 & 24.7 & 93.7\\\\\n",
      "\\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B} & 64.3 & 84.4 & 44.3 & 75.6 & 79.4 & \\underline{57.3} & \\underline{56.8} & 83.6\\\\\n",
      "\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B} & 56.2 & 87.5 & 24.9 & 78.6 & 90.3 & 21.0 & 27.1 & 92.2\\\\\n",
      "\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B} & 55.4 & 89.5 & 21.4 & 81.5 & 95.1 & 14.9 & 19.4 & 95.0\\\\\n",
      "\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B}{Qwen2.5-Math-PRM-7B} & 69.1 & \\textbf{91.7} & 46.6 & \\textbf{85.6} & \\underline{95.4} & 37.3 & 46.9 & 87.0\\\\\n",
      "\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-PRM-72B} & \\textbf{71.2} & \\underline{91.5} & \\textbf{50.9} & \\underline{85.5} & 93.9 & 44.3 & 55.9 & 85.2\\\\\n",
      "\\href{https://huggingface.co/jinachris/Qwen2.5-Math-7B-PRM800K}{Pure-PRM-7B} & \\underline{69.2} & 90.2 & \\underline{48.3} & 83.6 & 91.3 & 45.3 & 51.7 & 84.5\\\\\n",
      "\\cellcolor{gray!10} \\textbf{Avg.} & \\cellcolor{gray!10} 59.5 & \\cellcolor{gray!10} 84.6 & \\cellcolor{gray!10} 34.4 & \\cellcolor{gray!10} 75.7 & \\cellcolor{gray!10} 83.2 & \\cellcolor{gray!10} 37.9 & \\cellcolor{gray!10} 38.8 & \\cellcolor{gray!10} 87.8 \\\\\n",
      "\\hline \\multicolumn{9}{c}{\\textit{\\textbf{Open LLMs, Prompted as Critic Models}}} \\\\   \\hline \n",
      "\\href{https://huggingface.co/meta-math/MetaMath-7B-V1.0}{MetaMath-7B} & 48.3 & 90.4 & 6.2 & 82.5 & 95.3 & 4.1 & 3.5 & 96.6\\\\\n",
      "\\href{https://huggingface.co/meta-math/MetaMath-13B-V1.0}{MetaMath-13B} & 48.1 & \\underline{91.4} & 4.8 & \\underline{84.3} & 98.7 & 2.6 & 4.0 & 99.6\\\\\n",
      "\\href{https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B}{Qwen2.5-Math-72B} & 53.8 & 90.5 & 17.1 & 82.9 & \\underline{99.3} & 9.6 & 11.5 & 96.3\\\\\n",
      "\\href{https://huggingface.co/Qwen/QwQ-32B-Preview}{QwQ-Preview-32B} & \\underline{62.7} & 89.2 & \\underline{36.1} & 81.6 & 94.9 & \\underline{26.6} & \\underline{29.8} & 88.8\\\\\n",
      "\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B}{R1-Distill-Llama3.1-70B} & 54.1 & 91.0 & 17.2 & 83.7 & 99.0 & 9.9 & 12.0 & 96.1\\\\\n",
      "\\href{https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B}{R1-Distill-Qwen-7B} & 46.8 & \\textbf{91.8} & 1.9 & \\textbf{84.9} & \\textbf{99.5} & 1.0 & 1.2 & 99.3\\\\\n",
      "\\href{https://github.com/deepseek-ai/DeepSeek-R1}{DeepSeek-R1}$^\\dagger$ & \\textbf{68.0} & 88.0 & \\textbf{48.0} & 80.5 & 87.2 & \\textbf{50.0} & \\textbf{56.7} & 200.0\\\\\n",
      "\\cellcolor{gray!10} \\textbf{Avg.} & \\cellcolor{gray!10} 54.5 & \\cellcolor{gray!10} 90.3 & \\cellcolor{gray!10} 18.7 & \\cellcolor{gray!10} 82.9 & \\cellcolor{gray!10} 96.3 & \\cellcolor{gray!10} 14.8 & \\cellcolor{gray!10} 17.0 & \\cellcolor{gray!10} 111.0 \\\\\n",
      "\\hline \\multicolumn{9}{c}{\\textit{\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\   \\hline \n",
      "\\href{https://openai.com/index/hello-gpt-4o/}{GPT-4o} & 65.7 & \\underline{89.2} & 42.2 & \\underline{81.8} & \\underline{90.5} & 39.3 & 41.3 & 84.8\\\\\n",
      "\\href{https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/}{o1-mini}$^\\dagger$ & 64.8 & 86.7 & \\underline{42.9} & 78.4 & 84.5 & \\underline{48.2} & \\underline{43.8} & 200.0\\\\\n",
      "\\href{https://deepmind.google/technologies/gemini/flash/}{Gemini-2.0-flash-exp} & \\textbf{66.2} & 86.3 & \\textbf{46.1} & 78.1 & 82.7 & \\textbf{55.5} & \\textbf{60.2} & 83.4\\\\\n",
      "\\href{https://ai.google.dev/gemini-api/docs/thinking-mode}{Gemini-2.0-thinking-exp-1219} & \\underline{65.7} & \\textbf{89.7} & 41.8 & \\textbf{82.5} & \\textbf{91.8} & 37.0 & 40.2 & 86.4\\\\\n",
      "\\cellcolor{gray!10} \\textbf{Avg.} & \\cellcolor{gray!10} 65.6 & \\cellcolor{gray!10} 88.0 & \\cellcolor{gray!10} 43.2 & \\cellcolor{gray!10} 80.2 & \\cellcolor{gray!10} 87.4 & \\cellcolor{gray!10} 45.0 & \\cellcolor{gray!10} 46.4 & \\cellcolor{gray!10} 113.7 \\\\\n",
      "\n"
     ]
    }
   ],
   "source": [
    "classification = \"deception\"\n",
    "res_str = \"\"\n",
    "\n",
    "## PRMs\n",
    "model_type_panel=\"\\hline \\multicolumn{9}{c}{\\\\textit{\\\\textbf{Open-source Process Level Reward Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(prm_model_name_dict.keys()))\n",
    "prm_str = get_appendix_res_str(prm_model_name_dict,res_dict, classification)\n",
    "res_str += model_type_panel + prm_str\n",
    "\n",
    "## open llms\n",
    "model_type_panel=\"\\hline \\multicolumn{9}{c}{\\\\textit{\\\\textbf{Open LLMs, Prompted as Critic Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(open_model_name_dict.keys()))\n",
    "prm_str = get_appendix_res_str(open_model_name_dict,res_dict, classification)\n",
    "res_str += model_type_panel + prm_str\n",
    "\n",
    "## Close Models\n",
    "model_type_panel= \"\\hline \\multicolumn{9}{c}{\\\\textit{\\\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\\\\\   \\hline \\n\"\n",
    "res_dict = get_res_dict(file_dict,model_lists=list(close_model_name_dict.keys()))\n",
    "close_str = get_appendix_res_str(close_model_name_dict, res_dict, classification)\n",
    "res_str += model_type_panel + close_str\n",
    "\n",
    "\n",
    "print(res_str)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "smoe",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
