{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mr_eval.utils.utils import *\n",
    "import os\n",
    "\n",
    "def list_jsonl_files(folder_path):\n",
    "    \"\"\"\n",
    "    列举文件夹中的所有 .jsonl 文件\n",
    "    Args:\n",
    "        folder_path (str): 文件夹路径\n",
    "    Returns:\n",
    "        List[str]: 所有 .jsonl 文件的路径\n",
    "    \"\"\"\n",
    "    return [f for f in os.listdir(folder_path) if f.endswith(\".jsonl\")]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "llemma7b_prm_prm800k\t52.0\t75.7\t28.3\t63.7\t66.4\t48.5\t22.2\t20.5\t49.3\t77.7\t20.9\t65.2\t69.2\t36.9\t20.5\t13.6\t53.4\t65.5\t41.3\t56.6\t54.9\t61.4\t15.7\t34.4\t56.4\t76.8\t36.1\t65.9\t69.2\t51.7\t20.0\t18.8\t47.1\t75.2\t18.9\t62.1\t64.3\t42.5\t24.1\t19.5\t46.7\t71.6\t21.9\t58.3\t59.9\t47.1\t20.5\t25.7\t53.3\t79.9\t26.6\t68.5\t70.9\t50.0\t29.4\t13.8\t51.0\t71.7\t30.3\t59.7\t62.3\t47.9\t24.4\t25.0\t53.5\t77.8\t29.1\t66.2\t70.4\t43.8\t23.1\t14.9\t46.8\t93.6\t0.0\t88.0\t88.0\t-100.0\t-100.0\t10.2\n",
      "gpt4o\t66.8\t86.9\t46.7\t79.0\t82.9\t58.2\t64.4\t23.4\t57.0\t77.8\t36.3\t67.0\t66.5\t70.4\t77.1\t31.1\t62.4\t73.5\t51.3\t65.6\t65.4\t66.2\t80.6\t40.1\t72.0\t88.9\t55.2\t82.2\t88.8\t55.4\t63.2\t21.3\t69.7\t89.9\t49.6\t83.1\t84.3\t74.3\t76.9\t23.7\t70.7\t88.2\t53.3\t81.2\t81.3\t80.4\t81.4\t28.6\t71.1\t92.1\t50.2\t86.3\t90.4\t56.5\t60.4\t14.8\t62.5\t86.8\t38.3\t78.2\t88.8\t34.7\t33.3\t16.2\t65.7\t89.2\t42.2\t81.8\t90.5\t39.3\t41.3\t15.2\t49.6\t99.2\t0.0\t98.4\t98.4\t-100.0\t-100.0\t3.2\n",
      "gemini_2_thinking\t68.8\t89.7\t47.8\t82.8\t89.0\t49.8\t57.0\t18.0\t68.5\t91.4\t45.6\t85.1\t90.9\t47.1\t56.5\t15.3\t63.8\t81.2\t46.4\t72.2\t82.8\t44.0\t54.8\t25.4\t72.9\t89.4\t56.4\t83.0\t89.8\t55.5\t64.8\t20.5\t71.3\t91.4\t51.2\t85.4\t87.6\t68.5\t72.8\t19.0\t71.0\t88.8\t53.2\t81.9\t82.6\t77.2\t79.2\t26.3\t71.8\t92.6\t51.1\t87.1\t91.4\t55.8\t60.5\t13.9\t60.3\t87.1\t33.5\t78.4\t90.5\t28.0\t27.0\t13.0\t65.7\t89.7\t41.8\t82.5\t91.8\t37.0\t40.2\t13.6\t49.9\t99.8\t0.0\t99.7\t99.7\t-100.0\t-100.0\t2.7\n",
      "gemini_2_flash_0shot\t65.3\t86.3\t44.4\t78.0\t82.2\t55.7\t63.7\t19.5\t69.0\t92.9\t45.0\t87.4\t94.8\t38.9\t49.7\t15.0\t57.1\t82.9\t31.2\t72.6\t91.4\t22.7\t31.5\t17.0\t69.9\t85.0\t54.8\t77.4\t79.4\t69.2\t77.1\t23.4\t64.6\t85.5\t43.8\t76.9\t76.4\t81.2\t84.4\t22.3\t64.8\t82.5\t47.0\t73.7\t71.6\t87.4\t87.5\t29.0\t66.4\t87.4\t45.5\t79.5\t80.9\t69.8\t77.1\t16.1\t60.7\t82.7\t38.7\t73.1\t80.1\t43.8\t43.1\t18.4\t64.8\t85.3\t44.4\t76.7\t81.1\t55.1\t59.0\t17.4\t48.6\t97.2\t0.0\t94.6\t94.6\t-100.0\t-100.0\t6.9\n",
      "skyworkprm_1_5B\t31.7\t37.7\t25.7\t32.3\t24.1\t79.4\t74.7\t23.7\t31.4\t43.7\t19.1\t33.6\t29.4\t62.6\t59.1\t7.8\t35.8\t34.7\t36.9\t35.8\t22.7\t75.2\t71.4\t27.1\t32.4\t32.9\t32.0\t32.4\t20.3\t85.4\t76.4\t27.8\t25.7\t32.0\t19.3\t26.2\t19.4\t84.8\t80.4\t26.4\t26.0\t31.1\t20.8\t26.3\t19.0\t78.0\t72.1\t26.9\t30.2\t37.7\t22.7\t31.0\t23.5\t88.2\t85.5\t18.9\t33.1\t36.2\t30.1\t33.3\t23.2\t78.4\t73.9\t29.8\t32.3\t36.3\t28.2\t32.5\t22.8\t84.0\t79.1\t25.5\t40.6\t81.1\t0.0\t68.2\t68.2\t-100.0\t-100.0\t21.4\n",
      "gemini_2_flash_1shot\t64.9\t86.0\t43.9\t77.6\t81.6\t55.7\t63.0\t20.0\t66.8\t91.7\t41.9\t85.5\t92.5\t39.5\t48.7\t17.4\t52.2\t81.1\t23.4\t69.6\t89.5\t17.0\t21.5\t17.4\t69.7\t84.6\t54.8\t77.0\t78.7\t70.2\t76.7\t23.7\t65.1\t85.6\t44.5\t77.2\t76.5\t82.7\t85.9\t23.0\t64.9\t82.5\t47.4\t73.7\t71.4\t88.7\t88.2\t29.3\t66.8\t87.8\t45.8\t80.0\t81.5\t69.7\t77.1\t15.7\t61.4\t82.7\t40.1\t73.1\t79.6\t46.3\t45.5\t18.9\t65.2\t85.4\t45.1\t76.9\t81.1\t56.3\t60.2\t17.2\t49.0\t97.9\t0.0\t95.9\t95.9\t-100.0\t-100.0\t7.4\n",
      "gemini_2_thinking_1shot\t67.8\t89.2\t46.5\t82.1\t88.2\t49.3\t57.0\t18.3\t67.4\t90.8\t43.9\t84.2\t89.9\t46.7\t55.0\t16.0\t61.7\t80.4\t43.0\t70.9\t82.4\t40.2\t51.7\t23.9\t72.0\t88.9\t55.0\t82.2\t89.0\t54.8\t64.5\t20.9\t71.0\t91.2\t50.8\t85.1\t87.0\t69.6\t74.8\t19.5\t70.0\t88.0\t52.0\t80.8\t81.2\t78.2\t80.0\t27.1\t71.5\t92.5\t50.5\t86.9\t91.2\t55.3\t59.8\t14.0\t59.9\t86.3\t33.4\t77.3\t88.8\t29.4\t29.3\t13.7\t65.0\t89.3\t40.6\t81.8\t90.9\t36.9\t40.5\t14.5\t49.9\t99.8\t0.0\t99.5\t99.5\t-100.0\t-100.0\t2.9\n",
      "llemma7b_prm_metamath\t50.5\t80.4\t20.7\t68.5\t75.6\t27.7\t15.1\t15.8\t50.2\t85.3\t15.0\t75.0\t83.2\t17.6\t15.3\t5.6\t50.5\t70.8\t30.3\t58.8\t66.4\t36.0\t15.7\t25.2\t51.9\t78.0\t25.8\t66.0\t73.9\t31.7\t15.4\t16.7\t47.6\t81.3\t13.9\t69.2\t74.5\t23.9\t15.5\t15.0\t44.4\t76.0\t12.9\t62.3\t68.0\t22.5\t12.9\t20.9\t52.1\t83.7\t20.4\t73.0\t78.5\t30.3\t15.6\t11.1\t50.5\t78.7\t22.4\t66.5\t75.5\t26.5\t15.6\t17.2\t51.3\t80.6\t22.1\t68.9\t76.6\t27.9\t15.1\t14.4\t48.0\t96.0\t0.0\t92.3\t92.3\t-100.0\t-100.0\t16.8\n",
      "o1mini\t68.8\t89.2\t48.3\t82.1\t86.9\t55.4\t56.4\t19.6\t65.6\t90.8\t40.4\t84.1\t90.4\t41.5\t45.8\t15.9\t63.7\t80.4\t47.0\t71.4\t80.6\t46.6\t47.9\t-100.0\t74.5\t88.9\t60.0\t82.7\t85.6\t69.8\t75.0\t-100.0\t67.7\t89.7\t45.7\t82.7\t84.4\t68.6\t70.8\t25.8\t73.8\t92.2\t55.5\t86.7\t87.7\t77.8\t74.5\t20.0\t72.3\t91.6\t53.1\t85.7\t87.3\t72.7\t75.0\t19.9\t61.8\t84.9\t38.7\t75.7\t88.6\t33.1\t18.8\t-100.0\t64.8\t86.7\t42.9\t78.4\t84.5\t48.2\t43.8\t-100.0\t0.0\t100.0\t-100.0\t100.0\t100.0\t-100.0\t-100.0\t2.3\n",
      "skyworkprm_7B\t36.2\t45.1\t27.4\t37.5\t30.1\t79.7\t76.6\t25.7\t35.7\t52.6\t18.8\t40.2\t38.0\t55.5\t52.5\t5.9\t41.2\t44.2\t38.1\t41.3\t30.9\t72.6\t71.6\t26.5\t36.7\t39.1\t34.4\t36.8\t24.9\t88.9\t82.5\t32.4\t29.1\t37.0\t21.2\t30.0\t23.0\t90.3\t87.3\t31.2\t30.6\t37.6\t23.7\t31.3\t23.6\t85.5\t80.6\t31.8\t34.4\t44.4\t24.4\t35.9\t28.9\t90.4\t87.7\t22.6\t36.8\t42.9\t30.8\t37.4\t28.8\t76.0\t72.8\t29.8\t37.4\t44.9\t29.9\t38.3\t29.8\t83.3\t77.8\t26.7\t44.4\t88.8\t0.0\t79.8\t79.8\t-100.0\t-100.0\t19.2\n",
      "gemini_2_thinking_0shot\t67.8\t89.1\t46.5\t81.9\t87.9\t49.8\t57.2\t18.3\t69.3\t91.6\t46.9\t85.5\t91.2\t48.3\t58.8\t15.6\t65.7\t81.1\t50.2\t72.6\t81.0\t50.4\t64.7\t27.5\t71.8\t88.5\t55.1\t81.7\t87.8\t56.8\t64.0\t20.8\t68.9\t90.5\t47.3\t83.9\t86.1\t65.6\t71.3\t19.2\t69.7\t88.3\t51.2\t81.1\t82.2\t74.1\t76.1\t25.3\t69.0\t91.6\t46.5\t85.5\t90.2\t51.8\t57.4\t14.0\t58.6\t86.4\t30.8\t77.2\t89.5\t26.1\t24.5\t12.9\t63.9\t88.6\t39.2\t80.8\t89.8\t36.5\t40.6\t13.9\t49.8\t99.6\t0.0\t99.2\t99.2\t-100.0\t-100.0\t3.1\n",
      "reasoneval7b\t60.0\t90.8\t29.2\t83.8\t95.5\t21.2\t30.3\t8.4\t61.0\t91.5\t30.5\t84.8\t94.0\t25.0\t32.8\t10.2\t50.1\t80.8\t19.4\t69.0\t89.8\t13.6\t19.0\t11.9\t62.1\t89.7\t34.6\t82.2\t96.6\t23.7\t37.4\t8.8\t65.9\t94.1\t37.7\t89.2\t96.7\t29.4\t40.1\t7.7\t61.5\t92.1\t30.9\t85.8\t95.3\t23.8\t30.5\t8.8\t66.0\t93.7\t38.2\t88.6\t96.9\t28.9\t40.3\t7.0\t55.6\t88.4\t22.9\t79.8\t95.4\t15.4\t17.3\t7.2\t58.0\t90.6\t25.4\t83.2\t96.7\t16.9\t24.6\t6.2\t49.8\t99.5\t0.0\t99.1\t99.1\t-100.0\t-100.0\t3.9\n",
      "reasoneval34b\t60.5\t83.8\t37.2\t74.2\t79.1\t48.4\t50.8\t17.2\t54.8\t86.7\t22.9\t77.4\t85.3\t25.3\t31.6\t16.2\t48.1\t76.7\t19.5\t63.9\t81.9\t16.0\t19.9\t16.6\t66.4\t83.3\t49.4\t74.9\t78.1\t61.9\t61.1\t18.5\t60.3\t83.7\t36.9\t74.1\t74.8\t68.2\t70.6\t18.5\t57.8\t80.3\t35.3\t69.7\t71.0\t61.9\t57.9\t22.5\t67.5\t87.7\t47.2\t80.1\t81.0\t73.3\t73.3\t13.1\t57.7\t80.5\t35.0\t70.0\t76.8\t41.6\t35.2\t17.4\t64.3\t84.4\t44.3\t75.6\t79.4\t57.3\t56.8\t16.4\t48.6\t97.2\t0.0\t94.6\t94.6\t-100.0\t-100.0\t8.8\n",
      "qwen_qwq\t63.6\t87.6\t39.6\t79.4\t89.2\t36.4\t40.2\t16.9\t57.2\t85.8\t28.7\t76.3\t84.3\t31.4\t33.9\t18.1\t55.6\t76.1\t35.2\t65.0\t77.9\t33.1\t37.2\t24.2\t67.4\t87.6\t47.2\t79.9\t92.8\t38.1\t42.5\t16.5\t72.3\t91.3\t53.3\t85.3\t88.8\t62.4\t65.0\t20.3\t66.2\t87.0\t45.5\t78.9\t82.9\t57.0\t51.6\t25.6\t66.9\t91.7\t42.2\t85.4\t94.8\t34.2\t43.1\t10.9\t57.8\t86.4\t29.1\t77.2\t92.8\t21.5\t18.3\t11.7\t62.7\t89.2\t36.1\t81.6\t94.9\t26.6\t29.8\t11.2\t0.0\t100.0\t-100.0\t100.0\t100.0\t-100.0\t-100.0\t1.5\n",
      "gemini_2_flash\t66.0\t86.5\t45.5\t78.4\t82.3\t57.2\t64.3\t20.0\t67.2\t91.5\t42.9\t85.1\t91.8\t41.7\t49.7\t17.6\t58.1\t81.8\t34.5\t71.5\t88.1\t27.4\t34.7\t20.6\t70.4\t85.3\t55.4\t77.9\t80.0\t69.4\t76.3\t22.9\t65.7\t86.0\t45.4\t77.7\t77.0\t83.1\t85.6\t22.7\t66.0\t83.6\t48.4\t75.1\t73.2\t87.5\t87.3\t29.2\t67.3\t88.3\t46.4\t80.8\t82.4\t68.8\t76.0\t15.1\t61.8\t83.2\t40.4\t73.8\t80.5\t45.7\t44.2\t18.6\t66.2\t86.3\t46.1\t78.1\t82.7\t55.5\t60.2\t16.6\t49.1\t98.2\t0.0\t96.5\t96.5\t-100.0\t-100.0\t6.4\n",
      "o1preview\t65.7\t86.3\t45.1\t78.1\t81.4\t59.6\t63.0\t21.3\t64.5\t90.7\t38.3\t83.9\t90.7\t38.3\t39.6\t15.0\t65.3\t80.7\t50.0\t72.1\t80.0\t51.1\t60.4\t-100.0\t68.7\t84.7\t52.7\t76.8\t78.6\t69.0\t77.1\t-100.0\t62.8\t85.0\t40.5\t76.1\t76.0\t76.7\t81.2\t26.2\t67.6\t87.7\t47.4\t80.1\t79.7\t82.9\t79.2\t24.1\t67.2\t87.5\t46.9\t79.7\t79.8\t78.9\t79.2\t22.6\t57.9\t80.1\t35.6\t69.6\t79.8\t36.2\t27.1\t-100.0\t66.0\t85.2\t46.7\t76.8\t80.1\t60.7\t60.4\t-100.0\t0.0\t100.0\t-100.0\t100.0\t100.0\t-100.0\t-100.0\t6.5\n",
      "mathshepherd\t47.0\t64.9\t29.2\t53.0\t51.5\t61.1\t54.6\t16.5\t44.0\t67.7\t20.2\t54.0\t55.6\t43.7\t39.7\t6.2\t50.3\t60.4\t40.2\t52.3\t49.9\t58.6\t50.3\t21.0\t49.4\t62.4\t36.4\t52.7\t48.9\t68.0\t58.0\t18.4\t44.5\t64.9\t24.2\t52.0\t49.9\t68.6\t63.0\t18.1\t41.3\t59.2\t23.3\t46.7\t44.6\t60.5\t52.2\t20.2\t47.7\t66.6\t28.7\t54.5\t51.7\t75.1\t72.2\t12.7\t47.2\t63.6\t30.9\t52.3\t51.8\t54.5\t45.0\t18.6\t48.6\t65.6\t31.7\t54.2\t52.5\t62.8\t57.1\t16.0\t43.1\t86.1\t0.0\t75.6\t75.6\t-100.0\t-100.0\t21.2\n",
      "mathminos_mistral\t54.2\t79.2\t29.1\t67.9\t72.8\t41.7\t38.0\t15.4\t48.8\t82.7\t14.9\t71.2\t79.2\t19.0\t20.8\t4.3\t54.0\t70.8\t37.2\t60.2\t66.6\t43.2\t40.6\t20.9\t57.0\t77.3\t36.7\t66.6\t71.1\t48.6\t39.7\t18.0\t52.1\t80.7\t23.5\t69.2\t72.6\t42.4\t39.4\t14.5\t50.7\t76.1\t25.3\t63.8\t66.6\t45.8\t39.0\t19.4\t57.8\t82.5\t33.1\t72.3\t74.5\t56.3\t53.8\t12.7\t52.8\t77.5\t28.0\t65.7\t73.3\t34.2\t29.5\t16.9\t55.8\t79.1\t32.4\t68.1\t72.8\t45.2\t41.4\t16.3\t45.5\t91.1\t0.0\t83.6\t83.6\t-100.0\t-100.0\t18.0\n",
      "gpt4o_1shot\t68.2\t88.9\t47.4\t81.7\t87.3\t52.0\t58.8\t20.5\t62.9\t86.1\t39.7\t77.4\t80.8\t55.6\t66.8\t22.4\t66.2\t78.7\t53.7\t70.8\t74.1\t62.1\t78.9\t35.1\t71.9\t89.5\t54.3\t82.9\t91.1\t50.5\t56.5\t19.1\t70.7\t90.9\t50.5\t84.7\t86.6\t69.7\t73.1\t22.3\t71.5\t89.2\t53.8\t82.4\t83.3\t76.7\t77.7\t27.3\t71.2\t92.9\t49.6\t87.5\t92.9\t49.7\t53.9\t13.4\t61.9\t87.6\t36.2\t79.2\t91.0\t30.3\t29.7\t15.3\t63.8\t89.8\t37.8\t82.4\t92.7\t31.7\t33.3\t13.2\t49.8\t99.7\t0.0\t99.4\t99.4\t-100.0\t-100.0\t2.7\n",
      "llama3_1_8b_prm_mistral\t54.4\t87.7\t21.1\t78.8\t90.2\t17.9\t22.1\t7.2\t46.1\t89.1\t3.2\t80.3\t92.3\t2.4\t3.4\t1.5\t47.3\t79.8\t14.9\t67.3\t88.7\t10.5\t18.9\t6.3\t56.6\t85.8\t27.4\t76.2\t89.6\t22.5\t24.5\t9.5\t55.1\t90.0\t20.2\t82.2\t90.0\t20.1\t22.9\t7.7\t54.4\t87.8\t21.0\t78.9\t87.8\t20.9\t18.8\t9.6\t63.8\t91.0\t36.7\t84.2\t90.7\t37.5\t45.0\t9.1\t51.5\t85.3\t17.6\t75.0\t89.9\t13.7\t16.2\t6.9\t56.2\t87.5\t24.9\t78.6\t90.3\t21.0\t27.1\t7.8\t49.0\t97.9\t0.0\t95.9\t95.9\t-100.0\t-100.0\t4.0\n",
      "llama3_1_8b_prm_deepseek\t54.2\t89.9\t18.6\t82.0\t95.0\t13.0\t17.0\t5.0\t46.4\t91.0\t1.9\t83.5\t96.1\t1.2\t1.5\t1.0\t48.9\t82.3\t15.4\t70.7\t93.6\t9.8\t15.6\t5.4\t55.7\t87.8\t23.5\t79.0\t94.6\t16.2\t21.4\t6.5\t55.0\t92.4\t17.7\t86.1\t95.2\t13.4\t16.6\t4.9\t53.2\t90.4\t15.9\t82.8\t93.7\t12.2\t11.6\t5.8\t66.2\t93.0\t39.5\t87.5\t94.9\t33.5\t43.3\t7.9\t49.0\t87.0\t10.9\t77.4\t94.4\t7.1\t6.7\t4.1\t55.4\t89.5\t21.4\t81.5\t95.1\t14.9\t19.4\t5.0\t49.9\t99.8\t0.0\t99.6\t99.6\t-100.0\t-100.0\t1.5\n",
      "gpt4o_0shot\t68.1\t89.3\t46.8\t82.2\t88.3\t49.6\t56.9\t18.9\t64.5\t88.6\t40.4\t80.9\t85.9\t48.5\t62.8\t17.7\t66.3\t80.0\t52.7\t71.9\t77.3\t57.4\t76.0\t29.9\t70.7\t89.2\t52.2\t82.4\t90.8\t48.6\t53.7\t18.4\t70.9\t91.1\t50.6\t84.9\t86.9\t69.1\t72.5\t21.6\t71.2\t89.0\t53.3\t82.2\t83.2\t76.1\t76.6\t26.6\t70.7\t92.7\t48.8\t87.2\t92.4\t49.8\t53.2\t12.9\t60.6\t87.2\t33.9\t78.5\t90.6\t28.4\t26.4\t14.4\t63.7\t89.8\t37.5\t82.5\t92.9\t31.1\t33.6\t12.6\t50.0\t100.0\t0.0\t100.0\t100.0\t-100.0\t-100.0\t3.9\n",
      "llemma7b_oprm_prm800k\t50.3\t77.3\t23.3\t64.9\t69.9\t36.1\t16.6\t16.3\t48.7\t80.3\t17.2\t68.1\t74.1\t26.5\t16.0\t8.0\t49.3\t66.3\t32.2\t55.0\t59.0\t42.9\t10.2\t26.7\t54.2\t77.3\t31.1\t65.9\t71.4\t41.6\t15.6\t16.5\t46.8\t77.6\t16.1\t64.6\t68.3\t32.6\t16.8\t16.6\t44.5\t73.4\t15.6\t59.6\t63.8\t30.2\t13.6\t20.6\t53.5\t81.7\t25.4\t70.6\t74.0\t43.8\t27.2\t11.0\t49.2\t74.4\t24.1\t61.7\t68.0\t33.4\t16.1\t18.8\t51.3\t78.4\t24.1\t66.4\t72.5\t33.8\t17.2\t13.4\t45.9\t91.8\t0.0\t84.8\t84.8\t-100.0\t-100.0\t12.9\n",
      "\n"
     ]
    }
   ],
   "source": [
    "res_dir = \"/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/scripts/logs/prmtest_classified\"\n",
    "res_files = list_jsonl_files(res_dir)\n",
    "classification_name_dict = dict(\n",
    "    domain_inconsistency=\"DC.\",\n",
    "    redundency=\"NR.\",\n",
    "    multi_solutions=\"MS.\",\n",
    "    deception=\"DR.\",\n",
    "    confidence=\"CI.\",\n",
    "    step_contradiction=\"SC.\",\n",
    "    circular=\"NCL.\",\n",
    "    missing_condition=\"PS.\",\n",
    "    counterfactual=\"ES.\"\n",
    ")\n",
    "classifications = [\"redundency\", \"circular\", \"counterfactual\", \"step_contradiction\", \"domain_inconsistency\",  \"confidence\", \"missing_condition\", \"deception\", \"multi_solutions\", ]\n",
    "metrics = [\"f1\", \"negative_f1\", \"total_step_acc\", \"correct_step_acc\", \"wrong_step_acc\", \"first_error_acc\", \"similarity\",]\n",
    "\n",
    "res_names = [f.split(\".\")[0] for f in res_files]\n",
    "res_paths = [os.path.join(res_dir, f) for f in res_files]\n",
    "res_str = \"\"\n",
    "for name, path in zip(res_names, res_paths):\n",
    "    temp_str = f\"{name}\"\n",
    "    temp_res = process_jsonl(path)[-1]\n",
    "    total_results = temp_res[\"total_hallucination_results\"]\n",
    "    type_results = temp_res[\"hallucination_type_results\"]\n",
    "    prm_score = total_results['f1'] * 0.5 + total_results['negative_f1'] * 0.5\n",
    "    \n",
    "    temp_str += f\"\\t{prm_score * 100:.1f}\"\n",
    "    for metric in metrics:\n",
    "        temp_str += f\"\\t{total_results[metric] * 100:.1f}\"\n",
    "    \n",
    "    for classification in classifications:\n",
    "        prm_score = type_results['f1'][classification] * 0.5 + type_results['negative_f1'][classification] * 0.5\n",
    "        temp_str += f\"\\t{prm_score * 100:.1f}\"\n",
    "        for metric in metrics:\n",
    "            temp_str += f\"\\t{type_results[metric][classification]*100:.1f}\"\n",
    "            \n",
    "    \n",
    "    temp_str += \"\\n\"\n",
    "    res_str += temp_str\n",
    "print(res_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "all\tredundency\tcircular\tcounterfactual\tstep_contradiction\tdomain_inconsistency\tconfidence\tmissing_condition\tdeception\tmulti_solutions\n"
     ]
    }
   ],
   "source": [
    "classifications_str = \"\\t\".join([\"all\"]+classifications) \n",
    "print(classifications_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "smoe",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
