{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "7cc7b9d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import json\n",
    "import sklearn as sk\n",
    "import matplotlib.pyplot as plt\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "1ac9523f",
   "metadata": {},
   "outputs": [],
   "source": [
    "alldata = {}\n",
    "models = ['Qwen1', 'Qwen14', 'Qwen70', 'DSChat', 'DSReason', 'o3Batch', 'GeminiFlash']\n",
    "tests = ['Math', 'Logic']\n",
    "humanbruteforcemath = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 1, 10: 0, 11: 0, 12: 1, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0, 21: 0, 22: 0, 23: 0, 24: 0, 25: 0, 26: 0, 27: 1, 28: 0, 29: 0, 30: 0, 31: 0, 32: 1, 33: 0, 34: 0, 35: 0, 36: 0, 37: 0, 38: 1, 39: 0, 40: 0, 41: 0, 42: 0, 43: 1, 44: 1, 45: 1, 46: 1, 47: 0, 48: 0, 49: 0, 50: 0, 51: 0, 52: 0, 53: 0, 54: 1, 55: 1, 56: 0, 57: 0, 58: 0, 59: 0, 60: 0, 61: 0, 62: 0, 63: 1, 64: 0, 65: 0, 66: 0, 67: 0, 68: 0, 69: 0, 70: 0, 71: 0, 72: 1, 73: 0, 74: 0, 75: 0, 76: 0, 77: 0, 78: 0, 79: 0, 80: 0, 81: 1, 82: 0, 83: 0, 84: 0, 85: 0, 86: 0, 87: 0, 88: 0, 89: 0, 90: 0, 91: 0, 92: 1, 93: 0, 94: 0, 95: 0, 96: 0, 97: 1, 98: 0, 99: 0, 100: 0, 101: 0, 102: 0, 103: 1, 104: 1, 105: 0, 106: 0, 107: 1, 108: 0, 109: 0, 110: 0, 111: 0, 112: 0, 113: 0, 114: 0, 115: 0, 116: 0, 117: 0, 118: 0, 119: 0, 120: 0, 121: 0, 122: 0, 123: 0, 124: 0, 125: 0, 126: 0, 127: 0, 128: 0, 129: 1, 130: 0, 131: 0, 132: 1, 133: 1, 134: 0, 135: 1, 136: 0, 137: 0, 138: 0, 139: 0, 140: 0, 141: 0, 142: 0, 143: 0, 144: 0, 145: 0, 146: 0, 147: 0, 148: 0, 149: 0, 150: 1, 151: 0, 152: 0, 153: 0, 154: 1, 155: 0, 156: 0, 157: 0, 158: 0, 159: 0, 160: 0, 161: 0, 162: 0, 163: 0, 164: 0, 165: 0, 166: 0, 167: 0, 168: 0, 169: 0, 170: 1, 171: 0, 172: 0, 173: 1, 174: 0, 175: 0, 176: 0, 177: 0, 178: 0, 179: 0, 180: 0, 181: 0, 182: 0, 183: 0, 184: 1, 185: 0, 186: 1, 187: 0, 188: 0, 189: 0, 190: 0, 191: 0, 192: 0, 193: 0, 194: 0, 195: 0, 196: 1, 197: 0, 198: 0, 199: 0, 200: 0, 201: 0, 202: 0, 203: 0, 204: 0, 205: 0, 206: 0, 207: 1, 208: 0, 209: 0, 210: 0, 211: 0, 212: 0, 213: 0, 214: 0, 215: 0, 216: 0, 217: 0, 218: 0, 219: 0, 220: 0, 221: 0, 222: 0, 223: 0, 224: 0, 225: 1, 226: 0, 227: 0, 228: 0, 229: 0, 230: 0, 231: 0, 232: 0, 233: 0, 234: 0, 235: 0, 236: 0, 237: 0, 238: 0, 239: 0, 240: 0, 241: 0, 242: 0, 243: 0, 244: 0, 245: 0, 246: 1, 247: 0, 248: 0, 249: 0}\n",
    "humanbruteforcelogic = {0: 0, 1: 0, 2: 0, 3: 1, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 1, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 1, 20: 0, 21: 0, 22: 0, 23: 0, 24: 0, 25: 0, 26: 0, 27: 0, 28: 0, 29: 0, 30: 0, 31: 0, 32: 0, 33: 1, 34: 0, 35: 0, 36: 0, 37: 0, 38: 0, 39: 1, 40: 0, 41: 0, 42: 0, 43: 0, 44: 0, 45: 0, 46: 0, 47: 0, 48: 0, 49: 0, 50: 1, 51: 0, 52: 1, 53: 0, 54: 0, 55: 0, 56: 0, 57: 0, 58: 1, 59: 0, 60: 0, 61: 0, 62: 0, 63: 1, 64: 1, 65: 0, 66: 0, 67: 0, 68: 0, 69: 0, 70: 0, 71: 0, 72: 1, 73: 0, 74: 0, 75: 0, 76: 0, 77: 0, 78: 0, 79: 0, 80: 0, 81: 0, 82: 0, 83: 0, 84: 0, 85: 0, 86: 0, 87: 0, 88: 0, 89: 0, 90: 0, 91: 0, 92: 0, 93: 0, 94: 0, 95: 0, 96: 1, 97: 0, 98: 0, 99: 0, 100: 0, 101: 0, 102: 0, 103: 0, 104: 1, 105: 0, 106: 0, 107: 1, 108: 0, 109: 1, 110: 0, 111: 0, 112: 0, 113: 0, 114: 0, 115: 0, 116: 0, 117: 0, 118: 0, 119: 0, 120: 0, 121: 0, 122: 0, 123: 1, 124: 0, 125: 0, 126: 0, 127: 1, 128: 0, 129: 0, 130: 0, 131: 0, 132: 0, 133: 0, 134: 0, 135: 0, 136: 0, 137: 0, 138: 0, 139: 0, 140: 0, 141: 0, 142: 0, 143: 0, 144: 0, 145: 0, 146: 0, 147: 0, 148: 0, 149: 0, 150: 0, 151: 0, 152: 0, 153: 0, 154: 0, 155: 0, 156: 0, 157: 0, 158: 0, 159: 0, 160: 0, 161: 0, 162: 0, 163: 0, 164: 0, 165: 0, 166: 0, 167: 0, 168: 0, 169: 0, 170: 0, 171: 0, 172: 0, 173: 0, 174: 0, 175: 0, 176: 1, 177: 0, 178: 0, 179: 0, 180: 0, 181: 0, 182: 0, 183: 0, 184: 0, 185: 0, 186: 0, 187: 1, 188: 0, 189: 0, 190: 0, 191: 0, 192: 0, 193: 1, 194: 0, 195: 0, 196: 0, 197: 0, 198: 0, 199: 0, 200: 0, 201: 0, 202: 0, 203: 0, 204: 1, 205: 0, 206: 0, 207: 0, 208: 0, 209: 0, 210: 0, 211: 1, 212: 0, 213: 0, 214: 0, 215: 0, 216: 0, 217: 0, 218: 0, 219: 0, 220: 0, 221: 0, 222: 1, 223: 0, 224: 0, 225: 0, 226: 0, 227: 1, 228: 0, 229: 0, 230: 0, 231: 0, 232: 0, 233: 0, 234: 0, 235: 0, 236: 0, 237: 0, 238: 0, 239: 0, 240: 0, 241: 0, 242: 0, 243: 0, 244: 1, 245: 0, 246: 0, 247: 0, 248: 0, 249: 0}\n",
    "mathdifficulty = {}\n",
    "logicdifficulty = {}\n",
    "mathpopularity = {}\n",
    "logicpopularity = {}\n",
    "mathcategory = {}\n",
    "logiccategory = {}\n",
    "\n",
    "logicdata = pd.read_csv(\"data/braingle/braingle_Logic_with_categories.csv\")\n",
    "mathdata = pd.read_csv(\"data/braingle/braingle_Math_with_categories.csv\")\n",
    "for index, row in logicdata.iterrows():\n",
    "    logicdifficulty[index] = row[\"Difficulty\"]\n",
    "    logicpopularity[index] = row[\"Popularity/Fun\"]\n",
    "    logiccategory[index] = row['categories']\n",
    "for index, row in mathdata.iterrows():\n",
    "    mathdifficulty[index] = row[\"Difficulty\"]\n",
    "    mathpopularity[index] = row[\"Popularity/Fun\"]\n",
    "    mathcategory[index] = row['categories']\n",
    "\n",
    "\n",
    "for model in models:\n",
    "    file_path = f\"response_evaluation/Math/MathAll-{model}/resultsEvaluations_evaluatedbyo3-2025-04-16.jsonl\"\n",
    "    try:\n",
    "        with open(file_path, 'r') as file:\n",
    "            alldata[(model, \"Math\")] = [json.loads(line) for line in file]\n",
    "    except:\n",
    "        pass\n",
    "\n",
    "for model in models:\n",
    "    file_path = f\"response_evaluation/Logic/LogicAll-{model}/resultsEvaluations_evaluatedbyo3-2025-04-16.jsonl\"\n",
    "    try:\n",
    "        with open(file_path, 'r') as file:\n",
    "            alldata[(model, \"Logic\")] = [json.loads(line) for line in file]\n",
    "    except:\n",
    "        pass\n",
    "\n",
    "\n",
    "# for i in range(250):\n",
    "#     humanbruteforcemath[i] = round(humanbruteforcemath[i] / humanbruteforcetotalmath[i])\n",
    "#     humanbruteforcelogic[i] = round(humanbruteforcelogic[i] / humanbruteforcetotallogic[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "d03c56aa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: Qwen1, Test: Math, Prompt: basicprompt Correctness: 0.172/0.14 CorrectnessAdj: 0.172/0.14\n",
      "Model: Qwen1, Test: Math, Prompt: mathPrompt Correctness: 0.164/0.1 CorrectnessAdj: 0.164/0.1\n",
      "Model: Qwen1, Test: Math, Prompt: hintPrompt Correctness: 0.152/0.08 CorrectnessAdj: 0.152/0.08\n",
      "Model: Qwen1, Test: Math, Prompt: combinedhintPrompt Correctness: 0.176/0.1 CorrectnessAdj: 0.176/0.1\n",
      "Model: Qwen1, Test: Logic, Prompt: basicprompt Correctness: 0.04/0.04 CorrectnessAdj: 0.04/0.04\n",
      "Model: Qwen1, Test: Logic, Prompt: mathPrompt Correctness: 0.04/0.06 CorrectnessAdj: 0.04/0.06\n",
      "Model: Qwen1, Test: Logic, Prompt: hintPrompt Correctness: 0.068/0.06 CorrectnessAdj: 0.068/0.06\n",
      "Model: Qwen1, Test: Logic, Prompt: combinedhintPrompt Correctness: 0.036/0.041 CorrectnessAdj: 0.036/0.04\n",
      "Model: Qwen14, Test: Math, Prompt: basicprompt Correctness: 0.412/0.22 CorrectnessAdj: 0.412/0.22\n",
      "Model: Qwen14, Test: Math, Prompt: mathPrompt Correctness: 0.44/0.3 CorrectnessAdj: 0.44/0.3\n",
      "Model: Qwen14, Test: Math, Prompt: hintPrompt Correctness: 0.44/0.2 CorrectnessAdj: 0.44/0.2\n",
      "Model: Qwen14, Test: Math, Prompt: combinedhintPrompt Correctness: 0.426/0.26 CorrectnessAdj: 0.424/0.26\n",
      "Model: Qwen14, Test: Logic, Prompt: basicprompt Correctness: 0.22/0.16 CorrectnessAdj: 0.22/0.16\n",
      "Model: Qwen14, Test: Logic, Prompt: mathPrompt Correctness: 0.236/0.16 CorrectnessAdj: 0.236/0.16\n",
      "Model: Qwen14, Test: Logic, Prompt: hintPrompt Correctness: 0.272/0.22 CorrectnessAdj: 0.272/0.22\n",
      "Model: Qwen14, Test: Logic, Prompt: combinedhintPrompt Correctness: 0.26/0.32 CorrectnessAdj: 0.26/0.32\n",
      "Model: Qwen70, Test: Math, Prompt: basicprompt Correctness: 0.424/0.2 CorrectnessAdj: 0.424/0.2\n",
      "Model: Qwen70, Test: Math, Prompt: mathPrompt Correctness: 0.408/0.22 CorrectnessAdj: 0.408/0.22\n",
      "Model: Qwen70, Test: Math, Prompt: hintPrompt Correctness: 0.456/0.24 CorrectnessAdj: 0.456/0.24\n",
      "Model: Qwen70, Test: Math, Prompt: combinedhintPrompt Correctness: 0.442/0.18 CorrectnessAdj: 0.44/0.18\n",
      "Model: Qwen70, Test: Logic, Prompt: basicprompt Correctness: 0.244/0.16 CorrectnessAdj: 0.244/0.16\n",
      "Model: Qwen70, Test: Logic, Prompt: mathPrompt Correctness: 0.244/0.14 CorrectnessAdj: 0.244/0.14\n",
      "Model: Qwen70, Test: Logic, Prompt: hintPrompt Correctness: 0.26/0.2 CorrectnessAdj: 0.26/0.2\n",
      "Model: Qwen70, Test: Logic, Prompt: combinedhintPrompt Correctness: 0.292/0.26 CorrectnessAdj: 0.292/0.26\n",
      "Model: DSChat, Test: Math, Prompt: basicprompt Correctness: 0.58/0.46 CorrectnessAdj: 0.58/0.46\n",
      "Model: DSChat, Test: Math, Prompt: mathPrompt Correctness: 0.556/0.38 CorrectnessAdj: 0.556/0.38\n",
      "Model: DSChat, Test: Math, Prompt: hintPrompt Correctness: 0.56/0.36 CorrectnessAdj: 0.56/0.36\n",
      "Model: DSChat, Test: Math, Prompt: combinedhintPrompt Correctness: 0.588/0.36 CorrectnessAdj: 0.588/0.36\n",
      "Model: DSChat, Test: Logic, Prompt: basicprompt Correctness: 0.378/0.306 CorrectnessAdj: 0.376/0.3\n",
      "Model: DSChat, Test: Logic, Prompt: mathPrompt Correctness: 0.408/0.28 CorrectnessAdj: 0.408/0.28\n",
      "Model: DSChat, Test: Logic, Prompt: hintPrompt Correctness: 0.416/0.22 CorrectnessAdj: 0.416/0.22\n",
      "Model: DSChat, Test: Logic, Prompt: combinedhintPrompt Correctness: 0.414/0.245 CorrectnessAdj: 0.412/0.24\n",
      "Model: DSReason, Test: Math, Prompt: basicprompt Correctness: 0.668/0.48 CorrectnessAdj: 0.668/0.48\n",
      "Model: DSReason, Test: Math, Prompt: mathPrompt Correctness: 0.702/0.54 CorrectnessAdj: 0.696/0.54\n",
      "Model: DSReason, Test: Math, Prompt: hintPrompt Correctness: 0.724/0.48 CorrectnessAdj: 0.724/0.48\n",
      "Model: DSReason, Test: Math, Prompt: combinedhintPrompt Correctness: 0.728/0.56 CorrectnessAdj: 0.728/0.56\n",
      "Model: DSReason, Test: Logic, Prompt: basicprompt Correctness: 0.446/0.26 CorrectnessAdj: 0.444/0.26\n",
      "Model: DSReason, Test: Logic, Prompt: mathPrompt Correctness: 0.454/0.32 CorrectnessAdj: 0.452/0.32\n",
      "Model: DSReason, Test: Logic, Prompt: hintPrompt Correctness: 0.494/0.327 CorrectnessAdj: 0.492/0.32\n",
      "Model: DSReason, Test: Logic, Prompt: combinedhintPrompt Correctness: 0.506/0.4 CorrectnessAdj: 0.504/0.4\n",
      "Model: o3Batch, Test: Math, Prompt: basicprompt Correctness: 0.843/0.75 CorrectnessAdj: 0.796/0.66\n",
      "Model: o3Batch, Test: Math, Prompt: mathPrompt Correctness: 0.877/0.821 CorrectnessAdj: 0.796/0.64\n",
      "Model: o3Batch, Test: Math, Prompt: combinedhintPrompt Correctness: 0.868/0.773 CorrectnessAdj: 0.812/0.68\n",
      "Model: o3Batch, Test: Math, Prompt: hint_prompt Correctness: 0.888/0.805 CorrectnessAdj: 0.828/0.66\n",
      "Model: o3Batch, Test: Logic, Prompt: basicprompt Correctness: 0.851/0.862 CorrectnessAdj: 0.684/0.5\n",
      "Model: o3Batch, Test: Logic, Prompt: mathPrompt Correctness: 0.852/0.871 CorrectnessAdj: 0.712/0.54\n",
      "Model: o3Batch, Test: Logic, Prompt: combinedhintPrompt Correctness: 0.869/0.75 CorrectnessAdj: 0.744/0.54\n",
      "Model: o3Batch, Test: Logic, Prompt: hint_prompt Correctness: 0.858/0.812 CorrectnessAdj: 0.7/0.52\n",
      "Model: GeminiFlash, Test: Math, Prompt: basicprompt Correctness: 0.696/0.395 CorrectnessAdj: 0.66/0.34\n",
      "Model: GeminiFlash, Test: Math, Prompt: mathPrompt Correctness: 0.709/0.475 CorrectnessAdj: 0.652/0.38\n",
      "Model: GeminiFlash, Test: Math, Prompt: hint_prompt Correctness: 0.739/0.537 CorrectnessAdj: 0.692/0.44\n",
      "Model: GeminiFlash, Test: Math, Prompt: combinedhintPrompt Correctness: 0.789/0.69 CorrectnessAdj: 0.72/0.58\n",
      "Model: GeminiFlash, Test: Logic, Prompt: basicprompt Correctness: 0.567/0.474 CorrectnessAdj: 0.492/0.36\n",
      "Model: GeminiFlash, Test: Logic, Prompt: mathPrompt Correctness: 0.587/0.486 CorrectnessAdj: 0.512/0.34\n",
      "Model: GeminiFlash, Test: Logic, Prompt: hint_prompt Correctness: 0.605/0.525 CorrectnessAdj: 0.54/0.42\n",
      "Model: GeminiFlash, Test: Logic, Prompt: combinedhintPrompt Correctness: 0.647/0.594 CorrectnessAdj: 0.536/0.38\n"
     ]
    }
   ],
   "source": [
    "for model in models:\n",
    "    for test in tests:\n",
    "        try:\n",
    "            data = pd.DataFrame(alldata[(model, test)])\n",
    "        except:\n",
    "            continue\n",
    "        prompts = data[\"PromptType\"].unique()\n",
    "\n",
    "        for prompt in prompts:\n",
    "            if (1 == 0):\n",
    "                continue\n",
    "            tempdata = data[data[\"PromptType\"]==prompt]\n",
    "            bfarray = np.zeros((2, 2))\n",
    "            bfdiff = []\n",
    "            nbfdiff = []\n",
    "            bfpop = []\n",
    "            nbfpop = []\n",
    "            correctness = []\n",
    "            correctnessDiff = []\n",
    "            category = {}\n",
    "            count = 0\n",
    "\n",
    "            for index, row in tempdata.iterrows():\n",
    "                # if (type(row[\"Response\"]) != str):\n",
    "                #     print(row[\"Response\"])\n",
    "                if (type(row[\"Response\"]) != str or row[\"Response\"] == None or row[\"Response\"] == \"NaN\" or row[\"Response\"] == \"None\" or row[\"Response\"] == \"\" or row[\"model_bruteforce\"] == \"NULL\" or row[\"Response\"] is str and row[\"Response\"].isspace()):\n",
    "                    # print(\"Filtered!\")\n",
    "                    continue\n",
    "                try:\n",
    "                    if (row[\"model_bruteforce\"] == \"1\"):\n",
    "                        if \"Math\" in test:\n",
    "                            bfdiff.append(mathdifficulty[row[\"ID\"]])\n",
    "                            bfpop.append(mathpopularity[row[\"ID\"]])\n",
    "                        else:\n",
    "                            bfdiff.append(logicdifficulty[row[\"ID\"]])\n",
    "                            bfpop.append(logicpopularity[row[\"ID\"]])\n",
    "                    elif (row[\"model_bruteforce\"] == \"0\"):\n",
    "                        if \"Math\" in test:\n",
    "                            nbfdiff.append(mathdifficulty[row[\"ID\"]])\n",
    "                            nbfpop.append(mathpopularity[row[\"ID\"]])\n",
    "                        else:\n",
    "                            nbfdiff.append(logicdifficulty[row[\"ID\"]])\n",
    "                            nbfpop.append(logicpopularity[row[\"ID\"]])\n",
    "                    \n",
    "                    if \"Math\" in test:\n",
    "                        bfarray[1-int(row[\"model_bruteforce\"])][1-humanbruteforcemath[row['ID']]] += 1\n",
    "                    if \"Logic\" in test:\n",
    "                        bfarray[1-int(row[\"model_bruteforce\"])][1-humanbruteforcelogic[row['ID']]] += 1\n",
    "                    correctness.append(int(row[\"correctness\"]))\n",
    "\n",
    "                    if \"Logic\" in test:\n",
    "                        # print(logiccategory[row[\"ID\"]])\n",
    "                        if logiccategory[row[\"ID\"]] not in category.keys():\n",
    "                            category[logiccategory[row[\"ID\"]]] = [0, 0]\n",
    "                        \n",
    "                        # print(row[\"model_bruteforce\"])\n",
    "\n",
    "                        category[logiccategory[row[\"ID\"]]][1-int(row[\"model_bruteforce\"])] += 1\n",
    "\n",
    "                    if (row['ID'] < 50):\n",
    "                        correctnessDiff.append(int(row[\"correctness\"]))\n",
    "                    # count += 1\n",
    "\n",
    "                    \n",
    "                except Exception as e:\n",
    "                    # print(\"Error:\", e)\n",
    "                    # print(row[\"model_bruteforce\"])\n",
    "                    pass\n",
    "            # print(category)\n",
    "            \n",
    "            print(f\"Model: {model}, Test: {test}, Prompt: {prompt}\", \n",
    "                #   print(category),\n",
    "                #   np.round(100*bfarray.flatten()/np.sum(bfarray), 1), np.sum(bfarray), \n",
    "                  \"Correctness:\", str(np.round(np.mean(correctness), 3)) + \"/\" + str(np.round(np.mean(correctnessDiff), 3)), \n",
    "                  \"CorrectnessAdj:\", str(np.round(np.sum(correctness)/250, 3)) + \"/\" + str(np.round(np.sum(correctnessDiff)/50, 3)), \n",
    "                #   \"Difficulty (BF/NBF):\", np.round(np.mean(bfdiff), 2), np.round(np.mean(nbfdiff), 2), \n",
    "                #   \"Popularity (BF/NBF):\", np.round(np.mean(bfpop), 2), np.round(np.mean(nbfpop), 2),\n",
    "                  \n",
    "            )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "id": "5ed10e30",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Human Solution Math:\n",
      "Difficulty (BF/NBF): 2.83 2.8\n",
      "Popularity (BF/NBF): 2.32 2.33\n",
      "Human Solution Logic:\n",
      "Difficulty (BF/NBF): 2.7 2.65\n",
      "Popularity (BF/NBF): 2.37 2.51\n"
     ]
    }
   ],
   "source": [
    "print(\"Human Solution Math:\")\n",
    "totaldiffbf = []\n",
    "totaldiffnbf = []\n",
    "totalpopbf = []\n",
    "totalpopnbf = []\n",
    "for i in range(250):\n",
    "    if (humanbruteforcemath[i] == 1):\n",
    "        totaldiffbf.append(mathdifficulty[i])\n",
    "        totalpopbf.append(mathpopularity[i])\n",
    "    else:\n",
    "        totaldiffnbf.append(mathdifficulty[i])\n",
    "        totalpopnbf.append(mathpopularity[i])\n",
    "print(\"Difficulty (BF/NBF):\", np.round(np.mean(totaldiffbf), 2), np.round(np.mean(totaldiffnbf), 2))\n",
    "print(\"Popularity (BF/NBF):\", np.round(np.mean(totalpopbf), 2), np.round(np.mean(totalpopnbf), 2))\n",
    "print(\"Human Solution Logic:\")\n",
    "totaldiffbf = []\n",
    "totaldiffnbf = []\n",
    "totalpopbf = []\n",
    "totalpopnbf = []\n",
    "for i in range(250):\n",
    "    if (humanbruteforcelogic[i] == 1):\n",
    "        totaldiffbf.append(logicdifficulty[i])\n",
    "        totalpopbf.append(logicpopularity[i])\n",
    "    else:\n",
    "        totaldiffnbf.append(logicdifficulty[i])\n",
    "        totalpopnbf.append(logicpopularity[i])\n",
    "print(\"Difficulty (BF/NBF):\", np.round(np.mean(totaldiffbf), 2), np.round(np.mean(totaldiffnbf), 2))\n",
    "print(\"Popularity (BF/NBF):\", np.round(np.mean(totalpopbf), 2), np.round(np.mean(totalpopnbf), 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "id": "c912abea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Math Category Analysis:\n",
      "Model: Qwen1, Prompt: basicprompt 19.3&75.0&24.0&48.6&8.3&50.0&30.0&55.2&\n",
      "Model: Qwen1, Prompt: mathPrompt 19.3&80.0&20.0&45.7&8.3&40.0&30.0&62.1&\n",
      "Model: Qwen1, Prompt: hintPrompt 19.3&70.0&32.0&42.9&12.5&50.0&36.7&65.5&\n",
      "Model: Qwen1, Prompt: combinedhintPrompt 28.1&75.0&24.0&40.0&8.3&50.0&16.7&55.2&\n",
      "Model: Qwen14, Prompt: basicprompt 19.3&75.0&24.0&42.9&12.5&60.0&26.7&62.1&\n",
      "Model: Qwen14, Prompt: mathPrompt 19.3&80.0&20.0&37.1&4.2&50.0&33.3&48.3&\n",
      "Model: Qwen14, Prompt: hintPrompt 22.8&70.0&24.0&42.9&12.5&50.0&36.7&62.1&\n",
      "Model: Qwen14, Prompt: combinedhintPrompt 17.5&85.0&24.0&40.0&4.2&53.3&46.7&55.2&\n",
      "Model: Qwen70, Prompt: basicprompt 14.0&80.0&16.0&34.3&4.2&40.0&43.3&62.1&\n",
      "Model: Qwen70, Prompt: mathPrompt 12.3&85.0&16.0&37.1&0.0&33.3&36.7&69.0&\n",
      "Model: Qwen70, Prompt: hintPrompt 8.8&70.0&20.0&34.3&8.3&46.7&16.7&48.3&\n",
      "Model: Qwen70, Prompt: combinedhintPrompt 19.3&70.0&24.0&31.4&8.3&43.3&26.7&55.2&\n",
      "Model: DSChat, Prompt: basicprompt 22.8&85.0&24.0&37.1&20.8&53.3&43.3&65.5&\n",
      "Model: DSChat, Prompt: mathPrompt 21.1&85.0&28.0&45.7&8.3&46.7&30.0&65.5&\n",
      "Model: DSChat, Prompt: hintPrompt 26.3&80.0&20.0&37.1&8.3&56.7&26.7&65.5&\n",
      "Model: DSChat, Prompt: combinedhintPrompt 15.8&80.0&24.0&37.1&12.5&53.3&30.0&58.6&\n",
      "Model: DSReason, Prompt: basicprompt 8.8&35.0&8.0&25.7&4.2&26.7&13.3&48.3&\n",
      "Model: DSReason, Prompt: mathPrompt 10.5&30.0&12.0&22.9&8.3&26.7&3.4&34.5&\n",
      "Model: DSReason, Prompt: hintPrompt 10.5&35.0&20.0&25.7&8.3&33.3&10.0&37.9&\n",
      "Model: DSReason, Prompt: combinedhintPrompt 12.3&5.0&4.0&22.9&4.2&23.3&0.0&37.9&\n",
      "Model: o3Batch, Prompt: basicprompt 3.5&26.3&4.3&11.8&4.3&17.9&4.2&35.7&\n",
      "Model: o3Batch, Prompt: mathPrompt 5.4&5.3&5.0&6.1&0.0&15.4&0.0&22.2&\n",
      "Model: o3Batch, Prompt: combinedhintPrompt 3.6&5.3&4.8&15.2&4.8&11.1&0.0&27.6&\n",
      "Model: o3Batch, Prompt: hint_prompt 3.6&21.1&13.0&18.2&4.8&22.2&10.7&37.0&\n",
      "Model: GeminiFlash, Prompt: basicprompt 7.1&62.5&16.7&34.3&4.5&29.6&20.7&42.9&\n",
      "Model: GeminiFlash, Prompt: mathPrompt 7.3&28.6&17.4&23.5&0.0&46.2&6.7&46.2&\n",
      "Model: GeminiFlash, Prompt: hint_prompt 7.3&38.9&17.4&26.5&4.8&34.6&6.7&44.4&\n",
      "Model: GeminiFlash, Prompt: combinedhintPrompt 5.4&18.8&13.6&28.6&4.5&27.3&3.6&46.4&\n",
      "Model: Qwen1, Prompt: basicprompt 10.3&15.4&27.3&41.2&25.0&41.2&100.0&33.3&7.9&28.1&15.4&40.0&0.0&\n",
      "Model: Qwen1, Prompt: mathPrompt 10.3&23.1&13.6&52.9&0.0&35.3&75.0&44.4&13.2&31.2&19.2&60.0&16.7&\n",
      "Model: Qwen1, Prompt: hintPrompt 10.3&7.7&27.3&29.4&0.0&47.1&75.0&22.2&2.6&28.1&23.1&40.0&0.0&\n",
      "Model: Qwen1, Prompt: combinedhintPrompt 20.7&30.8&18.2&58.8&12.5&52.9&75.0&33.3&13.2&21.9&23.1&53.3&0.0&\n",
      "Model: Qwen14, Prompt: basicprompt 27.6&38.5&68.2&52.9&12.5&52.9&75.0&44.4&18.4&31.2&26.9&80.0&16.7&\n",
      "Model: Qwen14, Prompt: mathPrompt 31.0&23.1&54.5&70.6&37.5&64.7&50.0&55.6&15.8&28.1&26.9&86.7&16.7&\n",
      "Model: Qwen14, Prompt: hintPrompt 24.1&46.2&54.5&47.1&12.5&70.6&75.0&66.7&13.2&37.5&23.1&86.7&16.7&\n",
      "Model: Qwen14, Prompt: combinedhintPrompt 31.0&30.8&50.0&52.9&25.0&41.2&75.0&77.8&21.1&37.5&23.1&86.7&0.0&\n",
      "Model: Qwen70, Prompt: basicprompt 24.1&7.7&31.8&41.2&0.0&41.2&75.0&66.7&10.5&46.9&23.1&80.0&0.0&\n",
      "Model: Qwen70, Prompt: mathPrompt 6.9&23.1&27.3&41.2&12.5&64.7&75.0&44.4&10.5&28.1&38.5&80.0&33.3&\n",
      "Model: Qwen70, Prompt: hintPrompt 20.7&23.1&45.5&41.2&12.5&41.2&75.0&55.6&10.5&40.6&26.9&80.0&16.7&\n",
      "Model: Qwen70, Prompt: combinedhintPrompt 10.3&7.7&18.2&35.3&25.0&52.9&0.0&22.2&15.8&34.4&26.9&73.3&16.7&\n",
      "Model: DSChat, Prompt: basicprompt 20.7&23.1&63.6&58.8&87.5&41.2&75.0&66.7&21.1&37.5&23.1&86.7&33.3&\n",
      "Model: DSChat, Prompt: mathPrompt 41.4&38.5&59.1&64.7&50.0&29.4&75.0&44.4&23.7&34.4&26.9&73.3&66.7&\n",
      "Model: DSChat, Prompt: hintPrompt 31.0&7.7&45.5&52.9&75.0&52.9&75.0&44.4&13.2&50.0&19.2&60.0&16.7&\n",
      "Model: DSChat, Prompt: combinedhintPrompt 37.9&7.7&40.9&41.2&37.5&41.2&50.0&55.6&21.1&34.4&19.2&60.0&33.3&\n",
      "Model: DSReason, Prompt: basicprompt 6.9&7.7&31.8&18.8&0.0&41.2&25.0&22.2&2.6&21.9&7.7&53.3&0.0&\n",
      "Model: DSReason, Prompt: mathPrompt 10.3&0.0&27.3&41.2&12.5&17.6&0.0&0.0&2.6&15.6&0.0&73.3&0.0&\n",
      "Model: DSReason, Prompt: hintPrompt 6.9&7.7&27.3&35.3&25.0&23.5&25.0&22.2&2.7&12.5&3.8&13.3&0.0&\n",
      "Model: DSReason, Prompt: combinedhintPrompt 3.6&7.7&27.3&29.4&12.5&17.6&0.0&22.2&7.9&12.5&3.8&26.7&16.7&\n",
      "Model: o3Batch, Prompt: basicprompt 13.0&0.0&16.7&18.8&33.3&50.0&33.3&12.5&3.1&16.0&13.0&58.3&0.0&\n",
      "Model: o3Batch, Prompt: mathPrompt 4.8&0.0&15.4&17.6&71.4&31.2&0.0&0.0&3.0&11.5&8.7&23.1&25.0&\n",
      "Model: o3Batch, Prompt: combinedhintPrompt 0.0&0.0&23.1&11.8&12.5&5.9&0.0&25.0&0.0&3.6&3.8&7.7&0.0&\n",
      "Model: o3Batch, Prompt: hint_prompt 0.0&0.0&8.3&29.4&0.0&29.4&0.0&25.0&3.1&3.8&4.3&16.7&0.0&\n",
      "Model: GeminiFlash, Prompt: basicprompt 4.0&8.3&27.3&43.8&33.3&29.4&66.7&25.0&3.1&32.3&7.7&71.4&0.0&\n",
      "Model: GeminiFlash, Prompt: mathPrompt 4.5&8.3&41.7&31.2&25.0&17.6&33.3&37.5&11.8&15.6&0.0&78.6&0.0&\n",
      "Model: GeminiFlash, Prompt: hint_prompt 12.5&25.0&0.0&41.2&0.0&23.5&50.0&42.9&2.9&28.1&7.7&60.0&0.0&\n",
      "Model: GeminiFlash, Prompt: combinedhintPrompt 5.3&8.3&18.2&25.0&0.0&31.2&66.7&42.9&8.8&23.3&3.8&53.8&0.0&\n"
     ]
    }
   ],
   "source": [
    "print('Math Category Analysis:')\n",
    "for test in tests:\n",
    "    for model in models:\n",
    "        try:\n",
    "            data = pd.DataFrame(alldata[(model, test)])\n",
    "        except:\n",
    "            continue\n",
    "        prompts = data[\"PromptType\"].unique()\n",
    "\n",
    "        for prompt in prompts:\n",
    "            tempdata = data[data[\"PromptType\"]==prompt]\n",
    "            if \"Math\" in test:\n",
    "                categorybf = {'Algebra':[0,0],\n",
    "                              'Arithmetic':[0,0],\n",
    "                              'Combinatorics':[0,0],\n",
    "                              'Number Theory':[0,0],\n",
    "                              'Geometry':[0,0],\n",
    "                              'Logic':[0,0],\n",
    "                              'Pattern':[0,0],\n",
    "                              'Special Number':[0,0]}\n",
    "                for index, row in tempdata.iterrows():\n",
    "                    if (type(row[\"Response\"]) != str or row[\"Response\"] == None or row[\"Response\"] == \"NaN\" or row[\"Response\"] == \"None\" or row[\"Response\"] == \"\" or row[\"model_bruteforce\"] == \"NULL\" or row[\"Response\"] is str and row[\"Response\"].isspace()):\n",
    "                        continue\n",
    "                    try:\n",
    "                        if (mathcategory[row['ID']] not in categorybf.keys()):\n",
    "                            categorybf[mathcategory[row['ID']]] = [0, 0]\n",
    "                        # print(\"!\")\n",
    "                        categorybf[mathcategory[row['ID']]][int(row['model_bruteforce'])] += 1\n",
    "                        # print(\"@\")\n",
    "                    except Exception as e:\n",
    "                        pass\n",
    "                # categorybftemp = categorybf\n",
    "                # categorybf = {}\n",
    "                # for i in range(10):\n",
    "                #     if categorybftemp[i][0] != 0 or categorybftemp[i][1] != 0:\n",
    "                #         categorybf[str(i)] = round((categorybftemp[i][1])/(categorybftemp[i][0]+categorybftemp[i][1]), 2)\n",
    "            elif \"Logic\" in test:\n",
    "                categorybf = {'0D':[0,0]\n",
    "                            ,'1D':[0,0]\n",
    "                            ,'2D':[0,0]\n",
    "                            ,'Number':[0,0]\n",
    "                            ,'Clusters':[0,0]\n",
    "                            ,'Liars':[0,0]\n",
    "                            ,'Communication':[0,0]\n",
    "                            ,'Compound':[0,0]\n",
    "                            ,'Algorithm':[0,0]\n",
    "                            ,'Math':[0,0]\n",
    "                            ,'Pattern':[0,0]\n",
    "                            ,'Linguistic':[0,0]\n",
    "                            ,'Tree':[0,0]}\n",
    "                for index, row in tempdata.iterrows():\n",
    "                    if (type(row[\"Response\"]) != str or row[\"Response\"] == None or row[\"Response\"] == \"NaN\" or row[\"Response\"] == \"None\" or row[\"Response\"] == \"\" or row[\"model_bruteforce\"] == \"NULL\" or row[\"Response\"] is str and row[\"Response\"].isspace()):\n",
    "                        continue\n",
    "                    try:\n",
    "                        if (row[\"model_bruteforce\"] == \"1\"):\n",
    "                            categorybf[logiccategory[row['ID']]][1] += 1\n",
    "                        elif (row[\"model_bruteforce\"] == \"0\"):\n",
    "                            categorybf[logiccategory[row['ID']]][0] += 1\n",
    "                    except Exception as e:\n",
    "                        pass\n",
    "\n",
    "            stroutput = \"\"\n",
    "            \n",
    "            for v in categorybf.values():\n",
    "                stroutput += str(round(100*v[1]/(v[0]+v[1]), 1)) + \"&\"\n",
    "            print(f\"Model: {model}, Prompt: {prompt}\", stroutput,\n",
    "                #   categorybf.flatten(), np.sum(categorybf), \n",
    "                #   \"Correctness:\", str(np.round(np.mean(correctness), 3)) + \"/\" + str(np.round(np.mean(correctnessDiff), 3)), \n",
    "                #   \"Difficulty (BF/NBF):\", np.round(np.mean(bfdiff), 2), np.round(np.mean(nbfdiff), 2), \n",
    "                #   \"Popularity (BF/NBF):\", np.round(np.mean(bfpop), 2), np.round(np.mean(nbfpop), 2),\n",
    "                \n",
    "            )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4f97e28",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Brainteasers",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
