{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-29T22:58:07.490432Z",
     "start_time": "2025-03-29T22:58:07.487104Z"
    }
   },
   "source": [
    "import pandas as pd\n",
    "import ast\n",
    "import numpy as np\n",
    "import os"
   ],
   "outputs": [],
   "execution_count": 15
  },
  {
   "cell_type": "code",
   "id": "3c671b442c3cc15e",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-29T22:58:07.504652Z",
     "start_time": "2025-03-29T22:58:07.502714Z"
    }
   },
   "source": [
    "eval_results_path = \"/Users/ANON.ANON/PycharmProjects/benchname-baselines/bug_localization/data/eval\""
   ],
   "outputs": [],
   "execution_count": 16
  },
  {
   "cell_type": "code",
   "id": "5cd0c62630b0223e",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-30T18:43:33.195418Z",
     "start_time": "2025-03-30T18:43:33.189898Z"
    }
   },
   "source": "run_id = \"deepseek-v3_filepath\"",
   "outputs": [],
   "execution_count": 25
  },
  {
   "cell_type": "code",
   "id": "b73d398c-d6c2-44b1-9718-e63ea5f23e1d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-30T18:43:34.475921Z",
     "start_time": "2025-03-30T18:43:34.458841Z"
    }
   },
   "source": [
    "df = pd.read_json(os.path.join(eval_results_path, run_id, 'results.jsonl'), lines=True)\n",
    "df"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "                                   text_id  All Files Count  \\\n",
       "0             thealgorithms/python/295/289              195   \n",
       "1             keras-team/keras/15943/15942              553   \n",
       "2          serverless/serverless/5500/5499               10   \n",
       "3                textualize/rich/2212/2197              122   \n",
       "4            mitmproxy/mitmproxy/4298/3994              307   \n",
       "..                                     ...              ...   \n",
       "145             jwstegemann/fritz2/430/425              102   \n",
       "146             jwstegemann/fritz2/679/671               94   \n",
       "147  theopenconversationkit/tock/1040/1039             1484   \n",
       "148            google/android-fhir/301/296              122   \n",
       "149                 splendo/kaluga/469/468             1157   \n",
       "\n",
       "     Expected Bug Files Count  Actual Bug Files Count  % of Bug Files  \\\n",
       "0                           1                       1        0.005128   \n",
       "1                           1                       2        0.003617   \n",
       "2                           1                       1        0.100000   \n",
       "3                           1                       1        0.008197   \n",
       "4                           2                       2        0.006515   \n",
       "..                        ...                     ...             ...   \n",
       "145                         1                       4        0.039216   \n",
       "146                         3                       2        0.021277   \n",
       "147                         2                       4        0.002695   \n",
       "148                         5                      16        0.131148   \n",
       "149                         3                       5        0.004322   \n",
       "\n",
       "     Precision    Recall  F1 Score       FPR  All Correct  \\\n",
       "0       1.0000  1.000000  1.000000  0.000000            1   \n",
       "1       0.5000  1.000000  0.666667  0.001812            0   \n",
       "2       1.0000  1.000000  1.000000  0.000000            1   \n",
       "3       1.0000  1.000000  1.000000  0.000000            1   \n",
       "4       0.5000  0.500000  0.500000  0.003279            0   \n",
       "..         ...       ...       ...       ...          ...   \n",
       "145     0.2500  1.000000  0.400000  0.029703            0   \n",
       "146     1.0000  0.666667  0.800000  0.000000            0   \n",
       "147     0.2500  0.500000  0.333333  0.002024            0   \n",
       "148     0.1875  0.600000  0.285714  0.111111            0   \n",
       "149     0.4000  0.666667  0.500000  0.002600            0   \n",
       "\n",
       "     At Least One Correct  All Incorrect  system_prompt_tokens  \\\n",
       "0                       1              0                   116   \n",
       "1                       1              0                   116   \n",
       "2                       1              0                   116   \n",
       "3                       1              0                   116   \n",
       "4                       1              0                   116   \n",
       "..                    ...            ...                   ...   \n",
       "145                     1              0                   116   \n",
       "146                     1              0                   116   \n",
       "147                     1              0                   116   \n",
       "148                     1              0                   116   \n",
       "149                     1              0                   116   \n",
       "\n",
       "     user_prompt_tokens  total_tokens  \n",
       "0                  1921          2039  \n",
       "1                  6370          6488  \n",
       "2                   265           383  \n",
       "3                  1659          1777  \n",
       "4                  3191          3309  \n",
       "..                  ...           ...  \n",
       "145                2212          2330  \n",
       "146                1905          2023  \n",
       "147               15883         16001  \n",
       "148                2711          2829  \n",
       "149               15883         16001  \n",
       "\n",
       "[150 rows x 15 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_id</th>\n",
       "      <th>All Files Count</th>\n",
       "      <th>Expected Bug Files Count</th>\n",
       "      <th>Actual Bug Files Count</th>\n",
       "      <th>% of Bug Files</th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F1 Score</th>\n",
       "      <th>FPR</th>\n",
       "      <th>All Correct</th>\n",
       "      <th>At Least One Correct</th>\n",
       "      <th>All Incorrect</th>\n",
       "      <th>system_prompt_tokens</th>\n",
       "      <th>user_prompt_tokens</th>\n",
       "      <th>total_tokens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>thealgorithms/python/295/289</td>\n",
       "      <td>195</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.005128</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>1921</td>\n",
       "      <td>2039</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>keras-team/keras/15943/15942</td>\n",
       "      <td>553</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0.003617</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.001812</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>6370</td>\n",
       "      <td>6488</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>serverless/serverless/5500/5499</td>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>265</td>\n",
       "      <td>383</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>textualize/rich/2212/2197</td>\n",
       "      <td>122</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.008197</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>1659</td>\n",
       "      <td>1777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mitmproxy/mitmproxy/4298/3994</td>\n",
       "      <td>307</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0.006515</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.003279</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>3191</td>\n",
       "      <td>3309</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>jwstegemann/fritz2/430/425</td>\n",
       "      <td>102</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0.039216</td>\n",
       "      <td>0.2500</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.029703</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>2212</td>\n",
       "      <td>2330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>jwstegemann/fritz2/679/671</td>\n",
       "      <td>94</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0.021277</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>1905</td>\n",
       "      <td>2023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>theopenconversationkit/tock/1040/1039</td>\n",
       "      <td>1484</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>0.002695</td>\n",
       "      <td>0.2500</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.002024</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>15883</td>\n",
       "      <td>16001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>google/android-fhir/301/296</td>\n",
       "      <td>122</td>\n",
       "      <td>5</td>\n",
       "      <td>16</td>\n",
       "      <td>0.131148</td>\n",
       "      <td>0.1875</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>2711</td>\n",
       "      <td>2829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>splendo/kaluga/469/468</td>\n",
       "      <td>1157</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>0.004322</td>\n",
       "      <td>0.4000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.002600</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>116</td>\n",
       "      <td>15883</td>\n",
       "      <td>16001</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>150 rows × 15 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 26
  },
  {
   "cell_type": "code",
   "id": "f1cc7c76-7db6-496e-bd91-2af86c336e24",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-30T18:43:35.378178Z",
     "start_time": "2025-03-30T18:43:35.364875Z"
    }
   },
   "source": [
    "df[['Precision', 'Recall', 'F1 Score', 'FPR']].describe().round(3)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "       Precision   Recall  F1 Score      FPR\n",
       "count    150.000  150.000   150.000  150.000\n",
       "mean       0.489    0.697     0.523    0.025\n",
       "std        0.330    0.359     0.305    0.088\n",
       "min        0.000    0.000     0.000    0.000\n",
       "25%        0.250    0.350     0.286    0.001\n",
       "50%        0.333    1.000     0.500    0.006\n",
       "75%        0.750    1.000     0.667    0.021\n",
       "max        1.000    1.000     1.000    1.000"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F1 Score</th>\n",
       "      <th>FPR</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>150.000</td>\n",
       "      <td>150.000</td>\n",
       "      <td>150.000</td>\n",
       "      <td>150.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.489</td>\n",
       "      <td>0.697</td>\n",
       "      <td>0.523</td>\n",
       "      <td>0.025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.330</td>\n",
       "      <td>0.359</td>\n",
       "      <td>0.305</td>\n",
       "      <td>0.088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.250</td>\n",
       "      <td>0.350</td>\n",
       "      <td>0.286</td>\n",
       "      <td>0.001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.333</td>\n",
       "      <td>1.000</td>\n",
       "      <td>0.500</td>\n",
       "      <td>0.006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.750</td>\n",
       "      <td>1.000</td>\n",
       "      <td>0.667</td>\n",
       "      <td>0.021</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.000</td>\n",
       "      <td>1.000</td>\n",
       "      <td>1.000</td>\n",
       "      <td>1.000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 27
  },
  {
   "cell_type": "code",
   "id": "433a3cbe-b93d-4e5a-91ad-3e8613531d16",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-30T18:43:36.382507Z",
     "start_time": "2025-03-30T18:43:36.371343Z"
    }
   },
   "source": [
    "df[['All Correct', 'At Least One Correct', 'All Incorrect', 'Actual Bug Files Count']].describe().round(2)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "       All Correct  At Least One Correct  All Incorrect  \\\n",
       "count       150.00                150.00         150.00   \n",
       "mean          0.19                  0.92           0.08   \n",
       "std           0.40                  0.27           0.27   \n",
       "min           0.00                  0.00           0.00   \n",
       "25%           0.00                  1.00           0.00   \n",
       "50%           0.00                  1.00           0.00   \n",
       "75%           0.00                  1.00           0.00   \n",
       "max           1.00                  1.00           1.00   \n",
       "\n",
       "       Actual Bug Files Count  \n",
       "count                  150.00  \n",
       "mean                     3.61  \n",
       "std                      2.72  \n",
       "min                      1.00  \n",
       "25%                      2.00  \n",
       "50%                      3.00  \n",
       "75%                      5.00  \n",
       "max                     24.00  "
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>All Correct</th>\n",
       "      <th>At Least One Correct</th>\n",
       "      <th>All Incorrect</th>\n",
       "      <th>Actual Bug Files Count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>150.00</td>\n",
       "      <td>150.00</td>\n",
       "      <td>150.00</td>\n",
       "      <td>150.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.19</td>\n",
       "      <td>0.92</td>\n",
       "      <td>0.08</td>\n",
       "      <td>3.61</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.40</td>\n",
       "      <td>0.27</td>\n",
       "      <td>0.27</td>\n",
       "      <td>2.72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>3.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>5.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>24.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 28
  },
  {
   "cell_type": "code",
   "id": "b7e9d5260a57ff3f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-29T22:58:07.731896Z",
     "start_time": "2025-03-29T22:58:07.706703Z"
    }
   },
   "source": [
    "df[['All Files Count', 'Expected Bug Files Count', 'Actual Bug Files Count', '% of Bug Files']].describe().round(3)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "       All Files Count  Expected Bug Files Count  Actual Bug Files Count  \\\n",
       "count          150.000                   150.000                 150.000   \n",
       "mean           540.180                     2.500                   3.040   \n",
       "std            994.143                     2.385                   2.296   \n",
       "min              1.000                     1.000                   1.000   \n",
       "25%             78.250                     1.000                   2.000   \n",
       "50%            209.500                     1.500                   3.000   \n",
       "75%            517.750                     3.000                   4.000   \n",
       "max           6473.000                    14.000                  24.000   \n",
       "\n",
       "       % of Bug Files  \n",
       "count         150.000  \n",
       "mean            0.047  \n",
       "std             0.145  \n",
       "min             0.001  \n",
       "25%             0.004  \n",
       "50%             0.012  \n",
       "75%             0.031  \n",
       "max             1.000  "
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>All Files Count</th>\n",
       "      <th>Expected Bug Files Count</th>\n",
       "      <th>Actual Bug Files Count</th>\n",
       "      <th>% of Bug Files</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>150.000</td>\n",
       "      <td>150.000</td>\n",
       "      <td>150.000</td>\n",
       "      <td>150.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>540.180</td>\n",
       "      <td>2.500</td>\n",
       "      <td>3.040</td>\n",
       "      <td>0.047</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>994.143</td>\n",
       "      <td>2.385</td>\n",
       "      <td>2.296</td>\n",
       "      <td>0.145</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000</td>\n",
       "      <td>1.000</td>\n",
       "      <td>1.000</td>\n",
       "      <td>0.001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>78.250</td>\n",
       "      <td>1.000</td>\n",
       "      <td>2.000</td>\n",
       "      <td>0.004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>209.500</td>\n",
       "      <td>1.500</td>\n",
       "      <td>3.000</td>\n",
       "      <td>0.012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>517.750</td>\n",
       "      <td>3.000</td>\n",
       "      <td>4.000</td>\n",
       "      <td>0.031</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>6473.000</td>\n",
       "      <td>14.000</td>\n",
       "      <td>24.000</td>\n",
       "      <td>1.000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 21
  },
  {
   "cell_type": "code",
   "id": "7e944ce2-0550-470a-b085-aef3ac0ac480",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-29T22:58:07.787373Z",
     "start_time": "2025-03-29T22:58:07.780837Z"
    }
   },
   "source": [
    "df[['total_tokens']].describe().round(3)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "       total_tokens\n",
       "count       150.000\n",
       "mean      11150.080\n",
       "std       19303.041\n",
       "min         251.000\n",
       "25%        1908.750\n",
       "50%        4031.500\n",
       "75%        9831.000\n",
       "max      100001.000"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>total_tokens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>150.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>11150.080</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>19303.041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>251.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1908.750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>4031.500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>9831.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>100001.000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 22
  },
  {
   "cell_type": "code",
   "id": "1d2ff571-afcd-437c-a0e0-1c6137a99072",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-29T22:58:07.852204Z",
     "start_time": "2025-03-29T22:58:07.850419Z"
    }
   },
   "source": [
    "# max 3324 min 149, mean 611"
   ],
   "outputs": [],
   "execution_count": 23
  },
  {
   "cell_type": "code",
   "id": "10466afe-2eb3-493c-b725-8e7dab0ff9ca",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-29T22:58:07.883939Z",
     "start_time": "2025-03-29T22:58:07.882344Z"
    }
   },
   "source": [
    "# max >200000 min 251, mean 11150"
   ],
   "outputs": [],
   "execution_count": 24
  },
  {
   "cell_type": "code",
   "id": "bd0e1bca-958a-477c-8096-78a6dea18cd0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-29T22:58:07.924667Z",
     "start_time": "2025-03-29T22:58:07.923282Z"
    }
   },
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
