{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "5459a414",
   "metadata": {},
   "outputs": [],
   "source": [
    "exp_path = \"~/agentlab_results/miniwob_hints_retrieve2/2025-08-12_12-53-57_tooluse-gpt-4-1-on-miniwob-custom/\"\n",
    "exp_path = \"~/agentlab_results/2025-08-19_14-18-54-retrieve3/2025-08-19_14-18-54_tooluse-gpt-5-on-miniwob-custom/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "a44f6e28",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading CSVs from: /Users/<user>/agentlab_results/2025-08-19_14-18-54-retrieve3/2025-08-19_14-18-54_tooluse-gpt-5-on-miniwob-custom/\n",
      "\n",
      "result_df_trial_1_of_1.csv loaded successfully! Shape: (384, 99)\n",
      "\n",
      "First few rows of result_df:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>env.task_name</th>\n",
       "      <th>agent.agent_name</th>\n",
       "      <th>env.benchmark</th>\n",
       "      <th>agent.config.task_hint.use_task_hint</th>\n",
       "      <th>agent.config.task_hint.hint_db_rel_path</th>\n",
       "      <th>agent.config.task_hint.hint_retrieval_mode</th>\n",
       "      <th>index</th>\n",
       "      <th>exp_dir</th>\n",
       "      <th>agent.model.model_name</th>\n",
       "      <th>agent.model.max_total_tokens</th>\n",
       "      <th>...</th>\n",
       "      <th>stats.max_output_tokens</th>\n",
       "      <th>stats.cum_cost</th>\n",
       "      <th>stats.max_cost</th>\n",
       "      <th>stats.cum_step_elapsed</th>\n",
       "      <th>stats.max_step_elapsed</th>\n",
       "      <th>stats.cum_agent_elapsed</th>\n",
       "      <th>stats.max_agent_elapsed</th>\n",
       "      <th>terminated</th>\n",
       "      <th>truncated</th>\n",
       "      <th>err_key</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>miniwob.bisect-angle</td>\n",
       "      <td>ToolUse-gpt-5</td>\n",
       "      <td>miniwob</td>\n",
       "      <td>False</td>\n",
       "      <td>hint_db.csv</td>\n",
       "      <td>direct</td>\n",
       "      <td>70</td>\n",
       "      <td>/Users/<user>/agentlab_results/2025-08...</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>200000</td>\n",
       "      <td>...</td>\n",
       "      <td>3154</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.545828</td>\n",
       "      <td>6.365904</td>\n",
       "      <td>239.803122</td>\n",
       "      <td>66.248675</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>miniwob.bisect-angle</td>\n",
       "      <td>ToolUse-gpt-5</td>\n",
       "      <td>miniwob</td>\n",
       "      <td>False</td>\n",
       "      <td>hint_db.csv</td>\n",
       "      <td>direct</td>\n",
       "      <td>231</td>\n",
       "      <td>/Users/<user>/agentlab_results/2025-08...</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>200000</td>\n",
       "      <td>...</td>\n",
       "      <td>1768</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.171929</td>\n",
       "      <td>7.311526</td>\n",
       "      <td>68.064186</td>\n",
       "      <td>34.107962</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>miniwob.bisect-angle</td>\n",
       "      <td>ToolUse-gpt-5</td>\n",
       "      <td>miniwob</td>\n",
       "      <td>False</td>\n",
       "      <td>hint_db.csv</td>\n",
       "      <td>direct</td>\n",
       "      <td>309</td>\n",
       "      <td>/Users/<user>/agentlab_results/2025-08...</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>200000</td>\n",
       "      <td>...</td>\n",
       "      <td>2185</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.524459</td>\n",
       "      <td>7.098834</td>\n",
       "      <td>144.791286</td>\n",
       "      <td>44.472088</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>miniwob.bisect-angle</td>\n",
       "      <td>ToolUse-gpt-5</td>\n",
       "      <td>miniwob</td>\n",
       "      <td>False</td>\n",
       "      <td>hint_db.csv</td>\n",
       "      <td>direct</td>\n",
       "      <td>313</td>\n",
       "      <td>/Users/<user>/agentlab_results/2025-08...</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>200000</td>\n",
       "      <td>...</td>\n",
       "      <td>2657</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11.343318</td>\n",
       "      <td>7.305513</td>\n",
       "      <td>225.234672</td>\n",
       "      <td>62.361213</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>miniwob.bisect-angle</td>\n",
       "      <td>ToolUse-gpt-5</td>\n",
       "      <td>miniwob</td>\n",
       "      <td>False</td>\n",
       "      <td>hint_db.csv</td>\n",
       "      <td>direct</td>\n",
       "      <td>317</td>\n",
       "      <td>/Users/<user>/agentlab_results/2025-08...</td>\n",
       "      <td>gpt-5</td>\n",
       "      <td>200000</td>\n",
       "      <td>...</td>\n",
       "      <td>2299</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.891099</td>\n",
       "      <td>5.860545</td>\n",
       "      <td>175.807518</td>\n",
       "      <td>46.462174</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 99 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          env.task_name agent.agent_name env.benchmark  \\\n",
       "0  miniwob.bisect-angle    ToolUse-gpt-5       miniwob   \n",
       "1  miniwob.bisect-angle    ToolUse-gpt-5       miniwob   \n",
       "2  miniwob.bisect-angle    ToolUse-gpt-5       miniwob   \n",
       "3  miniwob.bisect-angle    ToolUse-gpt-5       miniwob   \n",
       "4  miniwob.bisect-angle    ToolUse-gpt-5       miniwob   \n",
       "\n",
       "   agent.config.task_hint.use_task_hint  \\\n",
       "0                                 False   \n",
       "1                                 False   \n",
       "2                                 False   \n",
       "3                                 False   \n",
       "4                                 False   \n",
       "\n",
       "  agent.config.task_hint.hint_db_rel_path  \\\n",
       "0                             hint_db.csv   \n",
       "1                             hint_db.csv   \n",
       "2                             hint_db.csv   \n",
       "3                             hint_db.csv   \n",
       "4                             hint_db.csv   \n",
       "\n",
       "  agent.config.task_hint.hint_retrieval_mode  index  \\\n",
       "0                                     direct     70   \n",
       "1                                     direct    231   \n",
       "2                                     direct    309   \n",
       "3                                     direct    313   \n",
       "4                                     direct    317   \n",
       "\n",
       "                                             exp_dir agent.model.model_name  \\\n",
       "0  /Users/<user>/agentlab_results/2025-08...                  gpt-5   \n",
       "1  /Users/<user>/agentlab_results/2025-08...                  gpt-5   \n",
       "2  /Users/<user>/agentlab_results/2025-08...                  gpt-5   \n",
       "3  /Users/<user>/agentlab_results/2025-08...                  gpt-5   \n",
       "4  /Users/<user>/agentlab_results/2025-08...                  gpt-5   \n",
       "\n",
       "   agent.model.max_total_tokens  ...  stats.max_output_tokens  stats.cum_cost  \\\n",
       "0                        200000  ...                     3154             0.0   \n",
       "1                        200000  ...                     1768             0.0   \n",
       "2                        200000  ...                     2185             0.0   \n",
       "3                        200000  ...                     2657             0.0   \n",
       "4                        200000  ...                     2299             0.0   \n",
       "\n",
       "   stats.max_cost  stats.cum_step_elapsed  stats.max_step_elapsed  \\\n",
       "0             0.0                9.545828                6.365904   \n",
       "1             0.0                8.171929                7.311526   \n",
       "2             0.0                9.524459                7.098834   \n",
       "3             0.0               11.343318                7.305513   \n",
       "4             0.0                9.891099                5.860545   \n",
       "\n",
       "   stats.cum_agent_elapsed  stats.max_agent_elapsed  terminated  truncated  \\\n",
       "0               239.803122                66.248675        True      False   \n",
       "1                68.064186                34.107962        True      False   \n",
       "2               144.791286                44.472088        True      False   \n",
       "3               225.234672                62.361213        True      False   \n",
       "4               175.807518                46.462174        True      False   \n",
       "\n",
       "   err_key  \n",
       "0      NaN  \n",
       "1      NaN  \n",
       "2      NaN  \n",
       "3      NaN  \n",
       "4      NaN  \n",
       "\n",
       "[5 rows x 99 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "summary_df_trial_1_of_1.csv loaded successfully! Shape: (4, 11)\n",
      "\n",
      "First few rows of summary_df:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>agent.agent_name</th>\n",
       "      <th>env.benchmark</th>\n",
       "      <th>agent.config.task_hint.use_task_hint</th>\n",
       "      <th>agent.config.task_hint.hint_db_rel_path</th>\n",
       "      <th>agent.config.task_hint.hint_retrieval_mode</th>\n",
       "      <th>avg_reward</th>\n",
       "      <th>std_err</th>\n",
       "      <th>avg_steps</th>\n",
       "      <th>n_completed</th>\n",
       "      <th>n_err</th>\n",
       "      <th>cum_effective_cost</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ToolUse-gpt-5</td>\n",
       "      <td>miniwob</td>\n",
       "      <td>False</td>\n",
       "      <td>hint_db.csv</td>\n",
       "      <td>direct</td>\n",
       "      <td>0.323</td>\n",
       "      <td>0.048</td>\n",
       "      <td>5.979</td>\n",
       "      <td>96/96</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ToolUse-gpt-5</td>\n",
       "      <td>miniwob</td>\n",
       "      <td>True</td>\n",
       "      <td>/Users/<user>/agentlab_results/2025-08...</td>\n",
       "      <td>direct</td>\n",
       "      <td>0.385</td>\n",
       "      <td>0.050</td>\n",
       "      <td>5.875</td>\n",
       "      <td>96/96</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ToolUse-gpt-5</td>\n",
       "      <td>miniwob</td>\n",
       "      <td>True</td>\n",
       "      <td>/Users/<user>/agentlab_results/2025-08...</td>\n",
       "      <td>emb</td>\n",
       "      <td>0.406</td>\n",
       "      <td>0.050</td>\n",
       "      <td>5.552</td>\n",
       "      <td>96/96</td>\n",
       "      <td>3</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ToolUse-gpt-5</td>\n",
       "      <td>miniwob</td>\n",
       "      <td>True</td>\n",
       "      <td>/Users/<user>/agentlab_results/2025-08...</td>\n",
       "      <td>llm</td>\n",
       "      <td>0.406</td>\n",
       "      <td>0.050</td>\n",
       "      <td>5.729</td>\n",
       "      <td>96/96</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  agent.agent_name env.benchmark  agent.config.task_hint.use_task_hint  \\\n",
       "0    ToolUse-gpt-5       miniwob                                 False   \n",
       "1    ToolUse-gpt-5       miniwob                                  True   \n",
       "2    ToolUse-gpt-5       miniwob                                  True   \n",
       "3    ToolUse-gpt-5       miniwob                                  True   \n",
       "\n",
       "             agent.config.task_hint.hint_db_rel_path  \\\n",
       "0                                        hint_db.csv   \n",
       "1  /Users/<user>/agentlab_results/2025-08...   \n",
       "2  /Users/<user>/agentlab_results/2025-08...   \n",
       "3  /Users/<user>/agentlab_results/2025-08...   \n",
       "\n",
       "  agent.config.task_hint.hint_retrieval_mode  avg_reward  std_err  avg_steps  \\\n",
       "0                                     direct       0.323    0.048      5.979   \n",
       "1                                     direct       0.385    0.050      5.875   \n",
       "2                                        emb       0.406    0.050      5.552   \n",
       "3                                        llm       0.406    0.050      5.729   \n",
       "\n",
       "  n_completed  n_err  cum_effective_cost  \n",
       "0       96/96      2                 0.0  \n",
       "1       96/96      0                 0.0  \n",
       "2       96/96      3                 0.0  \n",
       "3       96/96      2                 0.0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "# Expand the path and load the CSV files\n",
    "full_path = os.path.expanduser(exp_path)\n",
    "result_csv_path = os.path.join(full_path, \"result_df_trial_1_of_1.csv\")\n",
    "summary_csv_path = os.path.join(full_path, \"summary_df_trial_1_of_1.csv\")\n",
    "\n",
    "print(f\"Loading CSVs from: {full_path}\")\n",
    "\n",
    "# Load result_df CSV\n",
    "if os.path.exists(result_csv_path):\n",
    "    result_df = pd.read_csv(result_csv_path)\n",
    "    print(f\"\\nresult_df_trial_1_of_1.csv loaded successfully! Shape: {result_df.shape}\")\n",
    "    print(\"\\nFirst few rows of result_df:\")\n",
    "    display(result_df.head())\n",
    "else:\n",
    "    print(f\"result_df file not found at: {result_csv_path}\")\n",
    "\n",
    "# Load summary_df CSV  \n",
    "if os.path.exists(summary_csv_path):\n",
    "    summary_df = pd.read_csv(summary_csv_path)\n",
    "    print(f\"\\nsummary_df_trial_1_of_1.csv loaded successfully! Shape: {summary_df.shape}\")\n",
    "    print(\"\\nFirst few rows of summary_df:\")\n",
    "    display(summary_df.head())\n",
    "else:\n",
    "    print(f\"summary_df file not found at: {summary_csv_path}\")\n",
    "\n",
    "# If neither file exists, show available files\n",
    "if not os.path.exists(result_csv_path) and not os.path.exists(summary_csv_path):\n",
    "    print(\"Available files in the directory:\")\n",
    "    if os.path.exists(full_path):\n",
    "        files = os.listdir(full_path)\n",
    "        for file in files:\n",
    "            print(f\"  - {file}\")\n",
    "    else:\n",
    "        print(f\"Directory not found: {full_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "bd58357e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 437 subdirectories:\n",
      "\n",
      "=== GROUPING RESULTS ===\n",
      "Total configurations loaded: 384\n",
      "Unique groups: 4\n",
      "\n",
      "Group: llm\n",
      "Directories (96):\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.click-scroll-list_0_2 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.right-angle_2_4 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.social-media-all_16_2 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.stock-market_2_2 (step0: ✓)\n",
      "  - 2025-08-19_16-16-23_ToolUse-gpt-5_on_miniwob.number-checkboxes_20_1 (step0: ✓)\n",
      "\n",
      "Group: emb\n",
      "Directories (96):\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.grid-coordinate_7_3 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.number-checkboxes_20_3 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.stock-market_17_3 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.use-colorwheel-2_5_3 (step0: ✓)\n",
      "  - 2025-08-19_16-16-23_ToolUse-gpt-5_on_miniwob.number-checkboxes_22_2 (step0: ✓)\n",
      "\n",
      "Group: direct\n",
      "Directories (96):\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.bisect-angle_10_3 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.book-flight_20_1 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.form-sequence-2_22_1 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.hot-cold_24_1 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.right-angle_2_3 (step0: ✓)\n",
      "\n",
      "Group: no\n",
      "Directories (96):\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.click-menu_25 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.click-scroll-list_10 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.click-scroll-list_17 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.count-shape_26 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.hot-cold_20 (step0: ✓)\n",
      "\n",
      "=== GROUPING RESULTS ===\n",
      "Total configurations loaded: 384\n",
      "Unique groups: 4\n",
      "\n",
      "Group: llm\n",
      "Directories (96):\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.click-scroll-list_0_2 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.right-angle_2_4 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.social-media-all_16_2 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.stock-market_2_2 (step0: ✓)\n",
      "  - 2025-08-19_16-16-23_ToolUse-gpt-5_on_miniwob.number-checkboxes_20_1 (step0: ✓)\n",
      "\n",
      "Group: emb\n",
      "Directories (96):\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.grid-coordinate_7_3 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.number-checkboxes_20_3 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.stock-market_17_3 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.use-colorwheel-2_5_3 (step0: ✓)\n",
      "  - 2025-08-19_16-16-23_ToolUse-gpt-5_on_miniwob.number-checkboxes_22_2 (step0: ✓)\n",
      "\n",
      "Group: direct\n",
      "Directories (96):\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.bisect-angle_10_3 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.book-flight_20_1 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.form-sequence-2_22_1 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.hot-cold_24_1 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.right-angle_2_3 (step0: ✓)\n",
      "\n",
      "Group: no\n",
      "Directories (96):\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.click-menu_25 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.click-scroll-list_10 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.click-scroll-list_17 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.count-shape_26 (step0: ✓)\n",
      "  - 2025-08-19_14-18-54_ToolUse-gpt-5_on_miniwob.hot-cold_20 (step0: ✓)\n"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "import gzip\n",
    "from collections import defaultdict\n",
    "\n",
    "# Find all direct subdirectories of exp_path\n",
    "subdirs = []\n",
    "if os.path.exists(full_path):\n",
    "    for item in os.listdir(full_path):\n",
    "        item_path = os.path.join(full_path, item)\n",
    "        if os.path.isdir(item_path):\n",
    "            subdirs.append(item)\n",
    "\n",
    "print(f\"Found {len(subdirs)} subdirectories:\")\n",
    "# for subdir in sorted(subdirs):\n",
    "    # print(f\"  - {subdir}\")\n",
    "\n",
    "# Load exp_args.pkl and step0.pkl.gz from each subdirectory and group by task hint config\n",
    "grouped_dirs = defaultdict(list)\n",
    "loaded_configs = {}\n",
    "\n",
    "for subdir in subdirs:\n",
    "    if subdir.startswith(\"_\"):\n",
    "        continue\n",
    "    \n",
    "    pkl_path = os.path.join(full_path, subdir, \"exp_args.pkl\")\n",
    "    step0_path = os.path.join(full_path, subdir, \"step_0.pkl.gz\")\n",
    "    \n",
    "    config_data = {\n",
    "        'subdir_name': subdir,\n",
    "        'use_task_hint': None,\n",
    "        'hint_retrieval_mode': None,\n",
    "        'exp_args_object': None,\n",
    "        'step0_data': None\n",
    "    }\n",
    "    \n",
    "    # Load exp_args.pkl\n",
    "    if os.path.exists(pkl_path):\n",
    "        try:\n",
    "            with open(pkl_path, 'rb') as f:\n",
    "                exp_args = pickle.load(f)\n",
    "            \n",
    "            # Extract the key configuration values from the ExpArgs object\n",
    "            task_hint_config = exp_args.agent_args.config.task_hint\n",
    "            use_task_hint = task_hint_config.use_task_hint\n",
    "            hint_retrieval_mode = task_hint_config.hint_retrieval_mode\n",
    "            \n",
    "            config_data['use_task_hint'] = use_task_hint\n",
    "            config_data['hint_retrieval_mode'] = hint_retrieval_mode\n",
    "            config_data['exp_args_object'] = exp_args\n",
    "            \n",
    "        except Exception as e:\n",
    "            print(f\"✗ Failed to load exp_args.pkl from {subdir}: {e}\")\n",
    "            continue\n",
    "    else:\n",
    "        print(f\"✗ No exp_args.pkl found in {subdir}\")\n",
    "        continue\n",
    "    \n",
    "    # Load step0.pkl.gz\n",
    "    if os.path.exists(step0_path):\n",
    "        try:\n",
    "            with gzip.open(step0_path, 'rb') as f:\n",
    "                step0_data = pickle.load(f)\n",
    "            config_data['step0_data'] = step0_data\n",
    "            # print(f\"✓ Loaded both files from {subdir}: use_task_hint={use_task_hint}, hint_retrieval_mode={hint_retrieval_mode}\")\n",
    "            \n",
    "        except Exception as e:\n",
    "            print(f\"✗ Failed to load step0.pkl.gz from {subdir}: {e}\")\n",
    "            print(f\"  (but exp_args.pkl was loaded successfully)\")\n",
    "    else:\n",
    "        print(f\"✗ No {step0_path} found\")\n",
    "        print(f\"  (but exp_args.pkl was loaded successfully)\")\n",
    "    \n",
    "    # Create grouping key and add to results\n",
    "    key = hint_retrieval_mode if use_task_hint else \"no\"\n",
    "    grouped_dirs[key].append(subdir)\n",
    "    loaded_configs[subdir] = config_data\n",
    "\n",
    "print(f\"\\n=== GROUPING RESULTS ===\")\n",
    "print(f\"Total configurations loaded: {len(loaded_configs)}\")\n",
    "print(f\"Unique groups: {len(grouped_dirs)}\")\n",
    "\n",
    "for key, dirs in grouped_dirs.items():\n",
    "    print(f\"\\nGroup: {key}\")\n",
    "    print(f\"Directories ({len(dirs)}):\")\n",
    "    for dir_name in sorted(dirs[:5]):  # Show first 5 directories per group\n",
    "        step0_status = \"✓\" if loaded_configs[dir_name]['step0_data'] is not None else \"✗\"\n",
    "        print(f\"  - {dir_name} (step0: {step0_status})\")\n",
    "\n",
    "# Store results for further analysis\n",
    "experiment_groups = dict(grouped_dirs)\n",
    "experiment_configs = loaded_configs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "70506590",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "collected 74 hints\n"
     ]
    }
   ],
   "source": [
    "key = grouped_dirs[\"direct\"][0]\n",
    "hints_per_trace = {}\n",
    "for hint_mode, dirnames in grouped_dirs.items():\n",
    "    for dirname in dirnames:\n",
    "        if not loaded_configs[dirname][\"step0_data\"]:\n",
    "            continue\n",
    "        goal = loaded_configs[dirname][\"step0_data\"].agent_info.chat_messages[2].content[1][\"text\"]\n",
    "        try:\n",
    "            hint = loaded_configs[dirname][\"step0_data\"].agent_info.chat_messages[4].content[0][\"text\"]\n",
    "            assert hint.startswith(\"# Hints:\")\n",
    "        except Exception as e:\n",
    "            hint = \"\"\n",
    "        if goal not in hints_per_trace:\n",
    "            hints_per_trace[goal] = {}\n",
    "        hints_per_trace[goal][hint_mode] = hint\n",
    "print(f\"collected {len(hints_per_trace)} hints\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "c5a6fb99",
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_hints = {}\n",
    "prefix = \"# Hints:\\nHere are some hints for the task you are working on:\\n\"\n",
    "hint_line_prefix = \"- \"\n",
    "uniq_hints = {}\n",
    "for k, v in hints_per_trace.items():\n",
    "    clean_hints[k] = {}\n",
    "    for mode, hints in v.items():\n",
    "        if hints.startswith(prefix):\n",
    "            hints = hints[len(prefix):]\n",
    "        hints = hints.split(\"\\n\")\n",
    "        hints = [h[len(hint_line_prefix):] for h in hints if h.startswith(hint_line_prefix)]\n",
    "        hint_ids = []\n",
    "        for hint in hints:\n",
    "            if hint not in uniq_hints:\n",
    "                uniq_hints[hint] = len(uniq_hints)\n",
    "            hint_ids.append(uniq_hints[hint])\n",
    "        clean_hints[k][mode] = hint_ids\n",
    "\n",
    "for mode in [\"no\", \"direct\", \"llm\", \"emb\"]:\n",
    "    for k, v in clean_hints.items():\n",
    "        if mode not in v:\n",
    "            clean_hints[k][mode] = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "06e72548",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hints DataFrame shape: (74, 4)\n",
      "Tasks (rows): 74\n",
      "Hint modes (columns): ['llm', 'emb', 'direct', 'no']\n",
      "\n",
      "Hints per Task and Mode:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>llm</th>\n",
       "      <th>emb</th>\n",
       "      <th>direct</th>\n",
       "      <th>no</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Buy FMT stock when the price is less than $43.00.</th>\n",
       "      <td>[0, 1, 2, 3, 4, 5]</td>\n",
       "      <td>[4, 0, 2, 1]</td>\n",
       "      <td>[0, 1, 2]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Select Ertha, Aurel from the scroll list and click Submit.</th>\n",
       "      <td>[6, 7, 8]</td>\n",
       "      <td>[3, 4, 9, 10]</td>\n",
       "      <td>[6, 7, 8]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Click the \"Like\" button on all posts by @ultricies and then click Submit.</th>\n",
       "      <td>[0, 1, 2, 3, 4, 5]</td>\n",
       "      <td>[11, 4, 5, 3]</td>\n",
       "      <td>[3, 4, 5]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Add a third point to create a right angle, then press submit.</th>\n",
       "      <td>[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]</td>\n",
       "      <td>[20, 21, 12, 13]</td>\n",
       "      <td>[12, 13]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Draw the number \"7\" in the checkboxes using the example on the right and press Submit when finished.</th>\n",
       "      <td>[23, 24, 25]</td>\n",
       "      <td>[23, 26, 25, 24]</td>\n",
       "      <td>[23, 24, 25]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Select Sharona&gt;Ekaterina</th>\n",
       "      <td>[28, 29, 30]</td>\n",
       "      <td>[10, 4, 44, 26]</td>\n",
       "      <td>[28, 29, 30]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Select Anguilla, Ireland from the scroll list and click Submit.</th>\n",
       "      <td>[6, 7, 8]</td>\n",
       "      <td>[40, 5, 1, 39]</td>\n",
       "      <td>[6, 7, 8]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drag Marcelline to the bottom right.</th>\n",
       "      <td>[27, 9]</td>\n",
       "      <td>[44, 26, 27, 9]</td>\n",
       "      <td>[27, 9]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Select Donetta</th>\n",
       "      <td>[0, 1, 2, 3, 4, 5]</td>\n",
       "      <td>[3, 4, 1, 5]</td>\n",
       "      <td>[28, 29, 30]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Check the 2nd radio button and enter the number \"31\" into the 2nd textbox.</th>\n",
       "      <td>[11, 37, 38]</td>\n",
       "      <td>[2, 1, 11, 37]</td>\n",
       "      <td>[11, 37, 38]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>74 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                             llm  \\\n",
       "Buy FMT stock when the price is less than $43.00.                             [0, 1, 2, 3, 4, 5]   \n",
       "Select Ertha, Aurel from the scroll list and cl...                                     [6, 7, 8]   \n",
       "Click the \"Like\" button on all posts by @ultric...                            [0, 1, 2, 3, 4, 5]   \n",
       "Add a third point to create a right angle, then...  [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]   \n",
       "Draw the number \"7\" in the checkboxes using the...                                  [23, 24, 25]   \n",
       "...                                                                                          ...   \n",
       "Select Sharona>Ekaterina                                                            [28, 29, 30]   \n",
       "Select Anguilla, Ireland from the scroll list a...                                     [6, 7, 8]   \n",
       "Drag Marcelline to the bottom right.                                                     [27, 9]   \n",
       "Select Donetta                                                                [0, 1, 2, 3, 4, 5]   \n",
       "Check the 2nd radio button and enter the number...                                  [11, 37, 38]   \n",
       "\n",
       "                                                                 emb  \\\n",
       "Buy FMT stock when the price is less than $43.00.       [4, 0, 2, 1]   \n",
       "Select Ertha, Aurel from the scroll list and cl...     [3, 4, 9, 10]   \n",
       "Click the \"Like\" button on all posts by @ultric...     [11, 4, 5, 3]   \n",
       "Add a third point to create a right angle, then...  [20, 21, 12, 13]   \n",
       "Draw the number \"7\" in the checkboxes using the...  [23, 26, 25, 24]   \n",
       "...                                                              ...   \n",
       "Select Sharona>Ekaterina                             [10, 4, 44, 26]   \n",
       "Select Anguilla, Ireland from the scroll list a...    [40, 5, 1, 39]   \n",
       "Drag Marcelline to the bottom right.                 [44, 26, 27, 9]   \n",
       "Select Donetta                                          [3, 4, 1, 5]   \n",
       "Check the 2nd radio button and enter the number...    [2, 1, 11, 37]   \n",
       "\n",
       "                                                          direct  no  \n",
       "Buy FMT stock when the price is less than $43.00.      [0, 1, 2]  []  \n",
       "Select Ertha, Aurel from the scroll list and cl...     [6, 7, 8]  []  \n",
       "Click the \"Like\" button on all posts by @ultric...     [3, 4, 5]  []  \n",
       "Add a third point to create a right angle, then...      [12, 13]  []  \n",
       "Draw the number \"7\" in the checkboxes using the...  [23, 24, 25]  []  \n",
       "...                                                          ...  ..  \n",
       "Select Sharona>Ekaterina                            [28, 29, 30]  []  \n",
       "Select Anguilla, Ireland from the scroll list a...     [6, 7, 8]  []  \n",
       "Drag Marcelline to the bottom right.                     [27, 9]  []  \n",
       "Select Donetta                                      [28, 29, 30]  []  \n",
       "Check the 2nd radio button and enter the number...  [11, 37, 38]  []  \n",
       "\n",
       "[74 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Basic Statistics:\n",
      "Non-empty hints per mode:\n",
      "  llm: 74/74 tasks have hints\n",
      "  emb: 74/74 tasks have hints\n",
      "  direct: 74/74 tasks have hints\n",
      "  no: 74/74 tasks have hints\n",
      "\n",
      "Sample of first 3 tasks:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>llm</th>\n",
       "      <th>emb</th>\n",
       "      <th>direct</th>\n",
       "      <th>no</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Buy FMT stock when the price is less than $43.00.</th>\n",
       "      <td>[0, 1, 2, 3, 4, 5]</td>\n",
       "      <td>[4, 0, 2, 1]</td>\n",
       "      <td>[0, 1, 2]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Select Ertha, Aurel from the scroll list and click Submit.</th>\n",
       "      <td>[6, 7, 8]</td>\n",
       "      <td>[3, 4, 9, 10]</td>\n",
       "      <td>[6, 7, 8]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Click the \"Like\" button on all posts by @ultricies and then click Submit.</th>\n",
       "      <td>[0, 1, 2, 3, 4, 5]</td>\n",
       "      <td>[11, 4, 5, 3]</td>\n",
       "      <td>[3, 4, 5]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                   llm  \\\n",
       "Buy FMT stock when the price is less than $43.00.   [0, 1, 2, 3, 4, 5]   \n",
       "Select Ertha, Aurel from the scroll list and cl...           [6, 7, 8]   \n",
       "Click the \"Like\" button on all posts by @ultric...  [0, 1, 2, 3, 4, 5]   \n",
       "\n",
       "                                                              emb     direct  \\\n",
       "Buy FMT stock when the price is less than $43.00.    [4, 0, 2, 1]  [0, 1, 2]   \n",
       "Select Ertha, Aurel from the scroll list and cl...  [3, 4, 9, 10]  [6, 7, 8]   \n",
       "Click the \"Like\" button on all posts by @ultric...  [11, 4, 5, 3]  [3, 4, 5]   \n",
       "\n",
       "                                                    no  \n",
       "Buy FMT stock when the price is less than $43.00.   []  \n",
       "Select Ertha, Aurel from the scroll list and cl...  []  \n",
       "Click the \"Like\" button on all posts by @ultric...  []  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Convert hints_per_trace to DataFrame\n",
    "hints_df = pd.DataFrame.from_dict(clean_hints, orient='index')\n",
    "\n",
    "print(f\"Hints DataFrame shape: {hints_df.shape}\")\n",
    "print(f\"Tasks (rows): {len(hints_df)}\")\n",
    "print(f\"Hint modes (columns): {list(hints_df.columns)}\")\n",
    "\n",
    "# Display the DataFrame\n",
    "print(\"\\nHints per Task and Mode:\")\n",
    "display(hints_df)\n",
    "\n",
    "# Show some basic statistics\n",
    "print(f\"\\nBasic Statistics:\")\n",
    "print(f\"Non-empty hints per mode:\")\n",
    "for col in hints_df.columns:\n",
    "    non_empty = (hints_df[col] != \"\").sum()\n",
    "    print(f\"  {col}: {non_empty}/{len(hints_df)} tasks have hints\")\n",
    "\n",
    "# Show a sample of tasks and their hints\n",
    "print(f\"\\nSample of first 3 tasks:\")\n",
    "display(hints_df.head(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "1494d5da",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Available prediction modes: ['llm', 'emb']\n",
      "\n",
      "=== PER-TASK HINT-LEVEL METRICS ===\n",
      "Calculated metrics for 74 tasks\n",
      "\n",
      "=== AVERAGE METRICS ACROSS ALL TASKS ===\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>avg_precision</th>\n",
       "      <th>avg_recall</th>\n",
       "      <th>avg_f1</th>\n",
       "      <th>std_precision</th>\n",
       "      <th>std_recall</th>\n",
       "      <th>std_f1</th>\n",
       "      <th>perfect_f1_count</th>\n",
       "      <th>zero_f1_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>llm</th>\n",
       "      <td>0.744</td>\n",
       "      <td>0.905</td>\n",
       "      <td>0.788</td>\n",
       "      <td>0.362</td>\n",
       "      <td>0.295</td>\n",
       "      <td>0.326</td>\n",
       "      <td>47.0</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>emb</th>\n",
       "      <td>0.568</td>\n",
       "      <td>0.800</td>\n",
       "      <td>0.661</td>\n",
       "      <td>0.272</td>\n",
       "      <td>0.362</td>\n",
       "      <td>0.308</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     avg_precision  avg_recall  avg_f1  std_precision  std_recall  std_f1  \\\n",
       "llm          0.744       0.905   0.788          0.362       0.295   0.326   \n",
       "emb          0.568       0.800   0.661          0.272       0.362   0.308   \n",
       "\n",
       "     perfect_f1_count  zero_f1_count  \n",
       "llm              47.0            7.0  \n",
       "emb               0.0           11.0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== F1 SCORE DISTRIBUTION ===\n",
      "\n",
      "LLM:\n",
      "  Perfect F1 (1.0): 47/74 tasks (63.5%)\n",
      "  Zero F1 (0.0): 7/74 tasks (9.5%)\n",
      "  Mean F1: 0.788 ± 0.326\n",
      "  Median F1: 1.000\n",
      "\n",
      "EMB:\n",
      "  Perfect F1 (1.0): 0/74 tasks (0.0%)\n",
      "  Zero F1 (0.0): 11/74 tasks (14.9%)\n",
      "  Mean F1: 0.661 ± 0.308\n",
      "  Median F1: 0.857\n",
      "\n",
      "=== EXAMPLES OF TASK PERFORMANCE ===\n",
      "\n",
      "--- LLM EXAMPLES ---\n",
      "\n",
      "Best LLM Performance:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>llm_precision</th>\n",
       "      <th>llm_recall</th>\n",
       "      <th>llm_f1</th>\n",
       "      <th>true_num_hints</th>\n",
       "      <th>llm_num_hints</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Select Ertha, Aurel from the scroll list and c...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Draw the number \"7\" in the checkboxes using th...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Drag Fayth up by one....</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                task  llm_precision  \\\n",
       "1  Select Ertha, Aurel from the scroll list and c...            1.0   \n",
       "4  Draw the number \"7\" in the checkboxes using th...            1.0   \n",
       "7                           Drag Fayth up by one....            1.0   \n",
       "\n",
       "   llm_recall  llm_f1  true_num_hints  llm_num_hints  \n",
       "1         1.0     1.0               3              3  \n",
       "4         1.0     1.0               3              3  \n",
       "7         1.0     1.0               2              2  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Worst LLM Performance:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>llm_precision</th>\n",
       "      <th>llm_recall</th>\n",
       "      <th>llm_f1</th>\n",
       "      <th>true_num_hints</th>\n",
       "      <th>llm_num_hints</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Use the textbox to enter \"Ashlea\" and press \"S...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Select Noreen...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>Check the 1st radio button and enter the numbe...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 task  llm_precision  \\\n",
       "12  Use the textbox to enter \"Ashlea\" and press \"S...            0.0   \n",
       "13                                   Select Noreen...            0.0   \n",
       "23  Check the 1st radio button and enter the numbe...            0.0   \n",
       "\n",
       "    llm_recall  llm_f1  true_num_hints  llm_num_hints  \n",
       "12         0.0     0.0               2              3  \n",
       "13         0.0     0.0               3              6  \n",
       "23         0.0     0.0               3              3  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "--- EMB EXAMPLES ---\n",
      "\n",
      "Best EMB Performance:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>emb_precision</th>\n",
       "      <th>emb_recall</th>\n",
       "      <th>emb_f1</th>\n",
       "      <th>true_num_hints</th>\n",
       "      <th>emb_num_hints</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Buy FMT stock when the price is less than $43....</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Click the \"Like\" button on all posts by @ultri...</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Draw the number \"7\" in the checkboxes using th...</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                task  emb_precision  \\\n",
       "0  Buy FMT stock when the price is less than $43....           0.75   \n",
       "2  Click the \"Like\" button on all posts by @ultri...           0.75   \n",
       "4  Draw the number \"7\" in the checkboxes using th...           0.75   \n",
       "\n",
       "   emb_recall    emb_f1  true_num_hints  emb_num_hints  \n",
       "0         1.0  0.857143               3              4  \n",
       "2         1.0  0.857143               3              4  \n",
       "4         1.0  0.857143               3              4  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Worst EMB Performance:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task</th>\n",
       "      <th>emb_precision</th>\n",
       "      <th>emb_recall</th>\n",
       "      <th>emb_f1</th>\n",
       "      <th>true_num_hints</th>\n",
       "      <th>emb_num_hints</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Select Ertha, Aurel from the scroll list and c...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Select Stormi&gt;Crystal&gt;Jaynell...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Select Noreen...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 task  emb_precision  \\\n",
       "1   Select Ertha, Aurel from the scroll list and c...            0.0   \n",
       "8                    Select Stormi>Crystal>Jaynell...            0.0   \n",
       "13                                   Select Noreen...            0.0   \n",
       "\n",
       "    emb_recall  emb_f1  true_num_hints  emb_num_hints  \n",
       "1          0.0     0.0               3              4  \n",
       "8          0.0     0.0               3              4  \n",
       "13         0.0     0.0               3              4  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def calculate_hint_metrics(true_hints, pred_hints):\n",
    "    \"\"\"\n",
    "    Calculate precision, recall, and F1 for hint lists treating each hint as binary classification\n",
    "    \"\"\"\n",
    "    if not true_hints and not pred_hints:\n",
    "        # Both empty - perfect match\n",
    "        return 1.0, 1.0, 1.0\n",
    "    elif not true_hints:\n",
    "        # True is empty but pred is not - precision is 0, recall is undefined (set to 1)\n",
    "        return 0.0, 1.0, 0.0\n",
    "    elif not pred_hints:\n",
    "        # Pred is empty but true is not - precision is undefined (set to 1), recall is 0\n",
    "        return 1.0, 0.0, 0.0\n",
    "    \n",
    "    # Convert to sets for easier comparison\n",
    "    true_set = set(true_hints)\n",
    "    pred_set = set(pred_hints)\n",
    "    \n",
    "    # Calculate metrics\n",
    "    true_positives = len(true_set.intersection(pred_set))\n",
    "    false_positives = len(pred_set - true_set)\n",
    "    false_negatives = len(true_set - pred_set)\n",
    "    \n",
    "    # Calculate precision and recall\n",
    "    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 1.0\n",
    "    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 1.0\n",
    "    \n",
    "    # Calculate F1\n",
    "    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0\n",
    "    \n",
    "    return precision, recall, f1\n",
    "\n",
    "# Get available prediction modes (exclude 'direct' which is ground truth)\n",
    "available_modes = [col for col in hints_df.columns if col != 'direct']\n",
    "available_modes = [\"llm\", \"emb\"]\n",
    "print(f\"Available prediction modes: {available_modes}\")\n",
    "\n",
    "# Calculate metrics for each task and mode\n",
    "task_metrics = []\n",
    "for task_idx, task in enumerate(hints_df.index):\n",
    "    true_hints = hints_df.loc[task, 'direct']  # Ground truth\n",
    "    \n",
    "    # Calculate metrics for each prediction mode\n",
    "    task_data = {'task': task[:50] + '...', 'task_idx': task_idx}\n",
    "    \n",
    "    for pred_mode in available_modes:\n",
    "        pred_hints = hints_df.loc[task, pred_mode]\n",
    "        precision, recall, f1 = calculate_hint_metrics(true_hints, pred_hints)\n",
    "        \n",
    "        task_data[f'{pred_mode}_precision'] = precision\n",
    "        task_data[f'{pred_mode}_recall'] = recall\n",
    "        task_data[f'{pred_mode}_f1'] = f1\n",
    "        task_data[f'{pred_mode}_num_hints'] = len(pred_hints) if pred_hints else 0\n",
    "    \n",
    "    task_data['true_num_hints'] = len(true_hints) if true_hints else 0\n",
    "    task_metrics.append(task_data)\n",
    "\n",
    "# Convert to DataFrame\n",
    "task_metrics_df = pd.DataFrame(task_metrics)\n",
    "\n",
    "print(f\"\\n=== PER-TASK HINT-LEVEL METRICS ===\")\n",
    "print(f\"Calculated metrics for {len(task_metrics_df)} tasks\")\n",
    "\n",
    "# Calculate average metrics across all tasks\n",
    "avg_metrics = {}\n",
    "for pred_mode in available_modes:\n",
    "    avg_metrics[pred_mode] = {\n",
    "        'avg_precision': task_metrics_df[f'{pred_mode}_precision'].mean(),\n",
    "        'avg_recall': task_metrics_df[f'{pred_mode}_recall'].mean(),\n",
    "        'avg_f1': task_metrics_df[f'{pred_mode}_f1'].mean(),\n",
    "        'std_precision': task_metrics_df[f'{pred_mode}_precision'].std(),\n",
    "        'std_recall': task_metrics_df[f'{pred_mode}_recall'].std(),\n",
    "        'std_f1': task_metrics_df[f'{pred_mode}_f1'].std(),\n",
    "        'perfect_f1_count': (task_metrics_df[f'{pred_mode}_f1'] == 1.0).sum(),\n",
    "        'zero_f1_count': (task_metrics_df[f'{pred_mode}_f1'] == 0.0).sum()\n",
    "    }\n",
    "\n",
    "# Display results\n",
    "avg_metrics_df = pd.DataFrame(avg_metrics).T\n",
    "print(\"\\n=== AVERAGE METRICS ACROSS ALL TASKS ===\")\n",
    "display(avg_metrics_df.round(3))\n",
    "\n",
    "# Show distribution of F1 scores\n",
    "print(\"\\n=== F1 SCORE DISTRIBUTION ===\")\n",
    "for pred_mode in available_modes:\n",
    "    f1_scores = task_metrics_df[f'{pred_mode}_f1']\n",
    "    perfect_f1 = (f1_scores == 1.0).sum()\n",
    "    zero_f1 = (f1_scores == 0.0).sum()\n",
    "    \n",
    "    print(f\"\\n{pred_mode.upper()}:\")\n",
    "    print(f\"  Perfect F1 (1.0): {perfect_f1}/{len(f1_scores)} tasks ({perfect_f1/len(f1_scores):.1%})\")\n",
    "    print(f\"  Zero F1 (0.0): {zero_f1}/{len(f1_scores)} tasks ({zero_f1/len(f1_scores):.1%})\")\n",
    "    print(f\"  Mean F1: {f1_scores.mean():.3f} ± {f1_scores.std():.3f}\")\n",
    "    print(f\"  Median F1: {f1_scores.median():.3f}\")\n",
    "\n",
    "# Show some examples of best and worst performing tasks\n",
    "print(\"\\n=== EXAMPLES OF TASK PERFORMANCE ===\")\n",
    "\n",
    "for pred_mode in available_modes:\n",
    "    print(f\"\\n--- {pred_mode.upper()} EXAMPLES ---\")\n",
    "    \n",
    "    # Best tasks\n",
    "    best_tasks = task_metrics_df.nlargest(3, f'{pred_mode}_f1')[['task', f'{pred_mode}_precision', f'{pred_mode}_recall', f'{pred_mode}_f1', 'true_num_hints', f'{pred_mode}_num_hints']]\n",
    "    print(f\"\\nBest {pred_mode.upper()} Performance:\")\n",
    "    display(best_tasks)\n",
    "    \n",
    "    # Worst tasks\n",
    "    worst_tasks = task_metrics_df.nsmallest(3, f'{pred_mode}_f1')[['task', f'{pred_mode}_precision', f'{pred_mode}_recall', f'{pred_mode}_f1', 'true_num_hints', f'{pred_mode}_num_hints']]\n",
    "    print(f\"\\nWorst {pred_mode.upper()} Performance:\")\n",
    "    display(worst_tasks)\n",
    "\n",
    "# Store results for further analysis\n",
    "hint_level_metrics = {\n",
    "    'per_task_metrics': task_metrics_df,\n",
    "    'average_metrics': avg_metrics_df,\n",
    "    'calculate_metrics_function': calculate_hint_metrics\n",
    "}"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "agentlab",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
