{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scoring Tutorial\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1: Import Libraries\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/matthewho/miniconda3/envs/arc_agi/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from notebook_imports import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import combinations\n",
    "\n",
    "from concept_mem.evaluation.score_tree import (\n",
    "    flatten_solution_trees,\n",
    "    official_score,\n",
    "    official_score_per_puzzle,\n",
    "    strict_score,\n",
    "    strict_score_per_step,\n",
    "    strict_score_per_puzzle,\n",
    ")\n",
    "from concept_mem.evaluation.solution_tree import (\n",
    "    create_solution_tree_from_serialized_dict,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2: Load Data and Convert Result Directories to Dataframes\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load target problems\n",
    "val100 = load_arc_data(\"val100\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# specify output directories\n",
    "OUTPUTS_DIR = REPO_ROOT / \"outputs\"\n",
    "res_directories = {\n",
    "    \"baseline0\": OUTPUTS_DIR / \"2025-06-29/16-45-06\",\n",
    "    \"baseline1\": OUTPUTS_DIR / \"2025-07-02/16-17-40\",\n",
    "    \"baseline2\": OUTPUTS_DIR / \"2025-07-02/15-04-34\",\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert results to dataframes\n",
    "res_dataframes = {k: result_dir_to_df(v)[0] for k, v in res_directories.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>puzzle_id</th>\n",
       "      <th>branch_id</th>\n",
       "      <th>thread_id</th>\n",
       "      <th>step_idx</th>\n",
       "      <th>is_train</th>\n",
       "      <th>case_idx</th>\n",
       "      <th>correct</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>97239e3d</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>97239e3d</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>97239e3d</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>97239e3d</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>97239e3d</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>836</th>\n",
       "      <td>9b4c17c4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>837</th>\n",
       "      <td>9b4c17c4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>838</th>\n",
       "      <td>070dd51e</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>839</th>\n",
       "      <td>070dd51e</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>840</th>\n",
       "      <td>070dd51e</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>841 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    puzzle_id branch_id thread_id  step_idx  is_train  case_idx  correct\n",
       "0    97239e3d         0         0         0      True         0     True\n",
       "1    97239e3d         0         0         0      True         1    False\n",
       "2    97239e3d         0         0         0      True         2    False\n",
       "3    97239e3d         0         0         0     False         0    False\n",
       "4    97239e3d         0         0         1      True         0     True\n",
       "..        ...       ...       ...       ...       ...       ...      ...\n",
       "836  9b4c17c4         0         0         0     False         0     True\n",
       "837  9b4c17c4         0         0         0     False         1     True\n",
       "838  070dd51e         0         0         0      True         0     True\n",
       "839  070dd51e         0         0         0      True         1     True\n",
       "840  070dd51e         0         0         0     False         0     True\n",
       "\n",
       "[841 rows x 7 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# each dataframe has a row per test case result\n",
    "# almost all columns are identifying what puzzle, what code gen attempt, and what test case\n",
    "# then there's the \"correct\" column which is True if the test case was solved correctly\n",
    "next(iter(res_dataframes.values()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 3: Aggregate Token Usage\n",
    "\n",
    "(across different runs for the same setting)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _token_usage_from_run_dir(res_dir: Path) -> dict:\n",
    "    # yields {iter_num: tok_usg_dict} per iteration\n",
    "    aggregated_token_usage = {}\n",
    "    for iteration_dir in res_dir.glob(\"iteration_*\"):\n",
    "        tok_usg_path = iteration_dir / \"token_usage.json\"\n",
    "        if not tok_usg_path.exists():\n",
    "            print(f\"Token usage file not found in {iteration_dir}. Skipping.\")\n",
    "            continue\n",
    "        m = re.search(r\"iteration_(\\d)\", iteration_dir.name)\n",
    "        if m is None:\n",
    "            print(\n",
    "                f\"Could not extract iteration number from {tok_usg_path.name}. Skipping.\"\n",
    "            )\n",
    "            continue\n",
    "        iteration_num = int(m.group(1))\n",
    "        tok_usg = read_json(tok_usg_path)\n",
    "        aggregated_token_usage[iteration_num] = tok_usg[\"after\"]\n",
    "    return aggregated_token_usage\n",
    "\n",
    "\n",
    "def _combine_token_usage_dicts(\n",
    "    agg: dict[str, dict[str, int]], new_usg: dict[str, dict[str, int]]\n",
    ") -> None:\n",
    "    for model, usage in new_usg.items():\n",
    "        if model not in agg:\n",
    "            agg[model] = defaultdict(int)\n",
    "        for cat, count in usage.items():\n",
    "            agg[model][cat] += count\n",
    "\n",
    "\n",
    "def aggregate_token_usage(per_dir_usage: list[dict]) -> dict:\n",
    "    max_iter_num = max([max(d.keys()) for d in per_dir_usage])\n",
    "    aggregated_token_usage = defaultdict(dict)\n",
    "    for i in range(1, max_iter_num + 1):\n",
    "        # maps model -> usage\n",
    "        # where usage is str -> int\n",
    "        aggregated_token_usage[i] = defaultdict(lambda: defaultdict(int))\n",
    "        for tok_usg in per_dir_usage:\n",
    "            if i not in tok_usg:\n",
    "                continue\n",
    "            _combine_token_usage_dicts(aggregated_token_usage[i], tok_usg[i])\n",
    "    return aggregated_token_usage\n",
    "\n",
    "\n",
    "def aggregate_token_usage_from_dirs(run_dirs: list[Path]) -> dict:\n",
    "    per_dir_usage = [_token_usage_from_run_dir(rd) for rd in run_dirs]\n",
    "    return aggregate_token_usage(per_dir_usage)\n",
    "\n",
    "\n",
    "def _tok_usg_from_cont_run(run_dir: Path) -> dict:\n",
    "    aggregated_token_usage = {}\n",
    "    for iter_dir in run_dir.glob(\"iter_*\"):\n",
    "        max_batch = -1\n",
    "        max_batch_dir = None\n",
    "        for batch_dir in iter_dir.glob(\"batch_*\"):\n",
    "            m = re.search(r\"batch_(\\d+)\", batch_dir.name)\n",
    "            batch_num = int(m.group(1))\n",
    "            if batch_num > max_batch:\n",
    "                max_batch = batch_num\n",
    "                max_batch_dir = batch_dir\n",
    "        iter_num = int(re.search(r\"iter_(\\d+)\", iter_dir.name).group(1))\n",
    "        tok_usg = read_json(max_batch_dir / \"token_usage.json\")\n",
    "        aggregated_token_usage[iter_num] = tok_usg[\"after\"]\n",
    "    return aggregated_token_usage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "baseline_token_usage = aggregate_token_usage_from_dirs(list(res_directories.values()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 4: Score Results\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for each of the 3 way solves build a df:\n",
    "# - rows: iteration 1, 2, 3\n",
    "# - columns: run0, run1, run2, singe_run_avg, single_run_std, run0and1, run1and2, run0and2, two_run_avg, two_run_std\n",
    "def create_multi_run_summary(\n",
    "    prefix: str,\n",
    "    runs: list[Path] | list[pd.DataFrame],\n",
    "    uid_subset: list[str] | None = None,\n",
    "    official_or_strict: str = \"official\",\n",
    "    **kwargs,\n",
    ") -> pd.DataFrame:\n",
    "    fixed_runs = []\n",
    "    # uid_subset = set(uid_subset) if uid_subset is not None else None\n",
    "    for run in runs:\n",
    "        if isinstance(run, Path):\n",
    "            run_df, _ = result_dir_to_df(run)\n",
    "            run = run_df\n",
    "        if uid_subset is not None:\n",
    "            run = run[run[\"puzzle_id\"].isin(uid_subset)].copy()\n",
    "        fixed_runs.append(run)\n",
    "    runs = fixed_runs\n",
    "\n",
    "    # get official score per iteration for each individual run\n",
    "    cols = {}\n",
    "    for i, run in enumerate(runs):\n",
    "        cols[f\"run{i}\"] = get_agg_score_per_iteration(\n",
    "            run, prefix, official_or_strict, **kwargs\n",
    "        )\n",
    "\n",
    "    # get official score per best of 2 ensemble\n",
    "    for i, j in combinations(range(len(runs)), 2):\n",
    "        run0 = runs[i]\n",
    "        run1 = runs[j]\n",
    "        run0_duplicate = run0.copy()\n",
    "        run0_duplicate[\"thread_id\"] = 0\n",
    "        run1_duplicate = run1.copy()\n",
    "        run1_duplicate[\"thread_id\"] = 1\n",
    "        duplicate_df = pd.concat([run0_duplicate, run1_duplicate], ignore_index=True)\n",
    "        ensemble_scores = get_agg_score_per_iteration(\n",
    "            duplicate_df, prefix, official_or_strict, **kwargs\n",
    "        )\n",
    "        cols[f\"ensemble_{i}_{j}\"] = ensemble_scores\n",
    "\n",
    "    # get official score of all 3 runs combined\n",
    "    for i in range(3):\n",
    "        runs[i][\"thread_id\"] = str(i)\n",
    "    combined_df = pd.concat(runs, ignore_index=True)\n",
    "    combined_scores = get_agg_score_per_iteration(\n",
    "        combined_df, prefix, official_or_strict, **kwargs\n",
    "    )\n",
    "    cols[\"ensemble_all\"] = combined_scores\n",
    "\n",
    "    # combine into a DataFrame\n",
    "    summary_df = pd.DataFrame(cols)\n",
    "    # calculate averages and stds for each run\n",
    "    sr_cols = [c for c in summary_df.columns if \"run\" in c]\n",
    "    en_cols = [c for c in summary_df.columns if \"ensemble\" in c and \"all\" not in c]\n",
    "    sr_df = summary_df[sr_cols]\n",
    "    en_df = summary_df[en_cols]\n",
    "    summary_df[\"single_run_avg\"] = sr_df.mean(axis=1)\n",
    "    summary_df[\"single_run_std\"] = sr_df.std(axis=1)\n",
    "    summary_df[\"two_run_avg\"] = en_df.mean(axis=1)\n",
    "    summary_df[\"two_run_std\"] = en_df.std(axis=1)\n",
    "\n",
    "    return summary_df.T\n",
    "\n",
    "\n",
    "def get_agg_score_per_iteration(\n",
    "    case_df: pd.DataFrame, prefix: str, official_or_strict: str = \"official\", **kwargs\n",
    ") -> dict[str, float]:\n",
    "    # get max iteration\n",
    "    max_iteration = case_df[\"step_idx\"].max()\n",
    "    res = {}\n",
    "    for i in range(max_iteration + 1):\n",
    "        filtered_df = case_df[case_df[\"step_idx\"] <= i]\n",
    "        if official_or_strict == \"official\":\n",
    "            iter_scores = official_score(filtered_df, step_selection=\"last\", **kwargs)\n",
    "        else:\n",
    "            iter_scores = strict_score(filtered_df, step_selection=\"last\", **kwargs)\n",
    "        res[f\"{prefix}_i{i}\"] = iter_scores\n",
    "    return res\n",
    "\n",
    "\n",
    "# reformat sumdft, currently index contains \"{settin_name}_i{iteration}\", want multiindex with first level being setting name and second level being iteration\n",
    "setting_iteration_pattern = re.compile(r\"^(.*?)_i(\\d)$\")\n",
    "\n",
    "\n",
    "def get_setting_name_and_iteration(combined_string: str) -> tuple[str, int]:\n",
    "    match = setting_iteration_pattern.match(combined_string)\n",
    "    if match:\n",
    "        setting_name, iteration = match.groups()\n",
    "        return setting_name, int(iteration)\n",
    "    else:\n",
    "        raise ValueError(\n",
    "            f\"String '{combined_string}' does not match the expected pattern.\"\n",
    "        )\n",
    "\n",
    "\n",
    "def create_combined_summary_df(\n",
    "    run_groups: dict[str, list[pd.DataFrame]],\n",
    "    id_subset: list[str] | None = None,\n",
    "    official_or_strict: str = \"official\",\n",
    "    **kwargs,\n",
    ") -> pd.DataFrame:\n",
    "    group_summaries = []\n",
    "    for prefix, runs in run_groups.items():\n",
    "        group_summary = create_multi_run_summary(\n",
    "            prefix,\n",
    "            runs,\n",
    "            uid_subset=id_subset,\n",
    "            official_or_strict=official_or_strict,\n",
    "            **kwargs,\n",
    "        )\n",
    "        group_summaries.append(group_summary)\n",
    "    combined_summary = pd.concat(group_summaries, axis=1)\n",
    "\n",
    "    # reorder columns\n",
    "    # target order: [run0, ..., ensemble_0, ..., single_run_avg, two_run_avg, ensemble_all, single_run_std, two_run_std]\n",
    "    sumdft = combined_summary.T\n",
    "    col_order = (\n",
    "        [f\"run{i}\" for i in range(3)]\n",
    "        + [f\"ensemble_{i}_{j}\" for i, j in combinations(range(3), 2)]\n",
    "        + [\n",
    "            \"single_run_avg\",\n",
    "            \"two_run_avg\",\n",
    "            \"ensemble_all\",\n",
    "            \"single_run_std\",\n",
    "            \"two_run_std\",\n",
    "        ]\n",
    "    )\n",
    "    sumdft = sumdft[col_order]\n",
    "\n",
    "    sumdft.index = pd.MultiIndex.from_tuples(\n",
    "        [get_setting_name_and_iteration(idx) for idx in sumdft.index],\n",
    "        names=[\"setting\", \"iteration\"],\n",
    "    )\n",
    "\n",
    "    return sumdft"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create run groups\n",
    "run_groups = {\n",
    "    \"baseline\": list(res_dataframes.values()),\n",
    "}\n",
    "\n",
    "# expected input format:\n",
    "# dict[str, list[pd.DataFrame]]\n",
    "# mapping a setting name to a list of dataframes corresponding to different runs of that setting\n",
    "\n",
    "official_score_summary = create_combined_summary_df(\n",
    "    run_groups,\n",
    "    id_subset=None,\n",
    "    official_or_strict=\"official\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>run0</th>\n",
       "      <th>run1</th>\n",
       "      <th>run2</th>\n",
       "      <th>ensemble_0_1</th>\n",
       "      <th>ensemble_0_2</th>\n",
       "      <th>ensemble_1_2</th>\n",
       "      <th>single_run_avg</th>\n",
       "      <th>two_run_avg</th>\n",
       "      <th>ensemble_all</th>\n",
       "      <th>single_run_std</th>\n",
       "      <th>two_run_std</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>setting</th>\n",
       "      <th>iteration</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">baseline</th>\n",
       "      <th>0</th>\n",
       "      <td>46.0</td>\n",
       "      <td>45.5</td>\n",
       "      <td>47.5</td>\n",
       "      <td>51.5</td>\n",
       "      <td>57.0</td>\n",
       "      <td>57.0</td>\n",
       "      <td>46.333333</td>\n",
       "      <td>55.166667</td>\n",
       "      <td>59.5</td>\n",
       "      <td>1.040833</td>\n",
       "      <td>3.175426</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>57.0</td>\n",
       "      <td>55.5</td>\n",
       "      <td>61.0</td>\n",
       "      <td>62.5</td>\n",
       "      <td>70.0</td>\n",
       "      <td>67.5</td>\n",
       "      <td>57.833333</td>\n",
       "      <td>66.666667</td>\n",
       "      <td>71.5</td>\n",
       "      <td>2.843120</td>\n",
       "      <td>3.818813</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>59.5</td>\n",
       "      <td>59.0</td>\n",
       "      <td>65.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>71.0</td>\n",
       "      <td>61.166667</td>\n",
       "      <td>69.000000</td>\n",
       "      <td>73.0</td>\n",
       "      <td>3.329164</td>\n",
       "      <td>2.645751</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    run0  run1  run2  ensemble_0_1  ensemble_0_2  \\\n",
       "setting  iteration                                                 \n",
       "baseline 0          46.0  45.5  47.5          51.5          57.0   \n",
       "         1          57.0  55.5  61.0          62.5          70.0   \n",
       "         2          59.5  59.0  65.0          66.0          70.0   \n",
       "\n",
       "                    ensemble_1_2  single_run_avg  two_run_avg  ensemble_all  \\\n",
       "setting  iteration                                                            \n",
       "baseline 0                  57.0       46.333333    55.166667          59.5   \n",
       "         1                  67.5       57.833333    66.666667          71.5   \n",
       "         2                  71.0       61.166667    69.000000          73.0   \n",
       "\n",
       "                    single_run_std  two_run_std  \n",
       "setting  iteration                               \n",
       "baseline 0                1.040833     3.175426  \n",
       "         1                2.843120     3.818813  \n",
       "         2                3.329164     2.645751  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "official_score_summary"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "arc_agi",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
