{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f6996290-87f4-4a64-834f-6acbbfee10f5",
   "metadata": {},
   "source": [
    "# Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "00fe7ecd-fb6f-4af1-b539-0fb3702cdebb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import os\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fbd4cadb-b09b-4a94-b377-5209e2014556",
   "metadata": {},
   "outputs": [],
   "source": [
    "# COLORS = [\"#333333\", \"#999ea2\", \"#6fa2d0\", \"#4d759a\", \"#073763\"][::-1]\n",
    "# COLORS = [\"#1B262C\", \"#0F4C75\", \"#3282B8\", \"#BBE1FA\"]\n",
    "COLORS = [\"#000057\", \"#4d759a\", \"#3282B8\",  \"#999ea2\", \"#4f4f4f\", \"#1B262C\"]\n",
    "\n",
    "MY_PALLETE = sns.color_palette(\"colorblind\")\n",
    "sns.set_palette(MY_PALLETE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4fd16c8b-c9a9-4773-972b-d053abdb0b78",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<svg  width=\"550\" height=\"55\"><rect x=\"0\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#0173b2;stroke-width:2;stroke:rgb(255,255,255)\"/><rect x=\"55\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#de8f05;stroke-width:2;stroke:rgb(255,255,255)\"/><rect x=\"110\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#029e73;stroke-width:2;stroke:rgb(255,255,255)\"/><rect x=\"165\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#d55e00;stroke-width:2;stroke:rgb(255,255,255)\"/><rect x=\"220\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#cc78bc;stroke-width:2;stroke:rgb(255,255,255)\"/><rect x=\"275\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#ca9161;stroke-width:2;stroke:rgb(255,255,255)\"/><rect x=\"330\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#fbafe4;stroke-width:2;stroke:rgb(255,255,255)\"/><rect x=\"385\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#949494;stroke-width:2;stroke:rgb(255,255,255)\"/><rect x=\"440\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#ece133;stroke-width:2;stroke:rgb(255,255,255)\"/><rect x=\"495\" y=\"0\" width=\"55\" height=\"55\" style=\"fill:#56b4e9;stroke-width:2;stroke:rgb(255,255,255)\"/></svg>"
      ],
      "text/plain": [
       "[(0.00392156862745098, 0.45098039215686275, 0.6980392156862745),\n",
       " (0.8705882352941177, 0.5607843137254902, 0.0196078431372549),\n",
       " (0.00784313725490196, 0.6196078431372549, 0.45098039215686275),\n",
       " (0.8352941176470589, 0.3686274509803922, 0.0),\n",
       " (0.8, 0.47058823529411764, 0.7372549019607844),\n",
       " (0.792156862745098, 0.5686274509803921, 0.3803921568627451),\n",
       " (0.984313725490196, 0.6862745098039216, 0.8941176470588236),\n",
       " (0.5803921568627451, 0.5803921568627451, 0.5803921568627451),\n",
       " (0.9254901960784314, 0.8823529411764706, 0.2),\n",
       " (0.33725490196078434, 0.7058823529411765, 0.9137254901960784)]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "MY_PALLETE"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "40597d41-dea5-4f95-b8f4-fd32a35f273c",
   "metadata": {},
   "source": [
    "# Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8eefd75e-c916-4998-a0cd-423c82d786bd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_20659/2620244375.py:1: DtypeWarning: Columns (19,20,60,200,201) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(\"../pipeline_experiment_result.csv\")\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>seed_i</th>\n",
       "      <th>config_id</th>\n",
       "      <th>fold</th>\n",
       "      <th>config_hash</th>\n",
       "      <th>duration</th>\n",
       "      <th>start_time</th>\n",
       "      <th>end_time</th>\n",
       "      <th>status</th>\n",
       "      <th>seed</th>\n",
       "      <th>budget</th>\n",
       "      <th>...</th>\n",
       "      <th>classifier:sgd:power_t</th>\n",
       "      <th>feature_preprocessor:nystroem_sampler:coef0</th>\n",
       "      <th>feature_preprocessor:nystroem_sampler:degree</th>\n",
       "      <th>classifier:multinomial_nb:alpha</th>\n",
       "      <th>classifier:multinomial_nb:fit_prior</th>\n",
       "      <th>classifier:sgd:l1_ratio</th>\n",
       "      <th>dataset</th>\n",
       "      <th>data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__</th>\n",
       "      <th>data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__</th>\n",
       "      <th>data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>1.913694</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>1.911491</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>2.011853</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>2.025221</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>2.001899</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037065</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>6</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.627506</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037066</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>7</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.571584</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037067</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>8</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.475392</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037068</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>9</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.478533</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037069</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>10</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.529982</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1037070 rows × 203 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         seed_i  config_id  fold                       config_hash  duration  \\\n",
       "0             0          0     1  34a76cf2da6c41f8646867818eea56d6  1.913694   \n",
       "1             0          0     2  34a76cf2da6c41f8646867818eea56d6  1.911491   \n",
       "2             0          0     3  34a76cf2da6c41f8646867818eea56d6  2.011853   \n",
       "3             0          0     4  34a76cf2da6c41f8646867818eea56d6  2.025221   \n",
       "4             0          0     5  34a76cf2da6c41f8646867818eea56d6  2.001899   \n",
       "...         ...        ...   ...                               ...       ...   \n",
       "1037065       0        499     6  19a650aed428ed8e5fe1619c61bc7118  6.627506   \n",
       "1037066       0        499     7  19a650aed428ed8e5fe1619c61bc7118  6.571584   \n",
       "1037067       0        499     8  19a650aed428ed8e5fe1619c61bc7118  6.475392   \n",
       "1037068       0        499     9  19a650aed428ed8e5fe1619c61bc7118  6.478533   \n",
       "1037069       0        499    10  19a650aed428ed8e5fe1619c61bc7118  6.529982   \n",
       "\n",
       "           start_time      end_time              status  seed  budget  ...  \\\n",
       "0        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "2        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "3        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "4        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "...               ...           ...                 ...   ...     ...  ...   \n",
       "1037065  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037066  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037067  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037068  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037069  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "\n",
       "        classifier:sgd:power_t feature_preprocessor:nystroem_sampler:coef0  \\\n",
       "0                          NaN                                         NaN   \n",
       "1                          NaN                                         NaN   \n",
       "2                          NaN                                         NaN   \n",
       "3                          NaN                                         NaN   \n",
       "4                          NaN                                         NaN   \n",
       "...                        ...                                         ...   \n",
       "1037065                    NaN                                         NaN   \n",
       "1037066                    NaN                                         NaN   \n",
       "1037067                    NaN                                         NaN   \n",
       "1037068                    NaN                                         NaN   \n",
       "1037069                    NaN                                         NaN   \n",
       "\n",
       "        feature_preprocessor:nystroem_sampler:degree  \\\n",
       "0                                                NaN   \n",
       "1                                                NaN   \n",
       "2                                                NaN   \n",
       "3                                                NaN   \n",
       "4                                                NaN   \n",
       "...                                              ...   \n",
       "1037065                                          NaN   \n",
       "1037066                                          NaN   \n",
       "1037067                                          NaN   \n",
       "1037068                                          NaN   \n",
       "1037069                                          NaN   \n",
       "\n",
       "        classifier:multinomial_nb:alpha  classifier:multinomial_nb:fit_prior  \\\n",
       "0                                   NaN                                  NaN   \n",
       "1                                   NaN                                  NaN   \n",
       "2                                   NaN                                  NaN   \n",
       "3                                   NaN                                  NaN   \n",
       "4                                   NaN                                  NaN   \n",
       "...                                 ...                                  ...   \n",
       "1037065                             NaN                                  NaN   \n",
       "1037066                             NaN                                  NaN   \n",
       "1037067                             NaN                                  NaN   \n",
       "1037068                             NaN                                  NaN   \n",
       "1037069                             NaN                                  NaN   \n",
       "\n",
       "        classifier:sgd:l1_ratio        dataset  \\\n",
       "0                           NaN  dataset_40985   \n",
       "1                           NaN  dataset_40985   \n",
       "2                           NaN  dataset_40985   \n",
       "3                           NaN  dataset_40985   \n",
       "4                           NaN  dataset_40985   \n",
       "...                         ...            ...   \n",
       "1037065                     NaN   dataset_1161   \n",
       "1037066                     NaN   dataset_1161   \n",
       "1037067                     NaN   dataset_1161   \n",
       "1037068                     NaN   dataset_1161   \n",
       "1037069                     NaN   dataset_1161   \n",
       "\n",
       "        data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__  \\\n",
       "0                                                      NaN                                       \n",
       "1                                                      NaN                                       \n",
       "2                                                      NaN                                       \n",
       "3                                                      NaN                                       \n",
       "4                                                      NaN                                       \n",
       "...                                                    ...                                       \n",
       "1037065                                                NaN                                       \n",
       "1037066                                                NaN                                       \n",
       "1037067                                                NaN                                       \n",
       "1037068                                                NaN                                       \n",
       "1037069                                                NaN                                       \n",
       "\n",
       "         data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__  \\\n",
       "0                                                      NaN                                        \n",
       "1                                                      NaN                                        \n",
       "2                                                      NaN                                        \n",
       "3                                                      NaN                                        \n",
       "4                                                      NaN                                        \n",
       "...                                                    ...                                        \n",
       "1037065                                                NaN                                        \n",
       "1037066                                                NaN                                        \n",
       "1037067                                                NaN                                        \n",
       "1037068                                                NaN                                        \n",
       "1037069                                                NaN                                        \n",
       "\n",
       "        data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction  \n",
       "0                                                      NaN                                                               \n",
       "1                                                      NaN                                                               \n",
       "2                                                      NaN                                                               \n",
       "3                                                      NaN                                                               \n",
       "4                                                      NaN                                                               \n",
       "...                                                    ...                                                               \n",
       "1037065                                                NaN                                                               \n",
       "1037066                                                NaN                                                               \n",
       "1037067                                                NaN                                                               \n",
       "1037068                                                NaN                                                               \n",
       "1037069                                                NaN                                                               \n",
       "\n",
       "[1037070 rows x 203 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"../pipeline_experiment_result.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "315ec505-e576-4dc7-b9d6-c537a6ef3685",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1037070, 203)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0f048e4a-14ce-42b3-a6e2-03b0a08b4310",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['seed_i',\n",
       " 'config_id',\n",
       " 'fold',\n",
       " 'config_hash',\n",
       " 'duration',\n",
       " 'start_time',\n",
       " 'end_time',\n",
       " 'status',\n",
       " 'seed',\n",
       " 'budget',\n",
       " 'balancing:strategy',\n",
       " 'classifier:__choice__',\n",
       " 'data_preprocessor:__choice__',\n",
       " 'feature_preprocessor:__choice__',\n",
       " 'classifier:passive_aggressive:C',\n",
       " 'classifier:passive_aggressive:average',\n",
       " 'classifier:passive_aggressive:fit_intercept',\n",
       " 'classifier:passive_aggressive:loss',\n",
       " 'classifier:passive_aggressive:tol',\n",
       " 'data_preprocessor:feature_type:numerical_transformer:imputation:strategy',\n",
       " 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__',\n",
       " 'feature_preprocessor:select_percentile_classification:percentile',\n",
       " 'feature_preprocessor:select_percentile_classification:score_func',\n",
       " 'accuracy_train',\n",
       " 'balanced_accuracy_train',\n",
       " 'f1_macro_train',\n",
       " 'f1_weighted_train',\n",
       " 'precision_macro_train',\n",
       " 'precision_weighted_train',\n",
       " 'recall_macro_train',\n",
       " 'recall_weighted_train',\n",
       " 'accuracy_val',\n",
       " 'balanced_accuracy_val',\n",
       " 'f1_macro_val',\n",
       " 'f1_weighted_val',\n",
       " 'precision_macro_val',\n",
       " 'precision_weighted_val',\n",
       " 'recall_macro_val',\n",
       " 'recall_weighted_val',\n",
       " 'accuracy_test',\n",
       " 'balanced_accuracy_test',\n",
       " 'f1_macro_test',\n",
       " 'f1_weighted_test',\n",
       " 'precision_macro_test',\n",
       " 'precision_weighted_test',\n",
       " 'recall_macro_test',\n",
       " 'recall_weighted_test',\n",
       " 'feature_preprocessor:select_rates_classification:alpha',\n",
       " 'feature_preprocessor:select_rates_classification:score_func',\n",
       " 'classifier:decision_tree:criterion',\n",
       " 'classifier:decision_tree:max_depth_factor',\n",
       " 'classifier:decision_tree:max_features',\n",
       " 'classifier:decision_tree:max_leaf_nodes',\n",
       " 'classifier:decision_tree:min_impurity_decrease',\n",
       " 'classifier:decision_tree:min_samples_leaf',\n",
       " 'classifier:decision_tree:min_samples_split',\n",
       " 'classifier:decision_tree:min_weight_fraction_leaf',\n",
       " 'feature_preprocessor:pca:keep_variance',\n",
       " 'feature_preprocessor:pca:whiten',\n",
       " 'data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:n_quantiles',\n",
       " 'data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:output_distribution',\n",
       " 'classifier:qda:reg_param',\n",
       " 'feature_preprocessor:feature_agglomeration:affinity',\n",
       " 'feature_preprocessor:feature_agglomeration:linkage',\n",
       " 'feature_preprocessor:feature_agglomeration:n_clusters',\n",
       " 'feature_preprocessor:feature_agglomeration:pooling_func',\n",
       " 'classifier:lda:shrinkage',\n",
       " 'classifier:lda:tol',\n",
       " 'feature_preprocessor:kernel_pca:kernel',\n",
       " 'feature_preprocessor:kernel_pca:n_components',\n",
       " 'classifier:lda:shrinkage_factor',\n",
       " 'feature_preprocessor:kernel_pca:gamma',\n",
       " 'classifier:bernoulli_nb:alpha',\n",
       " 'classifier:bernoulli_nb:fit_prior',\n",
       " 'feature_preprocessor:select_rates_classification:mode',\n",
       " 'classifier:mlp:activation',\n",
       " 'classifier:mlp:alpha',\n",
       " 'classifier:mlp:batch_size',\n",
       " 'classifier:mlp:beta_1',\n",
       " 'classifier:mlp:beta_2',\n",
       " 'classifier:mlp:early_stopping',\n",
       " 'classifier:mlp:epsilon',\n",
       " 'classifier:mlp:hidden_layer_depth',\n",
       " 'classifier:mlp:learning_rate_init',\n",
       " 'classifier:mlp:n_iter_no_change',\n",
       " 'classifier:mlp:num_nodes_per_layer',\n",
       " 'classifier:mlp:shuffle',\n",
       " 'classifier:mlp:solver',\n",
       " 'classifier:mlp:tol',\n",
       " 'classifier:gradient_boosting:early_stop',\n",
       " 'classifier:gradient_boosting:l2_regularization',\n",
       " 'classifier:gradient_boosting:learning_rate',\n",
       " 'classifier:gradient_boosting:loss',\n",
       " 'classifier:gradient_boosting:max_bins',\n",
       " 'classifier:gradient_boosting:max_depth',\n",
       " 'classifier:gradient_boosting:max_leaf_nodes',\n",
       " 'classifier:gradient_boosting:min_samples_leaf',\n",
       " 'classifier:gradient_boosting:scoring',\n",
       " 'classifier:gradient_boosting:tol',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:bootstrap',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:criterion',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:max_depth',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:max_features',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:max_leaf_nodes',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:min_impurity_decrease',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:min_samples_leaf',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:min_samples_split',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:min_weight_fraction_leaf',\n",
       " 'feature_preprocessor:extra_trees_preproc_for_classification:n_estimators',\n",
       " 'classifier:gradient_boosting:n_iter_no_change',\n",
       " 'classifier:gradient_boosting:validation_fraction',\n",
       " 'classifier:adaboost:algorithm',\n",
       " 'classifier:adaboost:learning_rate',\n",
       " 'classifier:adaboost:max_depth',\n",
       " 'classifier:adaboost:n_estimators',\n",
       " 'feature_preprocessor:random_trees_embedding:bootstrap',\n",
       " 'feature_preprocessor:random_trees_embedding:max_depth',\n",
       " 'feature_preprocessor:random_trees_embedding:max_leaf_nodes',\n",
       " 'feature_preprocessor:random_trees_embedding:min_samples_leaf',\n",
       " 'feature_preprocessor:random_trees_embedding:min_samples_split',\n",
       " 'feature_preprocessor:random_trees_embedding:min_weight_fraction_leaf',\n",
       " 'feature_preprocessor:random_trees_embedding:n_estimators',\n",
       " 'data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_max',\n",
       " 'data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_min',\n",
       " 'feature_preprocessor:polynomial:degree',\n",
       " 'feature_preprocessor:polynomial:include_bias',\n",
       " 'feature_preprocessor:polynomial:interaction_only',\n",
       " 'classifier:extra_trees:bootstrap',\n",
       " 'classifier:extra_trees:criterion',\n",
       " 'classifier:extra_trees:max_depth',\n",
       " 'classifier:extra_trees:max_features',\n",
       " 'classifier:extra_trees:max_leaf_nodes',\n",
       " 'classifier:extra_trees:min_impurity_decrease',\n",
       " 'classifier:extra_trees:min_samples_leaf',\n",
       " 'classifier:extra_trees:min_samples_split',\n",
       " 'classifier:extra_trees:min_weight_fraction_leaf',\n",
       " 'classifier:mlp:validation_fraction',\n",
       " 'classifier:libsvm_svc:C',\n",
       " 'classifier:libsvm_svc:gamma',\n",
       " 'classifier:libsvm_svc:kernel',\n",
       " 'classifier:libsvm_svc:max_iter',\n",
       " 'classifier:libsvm_svc:shrinking',\n",
       " 'classifier:libsvm_svc:tol',\n",
       " 'classifier:libsvm_svc:coef0',\n",
       " 'feature_preprocessor:kitchen_sinks:gamma',\n",
       " 'feature_preprocessor:kitchen_sinks:n_components',\n",
       " 'classifier:k_nearest_neighbors:n_neighbors',\n",
       " 'classifier:k_nearest_neighbors:p',\n",
       " 'classifier:k_nearest_neighbors:weights',\n",
       " 'classifier:libsvm_svc:degree',\n",
       " 'classifier:liblinear_svc:C',\n",
       " 'classifier:liblinear_svc:dual',\n",
       " 'classifier:liblinear_svc:fit_intercept',\n",
       " 'classifier:liblinear_svc:intercept_scaling',\n",
       " 'classifier:liblinear_svc:loss',\n",
       " 'classifier:liblinear_svc:multi_class',\n",
       " 'classifier:liblinear_svc:penalty',\n",
       " 'classifier:liblinear_svc:tol',\n",
       " 'feature_preprocessor:liblinear_svc_preprocessor:C',\n",
       " 'feature_preprocessor:liblinear_svc_preprocessor:dual',\n",
       " 'feature_preprocessor:liblinear_svc_preprocessor:fit_intercept',\n",
       " 'feature_preprocessor:liblinear_svc_preprocessor:intercept_scaling',\n",
       " 'feature_preprocessor:liblinear_svc_preprocessor:loss',\n",
       " 'feature_preprocessor:liblinear_svc_preprocessor:multi_class',\n",
       " 'feature_preprocessor:liblinear_svc_preprocessor:penalty',\n",
       " 'feature_preprocessor:liblinear_svc_preprocessor:tol',\n",
       " 'classifier:random_forest:bootstrap',\n",
       " 'classifier:random_forest:criterion',\n",
       " 'classifier:random_forest:max_depth',\n",
       " 'classifier:random_forest:max_features',\n",
       " 'classifier:random_forest:max_leaf_nodes',\n",
       " 'classifier:random_forest:min_impurity_decrease',\n",
       " 'classifier:random_forest:min_samples_leaf',\n",
       " 'classifier:random_forest:min_samples_split',\n",
       " 'classifier:random_forest:min_weight_fraction_leaf',\n",
       " 'feature_preprocessor:fast_ica:algorithm',\n",
       " 'feature_preprocessor:fast_ica:fun',\n",
       " 'feature_preprocessor:fast_ica:whiten',\n",
       " 'feature_preprocessor:fast_ica:n_components',\n",
       " 'feature_preprocessor:kernel_pca:coef0',\n",
       " 'feature_preprocessor:kernel_pca:degree',\n",
       " 'classifier:sgd:alpha',\n",
       " 'classifier:sgd:average',\n",
       " 'classifier:sgd:fit_intercept',\n",
       " 'classifier:sgd:learning_rate',\n",
       " 'classifier:sgd:loss',\n",
       " 'classifier:sgd:penalty',\n",
       " 'classifier:sgd:tol',\n",
       " 'feature_preprocessor:nystroem_sampler:kernel',\n",
       " 'feature_preprocessor:nystroem_sampler:n_components',\n",
       " 'feature_preprocessor:nystroem_sampler:gamma',\n",
       " 'classifier:sgd:eta0',\n",
       " 'classifier:sgd:epsilon',\n",
       " 'classifier:sgd:power_t',\n",
       " 'feature_preprocessor:nystroem_sampler:coef0',\n",
       " 'feature_preprocessor:nystroem_sampler:degree',\n",
       " 'classifier:multinomial_nb:alpha',\n",
       " 'classifier:multinomial_nb:fit_prior',\n",
       " 'classifier:sgd:l1_ratio',\n",
       " 'dataset',\n",
       " 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__',\n",
       " 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__',\n",
       " 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns.to_list()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0652cca9-48c4-4ace-bdd3-c3a336494a4b",
   "metadata": {},
   "source": [
    "# Statistical Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e11024f6-fa91-45ac-acf2-3ed20b69e0ee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>seed_i</th>\n",
       "      <th>config_id</th>\n",
       "      <th>fold</th>\n",
       "      <th>config_hash</th>\n",
       "      <th>duration</th>\n",
       "      <th>start_time</th>\n",
       "      <th>end_time</th>\n",
       "      <th>status</th>\n",
       "      <th>seed</th>\n",
       "      <th>budget</th>\n",
       "      <th>...</th>\n",
       "      <th>feature_preprocessor:nystroem_sampler:coef0</th>\n",
       "      <th>feature_preprocessor:nystroem_sampler:degree</th>\n",
       "      <th>classifier:multinomial_nb:alpha</th>\n",
       "      <th>classifier:multinomial_nb:fit_prior</th>\n",
       "      <th>classifier:sgd:l1_ratio</th>\n",
       "      <th>dataset</th>\n",
       "      <th>data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__</th>\n",
       "      <th>data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__</th>\n",
       "      <th>data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction</th>\n",
       "      <th>pipelines</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>1.913694</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>passive_aggressive + select_percentile_classif...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>1.911491</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>passive_aggressive + select_percentile_classif...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>2.011853</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>passive_aggressive + select_percentile_classif...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>2.025221</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>passive_aggressive + select_percentile_classif...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>2.001899</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>passive_aggressive + select_percentile_classif...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037065</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>6</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.627506</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>decision_tree + pca</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037066</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>7</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.571584</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>decision_tree + pca</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037067</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>8</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.475392</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>decision_tree + pca</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037068</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>9</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.478533</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>decision_tree + pca</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037069</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>10</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.529982</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>decision_tree + pca</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1037070 rows × 204 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         seed_i  config_id  fold                       config_hash  duration  \\\n",
       "0             0          0     1  34a76cf2da6c41f8646867818eea56d6  1.913694   \n",
       "1             0          0     2  34a76cf2da6c41f8646867818eea56d6  1.911491   \n",
       "2             0          0     3  34a76cf2da6c41f8646867818eea56d6  2.011853   \n",
       "3             0          0     4  34a76cf2da6c41f8646867818eea56d6  2.025221   \n",
       "4             0          0     5  34a76cf2da6c41f8646867818eea56d6  2.001899   \n",
       "...         ...        ...   ...                               ...       ...   \n",
       "1037065       0        499     6  19a650aed428ed8e5fe1619c61bc7118  6.627506   \n",
       "1037066       0        499     7  19a650aed428ed8e5fe1619c61bc7118  6.571584   \n",
       "1037067       0        499     8  19a650aed428ed8e5fe1619c61bc7118  6.475392   \n",
       "1037068       0        499     9  19a650aed428ed8e5fe1619c61bc7118  6.478533   \n",
       "1037069       0        499    10  19a650aed428ed8e5fe1619c61bc7118  6.529982   \n",
       "\n",
       "           start_time      end_time              status  seed  budget  ...  \\\n",
       "0        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "2        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "3        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "4        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "...               ...           ...                 ...   ...     ...  ...   \n",
       "1037065  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037066  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037067  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037068  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037069  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "\n",
       "        feature_preprocessor:nystroem_sampler:coef0  \\\n",
       "0                                               NaN   \n",
       "1                                               NaN   \n",
       "2                                               NaN   \n",
       "3                                               NaN   \n",
       "4                                               NaN   \n",
       "...                                             ...   \n",
       "1037065                                         NaN   \n",
       "1037066                                         NaN   \n",
       "1037067                                         NaN   \n",
       "1037068                                         NaN   \n",
       "1037069                                         NaN   \n",
       "\n",
       "        feature_preprocessor:nystroem_sampler:degree  \\\n",
       "0                                                NaN   \n",
       "1                                                NaN   \n",
       "2                                                NaN   \n",
       "3                                                NaN   \n",
       "4                                                NaN   \n",
       "...                                              ...   \n",
       "1037065                                          NaN   \n",
       "1037066                                          NaN   \n",
       "1037067                                          NaN   \n",
       "1037068                                          NaN   \n",
       "1037069                                          NaN   \n",
       "\n",
       "        classifier:multinomial_nb:alpha classifier:multinomial_nb:fit_prior  \\\n",
       "0                                   NaN                                 NaN   \n",
       "1                                   NaN                                 NaN   \n",
       "2                                   NaN                                 NaN   \n",
       "3                                   NaN                                 NaN   \n",
       "4                                   NaN                                 NaN   \n",
       "...                                 ...                                 ...   \n",
       "1037065                             NaN                                 NaN   \n",
       "1037066                             NaN                                 NaN   \n",
       "1037067                             NaN                                 NaN   \n",
       "1037068                             NaN                                 NaN   \n",
       "1037069                             NaN                                 NaN   \n",
       "\n",
       "         classifier:sgd:l1_ratio        dataset  \\\n",
       "0                            NaN  dataset_40985   \n",
       "1                            NaN  dataset_40985   \n",
       "2                            NaN  dataset_40985   \n",
       "3                            NaN  dataset_40985   \n",
       "4                            NaN  dataset_40985   \n",
       "...                          ...            ...   \n",
       "1037065                      NaN   dataset_1161   \n",
       "1037066                      NaN   dataset_1161   \n",
       "1037067                      NaN   dataset_1161   \n",
       "1037068                      NaN   dataset_1161   \n",
       "1037069                      NaN   dataset_1161   \n",
       "\n",
       "        data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__  \\\n",
       "0                                                      NaN                                       \n",
       "1                                                      NaN                                       \n",
       "2                                                      NaN                                       \n",
       "3                                                      NaN                                       \n",
       "4                                                      NaN                                       \n",
       "...                                                    ...                                       \n",
       "1037065                                                NaN                                       \n",
       "1037066                                                NaN                                       \n",
       "1037067                                                NaN                                       \n",
       "1037068                                                NaN                                       \n",
       "1037069                                                NaN                                       \n",
       "\n",
       "        data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__  \\\n",
       "0                                                      NaN                                       \n",
       "1                                                      NaN                                       \n",
       "2                                                      NaN                                       \n",
       "3                                                      NaN                                       \n",
       "4                                                      NaN                                       \n",
       "...                                                    ...                                       \n",
       "1037065                                                NaN                                       \n",
       "1037066                                                NaN                                       \n",
       "1037067                                                NaN                                       \n",
       "1037068                                                NaN                                       \n",
       "1037069                                                NaN                                       \n",
       "\n",
       "         data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction  \\\n",
       "0                                                      NaN                                                                 \n",
       "1                                                      NaN                                                                 \n",
       "2                                                      NaN                                                                 \n",
       "3                                                      NaN                                                                 \n",
       "4                                                      NaN                                                                 \n",
       "...                                                    ...                                                                 \n",
       "1037065                                                NaN                                                                 \n",
       "1037066                                                NaN                                                                 \n",
       "1037067                                                NaN                                                                 \n",
       "1037068                                                NaN                                                                 \n",
       "1037069                                                NaN                                                                 \n",
       "\n",
       "                                                 pipelines  \n",
       "0        passive_aggressive + select_percentile_classif...  \n",
       "1        passive_aggressive + select_percentile_classif...  \n",
       "2        passive_aggressive + select_percentile_classif...  \n",
       "3        passive_aggressive + select_percentile_classif...  \n",
       "4        passive_aggressive + select_percentile_classif...  \n",
       "...                                                    ...  \n",
       "1037065                                decision_tree + pca  \n",
       "1037066                                decision_tree + pca  \n",
       "1037067                                decision_tree + pca  \n",
       "1037068                                decision_tree + pca  \n",
       "1037069                                decision_tree + pca  \n",
       "\n",
       "[1037070 rows x 204 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipelines = df[\"classifier:__choice__\"]+\" + \"+df[\"feature_preprocessor:__choice__\"]\n",
    "dfs = df \\\n",
    "    .assign(pipelines=pipelines) \\\n",
    "    .rename(columns={\"classifier:__choice__\": \"classifier\", \"feature_preprocessor:__choice__\": \"preprocessor\"})\n",
    "dfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "535c1fc1-ee52-40c8-8458-7cd8b6cc2a5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "kind = [\"preprocessor\", \"classifier\"]\n",
    "directory = \"statistical_tests/\"\n",
    "if not os.path.exists(directory):\n",
    "    os.makedirs(directory)\n",
    "    \n",
    "for k in kind:\n",
    "    aux = dfs\\\n",
    "        .groupby([\"dataset\", k, \"fold\"])\\\n",
    "        .agg(performance=(\"f1_weighted_test\", \"max\"))\\\n",
    "        .reset_index()\\\n",
    "        .pivot(index=[\"dataset\", \"fold\"], columns=k, values=\"performance\")\\\n",
    "        .fillna(0.0)\n",
    "    aux.round(4).to_csv(directory+k+\"_stest.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "19b0fa37-858d-49c8-aebb-a71d5f5a483f",
   "metadata": {},
   "outputs": [],
   "source": [
    "k = \"pipelines\"\n",
    "aux = dfs\\\n",
    "    .groupby([\"dataset\", k, \"fold\"])\\\n",
    "    .agg(performance=(\"f1_weighted_test\", \"max\"))\\\n",
    "    .reset_index()\\\n",
    "    .pivot(index=[\"dataset\", \"fold\"], columns=k, values=\"performance\")\\\n",
    "    .fillna(0.0)\n",
    "\n",
    "cols = [\n",
    "    \"gradient_boosting + polynomial\",\n",
    "    \"gradient_boosting + feature_agglomeration\",\n",
    "    \"adaboost + feature_agglomeration\",\n",
    "    \"extra_trees + no_preprocessing\",\n",
    "    \"adaboost + polynomial\",\n",
    "]\n",
    "\n",
    "aux[cols].round(4).to_csv(directory+\"pipelines_stest.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b1cdb8da-b9b8-4d8c-abf0-f8d3948796f4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['gradient_boosting + polynomial',\n",
       " 'gradient_boosting + feature_agglomeration',\n",
       " 'adaboost + feature_agglomeration',\n",
       " 'extra_trees + no_preprocessing',\n",
       " 'adaboost + polynomial']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43d47c76-7b18-4a93-806d-d522de1d58e5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
