{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1a453ed3-ac56-4c3d-b349-5b6949ed0614",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import pandas as pd\n",
    "import glob, os\n",
    "\n",
    "from pandas.api.types import is_string_dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "ba575021-a22c-49cb-8646-22f600a58956",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_datasets_path(datasets_path, find_by):\n",
    "    datasets_path_dict = {}\n",
    "    for root, dirs, files in os.walk(datasets_path):\n",
    "        for file in files:\n",
    "            if file.endswith(find_by):\n",
    "                path = os.path.join(root, file)\n",
    "                uuid = path.split(\"/\")[-1].split(\".\")[0]\n",
    "                datasets_path_dict[uuid] = path\n",
    "    return datasets_path_dict\n",
    "\n",
    "def read_dataset(path):\n",
    "    with open(path, 'rb') as picklefile:\n",
    "        dataset = pickle.load(picklefile)\n",
    "    return dataset\n",
    "\n",
    "def get_meta_information(dataset_path):\n",
    "    dataset_id = int(dataset_path.split(\"/\")[-1].split(\"_\")[-1].split(\".\")[0])\n",
    "    print(f\"Metadata for: dataset_{dataset_id} ... \")\n",
    "    df, df_class, check, features = read_dataset(dataset_path)\n",
    "\n",
    "    number_of_features = df.shape[1]\n",
    "    number_of_exemples = df.shape[0]\n",
    "    number_of_class = len(df_class.unique())\n",
    "    \n",
    "    minority = round(100*df_class.value_counts(sort=True, ascending=True).iloc[0]/number_of_exemples, 2)\n",
    "    majority = round(100*df_class.value_counts(sort=True, ascending=False).iloc[0]/number_of_exemples, 2)\n",
    "\n",
    "    # check = [is_string_dtype(df.iloc[i]) or isinstance(df.dtypes[i], pd.CategoricalDtype) for i in range(number_of_features)]\n",
    "    categorical_features = sum(check)\n",
    "    numerical_features = categorical_features - number_of_features\n",
    "    \n",
    "    print(\"\\r [Done]\")\n",
    "\n",
    "    return {\n",
    "        \"OpenML ID\": dataset_id,\n",
    "        \"Number of Examples\": number_of_exemples,\n",
    "        \"Number of Features\": number_of_features,\n",
    "        \"Number of Categorical Features\": categorical_features,\n",
    "        \"Number of class\": number_of_class,\n",
    "        \"Majority Class %\": majority,\n",
    "        \"Minority Class %\": minority\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fc4f9cef-115e-41f9-8a9a-f6f7a1d4e810",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Metadata for:  ../../../datasets/training/dataset_24.pkl  ...\n",
      " [Done]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'Number of Examples': 8124,\n",
       " 'Number of Features': 22,\n",
       " 'Number of Categorical Features': 22,\n",
       " 'Number of class': 2,\n",
       " 'Majority Class %': 51.8,\n",
       " 'Minority Class %': 48.2}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "datasets_parh = get_datasets_path(\"../../../datasets/\", \".pkl\")\n",
    "get_meta_information(datasets_parh[\"dataset_24\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "faab5cbb-0895-4a18-ba68-09f5c402ceb8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_223984/2620244375.py:1: DtypeWarning: Columns (19,20,60,200,201) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(\"../pipeline_experiment_result.csv\")\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>seed_i</th>\n",
       "      <th>config_id</th>\n",
       "      <th>fold</th>\n",
       "      <th>config_hash</th>\n",
       "      <th>duration</th>\n",
       "      <th>start_time</th>\n",
       "      <th>end_time</th>\n",
       "      <th>status</th>\n",
       "      <th>seed</th>\n",
       "      <th>budget</th>\n",
       "      <th>...</th>\n",
       "      <th>classifier:sgd:power_t</th>\n",
       "      <th>feature_preprocessor:nystroem_sampler:coef0</th>\n",
       "      <th>feature_preprocessor:nystroem_sampler:degree</th>\n",
       "      <th>classifier:multinomial_nb:alpha</th>\n",
       "      <th>classifier:multinomial_nb:fit_prior</th>\n",
       "      <th>classifier:sgd:l1_ratio</th>\n",
       "      <th>dataset</th>\n",
       "      <th>data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__</th>\n",
       "      <th>data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__</th>\n",
       "      <th>data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>1.913694</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>1.911491</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>2.011853</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>2.025221</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>34a76cf2da6c41f8646867818eea56d6</td>\n",
       "      <td>2.001899</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>1.680580e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_40985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037065</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>6</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.627506</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037066</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>7</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.571584</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037067</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>8</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.475392</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037068</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>9</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.478533</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037069</th>\n",
       "      <td>0</td>\n",
       "      <td>499</td>\n",
       "      <td>10</td>\n",
       "      <td>19a650aed428ed8e5fe1619c61bc7118</td>\n",
       "      <td>6.529982</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>1.680869e+09</td>\n",
       "      <td>StatusType.SUCCESS</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>dataset_1161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1037070 rows × 203 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         seed_i  config_id  fold                       config_hash  duration  \\\n",
       "0             0          0     1  34a76cf2da6c41f8646867818eea56d6  1.913694   \n",
       "1             0          0     2  34a76cf2da6c41f8646867818eea56d6  1.911491   \n",
       "2             0          0     3  34a76cf2da6c41f8646867818eea56d6  2.011853   \n",
       "3             0          0     4  34a76cf2da6c41f8646867818eea56d6  2.025221   \n",
       "4             0          0     5  34a76cf2da6c41f8646867818eea56d6  2.001899   \n",
       "...         ...        ...   ...                               ...       ...   \n",
       "1037065       0        499     6  19a650aed428ed8e5fe1619c61bc7118  6.627506   \n",
       "1037066       0        499     7  19a650aed428ed8e5fe1619c61bc7118  6.571584   \n",
       "1037067       0        499     8  19a650aed428ed8e5fe1619c61bc7118  6.475392   \n",
       "1037068       0        499     9  19a650aed428ed8e5fe1619c61bc7118  6.478533   \n",
       "1037069       0        499    10  19a650aed428ed8e5fe1619c61bc7118  6.529982   \n",
       "\n",
       "           start_time      end_time              status  seed  budget  ...  \\\n",
       "0        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "2        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "3        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "4        1.680580e+09  1.680580e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "...               ...           ...                 ...   ...     ...  ...   \n",
       "1037065  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037066  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037067  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037068  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "1037069  1.680869e+09  1.680869e+09  StatusType.SUCCESS     0     0.0  ...   \n",
       "\n",
       "        classifier:sgd:power_t feature_preprocessor:nystroem_sampler:coef0  \\\n",
       "0                          NaN                                         NaN   \n",
       "1                          NaN                                         NaN   \n",
       "2                          NaN                                         NaN   \n",
       "3                          NaN                                         NaN   \n",
       "4                          NaN                                         NaN   \n",
       "...                        ...                                         ...   \n",
       "1037065                    NaN                                         NaN   \n",
       "1037066                    NaN                                         NaN   \n",
       "1037067                    NaN                                         NaN   \n",
       "1037068                    NaN                                         NaN   \n",
       "1037069                    NaN                                         NaN   \n",
       "\n",
       "        feature_preprocessor:nystroem_sampler:degree  \\\n",
       "0                                                NaN   \n",
       "1                                                NaN   \n",
       "2                                                NaN   \n",
       "3                                                NaN   \n",
       "4                                                NaN   \n",
       "...                                              ...   \n",
       "1037065                                          NaN   \n",
       "1037066                                          NaN   \n",
       "1037067                                          NaN   \n",
       "1037068                                          NaN   \n",
       "1037069                                          NaN   \n",
       "\n",
       "        classifier:multinomial_nb:alpha  classifier:multinomial_nb:fit_prior  \\\n",
       "0                                   NaN                                  NaN   \n",
       "1                                   NaN                                  NaN   \n",
       "2                                   NaN                                  NaN   \n",
       "3                                   NaN                                  NaN   \n",
       "4                                   NaN                                  NaN   \n",
       "...                                 ...                                  ...   \n",
       "1037065                             NaN                                  NaN   \n",
       "1037066                             NaN                                  NaN   \n",
       "1037067                             NaN                                  NaN   \n",
       "1037068                             NaN                                  NaN   \n",
       "1037069                             NaN                                  NaN   \n",
       "\n",
       "        classifier:sgd:l1_ratio        dataset  \\\n",
       "0                           NaN  dataset_40985   \n",
       "1                           NaN  dataset_40985   \n",
       "2                           NaN  dataset_40985   \n",
       "3                           NaN  dataset_40985   \n",
       "4                           NaN  dataset_40985   \n",
       "...                         ...            ...   \n",
       "1037065                     NaN   dataset_1161   \n",
       "1037066                     NaN   dataset_1161   \n",
       "1037067                     NaN   dataset_1161   \n",
       "1037068                     NaN   dataset_1161   \n",
       "1037069                     NaN   dataset_1161   \n",
       "\n",
       "        data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__  \\\n",
       "0                                                      NaN                                       \n",
       "1                                                      NaN                                       \n",
       "2                                                      NaN                                       \n",
       "3                                                      NaN                                       \n",
       "4                                                      NaN                                       \n",
       "...                                                    ...                                       \n",
       "1037065                                                NaN                                       \n",
       "1037066                                                NaN                                       \n",
       "1037067                                                NaN                                       \n",
       "1037068                                                NaN                                       \n",
       "1037069                                                NaN                                       \n",
       "\n",
       "         data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__  \\\n",
       "0                                                      NaN                                        \n",
       "1                                                      NaN                                        \n",
       "2                                                      NaN                                        \n",
       "3                                                      NaN                                        \n",
       "4                                                      NaN                                        \n",
       "...                                                    ...                                        \n",
       "1037065                                                NaN                                        \n",
       "1037066                                                NaN                                        \n",
       "1037067                                                NaN                                        \n",
       "1037068                                                NaN                                        \n",
       "1037069                                                NaN                                        \n",
       "\n",
       "        data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction  \n",
       "0                                                      NaN                                                               \n",
       "1                                                      NaN                                                               \n",
       "2                                                      NaN                                                               \n",
       "3                                                      NaN                                                               \n",
       "4                                                      NaN                                                               \n",
       "...                                                    ...                                                               \n",
       "1037065                                                NaN                                                               \n",
       "1037066                                                NaN                                                               \n",
       "1037067                                                NaN                                                               \n",
       "1037068                                                NaN                                                               \n",
       "1037069                                                NaN                                                               \n",
       "\n",
       "[1037070 rows x 203 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"../pipeline_experiment_result.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "28d6e300-6335-43c2-936d-3b2e3c3ff743",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Metadata for: dataset_40985 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1501 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1479 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1530 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40680 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1480 ... \n",
      " [Done]\n",
      "Metadata for: dataset_151 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1528 ... \n",
      " [Done]\n",
      "Metadata for: dataset_311 ... \n",
      " [Done]\n",
      "Metadata for: dataset_949 ... \n",
      " [Done]\n",
      "Metadata for: dataset_934 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40691 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1532 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1146 ... \n",
      " [Done]\n",
      "Metadata for: dataset_742 ... \n",
      " [Done]\n",
      "Metadata for: dataset_886 ... \n",
      " [Done]\n",
      "Metadata for: dataset_728 ... \n",
      " [Done]\n",
      "Metadata for: dataset_737 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1553 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1053 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40704 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1116 ... \n",
      " [Done]\n",
      "Metadata for: dataset_37 ... \n",
      " [Done]\n",
      "Metadata for: dataset_837 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40705 ... \n",
      " [Done]\n",
      "Metadata for: dataset_6 ... \n",
      " [Done]\n",
      "Metadata for: dataset_937 ... \n",
      " [Done]\n",
      "Metadata for: dataset_841 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1536 ... \n",
      " [Done]\n",
      "Metadata for: dataset_60 ... \n",
      " [Done]\n",
      "Metadata for: dataset_950 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1471 ... \n",
      " [Done]\n",
      "Metadata for: dataset_30 ... \n",
      " [Done]\n",
      "Metadata for: dataset_735 ... \n",
      " [Done]\n",
      "Metadata for: dataset_839 ... \n",
      " [Done]\n",
      "Metadata for: dataset_4134 ... \n",
      " [Done]\n",
      "Metadata for: dataset_182 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40927 ... \n",
      " [Done]\n",
      "Metadata for: dataset_470 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41988 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41146 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41150 ... \n",
      " [Done]\n",
      "Metadata for: dataset_826 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41163 ... \n",
      " [Done]\n",
      "Metadata for: dataset_11 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1538 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41145 ... \n",
      " [Done]\n",
      "Metadata for: dataset_930 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1068 ... \n",
      " [Done]\n",
      "Metadata for: dataset_757 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1466 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1044 ... \n",
      " [Done]\n",
      "Metadata for: dataset_42 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1502 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40648 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1063 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41084 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1462 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1478 ... \n",
      " [Done]\n",
      "Metadata for: dataset_44 ... \n",
      " [Done]\n",
      "Metadata for: dataset_23380 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40693 ... \n",
      " [Done]\n",
      "Metadata for: dataset_871 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1056 ... \n",
      " [Done]\n",
      "Metadata for: dataset_951 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1503 ... \n",
      " [Done]\n",
      "Metadata for: dataset_734 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1120 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40646 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1459 ... \n",
      " [Done]\n",
      "Metadata for: dataset_181 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1481 ... \n",
      " [Done]\n",
      "Metadata for: dataset_185 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1496 ... \n",
      " [Done]\n",
      "Metadata for: dataset_300 ... \n",
      " [Done]\n",
      "Metadata for: dataset_799 ... \n",
      " [Done]\n",
      "Metadata for: dataset_451 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1128 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40900 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40677 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40923 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40706 ... \n",
      " [Done]\n",
      "Metadata for: dataset_188 ... \n",
      " [Done]\n",
      "Metadata for: dataset_469 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1457 ... \n",
      " [Done]\n",
      "Metadata for: dataset_940 ... \n",
      " [Done]\n",
      "Metadata for: dataset_947 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1142 ... \n",
      " [Done]\n",
      "Metadata for: dataset_923 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40496 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40668 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1069 ... \n",
      " [Done]\n",
      "Metadata for: dataset_24 ... \n",
      " [Done]\n",
      "Metadata for: dataset_722 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40649 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1542 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41986 ... \n",
      " [Done]\n",
      "Metadata for: dataset_881 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41144 ... \n",
      " [Done]\n",
      "Metadata for: dataset_981 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41082 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41162 ... \n",
      " [Done]\n",
      "Metadata for: dataset_727 ... \n",
      " [Done]\n",
      "Metadata for: dataset_833 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41982 ... \n",
      " [Done]\n",
      "Metadata for: dataset_920 ... \n",
      " [Done]\n",
      "Metadata for: dataset_725 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1590 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40994 ... \n",
      " [Done]\n",
      "Metadata for: dataset_307 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41972 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1483 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1510 ... \n",
      " [Done]\n",
      "Metadata for: dataset_4534 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1130 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40922 ... \n",
      " [Done]\n",
      "Metadata for: dataset_57 ... \n",
      " [Done]\n",
      "Metadata for: dataset_310 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40672 ... \n",
      " [Done]\n",
      "Metadata for: dataset_184 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1507 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1046 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1494 ... \n",
      " [Done]\n",
      "Metadata for: dataset_715 ... \n",
      " [Done]\n",
      "Metadata for: dataset_46 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41990 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1134 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40701 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40966 ... \n",
      " [Done]\n",
      "Metadata for: dataset_770 ... \n",
      " [Done]\n",
      "Metadata for: dataset_6332 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41989 ... \n",
      " [Done]\n",
      "Metadata for: dataset_2 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1529 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40983 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40536 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40499 ... \n",
      " [Done]\n",
      "Metadata for: dataset_333 ... \n",
      " [Done]\n",
      "Metadata for: dataset_752 ... \n",
      " [Done]\n",
      "Metadata for: dataset_761 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1541 ... \n",
      " [Done]\n",
      "Metadata for: dataset_375 ... \n",
      " [Done]\n",
      "Metadata for: dataset_750 ... \n",
      " [Done]\n",
      "Metadata for: dataset_802 ... \n",
      " [Done]\n",
      "Metadata for: dataset_901 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1487 ... \n",
      " [Done]\n",
      "Metadata for: dataset_825 ... \n",
      " [Done]\n",
      "Metadata for: dataset_15 ... \n",
      " [Done]\n",
      "Metadata for: dataset_23381 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40670 ... \n",
      " [Done]\n",
      "Metadata for: dataset_334 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1491 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1233 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40647 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40982 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40650 ... \n",
      " [Done]\n",
      "Metadata for: dataset_458 ... \n",
      " [Done]\n",
      "Metadata for: dataset_847 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40978 ... \n",
      " [Done]\n",
      "Metadata for: dataset_42206 ... \n",
      " [Done]\n",
      "Metadata for: dataset_32 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40971 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1547 ... \n",
      " [Done]\n",
      "Metadata for: dataset_819 ... \n",
      " [Done]\n",
      "Metadata for: dataset_155 ... \n",
      " [Done]\n",
      "Metadata for: dataset_772 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1535 ... \n",
      " [Done]\n",
      "Metadata for: dataset_554 ... \n",
      " [Done]\n",
      "Metadata for: dataset_42343 ... \n",
      " [Done]\n",
      "Metadata for: dataset_23 ... \n",
      " [Done]\n",
      "Metadata for: dataset_846 ... \n",
      " [Done]\n",
      "Metadata for: dataset_335 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1475 ... \n",
      " [Done]\n",
      "Metadata for: dataset_42345 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41147 ... \n",
      " [Done]\n",
      "Metadata for: dataset_279 ... \n",
      " [Done]\n",
      "Metadata for: dataset_803 ... \n",
      " [Done]\n",
      "Metadata for: dataset_903 ... \n",
      " [Done]\n",
      "Metadata for: dataset_936 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1039 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1049 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40498 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1485 ... \n",
      " [Done]\n",
      "Metadata for: dataset_740 ... \n",
      " [Done]\n",
      "Metadata for: dataset_4538 ... \n",
      " [Done]\n",
      "Metadata for: dataset_717 ... \n",
      " [Done]\n",
      "Metadata for: dataset_821 ... \n",
      " [Done]\n",
      "Metadata for: dataset_4541 ... \n",
      " [Done]\n",
      "Metadata for: dataset_42193 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41671 ... \n",
      " [Done]\n",
      "Metadata for: dataset_897 ... \n",
      " [Done]\n",
      "Metadata for: dataset_884 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1531 ... \n",
      " [Done]\n",
      "Metadata for: dataset_28 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1509 ... \n",
      " [Done]\n",
      "Metadata for: dataset_40645 ... \n",
      " [Done]\n",
      "Metadata for: dataset_50 ... \n",
      " [Done]\n",
      "Metadata for: dataset_377 ... \n",
      " [Done]\n",
      "Metadata for: dataset_807 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1166 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1050 ... \n",
      " [Done]\n",
      "Metadata for: dataset_823 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41160 ... \n",
      " [Done]\n",
      "Metadata for: dataset_26 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1549 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1515 ... \n",
      " [Done]\n",
      "Metadata for: dataset_41991 ... \n",
      " [Done]\n",
      "Metadata for: dataset_816 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1497 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1552 ... \n",
      " [Done]\n",
      "Metadata for: dataset_1161 ... \n",
      " [Done]\n"
     ]
    }
   ],
   "source": [
    "datasets = df[\"dataset\"].unique()\n",
    "meta_information = [get_meta_information(datasets_parh[i]) for i in datasets]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "958724a8-f2c0-4c1d-a8e7-822db20cadea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>OpenML ID</th>\n",
       "      <th>Number of Examples</th>\n",
       "      <th>Number of Features</th>\n",
       "      <th>Number of Categorical Features</th>\n",
       "      <th>Number of class</th>\n",
       "      <th>Majority Class %</th>\n",
       "      <th>Minority Class %</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>132</th>\n",
       "      <td>2</td>\n",
       "      <td>898</td>\n",
       "      <td>38</td>\n",
       "      <td>32</td>\n",
       "      <td>5</td>\n",
       "      <td>76.17</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>6</td>\n",
       "      <td>20000</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>26</td>\n",
       "      <td>4.07</td>\n",
       "      <td>3.67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>11</td>\n",
       "      <td>625</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>46.08</td>\n",
       "      <td>7.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>15</td>\n",
       "      <td>699</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>65.52</td>\n",
       "      <td>34.48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>169</th>\n",
       "      <td>23</td>\n",
       "      <td>1473</td>\n",
       "      <td>9</td>\n",
       "      <td>7</td>\n",
       "      <td>3</td>\n",
       "      <td>42.70</td>\n",
       "      <td>22.61</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>41991</td>\n",
       "      <td>270912</td>\n",
       "      <td>784</td>\n",
       "      <td>0</td>\n",
       "      <td>49</td>\n",
       "      <td>2.58</td>\n",
       "      <td>0.17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>188</th>\n",
       "      <td>42193</td>\n",
       "      <td>5278</td>\n",
       "      <td>13</td>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "      <td>52.96</td>\n",
       "      <td>47.04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>159</th>\n",
       "      <td>42206</td>\n",
       "      <td>595212</td>\n",
       "      <td>37</td>\n",
       "      <td>25</td>\n",
       "      <td>2</td>\n",
       "      <td>96.36</td>\n",
       "      <td>3.64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>168</th>\n",
       "      <td>42343</td>\n",
       "      <td>82318</td>\n",
       "      <td>477</td>\n",
       "      <td>136</td>\n",
       "      <td>2</td>\n",
       "      <td>88.23</td>\n",
       "      <td>11.77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>173</th>\n",
       "      <td>42345</td>\n",
       "      <td>70340</td>\n",
       "      <td>20</td>\n",
       "      <td>19</td>\n",
       "      <td>3</td>\n",
       "      <td>48.88</td>\n",
       "      <td>4.98</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>211 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     OpenML ID  Number of Examples  Number of Features  \\\n",
       "132          2                 898                  38   \n",
       "25           6               20000                  16   \n",
       "44          11                 625                   4   \n",
       "147         15                 699                   9   \n",
       "169         23                1473                   9   \n",
       "..         ...                 ...                 ...   \n",
       "206      41991              270912                 784   \n",
       "188      42193                5278                  13   \n",
       "159      42206              595212                  37   \n",
       "168      42343               82318                 477   \n",
       "173      42345               70340                  20   \n",
       "\n",
       "     Number of Categorical Features  Number of class  Majority Class %  \\\n",
       "132                              32                5             76.17   \n",
       "25                                0               26              4.07   \n",
       "44                                0                3             46.08   \n",
       "147                               0                2             65.52   \n",
       "169                               7                3             42.70   \n",
       "..                              ...              ...               ...   \n",
       "206                               0               49              2.58   \n",
       "188                               6                2             52.96   \n",
       "159                              25                2             96.36   \n",
       "168                             136                2             88.23   \n",
       "173                              19                3             48.88   \n",
       "\n",
       "     Minority Class %  \n",
       "132              0.00  \n",
       "25               3.67  \n",
       "44               7.84  \n",
       "147             34.48  \n",
       "169             22.61  \n",
       "..                ...  \n",
       "206              0.17  \n",
       "188             47.04  \n",
       "159              3.64  \n",
       "168             11.77  \n",
       "173              4.98  \n",
       "\n",
       "[211 rows x 7 columns]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_information_df = pd.DataFrame(meta_information).sort_values(by=\"OpenML ID\", ascending=True)\n",
    "meta_information_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "7eb9882e-71ae-49f6-ae3c-3cbcae114968",
   "metadata": {},
   "outputs": [],
   "source": [
    "meta_information_df.to_csv(\"datasets_meta_data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a70a425-b066-4d70-a9c4-05f7200911c3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
