{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data science imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from ucimlrepo import fetch_ucirepo\n",
    "import openml\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# file system\n",
    "import os\n",
    "from os.path import join as oj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def get_data(data_source, data_id):\n",
    "    \"\"\"\n",
    "    Fetches dataset from either UCI Machine Learning Repository or OpenML.\n",
    "    \n",
    "    Parameters:\n",
    "    data_source (str): The source of the dataset, either 'uci' or 'openml'.\n",
    "    data_id (int): The ID of the dataset.\n",
    "    \n",
    "    Returns:\n",
    "    X (np.ndarray): The feature matrix.\n",
    "    y (np.ndarray): The target vector.\n",
    "    \"\"\"\n",
    "    \n",
    "    # ensure that data source is either 'uci' or 'openml'\n",
    "    if data_source not in [\"uci\", \"openml\"]:\n",
    "        raise ValueError(\"data_source must be either 'uci' or 'openml'\")\n",
    "    \n",
    "    # handle case where data comes from uci\n",
    "    if data_source == \"uci\":\n",
    "        \n",
    "        # get pandas df X and numpy array y\n",
    "        dataset = fetch_ucirepo(id=data_id)\n",
    "        X = dataset.data.features\n",
    "        y = dataset.data.targets.to_numpy().flatten()\n",
    "        \n",
    "        # handle breast cancer dataset\n",
    "        if data_id == 15:\n",
    "            # remove rows with 'nan' entries for 'Bare_nuclei'\n",
    "            X = X.dropna()\n",
    "            # remove same observations from dataframe y\n",
    "            y = y[X.index]\n",
    "            # reset index\n",
    "            X = X.reset_index(drop=True)\n",
    "            # transform y from 2/4 to 0/1\n",
    "            y = (y == 4).astype(int)\n",
    "        \n",
    "        X = X.to_numpy() # convert to numpy\n",
    "\n",
    "    if data_source == \"openml\":\n",
    "        \n",
    "        # get data\n",
    "        task = openml.tasks.get_task(data_id)\n",
    "        dataset = task.get_dataset()\n",
    "        X, y, categorical_mask, col_names = \\\n",
    "            dataset.get_data(target=dataset.default_target_attribute,\n",
    "                            dataset_format=\"array\")\n",
    "        \n",
    "    # center and scale the covariates\n",
    "    scaler = StandardScaler()\n",
    "    X = scaler.fit_transform(X)\n",
    "    \n",
    "    # # sample 2000 rows of X and y if X has more than 2000 rows\n",
    "    # if X.shape[0] > 2000:\n",
    "    #     np.random.seed(42)\n",
    "    #     indices = np.random.choice(X.shape[0], 2000, replace=False)\n",
    "    #     X = X[indices]\n",
    "    #     y = y[indices]\n",
    "\n",
    "    return X, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "WARNING:root:Received uncompressed content from OpenML for https://api.openml.org/data/v1/download/22103248/house_16H.arff.\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "WARNING:root:Received uncompressed content from OpenML for https://api.openml.org/data/v1/download/22103254/Higgs.arff.\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "WARNING:root:Received uncompressed content from OpenML for https://api.openml.org/data/v1/download/22103247/pol.arff.\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "WARNING:root:Received uncompressed content from OpenML for https://api.openml.org/data/v1/download/22103256/jannis.arff.\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_288460/1720729341.py:6: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/tmp/ipykernel_288460/1418112640.py:45: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n"
     ]
    }
   ],
   "source": [
    "data_ids = [361260, 361254, 361259, 361253, 361243, 361242,\n",
    "            361063, 361069, 361062, 9978, 361071, 43]\n",
    "for id in data_ids:\n",
    "    X, y = get_data(\"openml\", id)\n",
    "    data_dir = oj(\"data\", str(id))\n",
    "    os.makedirs(data_dir, exist_ok=True)\n",
    "    np.savetxt(oj(data_dir, \"X.csv\"), X, delimiter=\",\")\n",
    "    np.savetxt(oj(data_dir, \"y.csv\"), y, delimiter=\",\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mdi",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
