{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# so kernel doesn't have to be restarted\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "# data science imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# helper scripts\n",
    "from knn_helper import get_data\n",
    "\n",
    "# file system\n",
    "import os\n",
    "from os.path import join as oj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1823582/547383113.py:3: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/accounts/grad/zachrewolinski/research/imodels-experiments/feature_importance/counterfactuals/knn_helper.py:75: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_1823582/547383113.py:3: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/accounts/grad/zachrewolinski/research/imodels-experiments/feature_importance/counterfactuals/knn_helper.py:75: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_1823582/547383113.py:3: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "WARNING:root:Received uncompressed content from OpenML for https://api.openml.org/data/v1/download/22103247/pol.arff.\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/accounts/grad/zachrewolinski/research/imodels-experiments/feature_importance/counterfactuals/knn_helper.py:75: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_1823582/547383113.py:3: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "WARNING:root:Received uncompressed content from OpenML for https://api.openml.org/data/v1/download/22103248/house_16H.arff.\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/accounts/grad/zachrewolinski/research/imodels-experiments/feature_importance/counterfactuals/knn_helper.py:75: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_1823582/547383113.py:3: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "WARNING:root:Received uncompressed content from OpenML for https://api.openml.org/data/v1/download/22103254/Higgs.arff.\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/accounts/grad/zachrewolinski/research/imodels-experiments/feature_importance/counterfactuals/knn_helper.py:75: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n",
      "/tmp/ipykernel_1823582/547383113.py:3: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X, y = get_data(\"openml\", id)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "WARNING:root:Received uncompressed content from OpenML for https://api.openml.org/data/v1/download/22103256/jannis.arff.\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/accounts/grad/zachrewolinski/research/imodels-experiments/feature_importance/counterfactuals/knn_helper.py:75: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n"
     ]
    }
   ],
   "source": [
    "data_ids = [43, 361062, 361063, 361069, 361071]\n",
    "for id in data_ids:\n",
    "    X, y = get_data(\"openml\", id)\n",
    "    data_dir = oj(\"data\", str(id))\n",
    "    os.makedirs(data_dir, exist_ok=True)\n",
    "    np.savetxt(oj(data_dir, \"X.csv\"), X, delimiter=\",\")\n",
    "    np.savetxt(oj(data_dir, \"y.csv\"), y, delimiter=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3207691/674515221.py:1: FutureWarning: Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.\n",
      "  X_9978, y_9978 = get_data(\"openml\", 9978)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/functions.py:442: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)\n",
      "/scratch/users/zachrewolinski/conda/envs/mdi/lib/python3.10/site-packages/openml/tasks/task.py:150: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  return datasets.get_dataset(self.dataset_id)\n",
      "/accounts/grad/zachrewolinski/research/imodels-experiments/feature_importance/counterfactuals/knn_helper.py:75: FutureWarning: Support for `dataset_format='array'` will be removed in 0.15,start using `dataset_format='dataframe' to ensure your code will continue to work. You can use the dataframe's `to_numpy` function to continue using numpy arrays.\n",
      "  dataset.get_data(target=dataset.default_target_attribute,\n"
     ]
    }
   ],
   "source": [
    "X_9978, y_9978 = get_data(\"openml\", 9978)\n",
    "X_9978 = pd.DataFrame(X_9978)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original number of features: 72\n",
      "Reduced number of features: 47\n"
     ]
    }
   ],
   "source": [
    "threshold = 0.95 \n",
    "correlation_matrix = X_9978.corr()\n",
    "# identify features with high correlation\n",
    "upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))\n",
    "high_correlation_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]\n",
    "# drop highly correlated features\n",
    "X_9978_reduced = X_9978.drop(columns=high_correlation_features)\n",
    "print(f\"Original number of features: {X_9978.shape[1]}\")\n",
    "print(f\"Reduced number of features: {X_9978_reduced.shape[1]}\")\n",
    "X_9978_reduced = X_9978_reduced.to_numpy()\n",
    "np.savetxt(oj(\"data/9978\", \"X.csv\"), X_9978_reduced, delimiter=\",\")\n",
    "np.savetxt(oj(\"data/9978\", \"y.csv\"), y_9978, delimiter=\",\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mdi",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
