{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fab02d95",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "246eb247",
   "metadata": {},
   "source": [
    "# Adult Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ddfea006",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>education</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>gender</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>native-country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(20, 30]</td>\n",
       "      <td>Private</td>\n",
       "      <td>11th</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Machine-op-inspct</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(30, 40]</td>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Farming-fishing</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(40.0, 50.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(20, 30]</td>\n",
       "      <td>Local-gov</td>\n",
       "      <td>Assoc-acdm</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Protective-serv</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(40, 50]</td>\n",
       "      <td>Private</td>\n",
       "      <td>Some-college</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Machine-op-inspct</td>\n",
       "      <td>Husband</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>(30, 40]</td>\n",
       "      <td>Private</td>\n",
       "      <td>10th</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Other-service</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(20.0, 30.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48837</th>\n",
       "      <td>(20, 30]</td>\n",
       "      <td>Private</td>\n",
       "      <td>Assoc-acdm</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Tech-support</td>\n",
       "      <td>Wife</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48838</th>\n",
       "      <td>(30, 40]</td>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Machine-op-inspct</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48839</th>\n",
       "      <td>(50, 60]</td>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Widowed</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Unmarried</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48840</th>\n",
       "      <td>(20, 30]</td>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(10.0, 20.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48841</th>\n",
       "      <td>(50, 60]</td>\n",
       "      <td>Self-emp-inc</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Wife</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>45222 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            age     workclass     education      marital-status  \\\n",
       "0      (20, 30]       Private          11th       Never-married   \n",
       "1      (30, 40]       Private       HS-grad  Married-civ-spouse   \n",
       "2      (20, 30]     Local-gov    Assoc-acdm  Married-civ-spouse   \n",
       "3      (40, 50]       Private  Some-college  Married-civ-spouse   \n",
       "5      (30, 40]       Private          10th       Never-married   \n",
       "...         ...           ...           ...                 ...   \n",
       "48837  (20, 30]       Private    Assoc-acdm  Married-civ-spouse   \n",
       "48838  (30, 40]       Private       HS-grad  Married-civ-spouse   \n",
       "48839  (50, 60]       Private       HS-grad             Widowed   \n",
       "48840  (20, 30]       Private       HS-grad       Never-married   \n",
       "48841  (50, 60]  Self-emp-inc       HS-grad  Married-civ-spouse   \n",
       "\n",
       "              occupation   relationship   race  gender  capital-gain  \\\n",
       "0      Machine-op-inspct      Own-child  Black    Male         False   \n",
       "1        Farming-fishing        Husband  White    Male         False   \n",
       "2        Protective-serv        Husband  White    Male         False   \n",
       "3      Machine-op-inspct        Husband  Black    Male          True   \n",
       "5          Other-service  Not-in-family  White    Male         False   \n",
       "...                  ...            ...    ...     ...           ...   \n",
       "48837       Tech-support           Wife  White  Female         False   \n",
       "48838  Machine-op-inspct        Husband  White    Male         False   \n",
       "48839       Adm-clerical      Unmarried  White  Female         False   \n",
       "48840       Adm-clerical      Own-child  White    Male         False   \n",
       "48841    Exec-managerial           Wife  White  Female          True   \n",
       "\n",
       "       capital-loss hours-per-week native-country income  \n",
       "0             False   (30.0, 40.0]  United-States  <=50K  \n",
       "1             False   (40.0, 50.0]  United-States  <=50K  \n",
       "2             False   (30.0, 40.0]  United-States   >50K  \n",
       "3             False   (30.0, 40.0]  United-States   >50K  \n",
       "5             False   (20.0, 30.0]  United-States  <=50K  \n",
       "...             ...            ...            ...    ...  \n",
       "48837         False   (30.0, 40.0]  United-States  <=50K  \n",
       "48838         False   (30.0, 40.0]  United-States   >50K  \n",
       "48839         False   (30.0, 40.0]  United-States  <=50K  \n",
       "48840         False   (10.0, 20.0]  United-States  <=50K  \n",
       "48841         False   (30.0, 40.0]  United-States   >50K  \n",
       "\n",
       "[45222 rows x 13 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"datasets/adult.csv\", na_values=[\" ?\", \"?\"])\n",
    "df.drop(columns=[\"fnlwgt\", \"educational-num\"], inplace=True)\n",
    "df.dropna(inplace=True)\n",
    "df.age = pd.cut(df.age, bins=[10, 20, 30, 40, 50, 60, 70, 80, 90]).astype(str)\n",
    "df[\"hours-per-week\"] = pd.cut(df[\"hours-per-week\"], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 1000], include_lowest=True).astype(str)\n",
    "df[\"capital-gain\"] = df[\"capital-gain\"] > 0\n",
    "df[\"capital-loss\"] = df[\"capital-loss\"] > 0\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "3a0f35d6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>education</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>gender</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>native-country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>15</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48837</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48838</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48839</th>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48840</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48841</th>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>45222 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       age  workclass  education  marital-status  occupation  relationship  \\\n",
       "0        1          2          1               4           6             3   \n",
       "1        2          2         11               2           4             0   \n",
       "2        1          1          7               2          10             0   \n",
       "3        3          2         15               2           6             0   \n",
       "5        2          2          0               4           7             1   \n",
       "...    ...        ...        ...             ...         ...           ...   \n",
       "48837    1          2          7               2          12             5   \n",
       "48838    2          2         11               2           6             0   \n",
       "48839    4          2         11               6           0             4   \n",
       "48840    1          2         11               4           0             3   \n",
       "48841    4          3         11               2           3             5   \n",
       "\n",
       "       race  gender  capital-gain  capital-loss  hours-per-week  \\\n",
       "0         2       1             0             0               3   \n",
       "1         4       1             0             0               4   \n",
       "2         4       1             0             0               3   \n",
       "3         2       1             1             0               3   \n",
       "5         4       1             0             0               2   \n",
       "...     ...     ...           ...           ...             ...   \n",
       "48837     4       0             0             0               3   \n",
       "48838     4       1             0             0               3   \n",
       "48839     4       0             0             0               3   \n",
       "48840     4       1             0             0               1   \n",
       "48841     4       0             1             0               3   \n",
       "\n",
       "       native-country  income  \n",
       "0                  38       0  \n",
       "1                  38       0  \n",
       "2                  38       1  \n",
       "3                  38       1  \n",
       "5                  38       0  \n",
       "...               ...     ...  \n",
       "48837              38       0  \n",
       "48838              38       1  \n",
       "48839              38       0  \n",
       "48840              38       0  \n",
       "48841              38       1  \n",
       "\n",
       "[45222 rows x 13 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for col in df.columns:\n",
    "    df[col] = df[col].astype(\"category\")\n",
    "    df[col] = df[col].cat.codes\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5512d5a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"datasets/adult-preprocessed.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7dace50c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "weakening-dp-bounds",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
