{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fab02d95",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "246eb247",
   "metadata": {},
   "source": [
    "# Adult Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ddfea006",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>education</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>gender</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>native-country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(20, 30]</td>\n",
       "      <td>Private</td>\n",
       "      <td>11th</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Machine-op-inspct</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(30, 40]</td>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Farming-fishing</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(40.0, 50.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(20, 30]</td>\n",
       "      <td>Local-gov</td>\n",
       "      <td>Assoc-acdm</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Protective-serv</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(40, 50]</td>\n",
       "      <td>Private</td>\n",
       "      <td>Some-college</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Machine-op-inspct</td>\n",
       "      <td>Husband</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>(30, 40]</td>\n",
       "      <td>Private</td>\n",
       "      <td>10th</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Other-service</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(20.0, 30.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48837</th>\n",
       "      <td>(20, 30]</td>\n",
       "      <td>Private</td>\n",
       "      <td>Assoc-acdm</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Tech-support</td>\n",
       "      <td>Wife</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48838</th>\n",
       "      <td>(30, 40]</td>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Machine-op-inspct</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48839</th>\n",
       "      <td>(50, 60]</td>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Widowed</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Unmarried</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48840</th>\n",
       "      <td>(20, 30]</td>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>(10.0, 20.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48841</th>\n",
       "      <td>(50, 60]</td>\n",
       "      <td>Self-emp-inc</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Wife</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>(30.0, 40.0]</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>45222 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            age     workclass     education      marital-status  \\\n",
       "0      (20, 30]       Private          11th       Never-married   \n",
       "1      (30, 40]       Private       HS-grad  Married-civ-spouse   \n",
       "2      (20, 30]     Local-gov    Assoc-acdm  Married-civ-spouse   \n",
       "3      (40, 50]       Private  Some-college  Married-civ-spouse   \n",
       "5      (30, 40]       Private          10th       Never-married   \n",
       "...         ...           ...           ...                 ...   \n",
       "48837  (20, 30]       Private    Assoc-acdm  Married-civ-spouse   \n",
       "48838  (30, 40]       Private       HS-grad  Married-civ-spouse   \n",
       "48839  (50, 60]       Private       HS-grad             Widowed   \n",
       "48840  (20, 30]       Private       HS-grad       Never-married   \n",
       "48841  (50, 60]  Self-emp-inc       HS-grad  Married-civ-spouse   \n",
       "\n",
       "              occupation   relationship   race  gender  capital-gain  \\\n",
       "0      Machine-op-inspct      Own-child  Black    Male         False   \n",
       "1        Farming-fishing        Husband  White    Male         False   \n",
       "2        Protective-serv        Husband  White    Male         False   \n",
       "3      Machine-op-inspct        Husband  Black    Male          True   \n",
       "5          Other-service  Not-in-family  White    Male         False   \n",
       "...                  ...            ...    ...     ...           ...   \n",
       "48837       Tech-support           Wife  White  Female         False   \n",
       "48838  Machine-op-inspct        Husband  White    Male         False   \n",
       "48839       Adm-clerical      Unmarried  White  Female         False   \n",
       "48840       Adm-clerical      Own-child  White    Male         False   \n",
       "48841    Exec-managerial           Wife  White  Female          True   \n",
       "\n",
       "       capital-loss hours-per-week native-country income  \n",
       "0             False   (30.0, 40.0]  United-States  <=50K  \n",
       "1             False   (40.0, 50.0]  United-States  <=50K  \n",
       "2             False   (30.0, 40.0]  United-States   >50K  \n",
       "3             False   (30.0, 40.0]  United-States   >50K  \n",
       "5             False   (20.0, 30.0]  United-States  <=50K  \n",
       "...             ...            ...            ...    ...  \n",
       "48837         False   (30.0, 40.0]  United-States  <=50K  \n",
       "48838         False   (30.0, 40.0]  United-States   >50K  \n",
       "48839         False   (30.0, 40.0]  United-States  <=50K  \n",
       "48840         False   (10.0, 20.0]  United-States  <=50K  \n",
       "48841         False   (30.0, 40.0]  United-States   >50K  \n",
       "\n",
       "[45222 rows x 13 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"datasets/adult.csv\", na_values=[\" ?\", \"?\"])\n",
    "df.drop(columns=[\"fnlwgt\", \"educational-num\"], inplace=True)\n",
    "df.dropna(inplace=True)\n",
    "df.age = pd.cut(df.age, bins=[10, 20, 30, 40, 50, 60, 70, 80, 90]).astype(str)\n",
    "df[\"hours-per-week\"] = pd.cut(df[\"hours-per-week\"], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 1000], include_lowest=True).astype(str)\n",
    "df[\"capital-gain\"] = df[\"capital-gain\"] > 0\n",
    "df[\"capital-loss\"] = df[\"capital-loss\"] > 0\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "3a0f35d6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>education</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>gender</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>native-country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>15</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48837</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48838</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48839</th>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48840</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48841</th>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>38</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>45222 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       age  workclass  education  marital-status  occupation  relationship  \\\n",
       "0        1          2          1               4           6             3   \n",
       "1        2          2         11               2           4             0   \n",
       "2        1          1          7               2          10             0   \n",
       "3        3          2         15               2           6             0   \n",
       "5        2          2          0               4           7             1   \n",
       "...    ...        ...        ...             ...         ...           ...   \n",
       "48837    1          2          7               2          12             5   \n",
       "48838    2          2         11               2           6             0   \n",
       "48839    4          2         11               6           0             4   \n",
       "48840    1          2         11               4           0             3   \n",
       "48841    4          3         11               2           3             5   \n",
       "\n",
       "       race  gender  capital-gain  capital-loss  hours-per-week  \\\n",
       "0         2       1             0             0               3   \n",
       "1         4       1             0             0               4   \n",
       "2         4       1             0             0               3   \n",
       "3         2       1             1             0               3   \n",
       "5         4       1             0             0               2   \n",
       "...     ...     ...           ...           ...             ...   \n",
       "48837     4       0             0             0               3   \n",
       "48838     4       1             0             0               3   \n",
       "48839     4       0             0             0               3   \n",
       "48840     4       1             0             0               1   \n",
       "48841     4       0             1             0               3   \n",
       "\n",
       "       native-country  income  \n",
       "0                  38       0  \n",
       "1                  38       0  \n",
       "2                  38       1  \n",
       "3                  38       1  \n",
       "5                  38       0  \n",
       "...               ...     ...  \n",
       "48837              38       0  \n",
       "48838              38       1  \n",
       "48839              38       0  \n",
       "48840              38       0  \n",
       "48841              38       1  \n",
       "\n",
       "[45222 rows x 13 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for col in df.columns:\n",
    "    df[col] = df[col].astype(\"category\")\n",
    "    df[col] = df[col].cat.codes\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5512d5a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"datasets/adult-preprocessed.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3556b2a3",
   "metadata": {},
   "source": [
    "# Dutch Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a39a0233",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gender</th>\n",
       "      <th>age_group</th>\n",
       "      <th>household_position</th>\n",
       "      <th>household_size</th>\n",
       "      <th>prev_residence_place</th>\n",
       "      <th>citizenship</th>\n",
       "      <th>country_birth</th>\n",
       "      <th>edu_level</th>\n",
       "      <th>economic_status</th>\n",
       "      <th>cur_eco_activity</th>\n",
       "      <th>Marital_status</th>\n",
       "      <th>occupation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>1131</td>\n",
       "      <td>112</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>111</td>\n",
       "      <td>135</td>\n",
       "      <td>1</td>\n",
       "      <td>2_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>1122</td>\n",
       "      <td>113</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>111</td>\n",
       "      <td>122</td>\n",
       "      <td>2</td>\n",
       "      <td>5_4_9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>1122</td>\n",
       "      <td>113</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>111</td>\n",
       "      <td>122</td>\n",
       "      <td>2</td>\n",
       "      <td>2_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>1121</td>\n",
       "      <td>112</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>111</td>\n",
       "      <td>137</td>\n",
       "      <td>2</td>\n",
       "      <td>5_4_9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1110</td>\n",
       "      <td>114</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>111</td>\n",
       "      <td>138</td>\n",
       "      <td>1</td>\n",
       "      <td>5_4_9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60415</th>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>1122</td>\n",
       "      <td>125</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>111</td>\n",
       "      <td>131</td>\n",
       "      <td>2</td>\n",
       "      <td>2_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60416</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>1110</td>\n",
       "      <td>114</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>111</td>\n",
       "      <td>135</td>\n",
       "      <td>1</td>\n",
       "      <td>5_4_9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60417</th>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>1121</td>\n",
       "      <td>112</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>111</td>\n",
       "      <td>135</td>\n",
       "      <td>2</td>\n",
       "      <td>2_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60418</th>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>1131</td>\n",
       "      <td>112</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>111</td>\n",
       "      <td>137</td>\n",
       "      <td>1</td>\n",
       "      <td>2_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60419</th>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>1121</td>\n",
       "      <td>112</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>111</td>\n",
       "      <td>137</td>\n",
       "      <td>2</td>\n",
       "      <td>2_1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>60420 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       gender  age_group  household_position  household_size  \\\n",
       "0           1          6                1131             112   \n",
       "1           2         10                1122             113   \n",
       "2           1          8                1122             113   \n",
       "3           1         12                1121             112   \n",
       "4           2          4                1110             114   \n",
       "...       ...        ...                 ...             ...   \n",
       "60415       1         10                1122             125   \n",
       "60416       2          5                1110             114   \n",
       "60417       1         11                1121             112   \n",
       "60418       2          7                1131             112   \n",
       "60419       1         12                1121             112   \n",
       "\n",
       "       prev_residence_place  citizenship  country_birth  edu_level  \\\n",
       "0                         1            1              1          5   \n",
       "1                         1            1              1          2   \n",
       "2                         1            1              1          2   \n",
       "3                         1            1              1          1   \n",
       "4                         1            1              1          2   \n",
       "...                     ...          ...            ...        ...   \n",
       "60415                     1            1              1          3   \n",
       "60416                     1            1              1          5   \n",
       "60417                     1            1              1          5   \n",
       "60418                     1            1              1          5   \n",
       "60419                     1            1              1          5   \n",
       "\n",
       "       economic_status  cur_eco_activity  Marital_status occupation  \n",
       "0                  111               135               1        2_1  \n",
       "1                  111               122               2      5_4_9  \n",
       "2                  111               122               2        2_1  \n",
       "3                  111               137               2      5_4_9  \n",
       "4                  111               138               1      5_4_9  \n",
       "...                ...               ...             ...        ...  \n",
       "60415              111               131               2        2_1  \n",
       "60416              111               135               1      5_4_9  \n",
       "60417              111               135               2        2_1  \n",
       "60418              111               137               1        2_1  \n",
       "60419              111               137               2        2_1  \n",
       "\n",
       "[60420 rows x 12 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"datasets/dutch.csv\", index_col=False)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b059e31b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gender\n",
      "2    30273\n",
      "1    30147\n",
      "Name: count, dtype: int64\n",
      "age_group\n",
      "8     8748\n",
      "9     8478\n",
      "7     8289\n",
      "10    7880\n",
      "11    7021\n",
      "6     6292\n",
      "5     4770\n",
      "12    3881\n",
      "4     3801\n",
      "13    1022\n",
      "14     176\n",
      "15      62\n",
      "Name: count, dtype: int64\n",
      "household_position\n",
      "1122    26225\n",
      "1121     9975\n",
      "1110     7229\n",
      "1210     6529\n",
      "1131     5818\n",
      "1132     2291\n",
      "1140     1824\n",
      "1220      529\n",
      "Name: count, dtype: int64\n",
      "household_size\n",
      "112    17237\n",
      "114    16370\n",
      "113    12238\n",
      "111     6529\n",
      "125     6179\n",
      "126     1867\n",
      "Name: count, dtype: int64\n",
      "prev_residence_place\n",
      "1    58943\n",
      "2     1477\n",
      "Name: count, dtype: int64\n",
      "citizenship\n",
      "1    59225\n",
      "2      843\n",
      "3      352\n",
      "Name: count, dtype: int64\n",
      "country_birth\n",
      "1    56058\n",
      "3     2821\n",
      "2     1541\n",
      "Name: count, dtype: int64\n",
      "edu_level\n",
      "3    22672\n",
      "5    18109\n",
      "2    12326\n",
      "1     4513\n",
      "4     2580\n",
      "0      220\n",
      "Name: count, dtype: int64\n",
      "economic_status\n",
      "111    51340\n",
      "120     4771\n",
      "112     4309\n",
      "Name: count, dtype: int64\n",
      "cur_eco_activity\n",
      "131    11621\n",
      "135    10239\n",
      "138     8168\n",
      "122     6505\n",
      "137     5862\n",
      "136     4294\n",
      "133     3062\n",
      "139     2661\n",
      "132     2616\n",
      "134     1940\n",
      "111     1738\n",
      "124     1714\n",
      "Name: count, dtype: int64\n",
      "Marital_status\n",
      "2    36655\n",
      "1    19656\n",
      "4     3566\n",
      "3      543\n",
      "Name: count, dtype: int64\n",
      "occupation\n",
      "5_4_9    31657\n",
      "2_1      28763\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "for col in df.columns:\n",
    "    print(df[col].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7dace50c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "weakening-dp-bounds",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
