{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function\n",
    "\n",
    "import gzip\n",
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from six.moves import urllib\n",
    "\n",
    "# import tensorflow as tf\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "ADULT_SOURCE_URL =\\\n",
    "  'https://archive.ics.uci.edu/dataset/2/adult'\n",
    "GERMAN_SOURCE_URL =\\\n",
    "  'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data'\n",
    "\n",
    "ADULT_RAW_COL_NAMES =\\\n",
    "  [\"age\",\\\n",
    "    \"workclass\",\\\n",
    "    \"fnlwgt\",\\\n",
    "    \"education\",\\\n",
    "    \"education-num\",\\\n",
    "    \"marital-status\",\\\n",
    "    \"occupation\",\\\n",
    "    \"relationship\",\\\n",
    "    \"race\",\\\n",
    "    \"sex\",\\\n",
    "    \"capital-gain\",\\\n",
    "    \"capital-loss\",\\\n",
    "    \"hours-per-week\",\\\n",
    "    \"native-country\",\n",
    "    \"income\"\\\n",
    "  ]\n",
    "\n",
    "ADULT_RAW_COL_FACTOR =\\\n",
    "  [1,3,5,6,7,8,13]\n",
    "\n",
    "ADULT_VALIDATION_SPLIT = 0.5\n",
    "\n",
    "GERMAN_RAW_COL_NAMES =\\\n",
    "  [\"checking_acc\",\\\n",
    "    \"duration\",\\\n",
    "    \"credit_hist\",\\\n",
    "    \"purpose\",\\\n",
    "    \"credit_amount\",\\\n",
    "    \"savings\",\\\n",
    "    \"employment_status\",\\\n",
    "    \"install_rate\",\\\n",
    "    \"relationship_and_sex\",\\\n",
    "    \"debtors\",\\\n",
    "    \"res_interval\",\\\n",
    "    \"property\",\\\n",
    "    \"age\",\\\n",
    "    \"other_plans\",\n",
    "    \"housing\",\\\n",
    "    \"credits_at_bank\",\n",
    "    \"job\",\n",
    "    \"liable_persons\",\n",
    "    \"phone\",\n",
    "    \"foreign\",\n",
    "    \"cr_good_bad\"\n",
    "  ]\n",
    "\n",
    "GERMAN_RAW_COL_FACTOR =\\\n",
    "  [0,2,3,5,6,8,9,11,13,14,16,18,19]\n",
    "\n",
    "GERMAN_VAL_SPLIT = 0.2\n",
    "GERMAN_TEST_SPLIT = 0.2\n",
    "\n",
    "DATA_DIRECTORY = \"data/\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def maybe_download(adult_flag=False, german_flag=False):\n",
    "  if not adult_flag and not german_flag:\n",
    "    raise(\"Neither flag specified, aborting.\")\n",
    "\n",
    "  if adult_flag:\n",
    "    maybe_download_adult(\"adult.data\")\n",
    "    maybe_download_adult(\"adult.test\")\n",
    "\n",
    "  if german_flag:\n",
    "    maybe_download_german(\"german.data\")\n",
    "\n",
    "# Download ADULT data\n",
    "def maybe_download_adult(filename):\n",
    "\n",
    "  # if not tf.gfile.Exists(DATA_DIRECTORY):\n",
    "  #   tf.gfile.MakeDirs(DATA_DIRECTORY + \"raw\")\n",
    "\n",
    "  filepath = os.path.join(DATA_DIRECTORY + \"raw\", filename)\n",
    "\n",
    "  # if not tf.gfile.Exists(filepath):\n",
    "  #   filepath, _ = urllib.request.urlretrieve(ADULT_SOURCE_URL + filename, filepath)\n",
    "  #   with tf.gfile.GFile(filepath) as f:\n",
    "  #     size = f.size()\n",
    "  #   print('Successfully downloaded', filename, size, 'bytes.')\n",
    "  filepath, _ = urllib.request.urlretrieve(ADULT_SOURCE_URL + filename, filepath)\n",
    "\n",
    "  return filepath\n",
    "\n",
    "# Download german data\n",
    "def maybe_download_german(filename):\n",
    "\n",
    "  # if not tf.gfile.Exists(DATA_DIRECTORY):\n",
    "  #   tf.gfile.MakeDirs(DATA_DIRECTORY + \"raw\")\n",
    "\n",
    "  filepath = os.path.join(DATA_DIRECTORY + \"raw\", filename)\n",
    "\n",
    "  # if not tf.gfile.Exists(filepath):\n",
    "  #   filepath, _ = urllib.request.urlretrieve(GERMAN_SOURCE_URL + filename, filepath)\n",
    "  #   with tf.gfile.GFile(filepath) as f:\n",
    "  #     size = f.size()\n",
    "  #   print('Successfully downloaded', filename, size, 'bytes.')\n",
    "  filepath, _ = urllib.request.urlretrieve(GERMAN_SOURCE_URL + filename, filepath)\n",
    "\n",
    "  return filepath\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as \"Census Income\" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Sep 09 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))\\r\\n\\r\\nPrediction task is to determine whether a person makes over 50K a year.\\r\\n', 'purpose': None, 'funded_by': None, 'instances_represent': None, 'recommended_data_splits': None, 'sensitive_data': None, 'preprocessing_description': None, 'variable_info': 'Listing of attributes:\\r\\n\\r\\n>50K, <=50K.\\r\\n\\r\\nage: continuous.\\r\\nworkclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.\\r\\nfnlwgt: continuous.\\r\\neducation: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.\\r\\neducation-num: continuous.\\r\\nmarital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.\\r\\noccupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.\\r\\nrelationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.\\r\\nrace: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.\\r\\nsex: Female, Male.\\r\\ncapital-gain: continuous.\\r\\ncapital-loss: continuous.\\r\\nhours-per-week: continuous.\\r\\nnative-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.', 'citation': None}}\n",
      "              name     role         type      demographic  \\\n",
      "0              age  Feature      Integer              Age   \n",
      "1        workclass  Feature  Categorical           Income   \n",
      "2           fnlwgt  Feature      Integer             None   \n",
      "3        education  Feature  Categorical  Education Level   \n",
      "4    education-num  Feature      Integer  Education Level   \n",
      "5   marital-status  Feature  Categorical            Other   \n",
      "6       occupation  Feature  Categorical            Other   \n",
      "7     relationship  Feature  Categorical            Other   \n",
      "8             race  Feature  Categorical             Race   \n",
      "9              sex  Feature       Binary              Sex   \n",
      "10    capital-gain  Feature      Integer             None   \n",
      "11    capital-loss  Feature      Integer             None   \n",
      "12  hours-per-week  Feature      Integer             None   \n",
      "13  native-country  Feature  Categorical            Other   \n",
      "14          income   Target       Binary           Income   \n",
      "\n",
      "                                          description units missing_values  \n",
      "0                                                 N/A  None             no  \n",
      "1   Private, Self-emp-not-inc, Self-emp-inc, Feder...  None            yes  \n",
      "2                                                None  None             no  \n",
      "3    Bachelors, Some-college, 11th, HS-grad, Prof-...  None             no  \n",
      "4                                                None  None             no  \n",
      "5   Married-civ-spouse, Divorced, Never-married, S...  None             no  \n",
      "6   Tech-support, Craft-repair, Other-service, Sal...  None            yes  \n",
      "7   Wife, Own-child, Husband, Not-in-family, Other...  None             no  \n",
      "8   White, Asian-Pac-Islander, Amer-Indian-Eskimo,...  None             no  \n",
      "9                                       Female, Male.  None             no  \n",
      "10                                               None  None             no  \n",
      "11                                               None  None             no  \n",
      "12                                               None  None             no  \n",
      "13  United-States, Cambodia, England, Puerto-Rico,...  None            yes  \n",
      "14                                       >50K, <=50K.  None             no  \n"
     ]
    }
   ],
   "source": [
    "from ucimlrepo import fetch_ucirepo \n",
    "  \n",
    "# fetch dataset \n",
    "adult = fetch_ucirepo(id=2) \n",
    "  \n",
    "# data (as pandas dataframes) \n",
    "X = adult.data.features \n",
    "y = adult.data.targets \n",
    "  \n",
    "# metadata \n",
    "print(adult.metadata) \n",
    "  \n",
    "# variable information \n",
    "print(adult.variables) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education</th>\n",
       "      <th>education-num</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>native-country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39</td>\n",
       "      <td>State-gov</td>\n",
       "      <td>77516</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>2174</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50</td>\n",
       "      <td>Self-emp-not-inc</td>\n",
       "      <td>83311</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>United-States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>Private</td>\n",
       "      <td>215646</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>53</td>\n",
       "      <td>Private</td>\n",
       "      <td>234721</td>\n",
       "      <td>11th</td>\n",
       "      <td>7</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Husband</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28</td>\n",
       "      <td>Private</td>\n",
       "      <td>338409</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Wife</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>Cuba</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48837</th>\n",
       "      <td>39</td>\n",
       "      <td>Private</td>\n",
       "      <td>215419</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>36</td>\n",
       "      <td>United-States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48838</th>\n",
       "      <td>64</td>\n",
       "      <td>NaN</td>\n",
       "      <td>321403</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Widowed</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Other-relative</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48839</th>\n",
       "      <td>38</td>\n",
       "      <td>Private</td>\n",
       "      <td>374983</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>50</td>\n",
       "      <td>United-States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48840</th>\n",
       "      <td>44</td>\n",
       "      <td>Private</td>\n",
       "      <td>83891</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>Asian-Pac-Islander</td>\n",
       "      <td>Male</td>\n",
       "      <td>5455</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48841</th>\n",
       "      <td>35</td>\n",
       "      <td>Self-emp-inc</td>\n",
       "      <td>182148</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>United-States</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>48842 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       age         workclass  fnlwgt  education  education-num  \\\n",
       "0       39         State-gov   77516  Bachelors             13   \n",
       "1       50  Self-emp-not-inc   83311  Bachelors             13   \n",
       "2       38           Private  215646    HS-grad              9   \n",
       "3       53           Private  234721       11th              7   \n",
       "4       28           Private  338409  Bachelors             13   \n",
       "...    ...               ...     ...        ...            ...   \n",
       "48837   39           Private  215419  Bachelors             13   \n",
       "48838   64               NaN  321403    HS-grad              9   \n",
       "48839   38           Private  374983  Bachelors             13   \n",
       "48840   44           Private   83891  Bachelors             13   \n",
       "48841   35      Self-emp-inc  182148  Bachelors             13   \n",
       "\n",
       "           marital-status         occupation    relationship  \\\n",
       "0           Never-married       Adm-clerical   Not-in-family   \n",
       "1      Married-civ-spouse    Exec-managerial         Husband   \n",
       "2                Divorced  Handlers-cleaners   Not-in-family   \n",
       "3      Married-civ-spouse  Handlers-cleaners         Husband   \n",
       "4      Married-civ-spouse     Prof-specialty            Wife   \n",
       "...                   ...                ...             ...   \n",
       "48837            Divorced     Prof-specialty   Not-in-family   \n",
       "48838             Widowed                NaN  Other-relative   \n",
       "48839  Married-civ-spouse     Prof-specialty         Husband   \n",
       "48840            Divorced       Adm-clerical       Own-child   \n",
       "48841  Married-civ-spouse    Exec-managerial         Husband   \n",
       "\n",
       "                     race     sex  capital-gain  capital-loss  hours-per-week  \\\n",
       "0                   White    Male          2174             0              40   \n",
       "1                   White    Male             0             0              13   \n",
       "2                   White    Male             0             0              40   \n",
       "3                   Black    Male             0             0              40   \n",
       "4                   Black  Female             0             0              40   \n",
       "...                   ...     ...           ...           ...             ...   \n",
       "48837               White  Female             0             0              36   \n",
       "48838               Black    Male             0             0              40   \n",
       "48839               White    Male             0             0              50   \n",
       "48840  Asian-Pac-Islander    Male          5455             0              40   \n",
       "48841               White    Male             0             0              60   \n",
       "\n",
       "      native-country  \n",
       "0      United-States  \n",
       "1      United-States  \n",
       "2      United-States  \n",
       "3      United-States  \n",
       "4               Cuba  \n",
       "...              ...  \n",
       "48837  United-States  \n",
       "48838  United-States  \n",
       "48839  United-States  \n",
       "48840  United-States  \n",
       "48841  United-States  \n",
       "\n",
       "[48842 rows x 14 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48837</th>\n",
       "      <td>&lt;=50K.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48838</th>\n",
       "      <td>&lt;=50K.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48839</th>\n",
       "      <td>&lt;=50K.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48840</th>\n",
       "      <td>&lt;=50K.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48841</th>\n",
       "      <td>&gt;50K.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>48842 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       income\n",
       "0       <=50K\n",
       "1       <=50K\n",
       "2       <=50K\n",
       "3       <=50K\n",
       "4       <=50K\n",
       "...       ...\n",
       "48837  <=50K.\n",
       "48838  <=50K.\n",
       "48839  <=50K.\n",
       "48840  <=50K.\n",
       "48841   >50K.\n",
       "\n",
       "[48842 rows x 1 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data['income'] = y\n",
    "all_data.dropna(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education</th>\n",
       "      <th>education-num</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>native-country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39</td>\n",
       "      <td>State-gov</td>\n",
       "      <td>77516</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>2174</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50</td>\n",
       "      <td>Self-emp-not-inc</td>\n",
       "      <td>83311</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>Private</td>\n",
       "      <td>215646</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>53</td>\n",
       "      <td>Private</td>\n",
       "      <td>234721</td>\n",
       "      <td>11th</td>\n",
       "      <td>7</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Husband</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28</td>\n",
       "      <td>Private</td>\n",
       "      <td>338409</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Wife</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>Cuba</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48836</th>\n",
       "      <td>33</td>\n",
       "      <td>Private</td>\n",
       "      <td>245211</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48837</th>\n",
       "      <td>39</td>\n",
       "      <td>Private</td>\n",
       "      <td>215419</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>36</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48839</th>\n",
       "      <td>38</td>\n",
       "      <td>Private</td>\n",
       "      <td>374983</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>50</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48840</th>\n",
       "      <td>44</td>\n",
       "      <td>Private</td>\n",
       "      <td>83891</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>Asian-Pac-Islander</td>\n",
       "      <td>Male</td>\n",
       "      <td>5455</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48841</th>\n",
       "      <td>35</td>\n",
       "      <td>Self-emp-inc</td>\n",
       "      <td>182148</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>47621 rows × 15 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       age         workclass  fnlwgt  education  education-num  \\\n",
       "0       39         State-gov   77516  Bachelors             13   \n",
       "1       50  Self-emp-not-inc   83311  Bachelors             13   \n",
       "2       38           Private  215646    HS-grad              9   \n",
       "3       53           Private  234721       11th              7   \n",
       "4       28           Private  338409  Bachelors             13   \n",
       "...    ...               ...     ...        ...            ...   \n",
       "48836   33           Private  245211  Bachelors             13   \n",
       "48837   39           Private  215419  Bachelors             13   \n",
       "48839   38           Private  374983  Bachelors             13   \n",
       "48840   44           Private   83891  Bachelors             13   \n",
       "48841   35      Self-emp-inc  182148  Bachelors             13   \n",
       "\n",
       "           marital-status         occupation   relationship  \\\n",
       "0           Never-married       Adm-clerical  Not-in-family   \n",
       "1      Married-civ-spouse    Exec-managerial        Husband   \n",
       "2                Divorced  Handlers-cleaners  Not-in-family   \n",
       "3      Married-civ-spouse  Handlers-cleaners        Husband   \n",
       "4      Married-civ-spouse     Prof-specialty           Wife   \n",
       "...                   ...                ...            ...   \n",
       "48836       Never-married     Prof-specialty      Own-child   \n",
       "48837            Divorced     Prof-specialty  Not-in-family   \n",
       "48839  Married-civ-spouse     Prof-specialty        Husband   \n",
       "48840            Divorced       Adm-clerical      Own-child   \n",
       "48841  Married-civ-spouse    Exec-managerial        Husband   \n",
       "\n",
       "                     race     sex  capital-gain  capital-loss  hours-per-week  \\\n",
       "0                   White    Male          2174             0              40   \n",
       "1                   White    Male             0             0              13   \n",
       "2                   White    Male             0             0              40   \n",
       "3                   Black    Male             0             0              40   \n",
       "4                   Black  Female             0             0              40   \n",
       "...                   ...     ...           ...           ...             ...   \n",
       "48836               White    Male             0             0              40   \n",
       "48837               White  Female             0             0              36   \n",
       "48839               White    Male             0             0              50   \n",
       "48840  Asian-Pac-Islander    Male          5455             0              40   \n",
       "48841               White    Male             0             0              60   \n",
       "\n",
       "      native-country  income  \n",
       "0      United-States   <=50K  \n",
       "1      United-States   <=50K  \n",
       "2      United-States   <=50K  \n",
       "3      United-States   <=50K  \n",
       "4               Cuba   <=50K  \n",
       "...              ...     ...  \n",
       "48836  United-States  <=50K.  \n",
       "48837  United-States  <=50K.  \n",
       "48839  United-States  <=50K.  \n",
       "48840  United-States  <=50K.  \n",
       "48841  United-States   >50K.  \n",
       "\n",
       "[47621 rows x 15 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = pd.get_dummies(all_data,\\\n",
    "  columns=[ADULT_RAW_COL_NAMES[i] for i in ADULT_RAW_COL_FACTOR])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "native-country\n",
       "United-States                 42958\n",
       "Mexico                          936\n",
       "?                               583\n",
       "Philippines                     293\n",
       "Germany                         202\n",
       "Puerto-Rico                     180\n",
       "Canada                          177\n",
       "El-Salvador                     153\n",
       "India                           147\n",
       "Cuba                            136\n",
       "England                         123\n",
       "China                           120\n",
       "South                           110\n",
       "Italy                           105\n",
       "Jamaica                         104\n",
       "Dominican-Republic              100\n",
       "Japan                            92\n",
       "Guatemala                        87\n",
       "Vietnam                          86\n",
       "Poland                           85\n",
       "Columbia                         85\n",
       "Haiti                            71\n",
       "Portugal                         65\n",
       "Taiwan                           64\n",
       "Iran                             57\n",
       "Greece                           49\n",
       "Nicaragua                        49\n",
       "Peru                             46\n",
       "Ecuador                          44\n",
       "France                           38\n",
       "Ireland                          36\n",
       "Thailand                         30\n",
       "Hong                             29\n",
       "Cambodia                         27\n",
       "Trinadad&Tobago                  27\n",
       "Yugoslavia                       23\n",
       "Laos                             22\n",
       "Outlying-US(Guam-USVI-etc)       22\n",
       "Scotland                         21\n",
       "Honduras                         20\n",
       "Hungary                          18\n",
       "Holand-Netherlands                1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X['native-country'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education-num</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>income</th>\n",
       "      <th>workclass_?</th>\n",
       "      <th>workclass_Federal-gov</th>\n",
       "      <th>...</th>\n",
       "      <th>native-country_Portugal</th>\n",
       "      <th>native-country_Puerto-Rico</th>\n",
       "      <th>native-country_Scotland</th>\n",
       "      <th>native-country_South</th>\n",
       "      <th>native-country_Taiwan</th>\n",
       "      <th>native-country_Thailand</th>\n",
       "      <th>native-country_Trinadad&amp;Tobago</th>\n",
       "      <th>native-country_United-States</th>\n",
       "      <th>native-country_Vietnam</th>\n",
       "      <th>native-country_Yugoslavia</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39</td>\n",
       "      <td>77516</td>\n",
       "      <td>13</td>\n",
       "      <td>Male</td>\n",
       "      <td>2174</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>&lt;=50K</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50</td>\n",
       "      <td>83311</td>\n",
       "      <td>13</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>&lt;=50K</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>215646</td>\n",
       "      <td>9</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>&lt;=50K</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>53</td>\n",
       "      <td>234721</td>\n",
       "      <td>7</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>&lt;=50K</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28</td>\n",
       "      <td>338409</td>\n",
       "      <td>13</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>&lt;=50K</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48836</th>\n",
       "      <td>33</td>\n",
       "      <td>245211</td>\n",
       "      <td>13</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>&lt;=50K.</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48837</th>\n",
       "      <td>39</td>\n",
       "      <td>215419</td>\n",
       "      <td>13</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>36</td>\n",
       "      <td>&lt;=50K.</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48839</th>\n",
       "      <td>38</td>\n",
       "      <td>374983</td>\n",
       "      <td>13</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>50</td>\n",
       "      <td>&lt;=50K.</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48840</th>\n",
       "      <td>44</td>\n",
       "      <td>83891</td>\n",
       "      <td>13</td>\n",
       "      <td>Male</td>\n",
       "      <td>5455</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>&lt;=50K.</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48841</th>\n",
       "      <td>35</td>\n",
       "      <td>182148</td>\n",
       "      <td>13</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>&gt;50K.</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>47621 rows × 108 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       age  fnlwgt  education-num     sex  capital-gain  capital-loss  \\\n",
       "0       39   77516             13    Male          2174             0   \n",
       "1       50   83311             13    Male             0             0   \n",
       "2       38  215646              9    Male             0             0   \n",
       "3       53  234721              7    Male             0             0   \n",
       "4       28  338409             13  Female             0             0   \n",
       "...    ...     ...            ...     ...           ...           ...   \n",
       "48836   33  245211             13    Male             0             0   \n",
       "48837   39  215419             13  Female             0             0   \n",
       "48839   38  374983             13    Male             0             0   \n",
       "48840   44   83891             13    Male          5455             0   \n",
       "48841   35  182148             13    Male             0             0   \n",
       "\n",
       "       hours-per-week  income  workclass_?  workclass_Federal-gov  ...  \\\n",
       "0                  40   <=50K        False                  False  ...   \n",
       "1                  13   <=50K        False                  False  ...   \n",
       "2                  40   <=50K        False                  False  ...   \n",
       "3                  40   <=50K        False                  False  ...   \n",
       "4                  40   <=50K        False                  False  ...   \n",
       "...               ...     ...          ...                    ...  ...   \n",
       "48836              40  <=50K.        False                  False  ...   \n",
       "48837              36  <=50K.        False                  False  ...   \n",
       "48839              50  <=50K.        False                  False  ...   \n",
       "48840              40  <=50K.        False                  False  ...   \n",
       "48841              60   >50K.        False                  False  ...   \n",
       "\n",
       "       native-country_Portugal  native-country_Puerto-Rico  \\\n",
       "0                        False                       False   \n",
       "1                        False                       False   \n",
       "2                        False                       False   \n",
       "3                        False                       False   \n",
       "4                        False                       False   \n",
       "...                        ...                         ...   \n",
       "48836                    False                       False   \n",
       "48837                    False                       False   \n",
       "48839                    False                       False   \n",
       "48840                    False                       False   \n",
       "48841                    False                       False   \n",
       "\n",
       "       native-country_Scotland  native-country_South  native-country_Taiwan  \\\n",
       "0                        False                 False                  False   \n",
       "1                        False                 False                  False   \n",
       "2                        False                 False                  False   \n",
       "3                        False                 False                  False   \n",
       "4                        False                 False                  False   \n",
       "...                        ...                   ...                    ...   \n",
       "48836                    False                 False                  False   \n",
       "48837                    False                 False                  False   \n",
       "48839                    False                 False                  False   \n",
       "48840                    False                 False                  False   \n",
       "48841                    False                 False                  False   \n",
       "\n",
       "       native-country_Thailand  native-country_Trinadad&Tobago  \\\n",
       "0                        False                           False   \n",
       "1                        False                           False   \n",
       "2                        False                           False   \n",
       "3                        False                           False   \n",
       "4                        False                           False   \n",
       "...                        ...                             ...   \n",
       "48836                    False                           False   \n",
       "48837                    False                           False   \n",
       "48839                    False                           False   \n",
       "48840                    False                           False   \n",
       "48841                    False                           False   \n",
       "\n",
       "       native-country_United-States  native-country_Vietnam  \\\n",
       "0                              True                   False   \n",
       "1                              True                   False   \n",
       "2                              True                   False   \n",
       "3                              True                   False   \n",
       "4                             False                   False   \n",
       "...                             ...                     ...   \n",
       "48836                          True                   False   \n",
       "48837                          True                   False   \n",
       "48839                          True                   False   \n",
       "48840                          True                   False   \n",
       "48841                          True                   False   \n",
       "\n",
       "       native-country_Yugoslavia  \n",
       "0                          False  \n",
       "1                          False  \n",
       "2                          False  \n",
       "3                          False  \n",
       "4                          False  \n",
       "...                          ...  \n",
       "48836                      False  \n",
       "48837                      False  \n",
       "48839                      False  \n",
       "48840                      False  \n",
       "48841                      False  \n",
       "\n",
       "[47621 rows x 108 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "all_data.loc[all_data.income == \">50K\",\"income\"] = 1\n",
    "all_data.loc[all_data.income == \">50K.\",\"income\"] = 1\n",
    "all_data.loc[all_data.income == \"<=50K\",\"income\"] = 0\n",
    "all_data.loc[all_data.income == \"<=50K.\",\"income\"] = 0\n",
    "\n",
    "all_data.loc[all_data.sex == \"Female\",\"sex\"] = 1\n",
    "all_data.loc[all_data.sex == \"Male\",\"sex\"] = 0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#all_data = pd.get_dummies(all_data,columns=[\"income\"],drop_first=True)\n",
    "\n",
    "cutoff = 30000 #train_data.shape[0]\n",
    "train_data = all_data.loc[:cutoff,\\\n",
    "  (all_data.columns != \"income\") & (all_data.columns != \"sex\")]\n",
    "train_c = all_data.loc[:cutoff,all_data.columns == \"sex\"]\n",
    "train_labels = all_data.loc[:cutoff,all_data.columns == \"income\"]\n",
    "\n",
    "test_data = all_data.loc[cutoff:,\\\n",
    "  (all_data.columns != \"income\") & (all_data.columns != \"sex\")]\n",
    "test_c = all_data.loc[cutoff:,all_data.columns == \"sex\"]\n",
    "\n",
    "test_labels = all_data.loc[cutoff:,all_data.columns == \"income\"]\n",
    "\n",
    "col_valid_in_train_data =\\\n",
    "    [len(train_data.loc[:,x].unique()) > 1 for x in train_data.columns]\n",
    "col_valid_in_test_data =\\\n",
    "    [len(test_data.loc[:,x].unique()) > 1 for x in test_data.columns]\n",
    "\n",
    "col_valid = list(map(lambda x,y: x and y, col_valid_in_train_data, col_valid_in_test_data))\n",
    "\n",
    "train_data = train_data.loc[:,col_valid]\n",
    "test_data = test_data.loc[:,col_valid]\n",
    "\n",
    "#split off validation data\n",
    "cutoff = int(ADULT_VALIDATION_SPLIT * train_data.shape[0])\n",
    "val_data = train_data.loc[cutoff:,:]\n",
    "train_data = train_data.loc[:cutoff,:]\n",
    "\n",
    "val_labels = train_labels.loc[cutoff:,:]\n",
    "train_labels = train_labels.loc[:cutoff,:]\n",
    "\n",
    "val_c = train_c.loc[cutoff:,:]\n",
    "train_c = train_c.loc[:cutoff,:]\n",
    "\n",
    "split_numbers = (train_data.shape[1],train_data.shape[1] + 1)\n",
    "train_size = train_data.shape[0]\n",
    "\n",
    "#data normalization\n",
    "maxes = np.maximum(np.maximum(\\\n",
    "  train_data.max(axis=0),\\\n",
    "  val_data.max(axis=0)),\\\n",
    "  test_data.max(axis=0))\n",
    "\n",
    "train_data = train_data / maxes\n",
    "test_data = test_data / maxes\n",
    "val_data = val_data / maxes\n",
    "\n",
    "train_data = np.concatenate(\\\n",
    "  (np.array(train_data), np.array(train_labels), np.array(train_c)),\\\n",
    "  axis=1\\\n",
    ")\n",
    "\n",
    "validation_data = np.array(val_data)\n",
    "validation_labels = np.array(val_labels)\n",
    "validation_c = np.array(val_c)\n",
    "test_data = np.array(test_data)\n",
    "test_labels = np.array(test_labels)\n",
    "test_c = np.array(test_c)\n",
    "\n",
    "np.save('data/adult_train.npy', train_data)\n",
    "np.save('data/adult_test.npy', np.concatenate((test_data, test_labels, test_c), axis=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15001, 107)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data.shape"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
