{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "is_executing": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import preprocessing\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression, LinearRegression\n",
    "from sklearn.metrics import accuracy_score, mean_squared_error, r2_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Adult Census"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('datasets/raw/adult.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education</th>\n",
       "      <th>education.num</th>\n",
       "      <th>marital.status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital.gain</th>\n",
       "      <th>capital.loss</th>\n",
       "      <th>hours.per.week</th>\n",
       "      <th>native.country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>90</td>\n",
       "      <td>?</td>\n",
       "      <td>77053</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Widowed</td>\n",
       "      <td>?</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>4356</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>82</td>\n",
       "      <td>Private</td>\n",
       "      <td>132870</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Widowed</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>4356</td>\n",
       "      <td>18</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>66</td>\n",
       "      <td>?</td>\n",
       "      <td>186061</td>\n",
       "      <td>Some-college</td>\n",
       "      <td>10</td>\n",
       "      <td>Widowed</td>\n",
       "      <td>?</td>\n",
       "      <td>Unmarried</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>4356</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>54</td>\n",
       "      <td>Private</td>\n",
       "      <td>140359</td>\n",
       "      <td>7th-8th</td>\n",
       "      <td>4</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Machine-op-inspct</td>\n",
       "      <td>Unmarried</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>3900</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>41</td>\n",
       "      <td>Private</td>\n",
       "      <td>264663</td>\n",
       "      <td>Some-college</td>\n",
       "      <td>10</td>\n",
       "      <td>Separated</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>3900</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>34</td>\n",
       "      <td>Private</td>\n",
       "      <td>216864</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Other-service</td>\n",
       "      <td>Unmarried</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>3770</td>\n",
       "      <td>45</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>38</td>\n",
       "      <td>Private</td>\n",
       "      <td>150601</td>\n",
       "      <td>10th</td>\n",
       "      <td>6</td>\n",
       "      <td>Separated</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Unmarried</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>3770</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>74</td>\n",
       "      <td>State-gov</td>\n",
       "      <td>88638</td>\n",
       "      <td>Doctorate</td>\n",
       "      <td>16</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Other-relative</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>3683</td>\n",
       "      <td>20</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>68</td>\n",
       "      <td>Federal-gov</td>\n",
       "      <td>422013</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>3683</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>41</td>\n",
       "      <td>Private</td>\n",
       "      <td>70037</td>\n",
       "      <td>Some-college</td>\n",
       "      <td>10</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Craft-repair</td>\n",
       "      <td>Unmarried</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>3004</td>\n",
       "      <td>60</td>\n",
       "      <td>?</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age    workclass  fnlwgt     education  education.num marital.status  \\\n",
       "0   90            ?   77053       HS-grad              9        Widowed   \n",
       "1   82      Private  132870       HS-grad              9        Widowed   \n",
       "2   66            ?  186061  Some-college             10        Widowed   \n",
       "3   54      Private  140359       7th-8th              4       Divorced   \n",
       "4   41      Private  264663  Some-college             10      Separated   \n",
       "5   34      Private  216864       HS-grad              9       Divorced   \n",
       "6   38      Private  150601          10th              6      Separated   \n",
       "7   74    State-gov   88638     Doctorate             16  Never-married   \n",
       "8   68  Federal-gov  422013       HS-grad              9       Divorced   \n",
       "9   41      Private   70037  Some-college             10  Never-married   \n",
       "\n",
       "          occupation    relationship   race     sex  capital.gain  \\\n",
       "0                  ?   Not-in-family  White  Female             0   \n",
       "1    Exec-managerial   Not-in-family  White  Female             0   \n",
       "2                  ?       Unmarried  Black  Female             0   \n",
       "3  Machine-op-inspct       Unmarried  White  Female             0   \n",
       "4     Prof-specialty       Own-child  White  Female             0   \n",
       "5      Other-service       Unmarried  White  Female             0   \n",
       "6       Adm-clerical       Unmarried  White    Male             0   \n",
       "7     Prof-specialty  Other-relative  White  Female             0   \n",
       "8     Prof-specialty   Not-in-family  White  Female             0   \n",
       "9       Craft-repair       Unmarried  White    Male             0   \n",
       "\n",
       "   capital.loss  hours.per.week native.country income  \n",
       "0          4356              40  United-States  <=50K  \n",
       "1          4356              18  United-States  <=50K  \n",
       "2          4356              40  United-States  <=50K  \n",
       "3          3900              40  United-States  <=50K  \n",
       "4          3900              40  United-States  <=50K  \n",
       "5          3770              45  United-States  <=50K  \n",
       "6          3770              40  United-States  <=50K  \n",
       "7          3683              20  United-States   >50K  \n",
       "8          3683              40  United-States  <=50K  \n",
       "9          3004              60              ?   >50K  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age               0\n",
       "workclass         0\n",
       "fnlwgt            0\n",
       "education         0\n",
       "education.num     0\n",
       "marital.status    0\n",
       "occupation        0\n",
       "relationship      0\n",
       "race              0\n",
       "sex               0\n",
       "capital.gain      0\n",
       "capital.loss      0\n",
       "hours.per.week    0\n",
       "native.country    0\n",
       "income            0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for col in ['workclass', 'occupation', 'native.country']:\n",
    "#     df[col].fillna(df[col].mode()[0], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert target to binary\n",
    "le = preprocessing.LabelEncoder()\n",
    "df['income'] = le.fit_transform(df['income'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### For clean dataset creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']\n",
    "for feature in categorical:\n",
    "    le = preprocessing.LabelEncoder()\n",
    "    df[feature] = le.fit_transform(df[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education</th>\n",
       "      <th>education.num</th>\n",
       "      <th>marital.status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital.gain</th>\n",
       "      <th>capital.loss</th>\n",
       "      <th>hours.per.week</th>\n",
       "      <th>native.country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>90</td>\n",
       "      <td>0</td>\n",
       "      <td>77053</td>\n",
       "      <td>11</td>\n",
       "      <td>9</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4356</td>\n",
       "      <td>40</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>82</td>\n",
       "      <td>4</td>\n",
       "      <td>132870</td>\n",
       "      <td>11</td>\n",
       "      <td>9</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4356</td>\n",
       "      <td>18</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>66</td>\n",
       "      <td>0</td>\n",
       "      <td>186061</td>\n",
       "      <td>15</td>\n",
       "      <td>10</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4356</td>\n",
       "      <td>40</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>54</td>\n",
       "      <td>4</td>\n",
       "      <td>140359</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3900</td>\n",
       "      <td>40</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>41</td>\n",
       "      <td>4</td>\n",
       "      <td>264663</td>\n",
       "      <td>15</td>\n",
       "      <td>10</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3900</td>\n",
       "      <td>40</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age  workclass  fnlwgt  education  education.num  marital.status  \\\n",
       "0   90          0   77053         11              9               6   \n",
       "1   82          4  132870         11              9               6   \n",
       "2   66          0  186061         15             10               6   \n",
       "3   54          4  140359          5              4               0   \n",
       "4   41          4  264663         15             10               5   \n",
       "\n",
       "   occupation  relationship  race  sex  capital.gain  capital.loss  \\\n",
       "0           0             1     4    0             0          4356   \n",
       "1           4             1     4    0             0          4356   \n",
       "2           0             4     2    0             0          4356   \n",
       "3           7             4     4    0             0          3900   \n",
       "4          10             3     4    0             0          3900   \n",
       "\n",
       "   hours.per.week  native.country  income  \n",
       "0              40              39       0  \n",
       "1              18              39       0  \n",
       "2              40              39       0  \n",
       "3              40              39       0  \n",
       "4              40              39       0  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = StandardScaler()\n",
    "# cols = pd.Series(['age', 'fnlwgt', 'capital.loss', 'hours.per.week']) # scale only continous\n",
    "cols = pd.Series(df.columns[df.columns != 'income'].tolist()) # scale all but target\n",
    "df[cols] = pd.DataFrame(scaler.fit_transform(df[cols]), columns = cols)\n",
    "# df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education</th>\n",
       "      <th>education.num</th>\n",
       "      <th>marital.status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital.gain</th>\n",
       "      <th>capital.loss</th>\n",
       "      <th>hours.per.week</th>\n",
       "      <th>native.country</th>\n",
       "      <th>income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3.769612</td>\n",
       "      <td>-2.65732</td>\n",
       "      <td>-1.067997</td>\n",
       "      <td>0.181332</td>\n",
       "      <td>-0.420060</td>\n",
       "      <td>2.249480</td>\n",
       "      <td>-1.554283</td>\n",
       "      <td>-0.277805</td>\n",
       "      <td>0.393668</td>\n",
       "      <td>-1.422331</td>\n",
       "      <td>-0.14592</td>\n",
       "      <td>10.593507</td>\n",
       "      <td>-0.035429</td>\n",
       "      <td>0.291569</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.183112</td>\n",
       "      <td>0.09005</td>\n",
       "      <td>-0.539169</td>\n",
       "      <td>0.181332</td>\n",
       "      <td>-0.420060</td>\n",
       "      <td>2.249480</td>\n",
       "      <td>-0.608387</td>\n",
       "      <td>-0.277805</td>\n",
       "      <td>0.393668</td>\n",
       "      <td>-1.422331</td>\n",
       "      <td>-0.14592</td>\n",
       "      <td>10.593507</td>\n",
       "      <td>-1.817204</td>\n",
       "      <td>0.291569</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.010110</td>\n",
       "      <td>-2.65732</td>\n",
       "      <td>-0.035220</td>\n",
       "      <td>1.214869</td>\n",
       "      <td>-0.031360</td>\n",
       "      <td>2.249480</td>\n",
       "      <td>-1.554283</td>\n",
       "      <td>1.589322</td>\n",
       "      <td>-1.962621</td>\n",
       "      <td>-1.422331</td>\n",
       "      <td>-0.14592</td>\n",
       "      <td>10.593507</td>\n",
       "      <td>-0.035429</td>\n",
       "      <td>0.291569</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.130359</td>\n",
       "      <td>0.09005</td>\n",
       "      <td>-0.468215</td>\n",
       "      <td>-1.368974</td>\n",
       "      <td>-2.363558</td>\n",
       "      <td>-1.734058</td>\n",
       "      <td>0.101036</td>\n",
       "      <td>1.589322</td>\n",
       "      <td>0.393668</td>\n",
       "      <td>-1.422331</td>\n",
       "      <td>-0.14592</td>\n",
       "      <td>9.461864</td>\n",
       "      <td>-0.035429</td>\n",
       "      <td>0.291569</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.177296</td>\n",
       "      <td>0.09005</td>\n",
       "      <td>0.709482</td>\n",
       "      <td>1.214869</td>\n",
       "      <td>-0.031360</td>\n",
       "      <td>1.585557</td>\n",
       "      <td>0.810458</td>\n",
       "      <td>0.966947</td>\n",
       "      <td>0.393668</td>\n",
       "      <td>-1.422331</td>\n",
       "      <td>-0.14592</td>\n",
       "      <td>9.461864</td>\n",
       "      <td>-0.035429</td>\n",
       "      <td>0.291569</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        age  workclass    fnlwgt  education  education.num  marital.status  \\\n",
       "0  3.769612   -2.65732 -1.067997   0.181332      -0.420060        2.249480   \n",
       "1  3.183112    0.09005 -0.539169   0.181332      -0.420060        2.249480   \n",
       "2  2.010110   -2.65732 -0.035220   1.214869      -0.031360        2.249480   \n",
       "3  1.130359    0.09005 -0.468215  -1.368974      -2.363558       -1.734058   \n",
       "4  0.177296    0.09005  0.709482   1.214869      -0.031360        1.585557   \n",
       "\n",
       "   occupation  relationship      race       sex  capital.gain  capital.loss  \\\n",
       "0   -1.554283     -0.277805  0.393668 -1.422331      -0.14592     10.593507   \n",
       "1   -0.608387     -0.277805  0.393668 -1.422331      -0.14592     10.593507   \n",
       "2   -1.554283      1.589322 -1.962621 -1.422331      -0.14592     10.593507   \n",
       "3    0.101036      1.589322  0.393668 -1.422331      -0.14592      9.461864   \n",
       "4    0.810458      0.966947  0.393668 -1.422331      -0.14592      9.461864   \n",
       "\n",
       "   hours.per.week  native.country  income  \n",
       "0       -0.035429        0.291569       0  \n",
       "1       -1.817204        0.291569       0  \n",
       "2       -0.035429        0.291569       0  \n",
       "3       -0.035429        0.291569       0  \n",
       "4       -0.035429        0.291569       0  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age               0\n",
       "workclass         0\n",
       "fnlwgt            0\n",
       "education         0\n",
       "education.num     0\n",
       "marital.status    0\n",
       "occupation        0\n",
       "relationship      0\n",
       "race              0\n",
       "sex               0\n",
       "capital.gain      0\n",
       "capital.loss      0\n",
       "hours.per.week    0\n",
       "native.country    0\n",
       "income            0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('datasets/clean/adult_scaled.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### For model training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df.drop(['income'], axis=1)\n",
    "y = df['income']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']\n",
    "for feature in categorical:\n",
    "    le = preprocessing.LabelEncoder()\n",
    "    X_train[feature] = le.fit_transform(X_train[feature])\n",
    "    X_test[feature] = le.transform(X_test[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = StandardScaler()\n",
    "X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)\n",
    "X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logistic Regression accuracy score with all the features: 0.8204\n"
     ]
    }
   ],
   "source": [
    "logreg = LogisticRegression()\n",
    "logreg.fit(X_train, y_train)\n",
    "y_pred = logreg.predict(X_test)\n",
    "\n",
    "print('Logistic Regression accuracy score with all the features: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Home Price Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('datasets/raw/house_price.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>sqft_living</th>\n",
       "      <th>sqft_lot</th>\n",
       "      <th>floors</th>\n",
       "      <th>waterfront</th>\n",
       "      <th>view</th>\n",
       "      <th>condition</th>\n",
       "      <th>sqft_above</th>\n",
       "      <th>sqft_basement</th>\n",
       "      <th>yr_built</th>\n",
       "      <th>yr_renovated</th>\n",
       "      <th>street</th>\n",
       "      <th>city</th>\n",
       "      <th>statezip</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>313000.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.50</td>\n",
       "      <td>1340</td>\n",
       "      <td>7912</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1340</td>\n",
       "      <td>0</td>\n",
       "      <td>1955</td>\n",
       "      <td>2005</td>\n",
       "      <td>18810 Densmore Ave N</td>\n",
       "      <td>Shoreline</td>\n",
       "      <td>WA 98133</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>2384000.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2.50</td>\n",
       "      <td>3650</td>\n",
       "      <td>9050</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>3370</td>\n",
       "      <td>280</td>\n",
       "      <td>1921</td>\n",
       "      <td>0</td>\n",
       "      <td>709 W Blaine St</td>\n",
       "      <td>Seattle</td>\n",
       "      <td>WA 98119</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>342000.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.00</td>\n",
       "      <td>1930</td>\n",
       "      <td>11947</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1930</td>\n",
       "      <td>0</td>\n",
       "      <td>1966</td>\n",
       "      <td>0</td>\n",
       "      <td>26206-26214 143rd Ave SE</td>\n",
       "      <td>Kent</td>\n",
       "      <td>WA 98042</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>420000.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.25</td>\n",
       "      <td>2000</td>\n",
       "      <td>8030</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1000</td>\n",
       "      <td>1000</td>\n",
       "      <td>1963</td>\n",
       "      <td>0</td>\n",
       "      <td>857 170th Pl NE</td>\n",
       "      <td>Bellevue</td>\n",
       "      <td>WA 98008</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>550000.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2.50</td>\n",
       "      <td>1940</td>\n",
       "      <td>10500</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1140</td>\n",
       "      <td>800</td>\n",
       "      <td>1976</td>\n",
       "      <td>1992</td>\n",
       "      <td>9105 170th Ave NE</td>\n",
       "      <td>Redmond</td>\n",
       "      <td>WA 98052</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>490000.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>880</td>\n",
       "      <td>6380</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>880</td>\n",
       "      <td>0</td>\n",
       "      <td>1938</td>\n",
       "      <td>1994</td>\n",
       "      <td>522 NE 88th St</td>\n",
       "      <td>Seattle</td>\n",
       "      <td>WA 98115</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>335000.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.00</td>\n",
       "      <td>1350</td>\n",
       "      <td>2560</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1350</td>\n",
       "      <td>0</td>\n",
       "      <td>1976</td>\n",
       "      <td>0</td>\n",
       "      <td>2616 174th Ave NE</td>\n",
       "      <td>Redmond</td>\n",
       "      <td>WA 98052</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>482000.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2.50</td>\n",
       "      <td>2710</td>\n",
       "      <td>35868</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2710</td>\n",
       "      <td>0</td>\n",
       "      <td>1989</td>\n",
       "      <td>0</td>\n",
       "      <td>23762 SE 253rd Pl</td>\n",
       "      <td>Maple Valley</td>\n",
       "      <td>WA 98038</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>452500.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.50</td>\n",
       "      <td>2430</td>\n",
       "      <td>88426</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1570</td>\n",
       "      <td>860</td>\n",
       "      <td>1985</td>\n",
       "      <td>0</td>\n",
       "      <td>46611-46625 SE 129th St</td>\n",
       "      <td>North Bend</td>\n",
       "      <td>WA 98045</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2014-05-02 00:00:00</td>\n",
       "      <td>640000.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2.00</td>\n",
       "      <td>1520</td>\n",
       "      <td>6200</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1520</td>\n",
       "      <td>0</td>\n",
       "      <td>1945</td>\n",
       "      <td>2010</td>\n",
       "      <td>6811 55th Ave NE</td>\n",
       "      <td>Seattle</td>\n",
       "      <td>WA 98115</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \\\n",
       "0  2014-05-02 00:00:00   313000.0       3.0       1.50         1340      7912   \n",
       "1  2014-05-02 00:00:00  2384000.0       5.0       2.50         3650      9050   \n",
       "2  2014-05-02 00:00:00   342000.0       3.0       2.00         1930     11947   \n",
       "3  2014-05-02 00:00:00   420000.0       3.0       2.25         2000      8030   \n",
       "4  2014-05-02 00:00:00   550000.0       4.0       2.50         1940     10500   \n",
       "5  2014-05-02 00:00:00   490000.0       2.0       1.00          880      6380   \n",
       "6  2014-05-02 00:00:00   335000.0       2.0       2.00         1350      2560   \n",
       "7  2014-05-02 00:00:00   482000.0       4.0       2.50         2710     35868   \n",
       "8  2014-05-02 00:00:00   452500.0       3.0       2.50         2430     88426   \n",
       "9  2014-05-02 00:00:00   640000.0       4.0       2.00         1520      6200   \n",
       "\n",
       "   floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \\\n",
       "0     1.5           0     0          3        1340              0      1955   \n",
       "1     2.0           0     4          5        3370            280      1921   \n",
       "2     1.0           0     0          4        1930              0      1966   \n",
       "3     1.0           0     0          4        1000           1000      1963   \n",
       "4     1.0           0     0          4        1140            800      1976   \n",
       "5     1.0           0     0          3         880              0      1938   \n",
       "6     1.0           0     0          3        1350              0      1976   \n",
       "7     2.0           0     0          3        2710              0      1989   \n",
       "8     1.0           0     0          4        1570            860      1985   \n",
       "9     1.5           0     0          3        1520              0      1945   \n",
       "\n",
       "   yr_renovated                    street          city  statezip country  \n",
       "0          2005      18810 Densmore Ave N     Shoreline  WA 98133     USA  \n",
       "1             0           709 W Blaine St       Seattle  WA 98119     USA  \n",
       "2             0  26206-26214 143rd Ave SE          Kent  WA 98042     USA  \n",
       "3             0           857 170th Pl NE      Bellevue  WA 98008     USA  \n",
       "4          1992         9105 170th Ave NE       Redmond  WA 98052     USA  \n",
       "5          1994            522 NE 88th St       Seattle  WA 98115     USA  \n",
       "6             0         2616 174th Ave NE       Redmond  WA 98052     USA  \n",
       "7             0         23762 SE 253rd Pl  Maple Valley  WA 98038     USA  \n",
       "8             0   46611-46625 SE 129th St    North Bend  WA 98045     USA  \n",
       "9          2010          6811 55th Ave NE       Seattle  WA 98115     USA  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['price']     = data['price'].astype('int64')\n",
    "data['bedrooms']  = data['bedrooms'].astype('int64')\n",
    "data['bathrooms'] = data['bathrooms'].astype('int64')\n",
    "data['floors']    = data['floors'].astype('int64')\n",
    "data['street']    = data['street'].astype('string')\n",
    "data['city']      = data['city'].astype('string')\n",
    "data['statezip']  = data['statezip'].astype('string')\n",
    "data['country']   = data['country'].astype('string')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[data['price'] != 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.drop(['date', 'street', 'statezip', 'country','sqft_above'], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert target to binary\n",
    "le = preprocessing.LabelEncoder()\n",
    "data['city'] = le.fit_transform(data['city'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>price</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>sqft_living</th>\n",
       "      <th>sqft_lot</th>\n",
       "      <th>floors</th>\n",
       "      <th>waterfront</th>\n",
       "      <th>view</th>\n",
       "      <th>condition</th>\n",
       "      <th>sqft_basement</th>\n",
       "      <th>yr_built</th>\n",
       "      <th>yr_renovated</th>\n",
       "      <th>city</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>313000</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1340</td>\n",
       "      <td>7912</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1955</td>\n",
       "      <td>2005</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2384000</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>3650</td>\n",
       "      <td>9050</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>280</td>\n",
       "      <td>1921</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>342000</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1930</td>\n",
       "      <td>11947</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1966</td>\n",
       "      <td>0</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>420000</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2000</td>\n",
       "      <td>8030</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1000</td>\n",
       "      <td>1963</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>550000</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>1940</td>\n",
       "      <td>10500</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>800</td>\n",
       "      <td>1976</td>\n",
       "      <td>1992</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>490000</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>880</td>\n",
       "      <td>6380</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1938</td>\n",
       "      <td>1994</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>335000</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1350</td>\n",
       "      <td>2560</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1976</td>\n",
       "      <td>0</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>482000</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2710</td>\n",
       "      <td>35868</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1989</td>\n",
       "      <td>0</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>452500</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2430</td>\n",
       "      <td>88426</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>860</td>\n",
       "      <td>1985</td>\n",
       "      <td>0</td>\n",
       "      <td>27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>640000</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>1520</td>\n",
       "      <td>6200</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1945</td>\n",
       "      <td>2010</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \\\n",
       "0   313000         3          1         1340      7912       1           0   \n",
       "1  2384000         5          2         3650      9050       2           0   \n",
       "2   342000         3          2         1930     11947       1           0   \n",
       "3   420000         3          2         2000      8030       1           0   \n",
       "4   550000         4          2         1940     10500       1           0   \n",
       "5   490000         2          1          880      6380       1           0   \n",
       "6   335000         2          2         1350      2560       1           0   \n",
       "7   482000         4          2         2710     35868       2           0   \n",
       "8   452500         3          2         2430     88426       1           0   \n",
       "9   640000         4          2         1520      6200       1           0   \n",
       "\n",
       "   view  condition  sqft_basement  yr_built  yr_renovated  city  \n",
       "0     0          3              0      1955          2005    36  \n",
       "1     4          5            280      1921             0    35  \n",
       "2     0          4              0      1966             0    18  \n",
       "3     0          4           1000      1963             0     3  \n",
       "4     0          4            800      1976          1992    31  \n",
       "5     0          3              0      1938          1994    35  \n",
       "6     0          3              0      1976             0    31  \n",
       "7     0          3              0      1989             0    21  \n",
       "8     0          4            860      1985             0    27  \n",
       "9     0          3              0      1945          2010    35  "
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset normalization\n",
    "columns = data.columns\n",
    "scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))\n",
    "data = pd.DataFrame(scaler.fit_transform(data), columns = columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### For clean dataset creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>price</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>sqft_living</th>\n",
       "      <th>sqft_lot</th>\n",
       "      <th>floors</th>\n",
       "      <th>waterfront</th>\n",
       "      <th>view</th>\n",
       "      <th>condition</th>\n",
       "      <th>sqft_basement</th>\n",
       "      <th>yr_built</th>\n",
       "      <th>yr_renovated</th>\n",
       "      <th>city</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.011481</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.073652</td>\n",
       "      <td>0.006775</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.482456</td>\n",
       "      <td>0.995531</td>\n",
       "      <td>0.837209</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.089391</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.249051</td>\n",
       "      <td>0.007835</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.058091</td>\n",
       "      <td>0.184211</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.813953</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.012572</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.118451</td>\n",
       "      <td>0.010534</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.578947</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.418605</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.015507</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.123766</td>\n",
       "      <td>0.006885</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.207469</td>\n",
       "      <td>0.552632</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.069767</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.020397</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.119210</td>\n",
       "      <td>0.009186</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.165975</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.989076</td>\n",
       "      <td>0.720930</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \\\n",
       "0  0.011481  0.333333      0.125     0.073652  0.006775     0.0         0.0   \n",
       "1  0.089391  0.555556      0.250     0.249051  0.007835     0.5         0.0   \n",
       "2  0.012572  0.333333      0.250     0.118451  0.010534     0.0         0.0   \n",
       "3  0.015507  0.333333      0.250     0.123766  0.006885     0.0         0.0   \n",
       "4  0.020397  0.444444      0.250     0.119210  0.009186     0.0         0.0   \n",
       "\n",
       "   view  condition  sqft_basement  yr_built  yr_renovated      city  \n",
       "0   0.0       0.50       0.000000  0.482456      0.995531  0.837209  \n",
       "1   1.0       1.00       0.058091  0.184211      0.000000  0.813953  \n",
       "2   0.0       0.75       0.000000  0.578947      0.000000  0.418605  \n",
       "3   0.0       0.75       0.207469  0.552632      0.000000  0.069767  \n",
       "4   0.0       0.75       0.165975  0.666667      0.989076  0.720930  "
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv('datasets/clean/house_price.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### For model training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data.drop(\"price\", axis=1)\n",
    "y = pd.DataFrame(data[\"price\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Root Mean Squared Error:  0.0087142072843099\n",
      "R2 Score:  0.5844905131307716\n"
     ]
    }
   ],
   "source": [
    "linreg = LinearRegression()\n",
    "linreg.fit(X_train, y_train)\n",
    "y_pred = linreg.predict(X_test)\n",
    "\n",
    "print(\"Root Mean Squared Error: \", np.sqrt(mean_squared_error(y_test, y_pred)))\n",
    "print(\"R2 Score: \", r2_score(y_test, y_pred))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
