{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preprocess steps for Melbourne Housing dataset \n",
    "\n",
    "> Additional experiment NeurIPS rebuttal. Last edit: August 2, 2022.\n",
    "\n",
    "-----\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import numpy as np \n",
    "import pandas as pd \n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.tree import DecisionTreeRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load dataframes from both datasets\n",
    "df = pd.read_csv('FeatureImportanceGames/Data/Unprocessed/MelbourneHousing.csv')\n",
    "\n",
    "# Convert objects to categorical variables\n",
    "obj_cats = ['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'CouncilArea','Regionname']\n",
    "\n",
    "for colname in obj_cats:\n",
    "    df[colname] = df[colname].astype('category')  \n",
    "\n",
    "# Convert numeric variables to categorical\n",
    "num_cats = ['Postcode']  \n",
    "\n",
    "for colname in num_cats:\n",
    "    df[colname] = df[colname].astype('category')   \n",
    "\n",
    "# Drop some columns\n",
    "df = df.drop(columns=['Bedroom2', 'Address', 'SellerG', 'Postcode', 'Suburb', 'CouncilArea', 'Date'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8887, 14)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type_dict = dict(zip(df.astype('category').Type, df.Type.astype('category').cat.codes))\n",
    "df['Type'] = df['Type'].apply(lambda x: type_dict[x])\n",
    "\n",
    "method_dict = dict(zip(df.astype('category').Method, df.Method.astype('category').cat.codes))\n",
    "df['Method'] = df['Method'].apply(lambda x: method_dict[x])\n",
    "\n",
    "region_dict = dict(zip(df.astype('category').Regionname, df.Regionname.astype('category').cat.codes))\n",
    "df['Regionname'] = df['Regionname'].apply(lambda x: region_dict[x])\n",
    "\n",
    "# Normalize numerical values between 0 and 1\n",
    "scaler = MinMaxScaler()\n",
    "num_cols = ['Rooms', 'Price', 'Distance', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', \n",
    "            'Lattitude', 'YearBuilt', 'Longtitude', 'Propertycount', 'Regionname', 'Type',\n",
    "            'Method'\n",
    "            ]\n",
    "df[num_cols] = scaler.fit_transform(df[num_cols])\n",
    "\n",
    "# Remove rows missing data\n",
    "df = df.dropna()\n",
    "\n",
    "# Confirm that observations missing data were removed  \n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeRegressor(random_state=1)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Save ground truth train data\n",
    "df_features = df.drop(['Price'], axis=1)\n",
    "\n",
    "# Select Target\n",
    "target = df['Price']\n",
    "\n",
    "# Define model. Specify a number for random_state to ensure same results each run\n",
    "melbourne_model = DecisionTreeRegressor(random_state=1)\n",
    "\n",
    "# Fit model\n",
    "melbourne_model.fit(df_features, target);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = 'decision_tree_housing13.sav'\n",
    "pickle.dump(melbourne_model, open(filename, 'wb'))\n",
    "\n",
    "df_features.to_csv('df_housing_feats13', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Rooms', 'Type', 'Method', 'Distance', 'Bathroom', 'Car', 'Landsize',\n",
       "       'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude', 'Longtitude',\n",
       "       'Regionname', 'Propertycount'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_features.columns"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.7 ('rebuttal')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "8b4c61b0c0dfa50a12dd4948992e6e1a04659874706c6edcf781c314bdb1c462"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
