{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following demonstrations coincide with Section 4 of the paper."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Code Demonstration"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Jupyter notebook install and imports are as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: Automunge in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (6.3)\n",
      "Requirement already satisfied: scipy in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (from Automunge) (1.5.2)\n",
      "Requirement already satisfied: scikit-learn in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (from Automunge) (0.23.2)\n",
      "Requirement already satisfied: numpy in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (from Automunge) (1.19.2)\n",
      "Requirement already satisfied: pandas in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (from Automunge) (1.1.3)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (from scikit-learn->Automunge) (2.1.0)\n",
      "Requirement already satisfied: joblib>=0.11 in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (from scikit-learn->Automunge) (0.17.0)\n",
      "Requirement already satisfied: pytz>=2017.2 in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (from pandas->Automunge) (2020.1)\n",
      "Requirement already satisfied: python-dateutil>=2.7.3 in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (from pandas->Automunge) (2.8.1)\n",
      "Requirement already satisfied: six>=1.5 in /Users/nicholasteague/opt/anaconda3/envs/mlinfill/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas->Automunge) (1.15.0)\n"
     ]
    }
   ],
   "source": [
    "!pip install Automunge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Automunge import *\n",
    "am = AutoMunge()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The automunge(.) function accepts as input a Pandas dataframe or tabular Numpy array of training data and optionally also corresponding test data. If any of the sets include a label column that header should be designated, similarly with any index header or list of headers to exclude from the ML infill basis. For Numpy, headers are the index integer and labels should be positioned as final column."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#for simplicity we'll base this notebook on Titanic data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df_train = pd.read_csv('train.csv')\n",
    "df_test = pd.read_csv('test.csv')\n",
    "\n",
    "# labels_column = '<labels_column_header>'\n",
    "# trainID_column = '<ID_column_header>'\n",
    "\n",
    "labels_column = 'Survived'\n",
    "trainID_column = 'PassengerId'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "These data sets can be passed to automunge(.) to automatically encode and impute. The function returns 10 sets (9 dataframes and 1 dictionary) which in some cases may be empty based on parameter settings, we suggest the following optional naming convention. The final set, the \"postprocess_dict\", is the key for consistently preparing additional data in postmunge(.). Note that if a validation set is desired it can be partitioned from df_train with valpercent and prepared on the train set basis. Shuffling is on by default for train data and off by default for test data, the associated parameter is shown for reference.  Here we demonstrate with the assigncat parameter assigning the root category of a transformation set to some target column which will override the default transform under automation. We also demonstrate with the assigninfill parameter assigning an alternate infill convention to a column. The ML infill and NArw column aggregation are on by default, their associated activation parameters are shown for reference. Note that if the data is already numerically encoded and user just desires infill, they can pass parameter powertransform = 'infill'. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Automunge processing\n",
      "\n",
      "evaluating column:  Pclass\n",
      "processing column:  Pclass\n",
      "    root category:  1010\n",
      " returned columns:\n",
      "['Pclass_NArw', 'Pclass_1010_0', 'Pclass_1010_1']\n",
      "\n",
      "evaluating column:  Name\n",
      "processing column:  Name\n",
      "    root category:  hash\n",
      " returned columns:\n",
      "['Name_NArw', 'Name_hash_0', 'Name_hash_1', 'Name_hash_2', 'Name_hash_3', 'Name_hash_4', 'Name_hash_5', 'Name_hash_6', 'Name_hash_7', 'Name_hash_8', 'Name_hash_9', 'Name_hash_10', 'Name_hash_11', 'Name_hash_12', 'Name_hash_13']\n",
      "\n",
      "evaluating column:  Sex\n",
      "processing column:  Sex\n",
      "    root category:  bnry\n",
      " returned columns:\n",
      "['Sex_bnry', 'Sex_NArw']\n",
      "\n",
      "evaluating column:  Age\n",
      "processing column:  Age\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['Age_nmbr', 'Age_NArw']\n",
      "\n",
      "evaluating column:  SibSp\n",
      "processing column:  SibSp\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['SibSp_nmbr', 'SibSp_NArw']\n",
      "\n",
      "evaluating column:  Parch\n",
      "processing column:  Parch\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['Parch_nmbr', 'Parch_NArw']\n",
      "\n",
      "evaluating column:  Ticket\n",
      "processing column:  Ticket\n",
      "    root category:  or23\n",
      " returned columns:\n",
      "['Ticket_UPCS_ord3', 'Ticket_NArw', 'Ticket_UPCS_nmcm', 'Ticket_UPCS_sp19_0', 'Ticket_UPCS_sp19_1', 'Ticket_UPCS_sp19_2', 'Ticket_UPCS_sp19_3', 'Ticket_UPCS_sp19_4', 'Ticket_UPCS_sp19_5', 'Ticket_UPCS_sp19_6']\n",
      "\n",
      "evaluating column:  Fare\n",
      "processing column:  Fare\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['Fare_nmbr', 'Fare_NArw']\n",
      "\n",
      "evaluating column:  Cabin\n",
      "processing column:  Cabin\n",
      "    root category:  1010\n",
      " returned columns:\n",
      "['Cabin_NArw', 'Cabin_1010_0', 'Cabin_1010_1', 'Cabin_1010_2', 'Cabin_1010_3', 'Cabin_1010_4', 'Cabin_1010_5', 'Cabin_1010_6', 'Cabin_1010_7']\n",
      "\n",
      "evaluating column:  Embarked\n",
      "processing column:  Embarked\n",
      "    root category:  1010\n",
      " returned columns:\n",
      "['Embarked_NArw', 'Embarked_1010_0', 'Embarked_1010_1']\n",
      "\n",
      "______\n",
      "\n",
      "evaluating label column:  Survived\n",
      "processing label column:  Survived\n",
      "    root label category:  lbor\n",
      "\n",
      " returned columns:\n",
      "['Survived_ordl']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  Sex_bnry\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Age_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  SibSp_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Parch_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_ord3\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Fare_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Pclass_1010_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Pclass_1010_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_nmcm\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_2\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_3\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_4\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_5\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_6\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Cabin_1010_0\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_1\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_2\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_3\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_4\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_5\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_6\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_7\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Embarked_1010_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Embarked_1010_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "_______________\n",
      "Begin Validation set processing with Postmunge\n",
      "\n",
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Pclass\n",
      "    root category:  1010\n",
      "\n",
      " returned columns:\n",
      "['Pclass_NArw', 'Pclass_1010_0', 'Pclass_1010_1']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Name\n",
      "    root category:  hash\n",
      "\n",
      " returned columns:\n",
      "['Name_NArw', 'Name_hash_0', 'Name_hash_1', 'Name_hash_2', 'Name_hash_3', 'Name_hash_4', 'Name_hash_5', 'Name_hash_6', 'Name_hash_7', 'Name_hash_8', 'Name_hash_9', 'Name_hash_10', 'Name_hash_11', 'Name_hash_12', 'Name_hash_13']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Sex\n",
      "    root category:  bnry\n",
      "\n",
      " returned columns:\n",
      "['Sex_bnry', 'Sex_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Age\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['Age_nmbr', 'Age_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  SibSp\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['SibSp_nmbr', 'SibSp_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Parch\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['Parch_nmbr', 'Parch_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Ticket\n",
      "    root category:  or23\n",
      "\n",
      " returned columns:\n",
      "['Ticket_UPCS_ord3', 'Ticket_NArw', 'Ticket_UPCS_nmcm', 'Ticket_UPCS_sp19_0', 'Ticket_UPCS_sp19_1', 'Ticket_UPCS_sp19_2', 'Ticket_UPCS_sp19_3', 'Ticket_UPCS_sp19_4', 'Ticket_UPCS_sp19_5', 'Ticket_UPCS_sp19_6']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Fare\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['Fare_nmbr', 'Fare_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Cabin\n",
      "    root category:  1010\n",
      "\n",
      " returned columns:\n",
      "['Cabin_NArw', 'Cabin_1010_0', 'Cabin_1010_1', 'Cabin_1010_2', 'Cabin_1010_3', 'Cabin_1010_4', 'Cabin_1010_5', 'Cabin_1010_6', 'Cabin_1010_7']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Embarked\n",
      "    root category:  1010\n",
      "\n",
      " returned columns:\n",
      "['Embarked_NArw', 'Embarked_1010_0', 'Embarked_1010_1']\n",
      "\n",
      "______\n",
      "\n",
      "processing label column:  Survived\n",
      "    root label category:  lbor\n",
      "\n",
      " returned columns:\n",
      "['Survived_ordl']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  Sex_bnry\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Age_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  SibSp_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Parch_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_ord3\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Fare_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Pclass_1010_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Pclass_1010_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_nmcm\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_2\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_3\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_4\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_5\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_6\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Cabin_1010_0\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_1\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_2\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_3\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_4\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_5\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_6\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_7\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Embarked_1010_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Embarked_1010_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "_______________\n",
      "Postmunge returned ID column set: \n",
      "['Automunge_index']\n",
      "\n",
      "Postmunge returned test column set: \n",
      "['Sex_bnry', 'Age_nmbr', 'SibSp_nmbr', 'Parch_nmbr', 'Ticket_UPCS_ord3', 'Fare_nmbr', 'Pclass_NArw', 'Pclass_1010_0', 'Pclass_1010_1', 'Name_NArw', 'Name_hash_0', 'Name_hash_1', 'Name_hash_2', 'Name_hash_3', 'Name_hash_4', 'Name_hash_5', 'Name_hash_6', 'Name_hash_7', 'Name_hash_8', 'Name_hash_9', 'Name_hash_10', 'Name_hash_11', 'Name_hash_12', 'Name_hash_13', 'Sex_NArw', 'Age_NArw', 'SibSp_NArw', 'Parch_NArw', 'Ticket_NArw', 'Ticket_UPCS_nmcm', 'Ticket_UPCS_sp19_0', 'Ticket_UPCS_sp19_1', 'Ticket_UPCS_sp19_2', 'Ticket_UPCS_sp19_3', 'Ticket_UPCS_sp19_4', 'Ticket_UPCS_sp19_5', 'Ticket_UPCS_sp19_6', 'Fare_NArw', 'Cabin_NArw', 'Cabin_1010_0', 'Cabin_1010_1', 'Cabin_1010_2', 'Cabin_1010_3', 'Cabin_1010_4', 'Cabin_1010_5', 'Cabin_1010_6', 'Cabin_1010_7', 'Embarked_NArw', 'Embarked_1010_0', 'Embarked_1010_1']\n",
      "\n",
      "Postmunge returned label column set: \n",
      "['Survived_ordl']\n",
      "\n",
      "_______________\n",
      "Postmunge Complete\n",
      "\n",
      "______\n",
      "\n",
      "versioning serial stamp:\n",
      "_6.03_243506360979_2021-05-23T22:13:12.116856\n",
      "\n",
      "Automunge returned ID column set: \n",
      "['PassengerId', 'Automunge_index']\n",
      "\n",
      "Automunge returned train column set: \n",
      "['Sex_bnry', 'Age_nmbr', 'SibSp_nmbr', 'Parch_nmbr', 'Ticket_UPCS_ord3', 'Fare_nmbr', 'Pclass_NArw', 'Pclass_1010_0', 'Pclass_1010_1', 'Name_NArw', 'Name_hash_0', 'Name_hash_1', 'Name_hash_2', 'Name_hash_3', 'Name_hash_4', 'Name_hash_5', 'Name_hash_6', 'Name_hash_7', 'Name_hash_8', 'Name_hash_9', 'Name_hash_10', 'Name_hash_11', 'Name_hash_12', 'Name_hash_13', 'Sex_NArw', 'Age_NArw', 'SibSp_NArw', 'Parch_NArw', 'Ticket_NArw', 'Ticket_UPCS_nmcm', 'Ticket_UPCS_sp19_0', 'Ticket_UPCS_sp19_1', 'Ticket_UPCS_sp19_2', 'Ticket_UPCS_sp19_3', 'Ticket_UPCS_sp19_4', 'Ticket_UPCS_sp19_5', 'Ticket_UPCS_sp19_6', 'Fare_NArw', 'Cabin_NArw', 'Cabin_1010_0', 'Cabin_1010_1', 'Cabin_1010_2', 'Cabin_1010_3', 'Cabin_1010_4', 'Cabin_1010_5', 'Cabin_1010_6', 'Cabin_1010_7', 'Embarked_NArw', 'Embarked_1010_0', 'Embarked_1010_1']\n",
      "\n",
      "Automunge returned label column set: \n",
      "['Survived_ordl']\n",
      "\n",
      "_______________\n",
      "Automunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "parsed_categoric_target_column = 'Ticket'\n",
    "infill_target_column = 'Cabin'\n",
    "\n",
    "train, train_ID, labels, \\\n",
    "val, val_ID, val_labels, \\\n",
    "test, test_ID, test_labels, \\\n",
    "postprocess_dict = \\\n",
    "am.automunge(df_train, \n",
    "             df_test = df_test,\n",
    "             labels_column = labels_column, \n",
    "             trainID_column = trainID_column,\n",
    "             valpercent = 0.2,\n",
    "             shuffletrain = True,\n",
    "             assigncat = {'or23' : [parsed_categoric_target_column] },\n",
    "             assigninfill = {'modeinfill' : [infill_target_column] },\n",
    "             MLinfill = True,\n",
    "             NArw_marker = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A list of columns returned from some particular input feature can be accessed with postprocess_dict['column_map']['\\<input_feature_header\\>']. A report classifying the returned column types (such as continuous, boolean, ordinal, onehot, binary, etc.) and their groupings can be accessed with postprocess_dict['columntype_report']."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Ticket_UPCS_ord3</th>\n",
       "      <th>Ticket_NArw</th>\n",
       "      <th>Ticket_UPCS_nmcm</th>\n",
       "      <th>Ticket_UPCS_sp19_0</th>\n",
       "      <th>Ticket_UPCS_sp19_1</th>\n",
       "      <th>Ticket_UPCS_sp19_2</th>\n",
       "      <th>Ticket_UPCS_sp19_3</th>\n",
       "      <th>Ticket_UPCS_sp19_4</th>\n",
       "      <th>Ticket_UPCS_sp19_5</th>\n",
       "      <th>Ticket_UPCS_sp19_6</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>176</th>\n",
       "      <td>536</td>\n",
       "      <td>0</td>\n",
       "      <td>3101312.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>492</th>\n",
       "      <td>26</td>\n",
       "      <td>0</td>\n",
       "      <td>17757.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>390</th>\n",
       "      <td>362</td>\n",
       "      <td>0</td>\n",
       "      <td>349247.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>613</th>\n",
       "      <td>537</td>\n",
       "      <td>0</td>\n",
       "      <td>392078.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>580</th>\n",
       "      <td>468</td>\n",
       "      <td>0</td>\n",
       "      <td>2466.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Ticket_UPCS_ord3  Ticket_NArw  Ticket_UPCS_nmcm  Ticket_UPCS_sp19_0  \\\n",
       "176               536            0         3101312.0                   1   \n",
       "492                26            0           17757.0                   1   \n",
       "390               362            0          349247.0                   0   \n",
       "613               537            0          392078.0                   1   \n",
       "580               468            0            2466.0                   0   \n",
       "\n",
       "     Ticket_UPCS_sp19_1  Ticket_UPCS_sp19_2  Ticket_UPCS_sp19_3  \\\n",
       "176                   1                   1                   1   \n",
       "492                   0                   1                   1   \n",
       "390                   1                   1                   0   \n",
       "613                   1                   1                   0   \n",
       "580                   1                   0                   0   \n",
       "\n",
       "     Ticket_UPCS_sp19_4  Ticket_UPCS_sp19_5  Ticket_UPCS_sp19_6  \n",
       "176                   0                   1                   1  \n",
       "492                   1                   0                   0  \n",
       "390                   0                   1                   0  \n",
       "613                   1                   0                   0  \n",
       "580                   1                   1                   1  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Here's an example of viewing columns \n",
    "#returned from some input feature\n",
    "\n",
    "input_feature_header = 'Ticket'\n",
    "\n",
    "train[postprocess_dict['column_map'][input_feature_header]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'continuous': ['Age_nmbr',\n",
       "  'SibSp_nmbr',\n",
       "  'Parch_nmbr',\n",
       "  'Fare_nmbr',\n",
       "  'Ticket_UPCS_nmcm'],\n",
       " 'boolean': ['Sex_bnry',\n",
       "  'Pclass_NArw',\n",
       "  'Name_NArw',\n",
       "  'Sex_NArw',\n",
       "  'Age_NArw',\n",
       "  'SibSp_NArw',\n",
       "  'Parch_NArw',\n",
       "  'Ticket_NArw',\n",
       "  'Fare_NArw',\n",
       "  'Cabin_NArw',\n",
       "  'Embarked_NArw'],\n",
       " 'ordinal': ['Ticket_UPCS_ord3',\n",
       "  'Name_hash_0',\n",
       "  'Name_hash_1',\n",
       "  'Name_hash_2',\n",
       "  'Name_hash_3',\n",
       "  'Name_hash_4',\n",
       "  'Name_hash_5',\n",
       "  'Name_hash_6',\n",
       "  'Name_hash_7',\n",
       "  'Name_hash_8',\n",
       "  'Name_hash_9',\n",
       "  'Name_hash_10',\n",
       "  'Name_hash_11',\n",
       "  'Name_hash_12',\n",
       "  'Name_hash_13'],\n",
       " 'onehot': [],\n",
       " 'onehot_sets': [],\n",
       " 'binary': ['Pclass_1010_0',\n",
       "  'Pclass_1010_1',\n",
       "  'Ticket_UPCS_sp19_0',\n",
       "  'Ticket_UPCS_sp19_1',\n",
       "  'Ticket_UPCS_sp19_2',\n",
       "  'Ticket_UPCS_sp19_3',\n",
       "  'Ticket_UPCS_sp19_4',\n",
       "  'Ticket_UPCS_sp19_5',\n",
       "  'Ticket_UPCS_sp19_6',\n",
       "  'Cabin_1010_0',\n",
       "  'Cabin_1010_1',\n",
       "  'Cabin_1010_2',\n",
       "  'Cabin_1010_3',\n",
       "  'Cabin_1010_4',\n",
       "  'Cabin_1010_5',\n",
       "  'Cabin_1010_6',\n",
       "  'Cabin_1010_7',\n",
       "  'Embarked_1010_0',\n",
       "  'Embarked_1010_1'],\n",
       " 'binary_sets': [['Pclass_1010_0', 'Pclass_1010_1'],\n",
       "  ['Ticket_UPCS_sp19_0',\n",
       "   'Ticket_UPCS_sp19_1',\n",
       "   'Ticket_UPCS_sp19_2',\n",
       "   'Ticket_UPCS_sp19_3',\n",
       "   'Ticket_UPCS_sp19_4',\n",
       "   'Ticket_UPCS_sp19_5',\n",
       "   'Ticket_UPCS_sp19_6'],\n",
       "  ['Cabin_1010_0',\n",
       "   'Cabin_1010_1',\n",
       "   'Cabin_1010_2',\n",
       "   'Cabin_1010_3',\n",
       "   'Cabin_1010_4',\n",
       "   'Cabin_1010_5',\n",
       "   'Cabin_1010_6',\n",
       "   'Cabin_1010_7'],\n",
       "  ['Embarked_1010_0', 'Embarked_1010_1']],\n",
       " 'passthrough': []}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#And here is what the report of column types looks like\n",
    "\n",
    "postprocess_dict['columntype_report']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If the returned train set is to be used for training a model that may go into production, the postprocess_dict should be saved externally, such as with the pickle library. \n",
    "\n",
    "We can then prepare additional data on the train set basis with postmunge(.)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Pclass\n",
      "    root category:  1010\n",
      "\n",
      " returned columns:\n",
      "['Pclass_NArw', 'Pclass_1010_0', 'Pclass_1010_1']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Name\n",
      "    root category:  hash\n",
      "\n",
      " returned columns:\n",
      "['Name_NArw', 'Name_hash_0', 'Name_hash_1', 'Name_hash_2', 'Name_hash_3', 'Name_hash_4', 'Name_hash_5', 'Name_hash_6', 'Name_hash_7', 'Name_hash_8', 'Name_hash_9', 'Name_hash_10', 'Name_hash_11', 'Name_hash_12', 'Name_hash_13']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Sex\n",
      "    root category:  bnry\n",
      "\n",
      " returned columns:\n",
      "['Sex_bnry', 'Sex_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Age\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['Age_nmbr', 'Age_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  SibSp\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['SibSp_nmbr', 'SibSp_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Parch\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['Parch_nmbr', 'Parch_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Ticket\n",
      "    root category:  or23\n",
      "\n",
      " returned columns:\n",
      "['Ticket_UPCS_ord3', 'Ticket_NArw', 'Ticket_UPCS_nmcm', 'Ticket_UPCS_sp19_0', 'Ticket_UPCS_sp19_1', 'Ticket_UPCS_sp19_2', 'Ticket_UPCS_sp19_3', 'Ticket_UPCS_sp19_4', 'Ticket_UPCS_sp19_5', 'Ticket_UPCS_sp19_6']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Fare\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['Fare_nmbr', 'Fare_NArw']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Cabin\n",
      "    root category:  1010\n",
      "\n",
      " returned columns:\n",
      "['Cabin_NArw', 'Cabin_1010_0', 'Cabin_1010_1', 'Cabin_1010_2', 'Cabin_1010_3', 'Cabin_1010_4', 'Cabin_1010_5', 'Cabin_1010_6', 'Cabin_1010_7']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  Embarked\n",
      "    root category:  1010\n",
      "\n",
      " returned columns:\n",
      "['Embarked_NArw', 'Embarked_1010_0', 'Embarked_1010_1']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  Sex_bnry\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Age_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  SibSp_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Parch_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_ord3\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Fare_nmbr\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Pclass_1010_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Pclass_1010_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_nmcm\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_2\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_3\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_4\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_5\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Ticket_UPCS_sp19_6\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Cabin_1010_0\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_1\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_2\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_3\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_4\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_5\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_6\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Cabin_1010_7\n",
      "     infill type: modeinfill\n",
      "\n",
      "infill to column:  Embarked_1010_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  Embarked_1010_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "_______________\n",
      "Postmunge returned ID column set: \n",
      "['PassengerId', 'Automunge_index']\n",
      "\n",
      "Postmunge returned test column set: \n",
      "['Sex_bnry', 'Age_nmbr', 'SibSp_nmbr', 'Parch_nmbr', 'Ticket_UPCS_ord3', 'Fare_nmbr', 'Pclass_NArw', 'Pclass_1010_0', 'Pclass_1010_1', 'Name_NArw', 'Name_hash_0', 'Name_hash_1', 'Name_hash_2', 'Name_hash_3', 'Name_hash_4', 'Name_hash_5', 'Name_hash_6', 'Name_hash_7', 'Name_hash_8', 'Name_hash_9', 'Name_hash_10', 'Name_hash_11', 'Name_hash_12', 'Name_hash_13', 'Sex_NArw', 'Age_NArw', 'SibSp_NArw', 'Parch_NArw', 'Ticket_NArw', 'Ticket_UPCS_nmcm', 'Ticket_UPCS_sp19_0', 'Ticket_UPCS_sp19_1', 'Ticket_UPCS_sp19_2', 'Ticket_UPCS_sp19_3', 'Ticket_UPCS_sp19_4', 'Ticket_UPCS_sp19_5', 'Ticket_UPCS_sp19_6', 'Fare_NArw', 'Cabin_NArw', 'Cabin_1010_0', 'Cabin_1010_1', 'Cabin_1010_2', 'Cabin_1010_3', 'Cabin_1010_4', 'Cabin_1010_5', 'Cabin_1010_6', 'Cabin_1010_7', 'Embarked_NArw', 'Embarked_1010_0', 'Embarked_1010_1']\n",
      "\n",
      "_______________\n",
      "Postmunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test, test_ID, test_labels, \\\n",
    "postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, \n",
    "             df_test)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
