{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "XD4yvwFfk1Bh"
   },
   "source": [
    "# String Theory - Demonstration Notebook"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "AucWFCq_kz9d"
   },
   "source": [
    "This notebook will demonstrate material related to discussions in the paper \"String Theory: Parsed Categoric Encodings with Automunge\". Let's get right to it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 190
    },
    "colab_type": "code",
    "id": "7mqQsQfAlJpK",
    "outputId": "7afa9280-aa3b-42dd-9f4d-c421e1c23540"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: Automunge in /Users/nicholasteague/miniconda3/lib/python3.7/site-packages (3.98)\r\n",
      "Requirement already satisfied: scipy in /Users/nicholasteague/miniconda3/lib/python3.7/site-packages (from Automunge) (1.4.1)\r\n",
      "Requirement already satisfied: pandas in /Users/nicholasteague/miniconda3/lib/python3.7/site-packages (from Automunge) (1.0.1)\r\n",
      "Requirement already satisfied: scikit-learn in /Users/nicholasteague/miniconda3/lib/python3.7/site-packages (from Automunge) (0.22.1)\r\n",
      "Requirement already satisfied: numpy in /Users/nicholasteague/miniconda3/lib/python3.7/site-packages (from Automunge) (1.18.1)\r\n",
      "Requirement already satisfied: pytz>=2017.2 in /Users/nicholasteague/miniconda3/lib/python3.7/site-packages (from pandas->Automunge) (2019.3)\r\n",
      "Requirement already satisfied: python-dateutil>=2.6.1 in /Users/nicholasteague/miniconda3/lib/python3.7/site-packages (from pandas->Automunge) (2.8.1)\r\n",
      "Requirement already satisfied: joblib>=0.11 in /Users/nicholasteague/miniconda3/lib/python3.7/site-packages (from scikit-learn->Automunge) (0.14.1)\r\n",
      "Requirement already satisfied: six>=1.5 in /Users/nicholasteague/miniconda3/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas->Automunge) (1.12.0)\r\n"
     ]
    }
   ],
   "source": [
    "\n",
    "#Automunge is available for pip install:\n",
    "!pip install Automunge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "pUUu-thSlSZe"
   },
   "outputs": [],
   "source": [
    "#or to upgrade (we currently roll out upgrades fairly frequently)\n",
    "#!pip install Automunge --upgrade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "O8raPvdClZ6y"
   },
   "outputs": [],
   "source": [
    "#Once installed, run this in local session to initialize\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "y2BOvuIFldp7"
   },
   "outputs": [],
   "source": [
    "#To demonstrate, we'll populate a simple dataframe consistent with the examples in the paper.\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "df_train = \\\n",
    "pd.DataFrame({'column1':['circle','circle','circle','square','square','triangle',1234,np.nan,np.nan], \\\n",
    "              'column2':['yes','yes','yes','yes','no','no','no',np.nan,np.nan], \\\n",
    "              'address':['1234 North Peterson St Orlando, FL 32714',\n",
    "                         '2345 South Anderson St Altamonte Springs, FL 32715',\n",
    "                         '3456 South Peterson St Maitland, FL 32789',\n",
    "                         '4567 North Peterson St Orlando, FL 32714',\n",
    "                         '5678 Avenue St Orlando, FL 32714',\n",
    "                         '6789 South Peterson St Maitland, FL 32789',\n",
    "                         '5858 North Other St Altamonte Springs, FL 32715',\n",
    "                         None,\n",
    "                         'Orlando, FL']})\n",
    "\n",
    "#for a test set we'll just copy the train set\n",
    "df_test = df_train.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "EjjebOtTnQPN"
   },
   "outputs": [],
   "source": [
    "#The returned postprocess_dict should be saved such as with pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "colab_type": "code",
    "id": "4UzHhetrnY1L",
    "outputId": "a7b6f520-d500-48af-feb5-2347894aa1c3"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Automunge processing\n",
      "\n",
      "evaluating column:  column1\n",
      "processing column:  column1\n",
      "    root category:  bnry\n",
      " returned columns:\n",
      "['column1_bnry']\n",
      "\n",
      "evaluating column:  column2\n",
      "processing column:  column2\n",
      "    root category:  bnry\n",
      " returned columns:\n",
      "['column2_bnry']\n",
      "\n",
      "evaluating column:  address\n",
      "processing column:  address\n",
      "    root category:  1010\n",
      " returned columns:\n",
      "['address_1010_0', 'address_1010_1', 'address_1010_2', 'address_1010_3']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  column1_bnry\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  column2_bnry\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_1\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_2\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_3\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "______\n",
      "\n",
      "versioning serial stamp:\n",
      "_3.98_177452598428_2020-06-03T20:44:03.623976\n",
      "\n",
      "Automunge returned column set: \n",
      "['column1_bnry', 'column2_bnry', 'address_1010_0', 'address_1010_1', 'address_1010_2', 'address_1010_3']\n",
      "\n",
      "_______________\n",
      "Automunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#I find it helps to just copy and paste the full range of parameters for reference\n",
    "\n",
    "#here are the defaults with only deviation of turning PCA off for this simple data set\n",
    "#(there is a heuristic for small data sets based on ratio #features/#rows)\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df_train, df_test = False, \\\n",
    "  labels_column = False, trainID_column = False, testID_column = False, \\\n",
    "  valpercent1=0.0, valpercent2 = 0.0, floatprecision = 32, shuffletrain = True, \\\n",
    "  TrainLabelFreqLevel = False, powertransform = False, binstransform = False, \\\n",
    "  MLinfill = False, infilliterate=1, randomseed = 42, eval_ratio = .5, \\\n",
    "  LabelSmoothing_train = False, LabelSmoothing_test = False, \\\n",
    "  LabelSmoothing_val = False, LSfit = False, \\\n",
    "  numbercategoryheuristic = 63, pandasoutput = False, NArw_marker = False, \\\n",
    "  featureselection = False, featurepct = 1.0, \\\n",
    "  featuremetric = 0.0, featuremethod = 'default', \\\n",
    "  Binary = False, PCAn_components = None, PCAexcl = [], excl_suffix = False, \\\n",
    "  ML_cmnd = {'MLinfill_type':'default', \\\n",
    "             'MLinfill_cmnd':{'RandomForestClassifier':{}, \\\n",
    "                              'RandomForestRegressor':{}}, \\\n",
    "             'PCA_type':'off', \\\n",
    "             'PCA_cmnd':{}}, \\\n",
    "  assigncat = {'nmbr':[], 'retn':[], 'mnmx':[], 'mean':[], 'MAD3':[], \\\n",
    "             'bins':[], 'bsor':[], 'pwr2':[], 'por2':[], 'bxcx':[], \\\n",
    "             'addd':[], 'sbtr':[], 'mltp':[], 'divd':[], \\\n",
    "             'log0':[], 'log1':[], 'sqrt':[], 'rais':[], 'absl':[], \\\n",
    "             'bnwd':[], 'bnwK':[], 'bnwM':[], 'bnwo':[], 'bnKo':[], 'bnMo':[], \\\n",
    "             'bnep':[], 'bne7':[], 'bne9':[], 'bneo':[], 'bn7o':[], 'bn9o':[], \\\n",
    "             'bkt1':[], 'bkt2':[], 'bkt3':[], 'bkt4':[], \\\n",
    "             'nbr2':[], 'nbr3':[], 'MADn':[], 'MAD2':[], 'tlbn':[], \\\n",
    "             'mnm2':[], 'mnm3':[], 'mnm4':[], 'mnm5':[], 'mnm6':[], \\\n",
    "             'mea2':[], 'mea3':[], 'bxc2':[], 'bxc3':[], 'bxc4':[], \\\n",
    "             'dxdt':[], 'd2dt':[], 'd3dt':[], 'dxd2':[], 'd2d2':[], 'd3d2':[], \\\n",
    "             'nmdx':[], 'nmd2':[], 'nmd3':[], 'mmdx':[], 'mmd2':[], 'mmd3':[], \\\n",
    "             'bnry':[], 'text':[], 'txt2':[], 'txt3':[], '1010':[], 'or10':[], \\\n",
    "             'ordl':[], 'ord2':[], 'ord3':[], 'ord4':[], 'om10':[], 'mmor':[], \\\n",
    "             'Utxt':[], 'Utx2':[], 'Utx3':[], 'Uor3':[], 'Uor6':[], 'U101':[], \\\n",
    "             'splt':[], 'spl2':[], 'spl3':[], 'spl4':[], 'spl5':[], 'lngt':[], \\\n",
    "             'spl7':[], 'spl8':[], 'spl9':[], 'sp10':[], 'srch':[], 'src2':[], \\\n",
    "             'nmrc':[], 'nmr2':[], 'nmr3':[], 'nmcm':[], 'nmc2':[], 'nmc3':[], \\\n",
    "             'nmr7':[], 'nmr8':[], 'nmr9':[], 'nmc7':[], 'nmc8':[], 'nmc9':[], \\\n",
    "             'ors2':[], 'ors5':[], 'ors6':[], 'ors7':[], \\\n",
    "             'or11':[], 'or12':[], 'or15':[], 'or17':[], 'or19':[], 'or20':[], \\\n",
    "             'date':[], 'dat2':[], 'dat6':[], 'wkdy':[], 'bshr':[], 'hldy':[], \\\n",
    "             'wkds':[], 'wkdo':[], 'mnts':[], 'mnto':[], \\\n",
    "             'yea2':[], 'mnt2':[], 'mnt6':[], 'day2':[], 'day5':[], \\\n",
    "             'hrs2':[], 'hrs4':[], 'min2':[], 'min4':[], 'scn2':[], \\\n",
    "             'excl':[], 'exc2':[], 'exc3':[], 'exc4':[], 'exc5':[], 'exc6':[], \\\n",
    "             'null':[], 'copy':[], 'shfl':[], 'eval':[], 'ptfm':[]}, \\\n",
    "  assigninfill = {'stdrdinfill':[], 'MLinfill':[], \\\n",
    "                  'zeroinfill':[], 'oneinfill':[], \\\n",
    "                  'adjinfill':[], 'meaninfill':[], 'medianinfill':[], \\\n",
    "                  'modeinfill':[], 'lcinfill':[]}, \\\n",
    "  assignparam = {'default_assignparam' : {'(category)' : {'(parameter)' : 42}}, \\\n",
    "                  '(category)' : {'(column)'   : {'(parameter)' : 42}}}, \\\n",
    "  transformdict = {}, processdict = {}, evalcat = False, \\\n",
    "  printstatus = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "colab_type": "code",
    "id": "Ut0QHZmnneSP",
    "outputId": "1ee03005-e7d7-4287-ee9e-3acb4d9b26db"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  column1\n",
      "    root category:  bnry\n",
      "\n",
      " returned columns:\n",
      "['column1_bnry']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  column2\n",
      "    root category:  bnry\n",
      "\n",
      " returned columns:\n",
      "['column2_bnry']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  address\n",
      "    root category:  1010\n",
      "\n",
      " returned columns:\n",
      "['address_1010_0', 'address_1010_1', 'address_1010_2', 'address_1010_3']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  column1_bnry\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  column2_bnry\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_1\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_2\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_3\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "Postmunge returned column set: \n",
      "['column1_bnry', 'column2_bnry', 'address_1010_0', 'address_1010_1', 'address_1010_2', 'address_1010_3']\n",
      "\n",
      "_______________\n",
      "Postmunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#or for postmunge(.) with full range of parameters:\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df_test, \\\n",
    "             testID_column = False, labelscolumn = False, \\\n",
    "             pandasoutput = False, printstatus = True, \\\n",
    "             TrainLabelFreqLevel = False, featureeval = False, driftreport = False, \\\n",
    "             LabelSmoothing = False, LSfit = False, \\\n",
    "             returnedsets = True, shuffletrain = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "5vfHUFMu7BO4",
    "outputId": "2044b560-0d55-49e1-9f31-a4bf8f1de586"
   },
   "outputs": [],
   "source": [
    "#Where the function returns numpy arrays of the encoded data\n",
    "#we could alternatively return pandas dataframes by passing pandasoutput=True\n",
    "#which we'll apply for below examples\n",
    "#in addition to turning off the PCA heuristic noted above\n",
    "#and turning off default shuffling and printouts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "bOaVAFYw6mdt"
   },
   "source": [
    "# Demonstrations from Paper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "XVF_8O7q62y1"
   },
   "source": [
    "## Categoric Encodings - Figure 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "LDv-U2qv6Fle",
    "outputId": "289701e2-ad64-4f1b-bf5f-76bac81c543b"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column1_1234</th>\n",
       "      <th>column1_circle</th>\n",
       "      <th>column1_square</th>\n",
       "      <th>column1_triangle</th>\n",
       "      <th>source_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>circle</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>circle</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>circle</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>square</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>square</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>triangle</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1234</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   column1_1234  column1_circle  column1_square  column1_triangle  \\\n",
       "0             0               1               0                 0   \n",
       "1             0               1               0                 0   \n",
       "2             0               1               0                 0   \n",
       "3             0               0               1                 0   \n",
       "4             0               0               1                 0   \n",
       "5             0               0               0                 1   \n",
       "6             1               0               0                 0   \n",
       "7             0               0               0                 0   \n",
       "8             0               0               0                 0   \n",
       "\n",
       "  source_column  \n",
       "0        circle  \n",
       "1        circle  \n",
       "2        circle  \n",
       "3        square  \n",
       "4        square  \n",
       "5      triangle  \n",
       "6          1234  \n",
       "7           NaN  \n",
       "8           NaN  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'text' (one-hot encoding)\n",
    "\n",
    "#we'll just apply to one column at a time for demonstrations\n",
    "df = pd.DataFrame(df_train['column1'].copy())\n",
    "#and a copy of that column for comparison\n",
    "df['source_column'] = df['column1'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'text':['column1'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "-wJPI9cL_FAk",
    "outputId": "9ff9be44-bebc-45c7-9466-ae29dcaf162d"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>column1_1010_0</th>\n",
       "      <th>column1_1010_1</th>\n",
       "      <th>column1_1010_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1234</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  source_column  column1_1010_0  column1_1010_1  column1_1010_2\n",
       "0        circle               0               0               1\n",
       "1        circle               0               0               1\n",
       "2        circle               0               0               1\n",
       "3        square               0               1               0\n",
       "4        square               0               1               0\n",
       "5      triangle               0               1               1\n",
       "6          1234               0               0               0\n",
       "7           NaN               1               0               0\n",
       "8           NaN               1               0               0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'1010' (binary encoding)\n",
    "\n",
    "df = pd.DataFrame(df_train['column1'].copy())\n",
    "df['source_column'] = df['column1'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'1010':['column1'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "1sWSxfnv_Y2e",
    "outputId": "33432117-dcd2-46ac-cd03-fef534611993"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>column1_ordl</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>circle</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>square</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1234</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  source_column  column1_ordl\n",
       "0        circle             1\n",
       "1        circle             1\n",
       "2        circle             1\n",
       "3        square             2\n",
       "4        square             2\n",
       "5      triangle             3\n",
       "6          1234             0\n",
       "7           NaN             4\n",
       "8           NaN             4"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'ordl' (ordinal alphabetical)\n",
    "\n",
    "df = pd.DataFrame(df_train['column1'].copy())\n",
    "df['source_column'] = df['column1'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'ordl':['column1'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>column1_ord3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>square</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1234</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  source_column  column1_ord3\n",
       "0        circle             0\n",
       "1        circle             0\n",
       "2        circle             0\n",
       "3        square             1\n",
       "4        square             1\n",
       "5      triangle             4\n",
       "6          1234             3\n",
       "7           NaN             2\n",
       "8           NaN             2"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'ord3' (ordinal by frequency)\n",
    "\n",
    "df = pd.DataFrame(df_train['column1'].copy())\n",
    "df['source_column'] = df['column1'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'ord3':['column1'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "ZEbGtnAK_q9n",
    "outputId": "46677228-c02f-4975-ebee-809a22f7ab1d"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>column2_bnry</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>yes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>yes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>yes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>yes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>no</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>no</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>no</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  source_column  column2_bnry\n",
       "0           yes             1\n",
       "1           yes             1\n",
       "2           yes             1\n",
       "3           yes             1\n",
       "4            no             0\n",
       "5            no             0\n",
       "6            no             0\n",
       "7           NaN             1\n",
       "8           NaN             1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'bnry' (boolean)\n",
    "\n",
    "\n",
    "df = pd.DataFrame(df_train['column2'].copy())\n",
    "df['source_column'] = df['column2'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'bnry':['column2'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "YZrF9ZJu_8xn"
   },
   "source": [
    "## String Parsing - Figure 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 464
    },
    "colab_type": "code",
    "id": "RMz54TBa_3ah",
    "outputId": "fa6264f0-2d11-44c5-d95a-e9b14581f4b1"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_splt_ South Peterson St Maitland, FL 32789</th>\n",
       "      <th>address_splt_ North Peterson St Orlando, FL 32714</th>\n",
       "      <th>address_splt_ St Altamonte Springs, FL 32715</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...   \n",
       "2          3456 South Peterson St Maitland, FL 32789   \n",
       "3           4567 North Peterson St Orlando, FL 32714   \n",
       "4                   5678 Avenue St Orlando, FL 32714   \n",
       "5          6789 South Peterson St Maitland, FL 32789   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715   \n",
       "7                                               None   \n",
       "8                                        Orlando, FL   \n",
       "\n",
       "   address_splt_ South Peterson St Maitland, FL 32789  \\\n",
       "0                                                  0    \n",
       "1                                                  0    \n",
       "2                                                  1    \n",
       "3                                                  0    \n",
       "4                                                  0    \n",
       "5                                                  1    \n",
       "6                                                  0    \n",
       "7                                                  0    \n",
       "8                                                  0    \n",
       "\n",
       "   address_splt_ North Peterson St Orlando, FL 32714  \\\n",
       "0                                                  1   \n",
       "1                                                  0   \n",
       "2                                                  0   \n",
       "3                                                  1   \n",
       "4                                                  0   \n",
       "5                                                  0   \n",
       "6                                                  0   \n",
       "7                                                  0   \n",
       "8                                                  0   \n",
       "\n",
       "   address_splt_ St Altamonte Springs, FL 32715  \n",
       "0                                             0  \n",
       "1                                             1  \n",
       "2                                             0  \n",
       "3                                             0  \n",
       "4                                             0  \n",
       "5                                             0  \n",
       "6                                             1  \n",
       "7                                             0  \n",
       "8                                             0  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'splt' (string overlap identification)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'splt':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_splt_Altamonte</th>\n",
       "      <th>address_splt_Peterson</th>\n",
       "      <th>address_splt_Maitland</th>\n",
       "      <th>address_splt_Orlando</th>\n",
       "      <th>address_splt_Springs</th>\n",
       "      <th>address_splt_North</th>\n",
       "      <th>address_splt_32714</th>\n",
       "      <th>address_splt_South</th>\n",
       "      <th>address_splt_32715</th>\n",
       "      <th>address_splt_32789</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_splt_Altamonte  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714                       0   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                       1   \n",
       "2          3456 South Peterson St Maitland, FL 32789                       0   \n",
       "3           4567 North Peterson St Orlando, FL 32714                       0   \n",
       "4                   5678 Avenue St Orlando, FL 32714                       0   \n",
       "5          6789 South Peterson St Maitland, FL 32789                       0   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715                       1   \n",
       "7                                               None                       0   \n",
       "8                                        Orlando, FL                       0   \n",
       "\n",
       "   address_splt_Peterson  address_splt_Maitland  address_splt_Orlando  \\\n",
       "0                      1                      0                     1   \n",
       "1                      0                      0                     0   \n",
       "2                      1                      1                     0   \n",
       "3                      1                      0                     1   \n",
       "4                      0                      0                     1   \n",
       "5                      1                      1                     0   \n",
       "6                      0                      0                     0   \n",
       "7                      0                      0                     0   \n",
       "8                      0                      0                     1   \n",
       "\n",
       "   address_splt_Springs  address_splt_North  address_splt_32714  \\\n",
       "0                     0                   1                   1   \n",
       "1                     1                   0                   0   \n",
       "2                     0                   0                   0   \n",
       "3                     0                   1                   1   \n",
       "4                     0                   0                   1   \n",
       "5                     0                   0                   0   \n",
       "6                     1                   1                   0   \n",
       "7                     0                   0                   0   \n",
       "8                     0                   0                   0   \n",
       "\n",
       "   address_splt_South  address_splt_32715  address_splt_32789  \n",
       "0                   0                   0                   0  \n",
       "1                   1                   1                   0  \n",
       "2                   1                   0                   1  \n",
       "3                   0                   0                   0  \n",
       "4                   0                   0                   0  \n",
       "5                   1                   0                   1  \n",
       "6                   0                   1                   0  \n",
       "7                   0                   0                   0  \n",
       "8                   0                   0                   0  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'splt' (string overlap identification)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#the paper noted also that parameters may be passed to string parsing functions\n",
    "#to exclude spaces and special characters\n",
    "#such as to promote single word activations\n",
    "#here is a demonstration (was not shown in Figure 2)\n",
    "\n",
    "#Here we pass 'space_and_punctuation' parameter as False to exclude special characters from overlaps\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'splt':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'splt' : {'address' : {'space_and_punctuation':False}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "colab_type": "code",
    "id": "ux1LHMRVAsnd",
    "outputId": "4dfb7216-8ba0-4c53-fb5d-61b812258692"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_spl2_ord3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_spl2_ord3\n",
       "0           1234 North Peterson St Orlando, FL 32714                  0\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                  2\n",
       "2          3456 South Peterson St Maitland, FL 32789                  1\n",
       "3           4567 North Peterson St Orlando, FL 32714                  0\n",
       "4                   5678 Avenue St Orlando, FL 32714                  3\n",
       "5          6789 South Peterson St Maitland, FL 32789                  1\n",
       "6    5858 North Other St Altamonte Springs, FL 32715                  2\n",
       "7                                               None                  4\n",
       "8                                        Orlando, FL                  5"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'spl2' (string overlap ordinal)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'spl2':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "w6f88HyyA8Eo"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_spl5_ord3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_spl5_ord3\n",
       "0           1234 North Peterson St Orlando, FL 32714                  1\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                  3\n",
       "2          3456 South Peterson St Maitland, FL 32789                  2\n",
       "3           4567 North Peterson St Orlando, FL 32714                  1\n",
       "4                   5678 Avenue St Orlando, FL 32714                  0\n",
       "5          6789 South Peterson St Maitland, FL 32789                  2\n",
       "6    5858 North Other St Altamonte Springs, FL 32715                  3\n",
       "7                                               None                  0\n",
       "8                                        Orlando, FL                  0"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'spl5' (spl2 w/ excluded non-overlaps)\n",
    "\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'spl5':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_sp15_ South Peterson St Maitland, FL 32789</th>\n",
       "      <th>address_sp15_ North Peterson St Orlando, FL 32714</th>\n",
       "      <th>address_sp15_ St Altamonte Springs, FL 32715</th>\n",
       "      <th>address_sp15_ St Orlando, FL 32714</th>\n",
       "      <th>address_sp15_Orlando, FL</th>\n",
       "      <th>address_sp15_erson St</th>\n",
       "      <th>address_sp15_ South</th>\n",
       "      <th>address_sp15_ North</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...   \n",
       "2          3456 South Peterson St Maitland, FL 32789   \n",
       "3           4567 North Peterson St Orlando, FL 32714   \n",
       "4                   5678 Avenue St Orlando, FL 32714   \n",
       "5          6789 South Peterson St Maitland, FL 32789   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715   \n",
       "7                                               None   \n",
       "8                                        Orlando, FL   \n",
       "\n",
       "   address_sp15_ South Peterson St Maitland, FL 32789  \\\n",
       "0                                                  0    \n",
       "1                                                  0    \n",
       "2                                                  1    \n",
       "3                                                  0    \n",
       "4                                                  0    \n",
       "5                                                  1    \n",
       "6                                                  0    \n",
       "7                                                  0    \n",
       "8                                                  0    \n",
       "\n",
       "   address_sp15_ North Peterson St Orlando, FL 32714  \\\n",
       "0                                                  1   \n",
       "1                                                  0   \n",
       "2                                                  0   \n",
       "3                                                  1   \n",
       "4                                                  0   \n",
       "5                                                  0   \n",
       "6                                                  0   \n",
       "7                                                  0   \n",
       "8                                                  0   \n",
       "\n",
       "   address_sp15_ St Altamonte Springs, FL 32715  \\\n",
       "0                                             0   \n",
       "1                                             1   \n",
       "2                                             0   \n",
       "3                                             0   \n",
       "4                                             0   \n",
       "5                                             0   \n",
       "6                                             1   \n",
       "7                                             0   \n",
       "8                                             0   \n",
       "\n",
       "   address_sp15_ St Orlando, FL 32714  address_sp15_Orlando, FL  \\\n",
       "0                                   1                         1   \n",
       "1                                   0                         0   \n",
       "2                                   0                         0   \n",
       "3                                   1                         1   \n",
       "4                                   1                         1   \n",
       "5                                   0                         0   \n",
       "6                                   0                         0   \n",
       "7                                   0                         0   \n",
       "8                                   0                         1   \n",
       "\n",
       "   address_sp15_erson St   address_sp15_ South   address_sp15_ North   \n",
       "0                       1                     0                     1  \n",
       "1                       1                     1                     0  \n",
       "2                       1                     1                     0  \n",
       "3                       1                     0                     1  \n",
       "4                       0                     0                     0  \n",
       "5                       1                     1                     0  \n",
       "6                       0                     0                     1  \n",
       "7                       0                     0                     0  \n",
       "8                       0                     0                     0  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'sp15' (string overlap with allowed concurrent activations)\n",
    "\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'sp15':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parsing Unbounded Sets - Figure 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_nmcm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>32714.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>32715.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>32789.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>32714.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>32714.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>32789.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>32715.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>32735.714844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>32735.714844</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_nmcm\n",
       "0           1234 North Peterson St Orlando, FL 32714  32714.000000\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...  32715.000000\n",
       "2          3456 South Peterson St Maitland, FL 32789  32789.000000\n",
       "3           4567 North Peterson St Orlando, FL 32714  32714.000000\n",
       "4                   5678 Avenue St Orlando, FL 32714  32714.000000\n",
       "5          6789 South Peterson St Maitland, FL 32789  32789.000000\n",
       "6    5858 North Other St Altamonte Springs, FL 32715  32715.000000\n",
       "7                                               None  32735.714844\n",
       "8                                        Orlando, FL  32735.714844"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'nmcm' (string parse for number, commas ok)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'nmcm':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_nmcm_nmbr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>-0.657041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>-0.657041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_nmcm_nmbr\n",
       "0           1234 North Peterson St Orlando, FL 32714          -0.688760\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...          -0.657041\n",
       "2          3456 South Peterson St Maitland, FL 32789           1.690181\n",
       "3           4567 North Peterson St Orlando, FL 32714          -0.688760\n",
       "4                   5678 Avenue St Orlando, FL 32714          -0.688760\n",
       "5          6789 South Peterson St Maitland, FL 32789           1.690181\n",
       "6    5858 North Other St Altamonte Springs, FL 32715          -0.657041\n",
       "7                                               None           0.000000\n",
       "8                                        Orlando, FL           0.000000"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'nmc2' (nmcm with z-score)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'nmc2':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_nmcm_mnmx</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0.013333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0.013333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0.289524</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0.289524</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_nmcm_mnmx\n",
       "0           1234 North Peterson St Orlando, FL 32714           0.000000\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...           0.013333\n",
       "2          3456 South Peterson St Maitland, FL 32789           1.000000\n",
       "3           4567 North Peterson St Orlando, FL 32714           0.000000\n",
       "4                   5678 Avenue St Orlando, FL 32714           0.000000\n",
       "5          6789 South Peterson St Maitland, FL 32789           1.000000\n",
       "6    5858 North Other St Altamonte Springs, FL 32715           0.013333\n",
       "7                                               None           0.289524\n",
       "8                                        Orlando, FL           0.289524"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'nmc3' (nmcm with min-max)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'nmc3':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_strn_ord3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_strn_ord3\n",
       "0           1234 North Peterson St Orlando, FL 32714                  0\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                  4\n",
       "2          3456 South Peterson St Maitland, FL 32789                  1\n",
       "3           4567 North Peterson St Orlando, FL 32714                  0\n",
       "4                   5678 Avenue St Orlando, FL 32714                  2\n",
       "5          6789 South Peterson St Maitland, FL 32789                  1\n",
       "6    5858 North Other St Altamonte Springs, FL 32715                  3\n",
       "7                                               None                  5\n",
       "8                                        Orlando, FL                  6"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'strn' (string extraction of non-numeric characters)\n",
    "\n",
    "#this was noted in paper and not shown in figure\n",
    "#strn is similar to nmcm but extracts longest length non-numeric character set\n",
    "\n",
    "#by default it is followed with an ord3 ordinal for numeric encoding\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'strn':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_srch_Maitland</th>\n",
       "      <th>address_srch_Orlando</th>\n",
       "      <th>address_srch_Altamonte Springs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_srch_Maitland  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714                      0   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                      0   \n",
       "2          3456 South Peterson St Maitland, FL 32789                      1   \n",
       "3           4567 North Peterson St Orlando, FL 32714                      0   \n",
       "4                   5678 Avenue St Orlando, FL 32714                      0   \n",
       "5          6789 South Peterson St Maitland, FL 32789                      1   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715                      0   \n",
       "7                                               None                      0   \n",
       "8                                        Orlando, FL                      0   \n",
       "\n",
       "   address_srch_Orlando  address_srch_Altamonte Springs  \n",
       "0                     1                               0  \n",
       "1                     0                               1  \n",
       "2                     0                               0  \n",
       "3                     1                               0  \n",
       "4                     1                               0  \n",
       "5                     0                               0  \n",
       "6                     0                               1  \n",
       "7                     0                               0  \n",
       "8                     1                               0  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'srch' (categoric string search)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#note that the srch transform accepts parameter 'search'\n",
    "#as a list of search terms\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'srch':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'srch' : {'address'   : {'search' : ['Maitland', 'Orlando', 'Altamonte Springs']}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_srch_Orlando</th>\n",
       "      <th>address_srch_Altamonte Springs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_srch_Orlando  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714                     1   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                     0   \n",
       "2          3456 South Peterson St Maitland, FL 32789                     1   \n",
       "3           4567 North Peterson St Orlando, FL 32714                     1   \n",
       "4                   5678 Avenue St Orlando, FL 32714                     1   \n",
       "5          6789 South Peterson St Maitland, FL 32789                     1   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715                     0   \n",
       "7                                               None                     0   \n",
       "8                                        Orlando, FL                     1   \n",
       "\n",
       "   address_srch_Altamonte Springs  \n",
       "0                               0  \n",
       "1                               1  \n",
       "2                               0  \n",
       "3                               0  \n",
       "4                               0  \n",
       "5                               0  \n",
       "6                               1  \n",
       "7                               0  \n",
       "8                               0  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'srch' (categoric string search)\n",
    "\n",
    "#the paper noted potential to aggregate search terms into a common activation\n",
    "#this is achieved by passing parameter search with embedded lists of terms to be aggregated\n",
    "\n",
    "#Here we'll demonstrate by aggregating ['Maitland', 'Orlando'] into common activation\n",
    "#in context of pass search parameter as [['Maitland', 'Orlando'], 'Altamonte Springs']\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'srch':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'srch' : {'address'   : {'search' : [['Maitland', 'Orlando'], 'Altamonte Springs']}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_src4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_src4\n",
       "0           1234 North Peterson St Orlando, FL 32714             2\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...             3\n",
       "2          3456 South Peterson St Maitland, FL 32789             1\n",
       "3           4567 North Peterson St Orlando, FL 32714             2\n",
       "4                   5678 Avenue St Orlando, FL 32714             2\n",
       "5          6789 South Peterson St Maitland, FL 32789             1\n",
       "6    5858 North Other St Altamonte Springs, FL 32715             3\n",
       "7                                               None             0\n",
       "8                                        Orlando, FL             2"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'src4' (categoric string search, ordinal encoding)\n",
    "\n",
    "#the convention is that for cases of multiple activations to same row\n",
    "#entries toward end of search parameter list take precendence\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#note that the src4 transform accepts parameter 'search'\n",
    "#as a list of search terms\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'src4':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'src4' : {'address'   : {'search' : ['Maitland', 'Orlando', 'Altamonte Springs']}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Family Tree Aggregations - Figure 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_UPCS_nmc7_nmbr</th>\n",
       "      <th>address_UPCS_1010_0</th>\n",
       "      <th>address_UPCS_1010_1</th>\n",
       "      <th>address_UPCS_1010_2</th>\n",
       "      <th>address_UPCS_1010_3</th>\n",
       "      <th>address_UPCS_spl9_ord3</th>\n",
       "      <th>address_UPCS_spl9_sp10_ord3</th>\n",
       "      <th>address_NArw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>-0.657041</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>-0.657041</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_UPCS_nmc7_nmbr  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714               -0.688760   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...               -0.657041   \n",
       "2          3456 South Peterson St Maitland, FL 32789                1.690181   \n",
       "3           4567 North Peterson St Orlando, FL 32714               -0.688760   \n",
       "4                   5678 Avenue St Orlando, FL 32714               -0.688760   \n",
       "5          6789 South Peterson St Maitland, FL 32789                1.690181   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715               -0.657041   \n",
       "7                                               None                0.000000   \n",
       "8                                        Orlando, FL                0.000000   \n",
       "\n",
       "   address_UPCS_1010_0  address_UPCS_1010_1  address_UPCS_1010_2  \\\n",
       "0                    0                    0                    0   \n",
       "1                    0                    0                    0   \n",
       "2                    0                    0                    1   \n",
       "3                    0                    0                    1   \n",
       "4                    0                    1                    0   \n",
       "5                    0                    1                    1   \n",
       "6                    0                    1                    0   \n",
       "7                    0                    1                    1   \n",
       "8                    1                    0                    0   \n",
       "\n",
       "   address_UPCS_1010_3  address_UPCS_spl9_ord3  address_UPCS_spl9_sp10_ord3  \\\n",
       "0                    0                       0                            1   \n",
       "1                    1                       2                            0   \n",
       "2                    0                       1                            2   \n",
       "3                    1                       0                            1   \n",
       "4                    0                       3                            1   \n",
       "5                    0                       1                            2   \n",
       "6                    1                       2                            0   \n",
       "7                    1                       4                            0   \n",
       "8                    0                       5                            0   \n",
       "\n",
       "   address_NArw  \n",
       "0             0  \n",
       "1             0  \n",
       "2             0  \n",
       "3             0  \n",
       "4             0  \n",
       "5             0  \n",
       "6             0  \n",
       "7             1  \n",
       "8             0  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'or19' (a family tree of transformations)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#here we'll also activate the NArw_marker parameter\n",
    "#to be consistent with the demonstrations in paper\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  NArw_marker = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = {'or19':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>address_UPCS_nmc7_nmbr</th>\n",
       "      <th>address_UPCS_1010_0</th>\n",
       "      <th>address_UPCS_1010_1</th>\n",
       "      <th>address_UPCS_1010_2</th>\n",
       "      <th>address_UPCS_1010_3</th>\n",
       "      <th>address_UPCS_spl9_ord3</th>\n",
       "      <th>address_UPCS_spl9_sp10_ord3</th>\n",
       "      <th>address_NArw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   address_UPCS_nmc7_nmbr  address_UPCS_1010_0  address_UPCS_1010_1  \\\n",
       "0                    True                 True                 True   \n",
       "1                    True                 True                 True   \n",
       "2                    True                 True                 True   \n",
       "3                    True                 True                 True   \n",
       "4                    True                 True                 True   \n",
       "5                    True                 True                 True   \n",
       "6                    True                 True                 True   \n",
       "7                    True                 True                 True   \n",
       "8                    True                 True                 True   \n",
       "\n",
       "   address_UPCS_1010_2  address_UPCS_1010_3  address_UPCS_spl9_ord3  \\\n",
       "0                 True                 True                    True   \n",
       "1                 True                 True                    True   \n",
       "2                 True                 True                    True   \n",
       "3                 True                 True                    True   \n",
       "4                 True                 True                    True   \n",
       "5                 True                 True                    True   \n",
       "6                 True                 True                    True   \n",
       "7                 True                 True                    True   \n",
       "8                 True                 True                    True   \n",
       "\n",
       "   address_UPCS_spl9_sp10_ord3  address_NArw  \n",
       "0                         True          True  \n",
       "1                         True          True  \n",
       "2                         True          True  \n",
       "3                         True          True  \n",
       "4                         True          True  \n",
       "5                         True          True  \n",
       "6                         True          True  \n",
       "7                         True          True  \n",
       "8                         True          True  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#here we'll validate or19 by running same data through postmunge(.)\n",
    "\n",
    "#the comparison between the returned sets demonstrates consistency of transforms\n",
    "#these type of comparisons between functions applied to different kinds of data\n",
    "#are one of the ways we validate\n",
    "\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df, \\\n",
    "             pandasoutput = True, printstatus = False)\n",
    "\n",
    "test[['address_UPCS_nmc7_nmbr',\n",
    " 'address_UPCS_1010_0',\n",
    " 'address_UPCS_1010_1',\n",
    " 'address_UPCS_1010_2',\n",
    " 'address_UPCS_1010_3',\n",
    " 'address_UPCS_spl9_ord3',\n",
    " 'address_UPCS_spl9_sp10_ord3',\n",
    " 'address_NArw']] \\\n",
    "== train[['address_UPCS_nmc7_nmbr',\n",
    " 'address_UPCS_1010_0',\n",
    " 'address_UPCS_1010_1',\n",
    " 'address_UPCS_1010_2',\n",
    " 'address_UPCS_1010_3',\n",
    " 'address_UPCS_spl9_ord3',\n",
    " 'address_UPCS_spl9_sp10_ord3',\n",
    " 'address_NArw']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0             1234 NORTH PETERSON ST ORLANDO, FL 32714\n",
       "1    2345 SOUTH ANDERSON ST ALTAMONTE SPRINGS, FL 3...\n",
       "2            3456 SOUTH PETERSON ST MAITLAND, FL 32789\n",
       "3             4567 NORTH PETERSON ST ORLANDO, FL 32714\n",
       "4                     5678 AVENUE ST ORLANDO, FL 32714\n",
       "5            6789 SOUTH PETERSON ST MAITLAND, FL 32789\n",
       "6      5858 NORTH OTHER ST ALTAMONTE SPRINGS, FL 32715\n",
       "7                                                 NONE\n",
       "8                                          ORLANDO, FL\n",
       "Name: address, dtype: object"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Note that if we want to recover the original form of data from the returned sets\n",
    "#we can run an inversion operation with the postmunge(.) function\n",
    "\n",
    "#although some of the transformations, such as UPCS, may not have full info recovery \n",
    "#(such as here with case configuration)\n",
    "\n",
    "df_invert, recovered_list, inversion_info_dict = \\\n",
    "am.postmunge(postprocess_dict, test, inversion='test', \\\n",
    "             pandasoutput=True, printstatus=False)\n",
    "\n",
    "df_invert['address']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Thanks!"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "draft String Theory demo.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
