{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "XD4yvwFfk1Bh"
   },
   "source": [
    "# String Theory - Demonstration Notebook"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "AucWFCq_kz9d"
   },
   "source": [
    "This notebook will demonstrate material related to discussions in the paper \"String Theory: Parsed Categoric Encodings with Automunge\". Let's get right to it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 190
    },
    "colab_type": "code",
    "id": "7mqQsQfAlJpK",
    "outputId": "7afa9280-aa3b-42dd-9f4d-c421e1c23540"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: Automunge in /opt/anaconda3/lib/python3.8/site-packages (4.87)\n",
      "Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.8/site-packages (from Automunge) (1.5.0)\n",
      "Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.8/site-packages (from Automunge) (1.18.5)\n",
      "Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.8/site-packages (from Automunge) (0.23.1)\n",
      "Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.8/site-packages (from Automunge) (1.0.5)\n",
      "Requirement already satisfied: joblib>=0.11 in /opt/anaconda3/lib/python3.8/site-packages (from scikit-learn->Automunge) (0.16.0)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from scikit-learn->Automunge) (2.1.0)\n",
      "Requirement already satisfied: pytz>=2017.2 in /opt/anaconda3/lib/python3.8/site-packages (from pandas->Automunge) (2020.1)\n",
      "Requirement already satisfied: python-dateutil>=2.6.1 in /opt/anaconda3/lib/python3.8/site-packages (from pandas->Automunge) (2.8.1)\n",
      "Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.8/site-packages (from python-dateutil>=2.6.1->pandas->Automunge) (1.15.0)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "#Automunge is available for pip install:\n",
    "!pip install Automunge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "pUUu-thSlSZe"
   },
   "outputs": [],
   "source": [
    "#or to upgrade (we currently roll out upgrades fairly frequently)\n",
    "#!pip install Automunge --upgrade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "O8raPvdClZ6y"
   },
   "outputs": [],
   "source": [
    "#Once installed, run this in local session to initialize\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "y2BOvuIFldp7"
   },
   "outputs": [],
   "source": [
    "#To demonstrate, we'll populate a simple dataframe consistent with the examples in the paper.\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "df_train = \\\n",
    "pd.DataFrame({'column1':['circle','circle','circle','square','square','triangle',1234,np.nan,np.nan], \\\n",
    "              'column2':['yes','yes','yes','yes','no','no','no',np.nan,np.nan], \\\n",
    "              'address':['1234 North Peterson St Orlando, FL 32714',\n",
    "                         '2345 South Anderson St Altamonte Springs, FL 32715',\n",
    "                         '3456 South Peterson St Maitland, FL 32789',\n",
    "                         '4567 North Peterson St Orlando, FL 32714',\n",
    "                         '5678 Avenue St Orlando, FL 32714',\n",
    "                         '6789 South Peterson St Maitland, FL 32789',\n",
    "                         '5858 North Other St Altamonte Springs, FL 32715',\n",
    "                         None,\n",
    "                         'Orlando, FL']})\n",
    "\n",
    "#for a test set we'll just copy the train set\n",
    "df_test = df_train.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "EjjebOtTnQPN"
   },
   "outputs": [],
   "source": [
    "#The returned postprocess_dict should be saved such as with pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "colab_type": "code",
    "id": "4UzHhetrnY1L",
    "outputId": "a7b6f520-d500-48af-feb5-2347894aa1c3"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Automunge processing\n",
      "\n",
      "evaluating column:  column1\n",
      "processing column:  column1\n",
      "    root category:  1010\n",
      " returned columns:\n",
      "['column1_1010_0', 'column1_1010_1', 'column1_1010_2']\n",
      "\n",
      "evaluating column:  column2\n",
      "processing column:  column2\n",
      "    root category:  bnry\n",
      " returned columns:\n",
      "['column2_bnry']\n",
      "\n",
      "evaluating column:  address\n",
      "processing column:  address\n",
      "    root category:  1010\n",
      " returned columns:\n",
      "['address_1010_0', 'address_1010_1', 'address_1010_2', 'address_1010_3']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  column1_1010_0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  column1_1010_1\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  column1_1010_2\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  column2_bnry\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_1\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_2\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_3\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "______\n",
      "\n",
      "versioning serial stamp:\n",
      "_4.87_198806890858_2020-10-01T09:37:03.761691\n",
      "\n",
      "Automunge returned ID column set: \n",
      "['Automunge_index_198806890858']\n",
      "\n",
      "Automunge returned train column set: \n",
      "['column1_1010_0', 'column1_1010_1', 'column1_1010_2', 'column2_bnry', 'address_1010_0', 'address_1010_1', 'address_1010_2', 'address_1010_3']\n",
      "\n",
      "_______________\n",
      "Automunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#I find it helps to just copy and paste the full range of parameters for reference\n",
    "\n",
    "#here are the defaults with only deviation of turning PCA off for this simple data set\n",
    "#(there is a heuristic for small data sets based on ratio #features/#rows)\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df_train, df_test = False, \\\n",
    "  labels_column = False, trainID_column = False, testID_column = False, \\\n",
    "  valpercent1=0.0, valpercent2 = 0.0, floatprecision = 32, shuffletrain = True, \\\n",
    "  TrainLabelFreqLevel = False, powertransform = False, binstransform = False, \\\n",
    "  MLinfill = False, infilliterate=1, randomseed = 42, eval_ratio = .5, \\\n",
    "  LabelSmoothing_train = False, LabelSmoothing_test = False, \\\n",
    "  LabelSmoothing_val = False, LSfit = False, \\\n",
    "  numbercategoryheuristic = 63, pandasoutput = False, NArw_marker = False, \\\n",
    "  featureselection = False, featurepct = 1.0, \\\n",
    "  featuremetric = 0.0, featuremethod = 'default', \\\n",
    "  Binary = False, PCAn_components = False, PCAexcl = [], excl_suffix = False, \\\n",
    "  ML_cmnd = {'MLinfill_type':'default', \\\n",
    "             'MLinfill_cmnd':{'RandomForestClassifier':{}, \\\n",
    "                              'RandomForestRegressor':{}}, \\\n",
    "             'PCA_type':'off', \\\n",
    "             'PCA_cmnd':{}}, \\\n",
    "  assigncat = {\n",
    "  'nmbr':[], 'retn':[], 'mnmx':[], 'mean':[], 'MAD3':[], 'lgnm':[],\n",
    "  'bins':[], 'bsor':[], 'pwrs':[], 'pwr2':[], 'por2':[], 'bxcx':[],\n",
    "  'addd':[], 'sbtr':[], 'mltp':[], 'divd':[],\n",
    "  'log0':[], 'log1':[], 'logn':[], 'sqrt':[], 'rais':[], 'absl':[],\n",
    "  'bnwd':[], 'bnwK':[], 'bnwM':[], 'bnwo':[], 'bnKo':[], 'bnMo':[],\n",
    "  'bnep':[], 'bne7':[], 'bne9':[], 'bneo':[], 'bn7o':[], 'bn9o':[],\n",
    "  'bkt1':[], 'bkt2':[], 'bkt3':[], 'bkt4':[],\n",
    "  'nbr2':[], 'nbr3':[], 'MADn':[], 'MAD2':[], 'tlbn':[],\n",
    "  'mnm2':[], 'mnm3':[], 'mnm4':[], 'mnm5':[], 'mnm6':[],\n",
    "  'ntgr':[], 'ntg2':[], 'ntg3':[], 'mea2':[], 'mea3':[], 'bxc2':[],\n",
    "  'dxdt':[], 'd2dt':[], 'd3dt':[], 'dxd2':[], 'd2d2':[], 'd3d2':[],\n",
    "  'nmdx':[], 'nmd2':[], 'nmd3':[], 'mmdx':[], 'mmd2':[], 'mmd3':[],\n",
    "  'shft':[], 'shf2':[], 'shf3':[], 'shf4':[], 'shf7':[], 'shf8':[],\n",
    "  'bnry':[], 'onht':[], 'text':[], 'txt2':[], '1010':[], 'or10':[],\n",
    "  'ordl':[], 'ord2':[], 'ord3':[], 'ord4':[], 'om10':[], 'mmor':[],\n",
    "  'Unht':[], 'Utxt':[], 'Utx2':[], 'Uor3':[], 'Uor6':[], 'U101':[],\n",
    "  'splt':[], 'spl2':[], 'spl5':[], 'sp15':[], 'sp19':[], 'sbst':[],\n",
    "  'spl8':[], 'spl9':[], 'sp10':[], 'sp16':[], 'sp20':[], 'sbs2':[],\n",
    "  'srch':[], 'src2':[], 'src4':[], 'strn':[], 'lngt':[], 'aggt':[],\n",
    "  'nmrc':[], 'nmr2':[], 'nmcm':[], 'nmc2':[], 'nmEU':[], 'nmE2':[],\n",
    "  'nmr7':[], 'nmr8':[], 'nmc7':[], 'nmc8':[], 'nmE7':[], 'nmE8':[],\n",
    "  'ors2':[], 'ors5':[], 'ors6':[], 'ors7':[], 'ucct':[], 'Ucct':[],\n",
    "  'or15':[], 'or17':[], 'or19':[], 'or20':[], 'or21':[], 'or22':[],\n",
    "  'date':[], 'dat2':[], 'dat6':[], 'wkdy':[], 'bshr':[], 'hldy':[],\n",
    "  'wkds':[], 'wkdo':[], 'mnts':[], 'mnto':[],\n",
    "  'yea2':[], 'mnt2':[], 'mnt6':[], 'day2':[], 'day5':[],\n",
    "  'hrs2':[], 'hrs4':[], 'min2':[], 'min4':[], 'scn2':[], 'DPrt':[],\n",
    "  'DPnb':[], 'DPmm':[], 'DPbn':[], 'DPod':[], 'DP10':[], 'DPoh':[],\n",
    "  'excl':[], 'exc2':[], 'exc3':[], 'exc4':[], 'exc5':[], 'exc6':[],\n",
    "  'null':[], 'copy':[], 'shfl':[], 'eval':[], 'ptfm':[]}, \\\n",
    "  assigninfill = {'stdrdinfill':[], 'MLinfill':[], \\\n",
    "                  'zeroinfill':[], 'oneinfill':[], \\\n",
    "                  'adjinfill':[], 'meaninfill':[], 'medianinfill':[], \\\n",
    "                  'modeinfill':[], 'lcinfill':[]}, \\\n",
    "  assignparam = {'default_assignparam' : {'(category)' : {'(parameter)' : 42}}, \\\n",
    "                  '(category)' : {'(column)'   : {'(parameter)' : 42}}}, \\\n",
    "  transformdict = {}, processdict = {}, evalcat = False, \\\n",
    "  printstatus = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "colab_type": "code",
    "id": "Ut0QHZmnneSP",
    "outputId": "1ee03005-e7d7-4287-ee9e-3acb4d9b26db"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  column1\n",
      "    root category:  1010\n",
      "\n",
      " returned columns:\n",
      "['column1_1010_0', 'column1_1010_1', 'column1_1010_2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  column2\n",
      "    root category:  bnry\n",
      "\n",
      " returned columns:\n",
      "['column2_bnry']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  address\n",
      "    root category:  1010\n",
      "\n",
      " returned columns:\n",
      "['address_1010_0', 'address_1010_1', 'address_1010_2', 'address_1010_3']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  column1_1010_0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  column1_1010_1\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  column1_1010_2\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  column2_bnry\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_1\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_2\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  address_1010_3\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "_______________\n",
      "Postmunge returned ID column set: \n",
      "['Automunge_index_198806890858']\n",
      "\n",
      "Postmunge returned test column set: \n",
      "['column1_1010_0', 'column1_1010_1', 'column1_1010_2', 'column2_bnry', 'address_1010_0', 'address_1010_1', 'address_1010_2', 'address_1010_3']\n",
      "\n",
      "_______________\n",
      "Postmunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#or for postmunge(.) with full range of parameters:\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df_test, \\\n",
    "             testID_column = False, labelscolumn = False, \\\n",
    "             pandasoutput = False, printstatus = True, \\\n",
    "             TrainLabelFreqLevel = False, featureeval = False, driftreport = False, \\\n",
    "             LabelSmoothing = False, LSfit = False, \\\n",
    "             returnedsets = True, shuffletrain = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "5vfHUFMu7BO4",
    "outputId": "2044b560-0d55-49e1-9f31-a4bf8f1de586"
   },
   "outputs": [],
   "source": [
    "#Where the function returns numpy arrays of the encoded data\n",
    "#we could alternatively return pandas dataframes by passing pandasoutput=True\n",
    "#which we'll apply for below examples\n",
    "#and turning off default shuffling and printouts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "bOaVAFYw6mdt"
   },
   "source": [
    "# Demonstrations from Paper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "XVF_8O7q62y1"
   },
   "source": [
    "## Categoric Encodings - Figure 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "LDv-U2qv6Fle",
    "outputId": "289701e2-ad64-4f1b-bf5f-76bac81c543b"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>column1_1234</th>\n",
       "      <th>column1_circle</th>\n",
       "      <th>column1_square</th>\n",
       "      <th>column1_triangle</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1234</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  source_column  column1_1234  column1_circle  column1_square  \\\n",
       "0        circle             0               1               0   \n",
       "1        circle             0               1               0   \n",
       "2        circle             0               1               0   \n",
       "3        square             0               0               1   \n",
       "4        square             0               0               1   \n",
       "5      triangle             0               0               0   \n",
       "6          1234             1               0               0   \n",
       "7           NaN             0               0               0   \n",
       "8           NaN             0               0               0   \n",
       "\n",
       "   column1_triangle  \n",
       "0                 0  \n",
       "1                 0  \n",
       "2                 0  \n",
       "3                 0  \n",
       "4                 0  \n",
       "5                 1  \n",
       "6                 0  \n",
       "7                 0  \n",
       "8                 0  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'text' (one-hot encoding)\n",
    "\n",
    "#we'll just apply to one column at a time for demonstrations\n",
    "df = pd.DataFrame(df_train['column1'].copy())\n",
    "#and a copy of that column for comparison\n",
    "df['source_column'] = df['column1'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'text':['column1'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "-wJPI9cL_FAk",
    "outputId": "9ff9be44-bebc-45c7-9466-ae29dcaf162d"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>column1_1010_0</th>\n",
       "      <th>column1_1010_1</th>\n",
       "      <th>column1_1010_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1234</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  source_column  column1_1010_0  column1_1010_1  column1_1010_2\n",
       "0        circle               0               0               1\n",
       "1        circle               0               0               1\n",
       "2        circle               0               0               1\n",
       "3        square               0               1               0\n",
       "4        square               0               1               0\n",
       "5      triangle               0               1               1\n",
       "6          1234               0               0               0\n",
       "7           NaN               1               0               0\n",
       "8           NaN               1               0               0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'1010' (binary encoding)\n",
    "\n",
    "df = pd.DataFrame(df_train['column1'].copy())\n",
    "df['source_column'] = df['column1'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'1010':['column1'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "1sWSxfnv_Y2e",
    "outputId": "33432117-dcd2-46ac-cd03-fef534611993"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>column1_ordl</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>circle</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>square</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1234</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  source_column  column1_ordl\n",
       "0        circle             1\n",
       "1        circle             1\n",
       "2        circle             1\n",
       "3        square             2\n",
       "4        square             2\n",
       "5      triangle             3\n",
       "6          1234             0\n",
       "7           NaN             4\n",
       "8           NaN             4"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'ordl' (ordinal alphabetical)\n",
    "\n",
    "df = pd.DataFrame(df_train['column1'].copy())\n",
    "df['source_column'] = df['column1'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'ordl':['column1'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>column1_ord3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>square</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1234</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  source_column  column1_ord3\n",
       "0        circle             0\n",
       "1        circle             0\n",
       "2        circle             0\n",
       "3        square             1\n",
       "4        square             1\n",
       "5      triangle             4\n",
       "6          1234             3\n",
       "7           NaN             2\n",
       "8           NaN             2"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'ord3' (ordinal by frequency)\n",
    "\n",
    "df = pd.DataFrame(df_train['column1'].copy())\n",
    "df['source_column'] = df['column1'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'ord3':['column1'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 328
    },
    "colab_type": "code",
    "id": "ZEbGtnAK_q9n",
    "outputId": "46677228-c02f-4975-ebee-809a22f7ab1d"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>column2_bnry</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>yes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>yes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>yes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>yes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>no</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>no</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>no</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  source_column  column2_bnry\n",
       "0           yes             1\n",
       "1           yes             1\n",
       "2           yes             1\n",
       "3           yes             1\n",
       "4            no             0\n",
       "5            no             0\n",
       "6            no             0\n",
       "7           NaN             1\n",
       "8           NaN             1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'bnry' (boolean)\n",
    "\n",
    "\n",
    "df = pd.DataFrame(df_train['column2'].copy())\n",
    "df['source_column'] = df['column2'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'bnry':['column2'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "YZrF9ZJu_8xn"
   },
   "source": [
    "## String Parsing - Figure 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 464
    },
    "colab_type": "code",
    "id": "RMz54TBa_3ah",
    "outputId": "fa6264f0-2d11-44c5-d95a-e9b14581f4b1"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_splt_ South Peterson St Maitland, FL 32789</th>\n",
       "      <th>address_splt_ North Peterson St Orlando, FL 32714</th>\n",
       "      <th>address_splt_ St Altamonte Springs, FL 32715</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...   \n",
       "2          3456 South Peterson St Maitland, FL 32789   \n",
       "3           4567 North Peterson St Orlando, FL 32714   \n",
       "4                   5678 Avenue St Orlando, FL 32714   \n",
       "5          6789 South Peterson St Maitland, FL 32789   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715   \n",
       "7                                               None   \n",
       "8                                        Orlando, FL   \n",
       "\n",
       "   address_splt_ South Peterson St Maitland, FL 32789  \\\n",
       "0                                                  0    \n",
       "1                                                  0    \n",
       "2                                                  1    \n",
       "3                                                  0    \n",
       "4                                                  0    \n",
       "5                                                  1    \n",
       "6                                                  0    \n",
       "7                                                  0    \n",
       "8                                                  0    \n",
       "\n",
       "   address_splt_ North Peterson St Orlando, FL 32714  \\\n",
       "0                                                  1   \n",
       "1                                                  0   \n",
       "2                                                  0   \n",
       "3                                                  1   \n",
       "4                                                  0   \n",
       "5                                                  0   \n",
       "6                                                  0   \n",
       "7                                                  0   \n",
       "8                                                  0   \n",
       "\n",
       "   address_splt_ St Altamonte Springs, FL 32715  \n",
       "0                                             0  \n",
       "1                                             1  \n",
       "2                                             0  \n",
       "3                                             0  \n",
       "4                                             0  \n",
       "5                                             0  \n",
       "6                                             1  \n",
       "7                                             0  \n",
       "8                                             0  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'splt' (string overlap identification)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'splt':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_splt_Altamonte</th>\n",
       "      <th>address_splt_Peterson</th>\n",
       "      <th>address_splt_Maitland</th>\n",
       "      <th>address_splt_Orlando</th>\n",
       "      <th>address_splt_Springs</th>\n",
       "      <th>address_splt_North</th>\n",
       "      <th>address_splt_32714</th>\n",
       "      <th>address_splt_South</th>\n",
       "      <th>address_splt_32715</th>\n",
       "      <th>address_splt_32789</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_splt_Altamonte  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714                       0   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                       1   \n",
       "2          3456 South Peterson St Maitland, FL 32789                       0   \n",
       "3           4567 North Peterson St Orlando, FL 32714                       0   \n",
       "4                   5678 Avenue St Orlando, FL 32714                       0   \n",
       "5          6789 South Peterson St Maitland, FL 32789                       0   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715                       1   \n",
       "7                                               None                       0   \n",
       "8                                        Orlando, FL                       0   \n",
       "\n",
       "   address_splt_Peterson  address_splt_Maitland  address_splt_Orlando  \\\n",
       "0                      1                      0                     1   \n",
       "1                      0                      0                     0   \n",
       "2                      1                      1                     0   \n",
       "3                      1                      0                     1   \n",
       "4                      0                      0                     1   \n",
       "5                      1                      1                     0   \n",
       "6                      0                      0                     0   \n",
       "7                      0                      0                     0   \n",
       "8                      0                      0                     1   \n",
       "\n",
       "   address_splt_Springs  address_splt_North  address_splt_32714  \\\n",
       "0                     0                   1                   1   \n",
       "1                     1                   0                   0   \n",
       "2                     0                   0                   0   \n",
       "3                     0                   1                   1   \n",
       "4                     0                   0                   1   \n",
       "5                     0                   0                   0   \n",
       "6                     1                   1                   0   \n",
       "7                     0                   0                   0   \n",
       "8                     0                   0                   0   \n",
       "\n",
       "   address_splt_South  address_splt_32715  address_splt_32789  \n",
       "0                   0                   0                   0  \n",
       "1                   1                   1                   0  \n",
       "2                   1                   0                   1  \n",
       "3                   0                   0                   0  \n",
       "4                   0                   0                   0  \n",
       "5                   1                   0                   1  \n",
       "6                   0                   1                   0  \n",
       "7                   0                   0                   0  \n",
       "8                   0                   0                   0  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'splt' (string overlap identification)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#the paper noted also that parameters may be passed to string parsing functions\n",
    "#to exclude spaces and special characters\n",
    "#such as to promote single word activations\n",
    "#here is a demonstration (was not shown in Figure 2)\n",
    "\n",
    "#Here we pass 'space_and_punctuation' parameter as False to exclude special characters from overlaps\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'splt':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'splt' : {'address' : {'space_and_punctuation':False}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_splt_ South Peterson St Maitland, FL 32789</th>\n",
       "      <th>address_splt_ North Peterson St Orlando, FL 32714</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...   \n",
       "2          3456 South Peterson St Maitland, FL 32789   \n",
       "3           4567 North Peterson St Orlando, FL 32714   \n",
       "4                   5678 Avenue St Orlando, FL 32714   \n",
       "5          6789 South Peterson St Maitland, FL 32789   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715   \n",
       "7                                               None   \n",
       "8                                        Orlando, FL   \n",
       "\n",
       "   address_splt_ South Peterson St Maitland, FL 32789  \\\n",
       "0                                                  0    \n",
       "1                                                  0    \n",
       "2                                                  1    \n",
       "3                                                  0    \n",
       "4                                                  0    \n",
       "5                                                  1    \n",
       "6                                                  0    \n",
       "7                                                  0    \n",
       "8                                                  0    \n",
       "\n",
       "   address_splt_ North Peterson St Orlando, FL 32714  \n",
       "0                                                  1  \n",
       "1                                                  0  \n",
       "2                                                  0  \n",
       "3                                                  1  \n",
       "4                                                  0  \n",
       "5                                                  0  \n",
       "6                                                  0  \n",
       "7                                                  0  \n",
       "8                                                  0  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'splt' (string overlap identification)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#the paper noted also that the parsing inspection is incremented from max entry length\n",
    "#down to a configurable minimum length for overlap detection threshold\n",
    "#here is a demonstration (was not shown in Figure 2)\n",
    "\n",
    "#Here we pass 'misplit' parameter as 32 such as to filter out overlaps below this threshold\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'splt':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'splt' : {'address' : {'minsplit':32}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "colab_type": "code",
    "id": "ux1LHMRVAsnd",
    "outputId": "4dfb7216-8ba0-4c53-fb5d-61b812258692"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_spl2_ord3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_spl2_ord3\n",
       "0           1234 North Peterson St Orlando, FL 32714                  0\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                  2\n",
       "2          3456 South Peterson St Maitland, FL 32789                  1\n",
       "3           4567 North Peterson St Orlando, FL 32714                  0\n",
       "4                   5678 Avenue St Orlando, FL 32714                  3\n",
       "5          6789 South Peterson St Maitland, FL 32789                  1\n",
       "6    5858 North Other St Altamonte Springs, FL 32715                  2\n",
       "7                                               None                  4\n",
       "8                                        Orlando, FL                  5"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'spl2' (string overlap ordinal)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'spl2':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "w6f88HyyA8Eo"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_spl5_ord3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_spl5_ord3\n",
       "0           1234 North Peterson St Orlando, FL 32714                  1\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                  3\n",
       "2          3456 South Peterson St Maitland, FL 32789                  2\n",
       "3           4567 North Peterson St Orlando, FL 32714                  1\n",
       "4                   5678 Avenue St Orlando, FL 32714                  0\n",
       "5          6789 South Peterson St Maitland, FL 32789                  2\n",
       "6    5858 North Other St Altamonte Springs, FL 32715                  3\n",
       "7                                               None                  0\n",
       "8                                        Orlando, FL                  0"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'spl5' (spl2 w/ excluded non-overlaps)\n",
    "\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'spl5':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_sp15_ South Peterson St Maitland, FL 32789</th>\n",
       "      <th>address_sp15_ North Peterson St Orlando, FL 32714</th>\n",
       "      <th>address_sp15_ St Altamonte Springs, FL 32715</th>\n",
       "      <th>address_sp15_ St Orlando, FL 32714</th>\n",
       "      <th>address_sp15_Orlando, FL</th>\n",
       "      <th>address_sp15_erson St</th>\n",
       "      <th>address_sp15_ South</th>\n",
       "      <th>address_sp15_ North</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...   \n",
       "2          3456 South Peterson St Maitland, FL 32789   \n",
       "3           4567 North Peterson St Orlando, FL 32714   \n",
       "4                   5678 Avenue St Orlando, FL 32714   \n",
       "5          6789 South Peterson St Maitland, FL 32789   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715   \n",
       "7                                               None   \n",
       "8                                        Orlando, FL   \n",
       "\n",
       "   address_sp15_ South Peterson St Maitland, FL 32789  \\\n",
       "0                                                  0    \n",
       "1                                                  0    \n",
       "2                                                  1    \n",
       "3                                                  0    \n",
       "4                                                  0    \n",
       "5                                                  1    \n",
       "6                                                  0    \n",
       "7                                                  0    \n",
       "8                                                  0    \n",
       "\n",
       "   address_sp15_ North Peterson St Orlando, FL 32714  \\\n",
       "0                                                  1   \n",
       "1                                                  0   \n",
       "2                                                  0   \n",
       "3                                                  1   \n",
       "4                                                  0   \n",
       "5                                                  0   \n",
       "6                                                  0   \n",
       "7                                                  0   \n",
       "8                                                  0   \n",
       "\n",
       "   address_sp15_ St Altamonte Springs, FL 32715  \\\n",
       "0                                             0   \n",
       "1                                             1   \n",
       "2                                             0   \n",
       "3                                             0   \n",
       "4                                             0   \n",
       "5                                             0   \n",
       "6                                             1   \n",
       "7                                             0   \n",
       "8                                             0   \n",
       "\n",
       "   address_sp15_ St Orlando, FL 32714  address_sp15_Orlando, FL  \\\n",
       "0                                   1                         1   \n",
       "1                                   0                         0   \n",
       "2                                   0                         0   \n",
       "3                                   1                         1   \n",
       "4                                   1                         1   \n",
       "5                                   0                         0   \n",
       "6                                   0                         0   \n",
       "7                                   0                         0   \n",
       "8                                   0                         1   \n",
       "\n",
       "   address_sp15_erson St   address_sp15_ South   address_sp15_ North   \n",
       "0                       1                     0                     1  \n",
       "1                       1                     1                     0  \n",
       "2                       1                     1                     0  \n",
       "3                       1                     0                     1  \n",
       "4                       0                     0                     0  \n",
       "5                       1                     1                     0  \n",
       "6                       0                     0                     1  \n",
       "7                       0                     0                     0  \n",
       "8                       0                     0                     0  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'sp15' (string overlap with allowed concurrent activations)\n",
    "\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'sp15':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A variation on string parsing with concurrent activations is available to reduce the dimensionality of the returned set, by way of a binary encoding consolidation of the set of activations. For example if sp15 returned 8 columns, and two of the rows had the same set of activations, the binary consolidation would assign a distinct activation set for those two rows represented in a reduced number of columns, here we see that taking place for rows 0/3 and 2/5."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_sp19_0</th>\n",
       "      <th>address_sp19_1</th>\n",
       "      <th>address_sp19_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_sp19_0  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714               1   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...               1   \n",
       "2          3456 South Peterson St Maitland, FL 32789               1   \n",
       "3           4567 North Peterson St Orlando, FL 32714               1   \n",
       "4                   5678 Avenue St Orlando, FL 32714               0   \n",
       "5          6789 South Peterson St Maitland, FL 32789               1   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715               0   \n",
       "7                                               None               0   \n",
       "8                                        Orlando, FL               0   \n",
       "\n",
       "   address_sp19_1  address_sp19_2  \n",
       "0               0               1  \n",
       "1               0               0  \n",
       "2               1               0  \n",
       "3               0               1  \n",
       "4               1               0  \n",
       "5               1               0  \n",
       "6               1               1  \n",
       "7               0               0  \n",
       "8               0               1  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'sp19' (string overlap with allowed concurrent activations, binary consolidated activations)\n",
    "\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'sp19':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Another variation is available with the \"sbst\" transforms - sbst is sort of a simpler version of string parsing in which instead of comparing string character subsets of entries to string character subsets of other entries, the sbst string parsing only compares string character subsets of entries to complete character representations of other entries, such as to identify presence of overlaps between complete entries and subsets of other entries. Here is a demonstration, which for this example will identify the entry \"Orlando, FL\" present in as a subset of some of the other entries:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_sbst_Orlando, FL</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_sbst_Orlando, FL\n",
       "0           1234 North Peterson St Orlando, FL 32714                         1\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                         0\n",
       "2          3456 South Peterson St Maitland, FL 32789                         0\n",
       "3           4567 North Peterson St Orlando, FL 32714                         1\n",
       "4                   5678 Avenue St Orlando, FL 32714                         1\n",
       "5          6789 South Peterson St Maitland, FL 32789                         0\n",
       "6    5858 North Other St Altamonte Springs, FL 32715                         0\n",
       "7                                               None                         0\n",
       "8                                        Orlando, FL                         1"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'sbst' (string overlap with complete entries)\n",
    "\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'sbst':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parsing Unbounded Sets - Figure 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_nmcm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>32714.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>32715.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>32789.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>32714.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>32714.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>32789.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>32715.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>32735.714844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>32735.714844</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_nmcm\n",
       "0           1234 North Peterson St Orlando, FL 32714  32714.000000\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...  32715.000000\n",
       "2          3456 South Peterson St Maitland, FL 32789  32789.000000\n",
       "3           4567 North Peterson St Orlando, FL 32714  32714.000000\n",
       "4                   5678 Avenue St Orlando, FL 32714  32714.000000\n",
       "5          6789 South Peterson St Maitland, FL 32789  32789.000000\n",
       "6    5858 North Other St Altamonte Springs, FL 32715  32715.000000\n",
       "7                                               None  32735.714844\n",
       "8                                        Orlando, FL  32735.714844"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'nmcm' (string parse for number, commas ok)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'nmcm':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_nmcm_nmbr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>-0.657041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>-0.657041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_nmcm_nmbr\n",
       "0           1234 North Peterson St Orlando, FL 32714          -0.688760\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...          -0.657041\n",
       "2          3456 South Peterson St Maitland, FL 32789           1.690181\n",
       "3           4567 North Peterson St Orlando, FL 32714          -0.688760\n",
       "4                   5678 Avenue St Orlando, FL 32714          -0.688760\n",
       "5          6789 South Peterson St Maitland, FL 32789           1.690181\n",
       "6    5858 North Other St Altamonte Springs, FL 32715          -0.657041\n",
       "7                                               None           0.000000\n",
       "8                                        Orlando, FL           0.000000"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'nmc2' (nmcm with z-score)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'nmc2':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_nmcm_mnmx</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0.013333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0.013333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0.289524</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0.289524</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_nmcm_mnmx\n",
       "0           1234 North Peterson St Orlando, FL 32714           0.000000\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...           0.013333\n",
       "2          3456 South Peterson St Maitland, FL 32789           1.000000\n",
       "3           4567 North Peterson St Orlando, FL 32714           0.000000\n",
       "4                   5678 Avenue St Orlando, FL 32714           0.000000\n",
       "5          6789 South Peterson St Maitland, FL 32789           1.000000\n",
       "6    5858 North Other St Altamonte Springs, FL 32715           0.013333\n",
       "7                                               None           0.289524\n",
       "8                                        Orlando, FL           0.289524"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'nmc3' (nmcm with min-max)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'nmc3':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_strn_ord3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_strn_ord3\n",
       "0           1234 North Peterson St Orlando, FL 32714                  0\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                  4\n",
       "2          3456 South Peterson St Maitland, FL 32789                  1\n",
       "3           4567 North Peterson St Orlando, FL 32714                  0\n",
       "4                   5678 Avenue St Orlando, FL 32714                  2\n",
       "5          6789 South Peterson St Maitland, FL 32789                  1\n",
       "6    5858 North Other St Altamonte Springs, FL 32715                  3\n",
       "7                                               None                  5\n",
       "8                                        Orlando, FL                  6"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'strn' (string extraction of non-numeric characters)\n",
    "\n",
    "#this was noted in paper and not shown in figure\n",
    "#strn is similar to nmcm but extracts longest length non-numeric character set\n",
    "\n",
    "#by default it is followed with an ord3 ordinal for numeric encoding\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'strn':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_srch_Maitland</th>\n",
       "      <th>address_srch_Orlando</th>\n",
       "      <th>address_srch_Altamonte Springs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_srch_Maitland  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714                      0   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                      0   \n",
       "2          3456 South Peterson St Maitland, FL 32789                      1   \n",
       "3           4567 North Peterson St Orlando, FL 32714                      0   \n",
       "4                   5678 Avenue St Orlando, FL 32714                      0   \n",
       "5          6789 South Peterson St Maitland, FL 32789                      1   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715                      0   \n",
       "7                                               None                      0   \n",
       "8                                        Orlando, FL                      0   \n",
       "\n",
       "   address_srch_Orlando  address_srch_Altamonte Springs  \n",
       "0                     1                               0  \n",
       "1                     0                               1  \n",
       "2                     0                               0  \n",
       "3                     1                               0  \n",
       "4                     1                               0  \n",
       "5                     0                               0  \n",
       "6                     0                               1  \n",
       "7                     0                               0  \n",
       "8                     1                               0  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'srch' (categoric string search)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#note that the srch transform accepts parameter 'search'\n",
    "#as a list of search terms\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'srch':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'srch' : {'address'   : {'search' : ['Maitland', 'Orlando', 'Altamonte Springs']}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_srch_Orlando</th>\n",
       "      <th>address_srch_Altamonte Springs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_srch_Orlando  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714                     1   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                     0   \n",
       "2          3456 South Peterson St Maitland, FL 32789                     1   \n",
       "3           4567 North Peterson St Orlando, FL 32714                     1   \n",
       "4                   5678 Avenue St Orlando, FL 32714                     1   \n",
       "5          6789 South Peterson St Maitland, FL 32789                     1   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715                     0   \n",
       "7                                               None                     0   \n",
       "8                                        Orlando, FL                     1   \n",
       "\n",
       "   address_srch_Altamonte Springs  \n",
       "0                               0  \n",
       "1                               1  \n",
       "2                               0  \n",
       "3                               0  \n",
       "4                               0  \n",
       "5                               0  \n",
       "6                               1  \n",
       "7                               0  \n",
       "8                               0  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'srch' (categoric string search)\n",
    "\n",
    "#the paper noted potential to aggregate search terms into a common activation\n",
    "#this is achieved by passing parameter search with embedded lists of terms to be aggregated\n",
    "\n",
    "#Here we'll demonstrate by aggregating ['Maitland', 'Orlando'] into common activation\n",
    "#in context of pass search parameter as [['Maitland', 'Orlando'], 'Altamonte Springs']\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'srch':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'srch' : {'address'   : {'search' : [['Maitland', 'Orlando'], 'Altamonte Springs']}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_src2_Maitland</th>\n",
       "      <th>address_src2_Orlando</th>\n",
       "      <th>address_src2_Altamonte Springs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_src2_Maitland  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714                      0   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...                      0   \n",
       "2          3456 South Peterson St Maitland, FL 32789                      1   \n",
       "3           4567 North Peterson St Orlando, FL 32714                      0   \n",
       "4                   5678 Avenue St Orlando, FL 32714                      0   \n",
       "5          6789 South Peterson St Maitland, FL 32789                      1   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715                      0   \n",
       "7                                               None                      0   \n",
       "8                                        Orlando, FL                      0   \n",
       "\n",
       "   address_src2_Orlando  address_src2_Altamonte Springs  \n",
       "0                     1                               0  \n",
       "1                     0                               1  \n",
       "2                     0                               0  \n",
       "3                     1                               0  \n",
       "4                     1                               0  \n",
       "5                     0                               0  \n",
       "6                     0                               1  \n",
       "7                     0                               0  \n",
       "8                     1                               0  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'src2' (categoric string search)\n",
    "\n",
    "#the paper also noted a variation with potential improved efficiency based on \n",
    "#added assumptions of whether the target set has a narrow range of entries\n",
    "\n",
    "#This is available with the src2 transform, which returns comparable\n",
    "#activations as srch (just a different suffix appender)\n",
    "\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'src2':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'src2' : {'address'   : {'search' : ['Maitland', 'Orlando', 'Altamonte Springs']}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_src4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_src4\n",
       "0           1234 North Peterson St Orlando, FL 32714             2\n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...             3\n",
       "2          3456 South Peterson St Maitland, FL 32789             1\n",
       "3           4567 North Peterson St Orlando, FL 32714             2\n",
       "4                   5678 Avenue St Orlando, FL 32714             2\n",
       "5          6789 South Peterson St Maitland, FL 32789             1\n",
       "6    5858 North Other St Altamonte Springs, FL 32715             3\n",
       "7                                               None             0\n",
       "8                                        Orlando, FL             2"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'src4' (categoric string search, ordinal encoding)\n",
    "\n",
    "#the convention is that for cases of multiple activations to same row\n",
    "#entries toward end of search parameter list take precendence\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#note that the src4 transform accepts parameter 'search'\n",
    "#as a list of search terms\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  assigncat = {'src4':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'src4' : {'address'   : {'search' : ['Maitland', 'Orlando', 'Altamonte Springs']}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Family Tree Aggregations - Figure 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_UPCS_nmc7_nmbr</th>\n",
       "      <th>address_UPCS_1010_0</th>\n",
       "      <th>address_UPCS_1010_1</th>\n",
       "      <th>address_UPCS_1010_2</th>\n",
       "      <th>address_UPCS_1010_3</th>\n",
       "      <th>address_UPCS_spl9_ord3</th>\n",
       "      <th>address_UPCS_spl9_sp10_ord3</th>\n",
       "      <th>address_NArw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>-0.657041</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>-0.657041</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_UPCS_nmc7_nmbr  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714               -0.688760   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...               -0.657041   \n",
       "2          3456 South Peterson St Maitland, FL 32789                1.690181   \n",
       "3           4567 North Peterson St Orlando, FL 32714               -0.688760   \n",
       "4                   5678 Avenue St Orlando, FL 32714               -0.688760   \n",
       "5          6789 South Peterson St Maitland, FL 32789                1.690181   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715               -0.657041   \n",
       "7                                               None                0.000000   \n",
       "8                                        Orlando, FL                0.000000   \n",
       "\n",
       "   address_UPCS_1010_0  address_UPCS_1010_1  address_UPCS_1010_2  \\\n",
       "0                    0                    0                    0   \n",
       "1                    0                    0                    0   \n",
       "2                    0                    0                    1   \n",
       "3                    0                    0                    1   \n",
       "4                    0                    1                    0   \n",
       "5                    0                    1                    1   \n",
       "6                    0                    1                    0   \n",
       "7                    1                    0                    0   \n",
       "8                    0                    1                    1   \n",
       "\n",
       "   address_UPCS_1010_3  address_UPCS_spl9_ord3  address_UPCS_spl9_sp10_ord3  \\\n",
       "0                    0                       0                            1   \n",
       "1                    1                       2                            0   \n",
       "2                    0                       1                            2   \n",
       "3                    1                       0                            1   \n",
       "4                    0                       3                            1   \n",
       "5                    0                       1                            2   \n",
       "6                    1                       2                            0   \n",
       "7                    0                       4                            0   \n",
       "8                    1                       5                            0   \n",
       "\n",
       "   address_NArw  \n",
       "0             0  \n",
       "1             0  \n",
       "2             0  \n",
       "3             0  \n",
       "4             0  \n",
       "5             0  \n",
       "6             0  \n",
       "7             1  \n",
       "8             0  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#'or19' (a family tree of transformations)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#here we'll also activate the NArw_marker parameter\n",
    "#to be consistent with the demonstrations in paper\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  NArw_marker = True, \\\n",
    "  assigncat = {'or19':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>address_UPCS_nmc7_nmbr</th>\n",
       "      <th>address_UPCS_1010_0</th>\n",
       "      <th>address_UPCS_1010_1</th>\n",
       "      <th>address_UPCS_1010_2</th>\n",
       "      <th>address_UPCS_1010_3</th>\n",
       "      <th>address_UPCS_spl9_ord3</th>\n",
       "      <th>address_UPCS_spl9_sp10_ord3</th>\n",
       "      <th>address_NArw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   address_UPCS_nmc7_nmbr  address_UPCS_1010_0  address_UPCS_1010_1  \\\n",
       "0                    True                 True                 True   \n",
       "1                    True                 True                 True   \n",
       "2                    True                 True                 True   \n",
       "3                    True                 True                 True   \n",
       "4                    True                 True                 True   \n",
       "5                    True                 True                 True   \n",
       "6                    True                 True                 True   \n",
       "7                    True                 True                 True   \n",
       "8                    True                 True                 True   \n",
       "\n",
       "   address_UPCS_1010_2  address_UPCS_1010_3  address_UPCS_spl9_ord3  \\\n",
       "0                 True                 True                    True   \n",
       "1                 True                 True                    True   \n",
       "2                 True                 True                    True   \n",
       "3                 True                 True                    True   \n",
       "4                 True                 True                    True   \n",
       "5                 True                 True                    True   \n",
       "6                 True                 True                    True   \n",
       "7                 True                 True                    True   \n",
       "8                 True                 True                    True   \n",
       "\n",
       "   address_UPCS_spl9_sp10_ord3  address_NArw  \n",
       "0                         True          True  \n",
       "1                         True          True  \n",
       "2                         True          True  \n",
       "3                         True          True  \n",
       "4                         True          True  \n",
       "5                         True          True  \n",
       "6                         True          True  \n",
       "7                         True          True  \n",
       "8                         True          True  "
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#here we'll validate or19 by running same data through postmunge(.)\n",
    "\n",
    "#the comparison between the returned sets demonstrates consistency of transforms\n",
    "#these type of comparisons between functions applied to different kinds of data\n",
    "#are one of the ways we validate\n",
    "\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df, \\\n",
    "             pandasoutput = True, printstatus = False)\n",
    "\n",
    "test[['address_UPCS_nmc7_nmbr',\n",
    " 'address_UPCS_1010_0',\n",
    " 'address_UPCS_1010_1',\n",
    " 'address_UPCS_1010_2',\n",
    " 'address_UPCS_1010_3',\n",
    " 'address_UPCS_spl9_ord3',\n",
    " 'address_UPCS_spl9_sp10_ord3',\n",
    " 'address_NArw']] \\\n",
    "== train[['address_UPCS_nmc7_nmbr',\n",
    " 'address_UPCS_1010_0',\n",
    " 'address_UPCS_1010_1',\n",
    " 'address_UPCS_1010_2',\n",
    " 'address_UPCS_1010_3',\n",
    " 'address_UPCS_spl9_ord3',\n",
    " 'address_UPCS_spl9_sp10_ord3',\n",
    " 'address_NArw']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0             1234 NORTH PETERSON ST ORLANDO, FL 32714\n",
       "1    2345 SOUTH ANDERSON ST ALTAMONTE SPRINGS, FL 3...\n",
       "2            3456 SOUTH PETERSON ST MAITLAND, FL 32789\n",
       "3             4567 NORTH PETERSON ST ORLANDO, FL 32714\n",
       "4                     5678 AVENUE ST ORLANDO, FL 32714\n",
       "5            6789 SOUTH PETERSON ST MAITLAND, FL 32789\n",
       "6      5858 NORTH OTHER ST ALTAMONTE SPRINGS, FL 32715\n",
       "7                                            zzzinfill\n",
       "8                                          ORLANDO, FL\n",
       "Name: address, dtype: object"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Note that if we want to recover the original form of data from the returned sets\n",
    "#we can run an inversion operation with the postmunge(.) function\n",
    "\n",
    "#although some of the transformations, such as UPCS, may not have full info recovery \n",
    "#(such as here with case configuration)\n",
    "\n",
    "df_invert, recovered_list, inversion_info_dict = \\\n",
    "am.postmunge(postprocess_dict, test, inversion='test', \\\n",
    "             pandasoutput=True, printstatus=False)\n",
    "\n",
    "df_invert['address']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_column</th>\n",
       "      <th>address_UPCS_nmc7_nmbr</th>\n",
       "      <th>address_UPCS_1010_0</th>\n",
       "      <th>address_UPCS_1010_1</th>\n",
       "      <th>address_UPCS_1010_2</th>\n",
       "      <th>address_UPCS_1010_3</th>\n",
       "      <th>address_UPCS_spl9_ord3</th>\n",
       "      <th>address_UPCS_spl9_sp10_ord3</th>\n",
       "      <th>address_NArw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1234 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2345 South Anderson St Altamonte Springs, FL 3...</td>\n",
       "      <td>-0.657041</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3456 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4567 North Peterson St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5678 Avenue St Orlando, FL 32714</td>\n",
       "      <td>-0.688760</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6789 South Peterson St Maitland, FL 32789</td>\n",
       "      <td>1.690181</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5858 North Other St Altamonte Springs, FL 32715</td>\n",
       "      <td>-0.657041</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>None</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Orlando, FL</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       source_column  address_UPCS_nmc7_nmbr  \\\n",
       "0           1234 North Peterson St Orlando, FL 32714               -0.688760   \n",
       "1  2345 South Anderson St Altamonte Springs, FL 3...               -0.657041   \n",
       "2          3456 South Peterson St Maitland, FL 32789                1.690181   \n",
       "3           4567 North Peterson St Orlando, FL 32714               -0.688760   \n",
       "4                   5678 Avenue St Orlando, FL 32714               -0.688760   \n",
       "5          6789 South Peterson St Maitland, FL 32789                1.690181   \n",
       "6    5858 North Other St Altamonte Springs, FL 32715               -0.657041   \n",
       "7                                               None                0.000000   \n",
       "8                                        Orlando, FL                0.000000   \n",
       "\n",
       "   address_UPCS_1010_0  address_UPCS_1010_1  address_UPCS_1010_2  \\\n",
       "0                    0                    0                    0   \n",
       "1                    0                    0                    0   \n",
       "2                    0                    0                    1   \n",
       "3                    0                    0                    1   \n",
       "4                    0                    1                    0   \n",
       "5                    0                    1                    1   \n",
       "6                    0                    1                    0   \n",
       "7                    1                    0                    0   \n",
       "8                    0                    1                    1   \n",
       "\n",
       "   address_UPCS_1010_3  address_UPCS_spl9_ord3  address_UPCS_spl9_sp10_ord3  \\\n",
       "0                    0                       0                            1   \n",
       "1                    1                       2                            0   \n",
       "2                    0                       1                            2   \n",
       "3                    1                       0                            1   \n",
       "4                    0                       3                            1   \n",
       "5                    0                       1                            2   \n",
       "6                    1                       2                            0   \n",
       "7                    0                       4                            0   \n",
       "8                    1                       5                            0   \n",
       "\n",
       "   address_NArw  \n",
       "0             0  \n",
       "1             0  \n",
       "2             0  \n",
       "3             0  \n",
       "4             0  \n",
       "5             0  \n",
       "6             0  \n",
       "7             1  \n",
       "8             0  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#the paper noted that the UPCS (upper case conversion) could be turned off by parameter\n",
    "#parameters are passed to a transformation functions by assignparam using\n",
    "#the associated transformation category populated in the family tree of the root category\n",
    "#that is associated with the transfomration funciton\n",
    "#for the case of the 'or19' root category, the transformation category\n",
    "#associated with the UPCS transfomation function is 'or19'\n",
    "#but note this is due to the 'or19' entered as a transformation category in the family\n",
    "#tree, not due to the root category of same name\n",
    "\n",
    "#Here we'll demonstrate turning off UPCS\n",
    "\n",
    "#'or19' (a family tree of transformations)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "#here we'll also activate the NArw_marker parameter\n",
    "#to be consistent with the demonstrations in paper\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  NArw_marker = True, \\\n",
    "  assigncat = {'or19':['address'], 'excl':['source_column']}, \\\n",
    "  assignparam = {'or19' : {'address' : {'activate' : False}}}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "#show the returned train set\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "#As an admittedly low scale demonstration of processing time between train and test sets\n",
    "#Let's run a quick comparison on this small data set\n",
    "#Here we're interested in relative performance\n",
    "\n",
    "import timeit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "time elapsed:\n",
      "0.14940843800000003\n"
     ]
    }
   ],
   "source": [
    "#'or19' (a family tree of transformations)\n",
    "\n",
    "df = pd.DataFrame(df_train['address'].copy())\n",
    "df['source_column'] = df['address'].copy()\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  NArw_marker = True, \\\n",
    "  assigncat = {'or19':['address'], 'excl':['source_column']}, \\\n",
    "  printstatus = False)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "time elapsed:\n",
      "0.03092650799999941\n"
     ]
    }
   ],
   "source": [
    "#corresponding postmunge application\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict \\\n",
    "= am.postmunge(postprocess_dict, df, \\\n",
    "               pandasoutput = True, \\\n",
    "               printstatus = False)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Thanks!"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "draft String Theory demo.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
