{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()\n",
    "\n",
    "from sklearn.svm import LinearSVC\n",
    "\n",
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#I wasn't sure about distribution rights for the complete data set\n",
    "#so just using a 10,000 row excerpt for this demonsrtation\n",
    "#note th full data used for training was 8,823,544 samples\n",
    "#from which a 500,000 row validation set was extracted\n",
    "#available online at https://archive.ics.uci.edu/ml/datasets/HIGGS\n",
    "\n",
    "path = \"Higgs_data_partial.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>19</th>\n",
       "      <th>20</th>\n",
       "      <th>21</th>\n",
       "      <th>22</th>\n",
       "      <th>23</th>\n",
       "      <th>24</th>\n",
       "      <th>25</th>\n",
       "      <th>26</th>\n",
       "      <th>27</th>\n",
       "      <th>28</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.869293</td>\n",
       "      <td>-0.635082</td>\n",
       "      <td>0.225690</td>\n",
       "      <td>0.327470</td>\n",
       "      <td>-0.689993</td>\n",
       "      <td>0.754202</td>\n",
       "      <td>-0.248573</td>\n",
       "      <td>-1.092064</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.010455</td>\n",
       "      <td>-0.045767</td>\n",
       "      <td>3.101961</td>\n",
       "      <td>1.353760</td>\n",
       "      <td>0.979563</td>\n",
       "      <td>0.978076</td>\n",
       "      <td>0.920005</td>\n",
       "      <td>0.721657</td>\n",
       "      <td>0.988751</td>\n",
       "      <td>0.876678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.907542</td>\n",
       "      <td>0.329147</td>\n",
       "      <td>0.359412</td>\n",
       "      <td>1.497970</td>\n",
       "      <td>-0.313010</td>\n",
       "      <td>1.095531</td>\n",
       "      <td>-0.557525</td>\n",
       "      <td>-1.588230</td>\n",
       "      <td>2.173076</td>\n",
       "      <td>...</td>\n",
       "      <td>-1.138930</td>\n",
       "      <td>-0.000819</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.302220</td>\n",
       "      <td>0.833048</td>\n",
       "      <td>0.985700</td>\n",
       "      <td>0.978098</td>\n",
       "      <td>0.779732</td>\n",
       "      <td>0.992356</td>\n",
       "      <td>0.798343</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.798835</td>\n",
       "      <td>1.470639</td>\n",
       "      <td>-1.635975</td>\n",
       "      <td>0.453773</td>\n",
       "      <td>0.425629</td>\n",
       "      <td>1.104875</td>\n",
       "      <td>1.282322</td>\n",
       "      <td>1.381664</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1.128848</td>\n",
       "      <td>0.900461</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.909753</td>\n",
       "      <td>1.108330</td>\n",
       "      <td>0.985692</td>\n",
       "      <td>0.951331</td>\n",
       "      <td>0.803252</td>\n",
       "      <td>0.865924</td>\n",
       "      <td>0.780118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.344385</td>\n",
       "      <td>-0.876626</td>\n",
       "      <td>0.935913</td>\n",
       "      <td>1.992050</td>\n",
       "      <td>0.882454</td>\n",
       "      <td>1.786066</td>\n",
       "      <td>-1.646778</td>\n",
       "      <td>-0.942383</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.678379</td>\n",
       "      <td>-1.360356</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.946652</td>\n",
       "      <td>1.028704</td>\n",
       "      <td>0.998656</td>\n",
       "      <td>0.728281</td>\n",
       "      <td>0.869200</td>\n",
       "      <td>1.026736</td>\n",
       "      <td>0.957904</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.105009</td>\n",
       "      <td>0.321356</td>\n",
       "      <td>1.522401</td>\n",
       "      <td>0.882808</td>\n",
       "      <td>-1.205349</td>\n",
       "      <td>0.681466</td>\n",
       "      <td>-1.070464</td>\n",
       "      <td>-0.921871</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.373566</td>\n",
       "      <td>0.113041</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.755856</td>\n",
       "      <td>1.361057</td>\n",
       "      <td>0.986610</td>\n",
       "      <td>0.838085</td>\n",
       "      <td>1.133295</td>\n",
       "      <td>0.872245</td>\n",
       "      <td>0.808487</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 29 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0         1         2         3         4         5         6         7  \\\n",
       "0  1.0  0.869293 -0.635082  0.225690  0.327470 -0.689993  0.754202 -0.248573   \n",
       "1  1.0  0.907542  0.329147  0.359412  1.497970 -0.313010  1.095531 -0.557525   \n",
       "2  1.0  0.798835  1.470639 -1.635975  0.453773  0.425629  1.104875  1.282322   \n",
       "3  0.0  1.344385 -0.876626  0.935913  1.992050  0.882454  1.786066 -1.646778   \n",
       "4  1.0  1.105009  0.321356  1.522401  0.882808 -1.205349  0.681466 -1.070464   \n",
       "\n",
       "          8         9  ...        19        20        21        22        23  \\\n",
       "0 -1.092064  0.000000  ... -0.010455 -0.045767  3.101961  1.353760  0.979563   \n",
       "1 -1.588230  2.173076  ... -1.138930 -0.000819  0.000000  0.302220  0.833048   \n",
       "2  1.381664  0.000000  ...  1.128848  0.900461  0.000000  0.909753  1.108330   \n",
       "3 -0.942383  0.000000  ... -0.678379 -1.360356  0.000000  0.946652  1.028704   \n",
       "4 -0.921871  0.000000  ... -0.373566  0.113041  0.000000  0.755856  1.361057   \n",
       "\n",
       "         24        25        26        27        28  \n",
       "0  0.978076  0.920005  0.721657  0.988751  0.876678  \n",
       "1  0.985700  0.978098  0.779732  0.992356  0.798343  \n",
       "2  0.985692  0.951331  0.803252  0.865924  0.780118  \n",
       "3  0.998656  0.728281  0.869200  1.026736  0.957904  \n",
       "4  0.986610  0.838085  1.133295  0.872245  0.808487  \n",
       "\n",
       "[5 rows x 29 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train = pd.read_csv(path, header='infer')\n",
    "\n",
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#to preprocess the data with automunge we'll apply a few different assigncat scnearios\n",
    "\n",
    "#first let's collect the feature and label headers\n",
    "\n",
    "features = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]\n",
    "label = 0\n",
    "\n",
    "#now here are the assigncat scenarios for assigning trasnfomartions\n",
    "\n",
    "#exc2 is a passthrough trasnformation which applies a default mode infill\n",
    "#bnry is a label encoding to boolean integer\n",
    "\n",
    "assigncat1 = {'exc2':features, 'bnry':label}\n",
    "\n",
    "#nmbr is a z-score normalization\n",
    "assigncat2 = {'nmbr':features, 'bnry':label}\n",
    "\n",
    "#retn is retain normalization\n",
    "assigncat3 = {'retn':features, 'bnry':label}\n",
    "\n",
    "#rtbs is a family tree we'll populate for retain with bins\n",
    "assigncat4 = {'rtbs':features, 'bnry':label}\n",
    "\n",
    "#this will return columns column_retn and column_retn_bins (set)\n",
    "transformdict = \\\n",
    "{'rtbs' : {'parents'       : ['rtbs'], \\\n",
    "           'siblings'      : [], \\\n",
    "           'auntsuncles'   : [], \\\n",
    "           'cousins'       : [], \\\n",
    "           'children'      : [], \\\n",
    "           'niecesnephews' : [], \\\n",
    "           'coworkers'     : [], \\\n",
    "           'friends'       : ['bins'] }}\n",
    "\n",
    "processdict = \\\n",
    "{'rtbs' : {'dualprocess' : am.process_retn_class, \\\n",
    "          'singleprocess' : None, \\\n",
    "          'postprocess' : am.postprocess_retn_class, \\\n",
    "          'NArowtype' : 'numeric', \\\n",
    "          'MLinfilltype' : 'numeric', \\\n",
    "          'labelctgy' : 'retn'}}\n",
    "\n",
    "#DPrt is retain normalization with noise injection\n",
    "#in first scenario we'll inject to 100% of data\n",
    "assigncat5 = {'DPrt':features, 'bnry':label}\n",
    "\n",
    "assignparam5 = {'default_assignparam' : {'DPrt' : {'flip_prob' : 1.0}}}\n",
    "\n",
    "\n",
    "#in second DPrt scenario we'll inject to 3% of data\n",
    "assigncat6 = {'DPrt':features, 'bnry':label}\n",
    "\n",
    "assignparam6 = {'default_assignparam' : {'DPrt' : {'flip_prob' : 0.03}}}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Automunge processing\n",
      "\n",
      "evaluating column:  1\n",
      "processing column:  1\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['1_exc2']\n",
      "\n",
      "evaluating column:  2\n",
      "processing column:  2\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['2_exc2']\n",
      "\n",
      "evaluating column:  3\n",
      "processing column:  3\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['3_exc2']\n",
      "\n",
      "evaluating column:  4\n",
      "processing column:  4\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['4_exc2']\n",
      "\n",
      "evaluating column:  5\n",
      "processing column:  5\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['5_exc2']\n",
      "\n",
      "evaluating column:  6\n",
      "processing column:  6\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['6_exc2']\n",
      "\n",
      "evaluating column:  7\n",
      "processing column:  7\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['7_exc2']\n",
      "\n",
      "evaluating column:  8\n",
      "processing column:  8\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['8_exc2']\n",
      "\n",
      "evaluating column:  9\n",
      "processing column:  9\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['9_exc2']\n",
      "\n",
      "evaluating column:  10\n",
      "processing column:  10\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['10_exc2']\n",
      "\n",
      "evaluating column:  11\n",
      "processing column:  11\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['11_exc2']\n",
      "\n",
      "evaluating column:  12\n",
      "processing column:  12\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['12_exc2']\n",
      "\n",
      "evaluating column:  13\n",
      "processing column:  13\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['13_exc2']\n",
      "\n",
      "evaluating column:  14\n",
      "processing column:  14\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['14_exc2']\n",
      "\n",
      "evaluating column:  15\n",
      "processing column:  15\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['15_exc2']\n",
      "\n",
      "evaluating column:  16\n",
      "processing column:  16\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['16_exc2']\n",
      "\n",
      "evaluating column:  17\n",
      "processing column:  17\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['17_exc2']\n",
      "\n",
      "evaluating column:  18\n",
      "processing column:  18\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['18_exc2']\n",
      "\n",
      "evaluating column:  19\n",
      "processing column:  19\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['19_exc2']\n",
      "\n",
      "evaluating column:  20\n",
      "processing column:  20\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['20_exc2']\n",
      "\n",
      "evaluating column:  21\n",
      "processing column:  21\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['21_exc2']\n",
      "\n",
      "evaluating column:  22\n",
      "processing column:  22\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['22_exc2']\n",
      "\n",
      "evaluating column:  23\n",
      "processing column:  23\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['23_exc2']\n",
      "\n",
      "evaluating column:  24\n",
      "processing column:  24\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['24_exc2']\n",
      "\n",
      "evaluating column:  25\n",
      "processing column:  25\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['25_exc2']\n",
      "\n",
      "evaluating column:  26\n",
      "processing column:  26\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['26_exc2']\n",
      "\n",
      "evaluating column:  27\n",
      "processing column:  27\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['27_exc2']\n",
      "\n",
      "evaluating column:  28\n",
      "processing column:  28\n",
      "    root category:  exc2\n",
      " returned columns:\n",
      "['28_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "evaluating label column:  0\n",
      "processing label column:  0\n",
      "    root label category:  bnry\n",
      "\n",
      " returned columns:\n",
      "['0_bnry']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  2_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  3_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  4_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  5_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  6_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  7_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  8_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  9_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  10_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  11_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  12_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  13_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  14_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  15_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  16_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  17_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  18_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  19_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  20_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  21_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  22_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  23_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  24_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  25_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  26_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  27_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  28_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "_______________\n",
      "Begin Validation set processing with Postmunge\n",
      "\n",
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  1\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['1_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  2\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['2_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  3\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['3_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  4\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['4_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  5\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['5_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  6\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['6_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  7\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['7_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  8\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['8_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  9\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['9_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  10\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['10_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  11\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['11_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  12\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['12_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  13\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['13_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  14\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['14_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  15\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['15_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  16\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['16_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  17\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['17_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  18\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['18_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  19\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['19_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  20\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['20_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  21\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['21_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  22\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['22_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  23\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['23_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  24\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['24_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  25\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['25_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  26\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['26_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  27\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['27_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  28\n",
      "    root category:  exc2\n",
      "\n",
      " returned columns:\n",
      "['28_exc2']\n",
      "\n",
      "______\n",
      "\n",
      "processing label column:  0\n",
      "    root label category:  bnry\n",
      "\n",
      " returned columns:\n",
      "['0_bnry']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  2_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  3_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  4_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  5_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  6_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  7_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  8_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  9_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  10_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  11_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  12_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  13_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  14_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  15_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  16_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  17_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  18_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  19_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  20_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  21_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  22_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  23_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  24_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  25_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  26_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  27_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "infill to column:  28_exc2\n",
      "     infill type: adjinfill\n",
      "\n",
      "_______________\n",
      "Postmunge returned ID column set: \n",
      "['Automunge_index_892219052865']\n",
      "\n",
      "Postmunge returned test column set: \n",
      "['1_exc2', '2_exc2', '3_exc2', '4_exc2', '5_exc2', '6_exc2', '7_exc2', '8_exc2', '9_exc2', '10_exc2', '11_exc2', '12_exc2', '13_exc2', '14_exc2', '15_exc2', '16_exc2', '17_exc2', '18_exc2', '19_exc2', '20_exc2', '21_exc2', '22_exc2', '23_exc2', '24_exc2', '25_exc2', '26_exc2', '27_exc2', '28_exc2']\n",
      "\n",
      "Postmunge returned label column set: \n",
      "['0_bnry']\n",
      "\n",
      "_______________\n",
      "Postmunge Complete\n",
      "\n",
      "______\n",
      "\n",
      "versioning serial stamp:\n",
      "_4.88_892219052865_2020-10-01T20:44:09.388848\n",
      "\n",
      "Automunge returned ID column set: \n",
      "['Automunge_index_892219052865']\n",
      "\n",
      "Automunge returned train column set: \n",
      "['1_exc2', '2_exc2', '3_exc2', '4_exc2', '5_exc2', '6_exc2', '7_exc2', '8_exc2', '9_exc2', '10_exc2', '11_exc2', '12_exc2', '13_exc2', '14_exc2', '15_exc2', '16_exc2', '17_exc2', '18_exc2', '19_exc2', '20_exc2', '21_exc2', '22_exc2', '23_exc2', '24_exc2', '25_exc2', '26_exc2', '27_exc2', '28_exc2']\n",
      "\n",
      "Automunge returned label column set: \n",
      "['0_bnry']\n",
      "\n",
      "_______________\n",
      "Automunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#now we can prprocess our data for each of these scenarios to run our experiments\n",
    "\n",
    "#we'll return data as numpy arrays\n",
    "#in experiment validation data was split based on 500k sample\n",
    "\n",
    "\n",
    "#scenario1, exc2\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train, labels_column = label, \\\n",
    "             valpercent1 = 0.05, \\\n",
    "             assigncat = assigncat1, \\\n",
    "             assigninfill={'adjinfill':features}, \\\n",
    "             pandasoutput=False, printstatus=True)\n",
    "\n",
    "# #scenario2, nmbr\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\\n",
    "#              valpercent1 = 0.05, \\\n",
    "#              assigncat = assigncat2, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              pandasoutput=False, printstatus=True)\n",
    "\n",
    "# #scenario3, retn\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\\n",
    "#              valpercent1 = 0.05, \\\n",
    "#              assigncat = assigncat3, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              pandasoutput=False, printstatus=True)\n",
    "\n",
    "# #scenario4, retn with standard deviation bins\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\\n",
    "#              valpercent1 = 0.05, \\\n",
    "#              assigncat = assigncat4, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              transformdict = transformdict, processdict = processdict, \\\n",
    "#              pandasoutput=False, printstatus=True)\n",
    "\n",
    "\n",
    "# #scenario5, DPrt with full noise injection\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\\n",
    "#              valpercent1 = 0.05, \\\n",
    "#              assigncat = assigncat5, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              assignparam = assignparam5, \\\n",
    "#              pandasoutput=False, printstatus=True)\n",
    "\n",
    "\n",
    "# #scenario5, DPrt with partial noise injection\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\\n",
    "#              valpercent1 = 0.05, \\\n",
    "#              assigncat = assigncat6, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              assignparam = assignparam6, \\\n",
    "#              pandasoutput=False, printstatus=True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#the postprocess_dict returned should be saved if we want to later consistently process additional data\n",
    "\n",
    "# import pickle\n",
    "\n",
    "# #backup postprocess_dict on disk\n",
    "# with open('filename.pkl', 'wb') as download:\n",
    "#     pickle.dump(postprocess_dict, download)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#initilize the model\n",
    "\n",
    "svc = LinearSVC(random_state=0, tol=1e-5, max_iter = 10000, dual=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearSVC(dual=False, max_iter=10000, random_state=0, tol=1e-05)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#fit the model\n",
    "\n",
    "svc.fit(train, labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#generate predictions\n",
    "predictions = svc.predict(validation1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.612"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#get accuracy score\n",
    "accuracy = accuracy_score(validationlabels1, predictions)\n",
    "\n",
    "accuracy"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
