{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# A Numbers Game - Demonstration Notebook"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook will demonstrate material related to discussions in the paper \"A Numbers Game: Numeric Encoding Options with Automunge\". Let's get right to it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Automunge is available for pip install:\n",
    "# !pip install Automunge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#or to upgrade (we currently roll out upgrades fairly frequently)\n",
    "#!pip install Automunge --upgrade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Once installed, run this in local session to initialize\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#To demonstrate, we'll popuilate a simple data of numeric data\n",
    "\n",
    "#col1 has mixed poistive and negative floats\n",
    "#col2 has all positive floats\n",
    "#col3 has all negative floats\n",
    "#col4 has integers\n",
    "#col5 is a categoric set\n",
    "#col6 is a categoric set\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "df_train = pd.DataFrame({'col1':[-0.5, 12, 0, 3.3, -4.2, 2.1, 101, -33], \\\n",
    "                         'col2':[12, 13.1, 26.2, 111, 2, 1, 2, 5], \\\n",
    "                         'col3':[-.33, -48, -11.2, -0.3, -6, -52, -121, -30.2], \\\n",
    "                         'col4':[5, 1, 5, 3, 1, 5, 11, 12], \\\n",
    "                         'col5':['circle', 'square', 'circle', 'square', 'triangle', 'triangle', 'circle', 'square'], \\\n",
    "                         'col6':['on', 'off', 'on', 'on', 'off', 'off', 'on', 'on']})\n",
    "\n",
    "#for a test set we'll just copy the train set\n",
    "df_test = df_train.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demonstrations from Paper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2 - Normalizations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Let's demonstrate each normalizaitons from Table 1\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df[['col1', 'col2', 'col3', 'col4']] = df_train[['col1', 'col2', 'col3', 'col4']].copy()\n",
    "df['col5'] = df_train['col1'].copy()\n",
    "\n",
    "assigncat = {'nmbr':'col1', 'mnmx':'col2', 'mean':'col3', 'MAD3':'col4', 'lgnm':'col5'}\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  printstatus = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>col2</th>\n",
       "      <th>col3</th>\n",
       "      <th>col4</th>\n",
       "      <th>col5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>12.0</td>\n",
       "      <td>-0.33</td>\n",
       "      <td>5</td>\n",
       "      <td>-0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>13.1</td>\n",
       "      <td>-48.00</td>\n",
       "      <td>1</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>26.2</td>\n",
       "      <td>-11.20</td>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>111.0</td>\n",
       "      <td>-0.30</td>\n",
       "      <td>3</td>\n",
       "      <td>3.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-6.00</td>\n",
       "      <td>1</td>\n",
       "      <td>-4.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-52.00</td>\n",
       "      <td>5</td>\n",
       "      <td>2.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-121.00</td>\n",
       "      <td>11</td>\n",
       "      <td>101.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>-30.20</td>\n",
       "      <td>12</td>\n",
       "      <td>-33.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1   col2    col3  col4   col5\n",
       "0   -0.5   12.0   -0.33     5   -0.5\n",
       "1   12.0   13.1  -48.00     1   12.0\n",
       "2    0.0   26.2  -11.20     5    0.0\n",
       "3    3.3  111.0   -0.30     3    3.3\n",
       "4   -4.2    2.0   -6.00     1   -4.2\n",
       "5    2.1    1.0  -52.00     5    2.1\n",
       "6  101.0    2.0 -121.00    11  101.0\n",
       "7  -33.0    5.0  -30.20    12  -33.0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#here is the data before normalizaitons\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1_nmbr</th>\n",
       "      <th>col2_mnmx</th>\n",
       "      <th>col3_mean</th>\n",
       "      <th>col4_MAD3</th>\n",
       "      <th>col5_logn_nmbr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.271396</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.275880</td>\n",
       "      <td>-2.285714</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.049024</td>\n",
       "      <td>0.110000</td>\n",
       "      <td>-0.119066</td>\n",
       "      <td>-3.591837</td>\n",
       "      <td>0.198834</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.258579</td>\n",
       "      <td>0.229091</td>\n",
       "      <td>0.185822</td>\n",
       "      <td>-2.285714</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.173988</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.276129</td>\n",
       "      <td>-2.938776</td>\n",
       "      <td>-0.937298</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.366241</td>\n",
       "      <td>0.009091</td>\n",
       "      <td>0.228904</td>\n",
       "      <td>-3.591837</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-0.204749</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.152206</td>\n",
       "      <td>-2.285714</td>\n",
       "      <td>-1.335068</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2.330419</td>\n",
       "      <td>0.009091</td>\n",
       "      <td>-0.723871</td>\n",
       "      <td>-0.326531</td>\n",
       "      <td>2.073531</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-1.104490</td>\n",
       "      <td>0.036364</td>\n",
       "      <td>0.028407</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   col1_nmbr  col2_mnmx  col3_mean  col4_MAD3  col5_logn_nmbr\n",
       "0  -0.271396   0.100000   0.275880  -2.285714        0.000000\n",
       "1   0.049024   0.110000  -0.119066  -3.591837        0.198834\n",
       "2  -0.258579   0.229091   0.185822  -2.285714        0.000000\n",
       "3  -0.173988   1.000000   0.276129  -2.938776       -0.937298\n",
       "4  -0.366241   0.009091   0.228904  -3.591837        0.000000\n",
       "5  -0.204749   0.000000  -0.152206  -2.285714       -1.335068\n",
       "6   2.330419   0.009091  -0.723871  -0.326531        2.073531\n",
       "7  -1.104490   0.036364   0.028407   0.000000        0.000000"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#and here is after normalizations\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#The paper noted that in some cases a floor or cap may be desired\n",
    "#to maintain consistent range between train and test data\n",
    "\n",
    "#Passing parameters for mnmx trasnform for instance could be as follows:\n",
    "\n",
    "assignparam = {'mnmx' : {'col2' : {'floor':True, 'cap':False}}}\n",
    "assigncat = {'mnmx':'col2'}\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Now let's try retain normalization\n",
    "#note that based on their respective received ranges\n",
    "#col1/col2/col3 will have different formulas applied\n",
    "\n",
    "assigncat = {'retn':['col1', 'col2', 'col3'], 'null':['col4', 'col5']}\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  printstatus = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>col2</th>\n",
       "      <th>col3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>12.0</td>\n",
       "      <td>-0.33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>13.1</td>\n",
       "      <td>-48.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>26.2</td>\n",
       "      <td>-11.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>111.0</td>\n",
       "      <td>-0.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-6.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-52.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-121.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>-30.20</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1   col2    col3\n",
       "0   -0.5   12.0   -0.33\n",
       "1   12.0   13.1  -48.00\n",
       "2    0.0   26.2  -11.20\n",
       "3    3.3  111.0   -0.30\n",
       "4   -4.2    2.0   -6.00\n",
       "5    2.1    1.0  -52.00\n",
       "6  101.0    2.0 -121.00\n",
       "7  -33.0    5.0  -30.20"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#here is the data before normalizaitons\n",
    "df[['col1', 'col2', 'col3']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1_retn</th>\n",
       "      <th>col2_retn</th>\n",
       "      <th>col3_retn</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.003731</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>-0.000249</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.089552</td>\n",
       "      <td>0.110000</td>\n",
       "      <td>-0.395195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.229091</td>\n",
       "      <td>-0.090307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.024627</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.031343</td>\n",
       "      <td>0.009091</td>\n",
       "      <td>-0.047225</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.015672</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.428335</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.753731</td>\n",
       "      <td>0.009091</td>\n",
       "      <td>-1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-0.246269</td>\n",
       "      <td>0.036364</td>\n",
       "      <td>-0.247722</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   col1_retn  col2_retn  col3_retn\n",
       "0  -0.003731   0.100000  -0.000249\n",
       "1   0.089552   0.110000  -0.395195\n",
       "2   0.000000   0.229091  -0.090307\n",
       "3   0.024627   1.000000   0.000000\n",
       "4  -0.031343   0.009091  -0.047225\n",
       "5   0.015672   0.000000  -0.428335\n",
       "6   0.753731   0.009091  -1.000000\n",
       "7  -0.246269   0.036364  -0.247722"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#and here is after normalizations\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "col1_retn    0.753731\n",
       "col2_retn    1.000000\n",
       "col3_retn    0.000000\n",
       "dtype: float32"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "col1_retn   -0.246269\n",
       "col2_retn    0.000000\n",
       "col3_retn   -1.000000\n",
       "dtype: float32"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.min()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3 - Transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#as an example of a populated family tree in a transformdict data structure\n",
    "#Let's say we want to replace our use of min-max scaling with min-max scaling\n",
    "#supplumented by power of ten bins, and a marker for applied infill via NArw\n",
    "\n",
    "#we could populate a transformdict data structure as\n",
    "\n",
    "transformdict =  {'mnmx' : {'parents'       : [],\n",
    "                            'siblings'      : [],\n",
    "                            'auntsuncles'   : ['mnmx', 'pwr2'],\n",
    "                            'cousins'       : ['NArw'],\n",
    "                            'children'      : [],\n",
    "                            'niecesnephews' : [],\n",
    "                            'coworkers'     : [],\n",
    "                            'friends'       : []}}\n",
    "\n",
    "#we can just pass this transformdict to an automunge(.) call"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4 - Bins and Grainings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>-0.5</td>\n",
       "      <td>-0.5</td>\n",
       "      <td>-0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>3.3</td>\n",
       "      <td>3.3</td>\n",
       "      <td>3.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>-4.2</td>\n",
       "      <td>-4.2</td>\n",
       "      <td>-4.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>2.1</td>\n",
       "      <td>2.1</td>\n",
       "      <td>2.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>101.0</td>\n",
       "      <td>101.0</td>\n",
       "      <td>101.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>-33.0</td>\n",
       "      <td>-33.0</td>\n",
       "      <td>-33.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1      1      2      3\n",
       "0   -0.5   -0.5   -0.5   -0.5\n",
       "1   12.0   12.0   12.0   12.0\n",
       "2    0.0    0.0    0.0    0.0\n",
       "3    3.3    3.3    3.3    3.3\n",
       "4   -4.2   -4.2   -4.2   -4.2\n",
       "5    2.1    2.1    2.1    2.1\n",
       "6  101.0  101.0  101.0  101.0\n",
       "7  -33.0  -33.0  -33.0  -33.0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Let's demonstrate a few scenarios of populating bins for a numeric set\n",
    "#we'll use the 'col1' column as our basis\n",
    "\n",
    "df = pd.DataFrame()\n",
    "\n",
    "df['col1'] = df_train['col1'].copy()\n",
    "df['1'] = df_train['col1'].copy()\n",
    "df['2'] = df_train['col1'].copy()\n",
    "df['3'] = df_train['col1'].copy()\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>1_bins_0</th>\n",
       "      <th>1_bins_1</th>\n",
       "      <th>1_bins_2</th>\n",
       "      <th>1_bins_3</th>\n",
       "      <th>1_bins_4</th>\n",
       "      <th>1_bins_5</th>\n",
       "      <th>2_bsor</th>\n",
       "      <th>3_bsor_1010_0</th>\n",
       "      <th>3_bsor_1010_1</th>\n",
       "      <th>3_bsor_1010_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1  1_bins_0  1_bins_1  1_bins_2  1_bins_3  1_bins_4  1_bins_5  2_bsor  \\\n",
       "0   -0.5         0         0         1         0         0         0       2   \n",
       "1   12.0         0         0         0         1         0         0       3   \n",
       "2    0.0         0         0         1         0         0         0       2   \n",
       "3    3.3         0         0         1         0         0         0       2   \n",
       "4   -4.2         0         0         1         0         0         0       2   \n",
       "5    2.1         0         0         1         0         0         0       2   \n",
       "6  101.0         0         0         0         0         0         1       5   \n",
       "7  -33.0         0         1         0         0         0         0       1   \n",
       "\n",
       "   3_bsor_1010_0  3_bsor_1010_1  3_bsor_1010_2  \n",
       "0              0              0              1  \n",
       "1              0              1              0  \n",
       "2              0              0              1  \n",
       "3              0              0              1  \n",
       "4              0              0              1  \n",
       "5              0              0              1  \n",
       "6              0              1              1  \n",
       "7              0              0              0  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#let's demonsrtate the same column with standard deviation bins\n",
    "#aggregated as one-hot, ordinal, or binary\n",
    "\n",
    "assigncat = {'bins':'1', 'bsor':'2', 'bsbn':'3', 'excl':'col1'}\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Here the bins aggregation returns columns:\n",
    "# ['1_bins_s<-2', '1_bins_s-21', '1_bins_s-10', '1_bins_s+01', '1_bins_s+12', '1_bins_s>+2']\n",
    "\n",
    "#the bsor aggregation returns columns:\n",
    "# ['2_bsor']\n",
    "\n",
    "#and the bsbn aggregation returns columns:\n",
    "# ['3_bsor_1010_0', '3_bsor_1010_1', '3_bsor_1010_2']\n",
    "\n",
    "#with the source column 'col1' shown for comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>1_pwor</th>\n",
       "      <th>2_pwor</th>\n",
       "      <th>3_bnwo</th>\n",
       "      <th>4_bneo</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>44</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>32</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>36</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>28</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>35</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>133</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1  1_pwor  2_pwor  3_bnwo  4_bneo\n",
       "0   -0.5       0       1      32       1\n",
       "1   12.0       2       2      44       4\n",
       "2    0.0       0       0      32       2\n",
       "3    3.3       3       4      36       3\n",
       "4   -4.2       0       5      28       0\n",
       "5    2.1       3       4      35       2\n",
       "6  101.0       4       6     133       4\n",
       "7  -33.0       0       7       0       0"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#The remainder of the bins we'll just demonstrate as ordinal\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col1'] = df_train['col1'].copy()\n",
    "df['1'] = df_train['col1'].copy()\n",
    "df['2'] = df_train['col1'].copy()\n",
    "df['3'] = df_train['col1'].copy()\n",
    "df['4'] = df_train['col1'].copy()\n",
    "\n",
    "assigncat = {'pwor':'1', 'por2':'2', 'bnwo':'3', 'bneo':'4', 'excl':'col1'}\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>1_bkt3</th>\n",
       "      <th>2_bkt4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1  1_bkt3  2_bkt4\n",
       "0   -0.5       1       0\n",
       "1   12.0       3       2\n",
       "2    0.0       1       0\n",
       "3    3.3       2       1\n",
       "4   -4.2       1       0\n",
       "5    2.1       2       1\n",
       "6  101.0       4       3\n",
       "7  -33.0       0       3"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#if we want to define some custom bin aggregations\n",
    "#we can pass parameters to assignparam\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col1'] = df_train['col1'].copy()\n",
    "df['1'] = df_train['col1'].copy()\n",
    "df['2'] = df_train['col1'].copy()\n",
    "\n",
    "assigncat = {'bkt3':'1', 'bkt4':'2', 'excl':'col1'}\n",
    "\n",
    "assignparam = {'default_assignparam' : {'bkt3' : {'buckets':[-5, 0, 10, 100]}, \\\n",
    "                                        'bkt4' : {'buckets':[-5, 0, 10, 100]}}}\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5 - Noise Injection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>NoNoise_nmbr</th>\n",
       "      <th>PartialNoise_nmbr_DPnb</th>\n",
       "      <th>FullNoise_nmbr_DPnb</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>-0.271396</td>\n",
       "      <td>-0.271396</td>\n",
       "      <td>-0.252906</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>0.049024</td>\n",
       "      <td>0.008603</td>\n",
       "      <td>0.022263</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.258579</td>\n",
       "      <td>-0.206509</td>\n",
       "      <td>-0.239540</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>-0.173988</td>\n",
       "      <td>-0.173988</td>\n",
       "      <td>-0.209256</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>-0.366241</td>\n",
       "      <td>-0.366241</td>\n",
       "      <td>-0.353645</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>-0.204749</td>\n",
       "      <td>-0.204749</td>\n",
       "      <td>-0.169735</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>2.330419</td>\n",
       "      <td>2.314361</td>\n",
       "      <td>2.369485</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>-1.104490</td>\n",
       "      <td>-1.104490</td>\n",
       "      <td>-1.122256</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1  NoNoise_nmbr  PartialNoise_nmbr_DPnb  FullNoise_nmbr_DPnb\n",
       "0   -0.5     -0.271396               -0.271396            -0.252906\n",
       "1   12.0      0.049024                0.008603             0.022263\n",
       "2    0.0     -0.258579               -0.206509            -0.239540\n",
       "3    3.3     -0.173988               -0.173988            -0.209256\n",
       "4   -4.2     -0.366241               -0.366241            -0.353645\n",
       "5    2.1     -0.204749               -0.204749            -0.169735\n",
       "6  101.0      2.330419                2.314361             2.369485\n",
       "7  -33.0     -1.104490               -1.104490            -1.122256"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#To demonstrate the noise injection, we'll show three returned sets:\n",
    "# the source column, the normalizaiton without noise, and with full noise\n",
    "#and with partial noise\n",
    "#first let's start with z-score:\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col1'] = df_train['col1'].copy()\n",
    "df['NoNoise'] = df_train['col1'].copy()\n",
    "df['PartialNoise'] = df_train['col1'].copy()\n",
    "df['FullNoise'] = df_train['col1'].copy()\n",
    "\n",
    "assigncat = {'excl':'col1', 'nmbr':'NoNoise', 'DPnb':['PartialNoise', 'FullNoise']}\n",
    "\n",
    "#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments\n",
    "#here we'll show the default values for mu and sigma\n",
    "#and vary the flip_prob parameter based on what ratio of data we want to receive noise\n",
    "#note that we don't have to set parameters for defaults, just showing as a demosnrtation\n",
    "\n",
    "assignparam = {'DPnb' : {'PartialNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':0.33}, \\\n",
    "                         'FullNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':1.}}}\n",
    "\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>NoNoise_mnmx</th>\n",
       "      <th>PartialNoise_mnmx_DPmm</th>\n",
       "      <th>FullNoise_mnmx_DPmm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>0.242537</td>\n",
       "      <td>0.235059</td>\n",
       "      <td>0.271047</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>0.335821</td>\n",
       "      <td>0.335821</td>\n",
       "      <td>0.416829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.246269</td>\n",
       "      <td>0.274822</td>\n",
       "      <td>0.237127</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>0.270896</td>\n",
       "      <td>0.270896</td>\n",
       "      <td>0.277949</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>0.214925</td>\n",
       "      <td>0.214925</td>\n",
       "      <td>0.251401</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>0.261940</td>\n",
       "      <td>0.261940</td>\n",
       "      <td>0.292239</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000339</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1  NoNoise_mnmx  PartialNoise_mnmx_DPmm  FullNoise_mnmx_DPmm\n",
       "0   -0.5      0.242537                0.235059             0.271047\n",
       "1   12.0      0.335821                0.335821             0.416829\n",
       "2    0.0      0.246269                0.274822             0.237127\n",
       "3    3.3      0.270896                0.270896             0.277949\n",
       "4   -4.2      0.214925                0.214925             0.251401\n",
       "5    2.1      0.261940                0.261940             0.292239\n",
       "6  101.0      1.000000                1.000000             1.000000\n",
       "7  -33.0      0.000000                0.000339             0.000000"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#similarly for min-max\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col1'] = df_train['col1'].copy()\n",
    "df['NoNoise'] = df_train['col1'].copy()\n",
    "df['PartialNoise'] = df_train['col1'].copy()\n",
    "df['FullNoise'] = df_train['col1'].copy()\n",
    "\n",
    "assigncat = {'excl':'col1', 'mnmx':'NoNoise', 'DPmm':['PartialNoise', 'FullNoise']}\n",
    "\n",
    "#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments\n",
    "#here we'll show the default values for mu and sigma\n",
    "#and vary the flip_prob parameter based on what ratio of data we want to receive noise\n",
    "#note that we don't have to set parameters for defaults, just showing as a demosnrtation\n",
    "\n",
    "assignparam = {'DPmm' : {'PartialNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':0.33}, \\\n",
    "                         'FullNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':1.}}}\n",
    "\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>NoNoise_retn</th>\n",
       "      <th>PartialNoise_DPrt</th>\n",
       "      <th>FullNoise_DPrt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>-0.003731</td>\n",
       "      <td>-0.023090</td>\n",
       "      <td>0.044819</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>0.089552</td>\n",
       "      <td>0.054759</td>\n",
       "      <td>0.088812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.036180</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>0.024627</td>\n",
       "      <td>0.024627</td>\n",
       "      <td>0.052735</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>-0.031343</td>\n",
       "      <td>-0.041998</td>\n",
       "      <td>-0.045556</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>0.015672</td>\n",
       "      <td>0.015672</td>\n",
       "      <td>0.031292</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>0.753731</td>\n",
       "      <td>0.716960</td>\n",
       "      <td>0.745769</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>-0.246269</td>\n",
       "      <td>-0.246269</td>\n",
       "      <td>-0.231533</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1  NoNoise_retn  PartialNoise_DPrt  FullNoise_DPrt\n",
       "0   -0.5     -0.003731          -0.023090        0.044819\n",
       "1   12.0      0.089552           0.054759        0.088812\n",
       "2    0.0      0.000000           0.000000        0.036180\n",
       "3    3.3      0.024627           0.024627        0.052735\n",
       "4   -4.2     -0.031343          -0.041998       -0.045556\n",
       "5    2.1      0.015672           0.015672        0.031292\n",
       "6  101.0      0.753731           0.716960        0.745769\n",
       "7  -33.0     -0.246269          -0.246269       -0.231533"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#similarly for retain normalization\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col1'] = df_train['col1'].copy()\n",
    "df['NoNoise'] = df_train['col1'].copy()\n",
    "df['PartialNoise'] = df_train['col1'].copy()\n",
    "df['FullNoise'] = df_train['col1'].copy()\n",
    "\n",
    "assigncat = {'excl':'col1', 'retn':'NoNoise', 'DPrt':['PartialNoise', 'FullNoise']}\n",
    "\n",
    "#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments\n",
    "#here we'll show the default values for mu and sigma\n",
    "#and vary the flip_prob parameter based on what ratio of data we want to receive noise\n",
    "#note that we don't have to set parameters for defaults, just showing as a demosnrtation\n",
    "\n",
    "assignparam = {'DPrt' : {'PartialNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':0.33}, \\\n",
    "                         'FullNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':1.}}}\n",
    "\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#notice how on FullNoise_DPrt the maximum and minimum values\n",
    "#are notperturned outside fo the original range\n",
    "#this is by design"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>NoNoise_retn</th>\n",
       "      <th>PartialNoise_DPrt</th>\n",
       "      <th>FullNoise_DPrt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>-0.003731</td>\n",
       "      <td>-0.003691</td>\n",
       "      <td>-0.033645</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12.0</td>\n",
       "      <td>0.089552</td>\n",
       "      <td>0.089552</td>\n",
       "      <td>0.064420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.026860</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.3</td>\n",
       "      <td>0.024627</td>\n",
       "      <td>0.024627</td>\n",
       "      <td>0.016791</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-4.2</td>\n",
       "      <td>-0.031343</td>\n",
       "      <td>-0.031343</td>\n",
       "      <td>-0.063345</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.1</td>\n",
       "      <td>0.015672</td>\n",
       "      <td>0.015672</td>\n",
       "      <td>0.002631</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>101.0</td>\n",
       "      <td>0.753731</td>\n",
       "      <td>0.753731</td>\n",
       "      <td>0.741908</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-33.0</td>\n",
       "      <td>-0.246269</td>\n",
       "      <td>-0.246269</td>\n",
       "      <td>-0.246269</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    col1  NoNoise_retn  PartialNoise_DPrt  FullNoise_DPrt\n",
       "0   -0.5     -0.003731          -0.003691       -0.033645\n",
       "1   12.0      0.089552           0.089552        0.064420\n",
       "2    0.0      0.000000           0.000000       -0.026860\n",
       "3    3.3      0.024627           0.024627        0.016791\n",
       "4   -4.2     -0.031343          -0.031343       -0.063345\n",
       "5    2.1      0.015672           0.015672        0.002631\n",
       "6  101.0      0.753731           0.753731        0.741908\n",
       "7  -33.0     -0.246269          -0.246269       -0.246269"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#similarly for retain normalization\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col1'] = df_train['col1'].copy()\n",
    "df['NoNoise'] = df_train['col1'].copy()\n",
    "df['PartialNoise'] = df_train['col1'].copy()\n",
    "df['FullNoise'] = df_train['col1'].copy()\n",
    "\n",
    "assigncat = {'excl':'col1', 'retn':'NoNoise', 'DPrt':['PartialNoise', 'FullNoise']}\n",
    "\n",
    "#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments\n",
    "#here we'll show the default values for mu and sigma\n",
    "#and vary the flip_prob parameter based on what ratio of data we want to receive noise\n",
    "#note that we don't have to set parameters for defaults, just showing as a demosnrtation\n",
    "\n",
    "assignparam = {'DPrt' : {'PartialNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':0.33}, \\\n",
    "                         'FullNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':1.}}}\n",
    "\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col6</th>\n",
       "      <th>NoNoise_bnry</th>\n",
       "      <th>PartialNoise_bnry_DPbn</th>\n",
       "      <th>FullNoise_bnry_DPbn</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>on</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>off</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>on</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>on</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>off</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>off</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>on</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>on</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  col6  NoNoise_bnry  PartialNoise_bnry_DPbn  FullNoise_bnry_DPbn\n",
       "0   on             1                       1                    0\n",
       "1  off             0                       1                    1\n",
       "2   on             1                       1                    0\n",
       "3   on             1                       1                    0\n",
       "4  off             0                       1                    1\n",
       "5  off             0                       1                    1\n",
       "6   on             1                       0                    0\n",
       "7   on             1                       1                    0"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#now we'll demonstrate the categoric noise injectors:\n",
    "\n",
    "#bnry transform is for two-vale categoric sets\n",
    "#noise just flips to value between 0 and 1\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col6'] = df_train['col6'].copy()\n",
    "df['NoNoise'] = df_train['col6'].copy()\n",
    "df['PartialNoise'] = df_train['col6'].copy()\n",
    "df['FullNoise'] = df_train['col6'].copy()\n",
    "\n",
    "assigncat = {'excl':'col6', 'bnry':'NoNoise', 'DPbn':['PartialNoise', 'FullNoise']}\n",
    "\n",
    "#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments\n",
    "#here we'll vary the flip_prob parameter based on what ratio of data we want to receive noise\n",
    "\n",
    "assignparam = {'DPbn' : {'PartialNoise' : {'flip_prob':0.33}, \\\n",
    "                         'FullNoise' : {'flip_prob':1.}}}\n",
    "\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col5</th>\n",
       "      <th>NoNoise_ord3</th>\n",
       "      <th>PartialNoise_ord3_DPod</th>\n",
       "      <th>FullNoise_ord3_DPod</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>square</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>triangle</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>square</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       col5  NoNoise_ord3  PartialNoise_ord3_DPod  FullNoise_ord3_DPod\n",
       "0    circle             0                       0                    2\n",
       "1    square             1                       1                    2\n",
       "2    circle             0                       0                    0\n",
       "3    square             1                       1                    2\n",
       "4  triangle             2                       2                    1\n",
       "5  triangle             2                       2                    2\n",
       "6    circle             0                       0                    2\n",
       "7    square             1                       1                    1"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#ord3 transform is for ordinal encoding of categoric sets\n",
    "#noise triggers a uniform random selection between categories\n",
    "#including posibility of retention\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col5'] = df_train['col5'].copy()\n",
    "df['NoNoise'] = df_train['col5'].copy()\n",
    "df['PartialNoise'] = df_train['col5'].copy()\n",
    "df['FullNoise'] = df_train['col5'].copy()\n",
    "\n",
    "assigncat = {'excl':'col5', 'ord3':'NoNoise', 'DPod':['PartialNoise', 'FullNoise']}\n",
    "\n",
    "#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments\n",
    "#here we'll vary the flip_prob parameter based on what ratio of data we want to receive noise\n",
    "\n",
    "\n",
    "assignparam = {'DPod' : {'PartialNoise' : {'flip_prob':0.33}, \\\n",
    "                         'FullNoise' : {'flip_prob':1.}}}\n",
    "\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col5</th>\n",
       "      <th>NoNoise_onht_0</th>\n",
       "      <th>NoNoise_onht_1</th>\n",
       "      <th>NoNoise_onht_2</th>\n",
       "      <th>PartialNoise_ord3_DPod_onht_0</th>\n",
       "      <th>PartialNoise_ord3_DPod_onht_1</th>\n",
       "      <th>PartialNoise_ord3_DPod_onht_2</th>\n",
       "      <th>FullNoise_ord3_DPod_onht_0</th>\n",
       "      <th>FullNoise_ord3_DPod_onht_1</th>\n",
       "      <th>FullNoise_ord3_DPod_onht_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>triangle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>circle</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       col5  NoNoise_onht_0  NoNoise_onht_1  NoNoise_onht_2  \\\n",
       "0    circle               1               0               0   \n",
       "1    square               0               1               0   \n",
       "2    circle               1               0               0   \n",
       "3    square               0               1               0   \n",
       "4  triangle               0               0               1   \n",
       "5  triangle               0               0               1   \n",
       "6    circle               1               0               0   \n",
       "7    square               0               1               0   \n",
       "\n",
       "   PartialNoise_ord3_DPod_onht_0  PartialNoise_ord3_DPod_onht_1  \\\n",
       "0                              1                              0   \n",
       "1                              0                              1   \n",
       "2                              1                              0   \n",
       "3                              0                              1   \n",
       "4                              0                              0   \n",
       "5                              0                              0   \n",
       "6                              1                              0   \n",
       "7                              0                              1   \n",
       "\n",
       "   PartialNoise_ord3_DPod_onht_2  FullNoise_ord3_DPod_onht_0  \\\n",
       "0                              0                           1   \n",
       "1                              0                           0   \n",
       "2                              0                           1   \n",
       "3                              0                           0   \n",
       "4                              1                           0   \n",
       "5                              1                           0   \n",
       "6                              0                           1   \n",
       "7                              0                           0   \n",
       "\n",
       "   FullNoise_ord3_DPod_onht_1  FullNoise_ord3_DPod_onht_2  \n",
       "0                           0                           0  \n",
       "1                           0                           1  \n",
       "2                           0                           0  \n",
       "3                           1                           0  \n",
       "4                           0                           1  \n",
       "5                           0                           1  \n",
       "6                           0                           0  \n",
       "7                           1                           0  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#onht transform is for one hot encoding of categoric sets\n",
    "#noise triggers a uniform random selection between categories\n",
    "#including posibility of retention\n",
    "\n",
    "#note that onht is similar to text transform but integer encodes the column headers\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col5'] = df_train['col5'].copy()\n",
    "df['NoNoise'] = df_train['col5'].copy()\n",
    "df['PartialNoise'] = df_train['col5'].copy()\n",
    "df['FullNoise'] = df_train['col5'].copy()\n",
    "\n",
    "assigncat = {'excl':'col5', 'onht':'NoNoise', 'DPoh':['PartialNoise', 'FullNoise']}\n",
    "\n",
    "#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments\n",
    "#here we'll vary the flip_prob parameter based on what ratio of data we want to receive noise\n",
    "\n",
    "\n",
    "assignparam = {'DPoh' : {'PartialNoise' : {'flip_prob':0.33}, \\\n",
    "                         'FullNoise' : {'flip_prob':1.}}}\n",
    "\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col5</th>\n",
       "      <th>NoNoise_1010_0</th>\n",
       "      <th>NoNoise_1010_1</th>\n",
       "      <th>PartialNoise_ord3_DPod_1010_0</th>\n",
       "      <th>PartialNoise_ord3_DPod_1010_1</th>\n",
       "      <th>FullNoise_ord3_DPod_1010_0</th>\n",
       "      <th>FullNoise_ord3_DPod_1010_1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>triangle</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>triangle</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>circle</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>square</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       col5  NoNoise_1010_0  NoNoise_1010_1  PartialNoise_ord3_DPod_1010_0  \\\n",
       "0    circle               0               0                              0   \n",
       "1    square               0               1                              0   \n",
       "2    circle               0               0                              0   \n",
       "3    square               0               1                              0   \n",
       "4  triangle               1               0                              1   \n",
       "5  triangle               1               0                              1   \n",
       "6    circle               0               0                              1   \n",
       "7    square               0               1                              0   \n",
       "\n",
       "   PartialNoise_ord3_DPod_1010_1  FullNoise_ord3_DPod_1010_0  \\\n",
       "0                              0                           1   \n",
       "1                              1                           1   \n",
       "2                              0                           1   \n",
       "3                              1                           1   \n",
       "4                              0                           1   \n",
       "5                              0                           0   \n",
       "6                              0                           0   \n",
       "7                              1                           0   \n",
       "\n",
       "   FullNoise_ord3_DPod_1010_1  \n",
       "0                           0  \n",
       "1                           0  \n",
       "2                           0  \n",
       "3                           0  \n",
       "4                           0  \n",
       "5                           0  \n",
       "6                           0  \n",
       "7                           1  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#1010 transform is for binary encoding of categoric sets\n",
    "#noise triggers a uniform random selection between categories\n",
    "#including posibility of retention\n",
    "#where a category may be represented by a set of activations\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col5'] = df_train['col5'].copy()\n",
    "df['NoNoise'] = df_train['col5'].copy()\n",
    "df['PartialNoise'] = df_train['col5'].copy()\n",
    "df['FullNoise'] = df_train['col5'].copy()\n",
    "\n",
    "assigncat = {'excl':'col5', '1010':'NoNoise', 'DP10':['PartialNoise', 'FullNoise']}\n",
    "\n",
    "#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments\n",
    "#here we'll vary the flip_prob parameter based on what ratio of data we want to receive noise\n",
    "\n",
    "assignparam = {'DPo3' : {'PartialNoise' : {'flip_prob':0.33}, \\\n",
    "                         'FullNoise' : {'flip_prob':1.}}}\n",
    "\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6 - Sequential Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col4</th>\n",
       "      <th>1_dxdt_retn</th>\n",
       "      <th>1_dxdt_dxdt_retn</th>\n",
       "      <th>1_retn</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>-0.4</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.363636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.4</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.363636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>-0.2</td>\n",
       "      <td>-0.428571</td>\n",
       "      <td>0.181818</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>0.363636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>11</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.909091</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>12</td>\n",
       "      <td>0.1</td>\n",
       "      <td>-0.357143</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   col4  1_dxdt_retn  1_dxdt_dxdt_retn    1_retn\n",
       "0     5         -0.4          0.000000  0.363636\n",
       "1     1         -0.4          0.000000  0.000000\n",
       "2     5          0.4          0.571429  0.363636\n",
       "3     3         -0.2         -0.428571  0.181818\n",
       "4     1         -0.2          0.000000  0.000000\n",
       "5     5          0.4          0.428571  0.363636\n",
       "6    11          0.6          0.142857  0.909091\n",
       "7    12          0.1         -0.357143  1.000000"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#dxdt transform is for sequential data delta between time steps\n",
    "#where data includes a retn normalziation, as well as a dxdt with retn normalization\n",
    "#here we'll show d2dt which also include a set with two tiers of dxdt applied\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col4'] = df_train['col4'].copy()\n",
    "df['1'] = df_train['col4'].copy()\n",
    "\n",
    "assigncat = {'excl':'col4', 'd2dt':'1'}\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col4</th>\n",
       "      <th>1_dxd2_retn</th>\n",
       "      <th>1_dxd2_dxd2_retn</th>\n",
       "      <th>1_retn</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>0.105263</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.363636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0.105263</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>0.105263</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.363636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0.105263</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.181818</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.105263</td>\n",
       "      <td>-0.097561</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>-0.105263</td>\n",
       "      <td>-0.195122</td>\n",
       "      <td>0.363636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>11</td>\n",
       "      <td>0.631579</td>\n",
       "      <td>0.243902</td>\n",
       "      <td>0.909091</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>12</td>\n",
       "      <td>0.894737</td>\n",
       "      <td>0.804878</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   col4  1_dxd2_retn  1_dxd2_dxd2_retn    1_retn\n",
       "0     5     0.105263          0.000000  0.363636\n",
       "1     1     0.105263          0.000000  0.000000\n",
       "2     5     0.105263          0.000000  0.363636\n",
       "3     3     0.105263          0.000000  0.181818\n",
       "4     1    -0.105263         -0.097561  0.000000\n",
       "5     5    -0.105263         -0.195122  0.363636\n",
       "6    11     0.631579          0.243902  0.909091\n",
       "7    12     0.894737          0.804878  1.000000"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#the paper noted a variation on the dxdt transforms to return averages\n",
    "#between sets of points such as to smooth or denoise data\n",
    "#this is available with the dxd2 family, \n",
    "#with a similar number of tieers for last demosnrtation with the d2d2 root category\n",
    "#note that as with dxdt family, dxd2 family accepts a 'periods' parameter for number fo time steps\n",
    "#here we'll demonstrate with periods as 2\n",
    "#note that further variations are also available without the downstream normalizations on the output\n",
    "\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col4'] = df_train['col4'].copy()\n",
    "df['1'] = df_train['col4'].copy()\n",
    "\n",
    "assigncat = {'excl':'col4', 'd2d2':'1'}\n",
    "assignparam = {'d2d2' : {'1' : {'periods' : 2}}, \\\n",
    "               'd2dt' : {'1' : {'periods' : 2}}}\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  assignparam = assignparam, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7 - Integer Sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col4</th>\n",
       "      <th>1_ord3_mnmx</th>\n",
       "      <th>1_retn</th>\n",
       "      <th>1_1010_0</th>\n",
       "      <th>1_1010_1</th>\n",
       "      <th>1_1010_2</th>\n",
       "      <th>1_ordl</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.363636</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.363636</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.181818</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.363636</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>11</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>12</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   col4  1_ord3_mnmx    1_retn  1_1010_0  1_1010_1  1_1010_2  1_ordl\n",
       "0     5         0.00  0.363636         1         0         0       4\n",
       "1     1         0.25  0.000000         0         0         0       0\n",
       "2     5         0.00  0.363636         1         0         0       4\n",
       "3     3         1.00  0.181818         0         1         1       3\n",
       "4     1         0.25  0.000000         0         0         0       0\n",
       "5     5         0.00  0.363636         1         0         0       4\n",
       "6    11         0.50  0.909091         0         0         1       1\n",
       "7    12         0.75  1.000000         0         1         0       2"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#the ntgr family of transforms are intended for received integer sets\n",
    "#and encode the data in multiple configurations\n",
    "#here we'll demonsrtate the ntgr root category\n",
    "#which returns a retn normalizaiton, 1010 binary transformm, ordl ordinal encding\n",
    "#and ord4 as ord3_mnmx which is a scaled metric for frequency of redundant entries\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['col4'] = df_train['col4'].copy()\n",
    "df['1'] = df_train['col4'].copy()\n",
    "\n",
    "assigncat = {'excl':'col4', 'ntgr':'1'}\n",
    "\n",
    "\n",
    "#then we can apply\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(\n",
    "  df, \\\n",
    "  shuffletrain = False, \\\n",
    "  pandasoutput = True, \\\n",
    "  ML_cmnd = {'PCA_type':'off'}, \\\n",
    "  assigncat = assigncat, \\\n",
    "  printstatus = False)\n",
    "\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Best regards"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
