{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Augmentation Demonstration"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automunge is available now for pip install:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install Automunge"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or to upgrade (we currently roll out upgrades pretty frequently):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install Automunge --upgrade"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once installed, run this in a local session to initialize:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Automunge import *\n",
    "am = AutoMunge()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll demonstrate data augmentation on the Titanic set, a common benchmark."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "#titanic set\n",
    "df_train = pd.read_csv('train.csv')\n",
    "df_test = pd.read_csv('test.csv')\n",
    "\n",
    "#titanic set\n",
    "labels_column = 'Survived'\n",
    "trainID_column = 'PassengerId'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(891, 12)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data augmentation is available for assignment to target columns in assigncat, and may be targeted to numeric or bounded categoric sets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_columns = ['Age', 'Parch', 'Fare']\n",
    "categoric_columns = ['Pclass', 'Cabin', 'Embarked']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are a few forms of augmentation to choose from, each of which in parallel to noise injection applies a different type of normalization or categoric encoding.\n",
    "\n",
    "Numeric\n",
    "- DPnb: for z-score normalized numeric data\n",
    "- DPmm: for min-max scaled numeric data\n",
    "- DPrt: for retain normalized numeric data\n",
    "\n",
    "Categoric\n",
    "- DPbn: for binary categoric data (i.e. two value sets)\n",
    "- DPod: for ordinal encoded categoric data\n",
    "- DPoh: for one-hot encoded categoric data\n",
    "- DP10: for binarized categoric data\n",
    "\n",
    "Here we'll demonstrate by applying z-score normalized numeric sets and binarized bounded categoric sets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "assigncat = {'DPnb' : numeric_columns, \n",
    "             'DP10' : categoric_columns}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Columns that are not explicitly assigned to transformation categories in assigncat are defered to automation.\n",
    "\n",
    "Note that the DP family of transforms inject noise into training data but do not inject noise into test data.\n",
    "\n",
    "Here we'll demonstrate processing the same training data set twice, both with and without noise injection, and concatinating the two results (by passing the same training set df_train as both train and test data)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, train_ID, labels, \\\n",
    "val, val_ID, val_labels, \\\n",
    "test, test_ID, test_labels, \\\n",
    "postprocess_dict \\\n",
    "= am.automunge(df_train,\n",
    "               df_test = df_train,\n",
    "               labels_column = labels_column,\n",
    "               trainID_column = trainID_column,\n",
    "               assigncat = assigncat,\n",
    "               printstatus = False)\n",
    "\n",
    "train = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "train_ID = pd.concat([train_ID, test_ID], axis=0, ignore_index=True)\n",
    "labels = pd.concat([labels, test_labels], axis=0, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1782, 45)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sex_bnry</th>\n",
       "      <th>SibSp_nmbr</th>\n",
       "      <th>Pclass_NArw</th>\n",
       "      <th>Pclass_ord3_DPod_1010_0</th>\n",
       "      <th>Pclass_ord3_DPod_1010_1</th>\n",
       "      <th>Name_NArw</th>\n",
       "      <th>Name_hash_0</th>\n",
       "      <th>Name_hash_1</th>\n",
       "      <th>Name_hash_2</th>\n",
       "      <th>Name_hash_3</th>\n",
       "      <th>...</th>\n",
       "      <th>Cabin_ord3_DPod_1010_2</th>\n",
       "      <th>Cabin_ord3_DPod_1010_3</th>\n",
       "      <th>Cabin_ord3_DPod_1010_4</th>\n",
       "      <th>Cabin_ord3_DPod_1010_5</th>\n",
       "      <th>Cabin_ord3_DPod_1010_6</th>\n",
       "      <th>Cabin_ord3_DPod_1010_7</th>\n",
       "      <th>Embarked_NArw</th>\n",
       "      <th>Embarked_ord3_DPod_1010_0</th>\n",
       "      <th>Embarked_ord3_DPod_1010_1</th>\n",
       "      <th>Embarked_ord3_DPod_1010_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.474279</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>411</td>\n",
       "      <td>847</td>\n",
       "      <td>656</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.474279</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>580</td>\n",
       "      <td>847</td>\n",
       "      <td>83</td>\n",
       "      <td>993</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.474279</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>589</td>\n",
       "      <td>847</td>\n",
       "      <td>515</td>\n",
       "      <td>583</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.474279</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>884</td>\n",
       "      <td>847</td>\n",
       "      <td>40</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0.432550</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>460</td>\n",
       "      <td>847</td>\n",
       "      <td>145</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 45 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Sex_bnry  SibSp_nmbr  Pclass_NArw  Pclass_ord3_DPod_1010_0  \\\n",
       "0         1   -0.474279            0                        0   \n",
       "1         1   -0.474279            0                        1   \n",
       "2         1   -0.474279            0                        0   \n",
       "3         1   -0.474279            0                        1   \n",
       "4         1    0.432550            0                        0   \n",
       "\n",
       "   Pclass_ord3_DPod_1010_1  Name_NArw  Name_hash_0  Name_hash_1  Name_hash_2  \\\n",
       "0                        1          0          411          847          656   \n",
       "1                        0          0          580          847           83   \n",
       "2                        0          0          589          847          515   \n",
       "3                        0          0          884          847           40   \n",
       "4                        0          0          460          847          145   \n",
       "\n",
       "   Name_hash_3  ...  Cabin_ord3_DPod_1010_2  Cabin_ord3_DPod_1010_3  \\\n",
       "0            0  ...                       1                       1   \n",
       "1          993  ...                       0                       0   \n",
       "2          583  ...                       0                       0   \n",
       "3            0  ...                       0                       0   \n",
       "4            0  ...                       0                       0   \n",
       "\n",
       "   Cabin_ord3_DPod_1010_4  Cabin_ord3_DPod_1010_5  Cabin_ord3_DPod_1010_6  \\\n",
       "0                       1                       1                       0   \n",
       "1                       0                       0                       0   \n",
       "2                       0                       0                       0   \n",
       "3                       0                       0                       0   \n",
       "4                       0                       0                       0   \n",
       "\n",
       "   Cabin_ord3_DPod_1010_7  Embarked_NArw  Embarked_ord3_DPod_1010_0  \\\n",
       "0                       1              0                          0   \n",
       "1                       0              0                          0   \n",
       "2                       0              0                          0   \n",
       "3                       0              0                          0   \n",
       "4                       0              0                          0   \n",
       "\n",
       "   Embarked_ord3_DPod_1010_1  Embarked_ord3_DPod_1010_2  \n",
       "0                          0                          0  \n",
       "1                          0                          0  \n",
       "2                          0                          0  \n",
       "3                          0                          0  \n",
       "4                          0                          1  \n",
       "\n",
       "[5 rows x 45 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When it is time to process any additional data on the train set basis, we can still perform in the postmunge(.) function using the postprocess_dict returned from the corresponding automunge(.) call. By default postmunge will treat data as test data and not inject noise. If noise injection is desired on additional data postmunge accepts the traindata parameter to signal data is to be treated as training data for noise injection when traindata=True."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "test, test_ID, test_labels, \\\n",
    "postreports_dict \\\n",
    "= am.postmunge(postprocess_dict, df_test, \n",
    "               traindata=False,\n",
    "               printstatus=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sex_bnry</th>\n",
       "      <th>SibSp_nmbr</th>\n",
       "      <th>Pclass_NArw</th>\n",
       "      <th>Pclass_ord3_DPod_1010_0</th>\n",
       "      <th>Pclass_ord3_DPod_1010_1</th>\n",
       "      <th>Name_NArw</th>\n",
       "      <th>Name_hash_0</th>\n",
       "      <th>Name_hash_1</th>\n",
       "      <th>Name_hash_2</th>\n",
       "      <th>Name_hash_3</th>\n",
       "      <th>...</th>\n",
       "      <th>Cabin_ord3_DPod_1010_2</th>\n",
       "      <th>Cabin_ord3_DPod_1010_3</th>\n",
       "      <th>Cabin_ord3_DPod_1010_4</th>\n",
       "      <th>Cabin_ord3_DPod_1010_5</th>\n",
       "      <th>Cabin_ord3_DPod_1010_6</th>\n",
       "      <th>Cabin_ord3_DPod_1010_7</th>\n",
       "      <th>Embarked_NArw</th>\n",
       "      <th>Embarked_ord3_DPod_1010_0</th>\n",
       "      <th>Embarked_ord3_DPod_1010_1</th>\n",
       "      <th>Embarked_ord3_DPod_1010_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.474279</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>910</td>\n",
       "      <td>847</td>\n",
       "      <td>83</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.432550</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>185</td>\n",
       "      <td>583</td>\n",
       "      <td>83</td>\n",
       "      <td>99</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.474279</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>53</td>\n",
       "      <td>847</td>\n",
       "      <td>26</td>\n",
       "      <td>687</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.474279</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>851</td>\n",
       "      <td>847</td>\n",
       "      <td>662</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0.432550</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>168</td>\n",
       "      <td>583</td>\n",
       "      <td>718</td>\n",
       "      <td>547</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 45 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Sex_bnry  SibSp_nmbr  Pclass_NArw  Pclass_ord3_DPod_1010_0  \\\n",
       "0         1   -0.474279            0                        0   \n",
       "1         0    0.432550            0                        0   \n",
       "2         1   -0.474279            0                        1   \n",
       "3         1   -0.474279            0                        0   \n",
       "4         0    0.432550            0                        0   \n",
       "\n",
       "   Pclass_ord3_DPod_1010_1  Name_NArw  Name_hash_0  Name_hash_1  Name_hash_2  \\\n",
       "0                        0          0          910          847           83   \n",
       "1                        0          0          185          583           83   \n",
       "2                        0          0           53          847           26   \n",
       "3                        0          0          851          847          662   \n",
       "4                        0          0          168          583          718   \n",
       "\n",
       "   Name_hash_3  ...  Cabin_ord3_DPod_1010_2  Cabin_ord3_DPod_1010_3  \\\n",
       "0            0  ...                       0                       0   \n",
       "1           99  ...                       0                       0   \n",
       "2          687  ...                       0                       0   \n",
       "3            0  ...                       0                       0   \n",
       "4          547  ...                       0                       0   \n",
       "\n",
       "   Cabin_ord3_DPod_1010_4  Cabin_ord3_DPod_1010_5  Cabin_ord3_DPod_1010_6  \\\n",
       "0                       0                       0                       0   \n",
       "1                       0                       0                       0   \n",
       "2                       0                       0                       0   \n",
       "3                       0                       0                       0   \n",
       "4                       0                       0                       0   \n",
       "\n",
       "   Cabin_ord3_DPod_1010_7  Embarked_NArw  Embarked_ord3_DPod_1010_0  \\\n",
       "0                       0              0                          0   \n",
       "1                       0              0                          0   \n",
       "2                       0              0                          0   \n",
       "3                       0              0                          0   \n",
       "4                       0              0                          0   \n",
       "\n",
       "   Embarked_ord3_DPod_1010_1  Embarked_ord3_DPod_1010_2  \n",
       "0                          1                          0  \n",
       "1                          0                          0  \n",
       "2                          1                          0  \n",
       "3                          0                          0  \n",
       "4                          0                          0  \n",
       "\n",
       "[5 rows x 45 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One more demonstration, note that the noise distribution profiles injected to any column can be custom configuraed by parameter. Automunge allows passing parameters to transformations by the assignparam dictionary. Available parameters for each transformation category and their defaults are documented in the library of transformations section of the [READ ME](anonymized).\n",
    "\n",
    "Here we'll demonstrate again applying noise injection to training data, but in this case will configure custom noise profiles."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#If we want to overwrite for the transformation category globally \n",
    "#can apply assignparam with a default assignparam entry.\n",
    "\n",
    "#here sigma is the noise standard deviation and flip_prob refers to the ratio of entries receiving injection\n",
    "\n",
    "assignparam = \\\n",
    "{'default_assignparam' :\n",
    "   {'DPnb' : {'sigma' : 0.05, 'flip_prob' : 0.5},\n",
    "    'DP10' : {'flip_prob' : 0.5}\n",
    "} }\n",
    "\n",
    "#Or we can overwrite for a specific column, \n",
    "#here we demonstrate applying scaled Laplace distributed noise instead of Gaussian \n",
    "#to the ‘DPnb’ transform application to column ‘Fare’.\n",
    "\n",
    "assignparam.update(\n",
    " {'DPnb' : {'Fare' : {'noisedistribution' : 'laplace'}}}\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#and then can similarly process the train data with and without noise injections and concatinate\n",
    "\n",
    "train, train_ID, labels, \\\n",
    "val, val_ID, val_labels, \\\n",
    "test, test_ID, test_labels, \\\n",
    "postprocess_dict \\\n",
    "= am.automunge(df_train,\n",
    "               df_test = df_train,\n",
    "               labels_column = labels_column,\n",
    "               trainID_column = trainID_column,\n",
    "               assigncat = assigncat,\n",
    "               assignparam = assignparam,\n",
    "               printstatus = False)\n",
    "\n",
    "train = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([train_ID, test_ID], axis=0, ignore_index=True)\n",
    "labels = pd.concat([labels, test_labels], axis=0, ignore_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "More information is available on tabular data augmentation with Automunge in the paper [A Numbers Game\n",
    "Numeric Encoding Options with Automunge](anonymized), particularly in Section 5 and Appendix D."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}