{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Let's run the code that was demonstrated in the appendix to validate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#As a sample data set we'll use a small excerpt from the Higgs set\n",
    "#available online at https://archive.ics.uci.edu/ml/datasets/HIGGS\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "df_train = pd.read_csv('Higgs_data_partial.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Note this sets includes an intended Label column\n",
    "#Which will be carved out by passing the associated column headers\n",
    "\n",
    "labels_column = '0'\n",
    "ID_columnlist = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#To be consistent with the function calls demonstrated in the appendix, \n",
    "#We'll just drop the label columns from our set. \n",
    "#Alternatively these could be designated in the automunge(.) call.\n",
    "\n",
    "del df_train[labels_column]\n",
    "\n",
    "#we'll create a copy of train set to use as test set for these demonstrations\n",
    "df_test = df_train.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# A - Function Call Demonstrations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automunge is available for pip install:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install Automunge"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or to upgrade (we currently roll out upgrades fairly frequently):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install Automunge --upgrade"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once installed, run this in local session to initialize:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, assuming we want to prepare a train set df\\_train for ML, can apply default parameters as:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Automunge processing\n",
      "\n",
      "evaluating column:  1\n",
      "processing column:  1\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['1_nmbr']\n",
      "\n",
      "evaluating column:  2\n",
      "processing column:  2\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['2_nmbr']\n",
      "\n",
      "evaluating column:  3\n",
      "processing column:  3\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['3_nmbr']\n",
      "\n",
      "evaluating column:  4\n",
      "processing column:  4\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['4_nmbr']\n",
      "\n",
      "evaluating column:  5\n",
      "processing column:  5\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['5_nmbr']\n",
      "\n",
      "evaluating column:  6\n",
      "processing column:  6\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['6_nmbr']\n",
      "\n",
      "evaluating column:  7\n",
      "processing column:  7\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['7_nmbr']\n",
      "\n",
      "evaluating column:  8\n",
      "processing column:  8\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['8_nmbr']\n",
      "\n",
      "evaluating column:  9\n",
      "processing column:  9\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['9_0.0', '9_1.0865380764007568', '9_2.1730761528015137']\n",
      "\n",
      "evaluating column:  10\n",
      "processing column:  10\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['10_nmbr']\n",
      "\n",
      "evaluating column:  11\n",
      "processing column:  11\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['11_nmbr']\n",
      "\n",
      "evaluating column:  12\n",
      "processing column:  12\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['12_nmbr']\n",
      "\n",
      "evaluating column:  13\n",
      "processing column:  13\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['13_0.0', '13_1.1074360609054563', '13_2.2148721218109126']\n",
      "\n",
      "evaluating column:  14\n",
      "processing column:  14\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['14_nmbr']\n",
      "\n",
      "evaluating column:  15\n",
      "processing column:  15\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['15_nmbr']\n",
      "\n",
      "evaluating column:  16\n",
      "processing column:  16\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['16_nmbr']\n",
      "\n",
      "evaluating column:  17\n",
      "processing column:  17\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['17_0.0', '17_1.2741122245788574', '17_2.548224449157715']\n",
      "\n",
      "evaluating column:  18\n",
      "processing column:  18\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['18_nmbr']\n",
      "\n",
      "evaluating column:  19\n",
      "processing column:  19\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['19_nmbr']\n",
      "\n",
      "evaluating column:  20\n",
      "processing column:  20\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['20_nmbr']\n",
      "\n",
      "evaluating column:  21\n",
      "processing column:  21\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "evaluating column:  22\n",
      "processing column:  22\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['22_nmbr']\n",
      "\n",
      "evaluating column:  23\n",
      "processing column:  23\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['23_nmbr']\n",
      "\n",
      "evaluating column:  24\n",
      "processing column:  24\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['24_nmbr']\n",
      "\n",
      "evaluating column:  25\n",
      "processing column:  25\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['25_nmbr']\n",
      "\n",
      "evaluating column:  26\n",
      "processing column:  26\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['26_nmbr']\n",
      "\n",
      "evaluating column:  27\n",
      "processing column:  27\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['27_nmbr']\n",
      "\n",
      "evaluating column:  28\n",
      "processing column:  28\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['28_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  3_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  4_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  5_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  6_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  7_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  8_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  10_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  11_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  12_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  14_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  15_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  16_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  18_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  19_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  20_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  22_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  23_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  24_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  25_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  26_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  27_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  28_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_1.0865380764007568\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_2.1730761528015137\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_1.1074360609054563\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_2.2148721218109126\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_1.2741122245788574\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_2.548224449157715\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_1.5509806871414182\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_3.1019613742828365\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "______\n",
      "\n",
      "versioning serial stamp:\n",
      "_5.22_491887658430_2020-11-15T14:10:11.683813\n",
      "\n",
      "Automunge returned ID column set: \n",
      "['Automunge_index_491887658430']\n",
      "\n",
      "Automunge returned train column set: \n",
      "['1_nmbr', '2_nmbr', '3_nmbr', '4_nmbr', '5_nmbr', '6_nmbr', '7_nmbr', '8_nmbr', '10_nmbr', '11_nmbr', '12_nmbr', '14_nmbr', '15_nmbr', '16_nmbr', '18_nmbr', '19_nmbr', '20_nmbr', '22_nmbr', '23_nmbr', '24_nmbr', '25_nmbr', '26_nmbr', '27_nmbr', '28_nmbr', '9_0.0', '9_1.0865380764007568', '9_2.1730761528015137', '13_0.0', '13_1.1074360609054563', '13_2.2148721218109126', '17_0.0', '17_1.2741122245788574', '17_2.548224449157715', '21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "_______________\n",
      "Automunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that if our df\\_train set included a labels column, we should designate the column header with the labels\\_column parameter. Or likewise we can designate any ID columns with the trainID\\_column parameter.\n",
    "\n",
    "The returned postprocess\\_dict should be saved such as with pickle.\n",
    "\n",
    "We can then consistently prepare subsequent test data df\\_test in postmunge(.):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  1\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['1_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  2\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['2_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  3\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['3_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  4\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['4_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  5\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['5_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  6\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['6_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  7\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['7_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  8\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['8_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  9\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['9_0.0', '9_1.0865380764007568', '9_2.1730761528015137']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  10\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['10_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  11\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['11_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  12\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['12_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  13\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['13_0.0', '13_1.1074360609054563', '13_2.2148721218109126']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  14\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['14_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  15\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['15_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  16\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['16_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  17\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['17_0.0', '17_1.2741122245788574', '17_2.548224449157715']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  18\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['18_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  19\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['19_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  20\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['20_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  21\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  22\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['22_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  23\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['23_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  24\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['24_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  25\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['25_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  26\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['26_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  27\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['27_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  28\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['28_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  3_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  4_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  5_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  6_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  7_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  8_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  10_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  11_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  12_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  14_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  15_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  16_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  18_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  19_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  20_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  22_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  23_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  24_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  25_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  26_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  27_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  28_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_1.0865380764007568\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_2.1730761528015137\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_1.1074360609054563\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_2.2148721218109126\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_1.2741122245788574\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_2.548224449157715\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_1.5509806871414182\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_3.1019613742828365\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "_______________\n",
      "Postmunge returned ID column set: \n",
      "['Automunge_index_491887658430']\n",
      "\n",
      "Postmunge returned test column set: \n",
      "['1_nmbr', '2_nmbr', '3_nmbr', '4_nmbr', '5_nmbr', '6_nmbr', '7_nmbr', '8_nmbr', '10_nmbr', '11_nmbr', '12_nmbr', '14_nmbr', '15_nmbr', '16_nmbr', '18_nmbr', '19_nmbr', '20_nmbr', '22_nmbr', '23_nmbr', '24_nmbr', '25_nmbr', '26_nmbr', '27_nmbr', '28_nmbr', '9_0.0', '9_1.0865380764007568', '9_2.1730761528015137', '13_0.0', '13_1.1074360609054563', '13_2.2148721218109126', '17_0.0', '17_1.2741122245788574', '17_2.548224449157715', '21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "_______________\n",
      "Postmunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict \\\n",
    "= am.postmunge(postprocess_dict, df_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I find it helps to just copy and paste the full range of parameters for reference:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Automunge processing\n",
      "\n",
      "evaluating column:  1\n",
      "processing column:  1\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['1_nmbr']\n",
      "\n",
      "evaluating column:  2\n",
      "processing column:  2\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['2_nmbr']\n",
      "\n",
      "evaluating column:  3\n",
      "processing column:  3\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['3_nmbr']\n",
      "\n",
      "evaluating column:  4\n",
      "processing column:  4\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['4_nmbr']\n",
      "\n",
      "evaluating column:  5\n",
      "processing column:  5\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['5_nmbr']\n",
      "\n",
      "evaluating column:  6\n",
      "processing column:  6\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['6_nmbr']\n",
      "\n",
      "evaluating column:  7\n",
      "processing column:  7\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['7_nmbr']\n",
      "\n",
      "evaluating column:  8\n",
      "processing column:  8\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['8_nmbr']\n",
      "\n",
      "evaluating column:  9\n",
      "processing column:  9\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['9_0.0', '9_1.0865380764007568', '9_2.1730761528015137']\n",
      "\n",
      "evaluating column:  10\n",
      "processing column:  10\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['10_nmbr']\n",
      "\n",
      "evaluating column:  11\n",
      "processing column:  11\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['11_nmbr']\n",
      "\n",
      "evaluating column:  12\n",
      "processing column:  12\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['12_nmbr']\n",
      "\n",
      "evaluating column:  13\n",
      "processing column:  13\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['13_0.0', '13_1.1074360609054563', '13_2.2148721218109126']\n",
      "\n",
      "evaluating column:  14\n",
      "processing column:  14\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['14_nmbr']\n",
      "\n",
      "evaluating column:  15\n",
      "processing column:  15\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['15_nmbr']\n",
      "\n",
      "evaluating column:  16\n",
      "processing column:  16\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['16_nmbr']\n",
      "\n",
      "evaluating column:  17\n",
      "processing column:  17\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['17_0.0', '17_1.2741122245788574', '17_2.548224449157715']\n",
      "\n",
      "evaluating column:  18\n",
      "processing column:  18\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['18_nmbr']\n",
      "\n",
      "evaluating column:  19\n",
      "processing column:  19\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['19_nmbr']\n",
      "\n",
      "evaluating column:  20\n",
      "processing column:  20\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['20_nmbr']\n",
      "\n",
      "evaluating column:  21\n",
      "processing column:  21\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "evaluating column:  22\n",
      "processing column:  22\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['22_nmbr']\n",
      "\n",
      "evaluating column:  23\n",
      "processing column:  23\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['23_nmbr']\n",
      "\n",
      "evaluating column:  24\n",
      "processing column:  24\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['24_nmbr']\n",
      "\n",
      "evaluating column:  25\n",
      "processing column:  25\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['25_nmbr']\n",
      "\n",
      "evaluating column:  26\n",
      "processing column:  26\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['26_nmbr']\n",
      "\n",
      "evaluating column:  27\n",
      "processing column:  27\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['27_nmbr']\n",
      "\n",
      "evaluating column:  28\n",
      "processing column:  28\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['28_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  3_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  4_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  5_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  6_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  7_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  8_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  10_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  11_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  12_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  14_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  15_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  16_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  18_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  19_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  20_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  22_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  23_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  24_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  25_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  26_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  27_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  28_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_1.0865380764007568\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_2.1730761528015137\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_1.1074360609054563\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_2.2148721218109126\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_1.2741122245788574\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_2.548224449157715\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_1.5509806871414182\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_3.1019613742828365\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "______\n",
      "\n",
      "versioning serial stamp:\n",
      "_5.22_887339508720_2020-11-15T14:10:21.725979\n",
      "\n",
      "Automunge returned ID column set: \n",
      "['Automunge_index_887339508720']\n",
      "\n",
      "Automunge returned train column set: \n",
      "['1_nmbr', '2_nmbr', '3_nmbr', '4_nmbr', '5_nmbr', '6_nmbr', '7_nmbr', '8_nmbr', '10_nmbr', '11_nmbr', '12_nmbr', '14_nmbr', '15_nmbr', '16_nmbr', '18_nmbr', '19_nmbr', '20_nmbr', '22_nmbr', '23_nmbr', '24_nmbr', '25_nmbr', '26_nmbr', '27_nmbr', '28_nmbr', '9_0.0', '9_1.0865380764007568', '9_2.1730761528015137', '13_0.0', '13_1.1074360609054563', '13_2.2148721218109126', '17_0.0', '17_1.2741122245788574', '17_2.548224449157715', '21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "_______________\n",
      "Automunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train, df_test=False, labels_column=False, \n",
    "  trainID_column=False, testID_column=False, valpercent1=.0, \n",
    "  valpercent2=.0, floatprecision=32, shuffletrain=True, \n",
    "  TrainLabelFreqLevel=False, powertransform=False, \n",
    "  binstransform=False, MLinfill=False, infilliterate=1, \n",
    "  randomseed=42, eval_ratio=.5, LabelSmoothing_train=False, \n",
    "  LabelSmoothing_test=False, LabelSmoothing_val=False, LSfit=False, \n",
    "  numbercategoryheuristic=63, pandasoutput=False, \n",
    "  NArw_marker=False, featureselection=False, featurepct=1.0,\n",
    "  featuremetric=0.0, featuremethod='default', Binary=False, \n",
    "  PCAn_components=False, PCAexcl=[], excl_suffix=False, \n",
    "  ML_cmnd = {'MLinfill_type':'default', \n",
    "             'MLinfill_cmnd':{'RandomForestClassifier':{}, \n",
    "                                    'RandomForestRegressor':{}}, \n",
    "             'PCA_type':'default', 'PCA_cmnd':{}}, \n",
    "  assigncat = {      \n",
    "  'nmbr':[], 'retn':[], 'mnmx':[], 'mean':[], 'MAD3':[], 'lgnm':[],\n",
    "  'bins':[], 'bsor':[], 'pwrs':[], 'pwr2':[], 'por2':[], 'bxcx':[],\n",
    "  'addd':[], 'sbtr':[], 'mltp':[], 'divd':[],\n",
    "  'log0':[], 'log1':[], 'logn':[], 'sqrt':[], 'rais':[], 'absl':[],\n",
    "  'bnwd':[], 'bnwK':[], 'bnwM':[], 'bnwo':[], 'bnKo':[], 'bnMo':[],\n",
    "  'bnep':[], 'bne7':[], 'bne9':[], 'bneo':[], 'bn7o':[], 'bn9o':[],\n",
    "  'bkt1':[], 'bkt2':[], 'bkt3':[], 'bkt4':[],\n",
    "  'nbr2':[], 'nbr3':[], 'MADn':[], 'MAD2':[], 'tlbn':[],\n",
    "  'mnm2':[], 'mnm3':[], 'mnm4':[], 'mnm5':[], 'mnm6':[],\n",
    "  'ntgr':[], 'ntg2':[], 'ntg3':[], 'mea2':[], 'mea3':[], 'bxc2':[],\n",
    "  'dxdt':[], 'd2dt':[], 'd3dt':[], 'dxd2':[], 'd2d2':[], 'd3d2':[],\n",
    "  'nmdx':[], 'nmd2':[], 'nmd3':[], 'mmdx':[], 'mmd2':[], 'mmd3':[],\n",
    "  'shft':[], 'shf2':[], 'shf3':[], 'shf4':[], 'shf7':[], 'shf8':[],\n",
    "  'bnry':[], 'onht':[], 'text':[], 'txt2':[], '1010':[], 'or10':[],\n",
    "  'ordl':[], 'ord2':[], 'ord3':[], 'ord4':[], 'om10':[], 'mmor':[],\n",
    "  'Unht':[], 'Utxt':[], 'Utx2':[], 'Uor3':[], 'Uor6':[], 'U101':[],\n",
    "  'splt':[], 'spl2':[], 'spl5':[], 'sp15':[], 'sp19':[], 'sbst':[],\n",
    "  'spl8':[], 'spl9':[], 'sp10':[], 'sp16':[], 'sp20':[], 'sbs2':[],\n",
    "  'srch':[], 'src2':[], 'src4':[], 'strn':[], 'lngt':[], 'aggt':[],\n",
    "  'nmrc':[], 'nmr2':[], 'nmcm':[], 'nmc2':[], 'nmEU':[], 'nmE2':[],\n",
    "  'nmr7':[], 'nmr8':[], 'nmc7':[], 'nmc8':[], 'nmE7':[], 'nmE8':[],\n",
    "  'ors2':[], 'ors5':[], 'ors6':[], 'ors7':[], 'ucct':[], 'Ucct':[],\n",
    "  'or15':[], 'or17':[], 'or19':[], 'or20':[], 'or21':[], 'or22':[],\n",
    "  'date':[], 'dat2':[], 'dat6':[], 'wkdy':[], 'bshr':[], 'hldy':[],\n",
    "  'wkds':[], 'wkdo':[], 'mnts':[], 'mnto':[],\n",
    "  'yea2':[], 'mnt2':[], 'mnt6':[], 'day2':[], 'day5':[],\n",
    "  'hrs2':[], 'hrs4':[], 'min2':[], 'min4':[], 'scn2':[], 'DPrt':[],\n",
    "  'DPnb':[], 'DPmm':[], 'DPbn':[], 'DPod':[], 'DP10':[], 'DPoh':[],\n",
    "  'excl':[], 'exc2':[], 'exc3':[], 'exc4':[], 'exc5':[], 'exc6':[],\n",
    "  'null':[], 'copy':[], 'shfl':[], 'eval':[], 'ptfm':[]},\n",
    "  assignparam = {'default_assignparam' : \n",
    "                 {'(category)' : {'(parameter)' : 42}}, \n",
    "                 '(category)' : {'(column)'   : {'(parameter)' : 42}}}, \n",
    "  assigninfill = {'stdrdinfill':[], 'MLinfill':[], \n",
    "                  'zeroinfill':[], 'oneinfill':[], \n",
    "                  'adjinfill':[], 'meaninfill':[], 'medianinfill':[], \n",
    "                  'modeinfill':[], 'lcinfill':[], 'naninfill':[]}, \n",
    "  assignnan = {'categories':{}, 'columns':{}, 'global':[]}, \n",
    "  transformdict={}, processdict={}, evalcat=False, \n",
    "  privacy_encode = False, printstatus=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or for postmunge(.) with full range of parameters:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  1\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['1_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  2\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['2_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  3\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['3_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  4\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['4_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  5\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['5_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  6\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['6_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  7\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['7_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  8\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['8_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  9\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['9_0.0', '9_1.0865380764007568', '9_2.1730761528015137']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  10\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['10_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  11\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['11_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  12\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['12_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  13\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['13_0.0', '13_1.1074360609054563', '13_2.2148721218109126']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  14\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['14_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  15\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['15_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  16\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['16_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  17\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['17_0.0', '17_1.2741122245788574', '17_2.548224449157715']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  18\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['18_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  19\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['19_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  20\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['20_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  21\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  22\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['22_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  23\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['23_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  24\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['24_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  25\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['25_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  26\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['26_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  27\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['27_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  28\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['28_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  3_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  4_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  5_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  6_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  7_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  8_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  10_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  11_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  12_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  14_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  15_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  16_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  18_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  19_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  20_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  22_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  23_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  24_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  25_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  26_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  27_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  28_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_1.0865380764007568\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_2.1730761528015137\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_1.1074360609054563\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_2.2148721218109126\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_1.2741122245788574\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_2.548224449157715\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_1.5509806871414182\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_3.1019613742828365\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "_______________\n",
      "Postmunge returned ID column set: \n",
      "['Automunge_index_887339508720']\n",
      "\n",
      "Postmunge returned test column set: \n",
      "['1_nmbr', '2_nmbr', '3_nmbr', '4_nmbr', '5_nmbr', '6_nmbr', '7_nmbr', '8_nmbr', '10_nmbr', '11_nmbr', '12_nmbr', '14_nmbr', '15_nmbr', '16_nmbr', '18_nmbr', '19_nmbr', '20_nmbr', '22_nmbr', '23_nmbr', '24_nmbr', '25_nmbr', '26_nmbr', '27_nmbr', '28_nmbr', '9_0.0', '9_1.0865380764007568', '9_2.1730761528015137', '13_0.0', '13_1.1074360609054563', '13_2.2148721218109126', '17_0.0', '17_1.2741122245788574', '17_2.548224449157715', '21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "_______________\n",
      "Postmunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df_test, \n",
    "  testID_column = False, labelscolumn = False, \n",
    "  pandasoutput = False, printstatus = True, \n",
    "  TrainLabelFreqLevel = False, featureeval = False, \n",
    "  driftreport = False, \n",
    "  LabelSmoothing = False, LSfit = False, inversion = False,\n",
    "  traindata = False, \n",
    "  returnedsets = True, shuffletrain = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# B - Assigning Transforms and Infill"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Assigning root categories is conducted in assigncat parameter and assigning infill in assigninfill - e.g. for a train set df\\_train with column headers `col1' and `col2' we could assign retain normalization (retn) and an integer encoding set (ntgr) with infill types zero infill and ML infill.\n",
    "\n",
    "Note any columns we don't explicitly assign will defer to automation or we could turn off automated defaults for pass-through of other columns by passing automunge parameter powertransform = `excl'."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#let's create a new column for df_train to apply these two demonsrtations\n",
    "\n",
    "df_train['col1'] = df_train['1'].copy()\n",
    "\n",
    "df_train['col2'] = pd.DataFrame(np.random.randint(0, high = 10, size=(df_train.shape[0])))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Automunge processing\n",
      "\n",
      "evaluating column:  1\n",
      "processing column:  1\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['1_nmbr']\n",
      "\n",
      "evaluating column:  2\n",
      "processing column:  2\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['2_nmbr']\n",
      "\n",
      "evaluating column:  3\n",
      "processing column:  3\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['3_nmbr']\n",
      "\n",
      "evaluating column:  4\n",
      "processing column:  4\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['4_nmbr']\n",
      "\n",
      "evaluating column:  5\n",
      "processing column:  5\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['5_nmbr']\n",
      "\n",
      "evaluating column:  6\n",
      "processing column:  6\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['6_nmbr']\n",
      "\n",
      "evaluating column:  7\n",
      "processing column:  7\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['7_nmbr']\n",
      "\n",
      "evaluating column:  8\n",
      "processing column:  8\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['8_nmbr']\n",
      "\n",
      "evaluating column:  9\n",
      "processing column:  9\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['9_0.0', '9_1.0865380764007568', '9_2.1730761528015137']\n",
      "\n",
      "evaluating column:  10\n",
      "processing column:  10\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['10_nmbr']\n",
      "\n",
      "evaluating column:  11\n",
      "processing column:  11\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['11_nmbr']\n",
      "\n",
      "evaluating column:  12\n",
      "processing column:  12\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['12_nmbr']\n",
      "\n",
      "evaluating column:  13\n",
      "processing column:  13\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['13_0.0', '13_1.1074360609054563', '13_2.2148721218109126']\n",
      "\n",
      "evaluating column:  14\n",
      "processing column:  14\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['14_nmbr']\n",
      "\n",
      "evaluating column:  15\n",
      "processing column:  15\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['15_nmbr']\n",
      "\n",
      "evaluating column:  16\n",
      "processing column:  16\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['16_nmbr']\n",
      "\n",
      "evaluating column:  17\n",
      "processing column:  17\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['17_0.0', '17_1.2741122245788574', '17_2.548224449157715']\n",
      "\n",
      "evaluating column:  18\n",
      "processing column:  18\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['18_nmbr']\n",
      "\n",
      "evaluating column:  19\n",
      "processing column:  19\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['19_nmbr']\n",
      "\n",
      "evaluating column:  20\n",
      "processing column:  20\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['20_nmbr']\n",
      "\n",
      "evaluating column:  21\n",
      "processing column:  21\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "evaluating column:  22\n",
      "processing column:  22\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['22_nmbr']\n",
      "\n",
      "evaluating column:  23\n",
      "processing column:  23\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['23_nmbr']\n",
      "\n",
      "evaluating column:  24\n",
      "processing column:  24\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['24_nmbr']\n",
      "\n",
      "evaluating column:  25\n",
      "processing column:  25\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['25_nmbr']\n",
      "\n",
      "evaluating column:  26\n",
      "processing column:  26\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['26_nmbr']\n",
      "\n",
      "evaluating column:  27\n",
      "processing column:  27\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['27_nmbr']\n",
      "\n",
      "evaluating column:  28\n",
      "processing column:  28\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['28_nmbr']\n",
      "\n",
      "evaluating column:  col1\n",
      "processing column:  col1\n",
      "    root category:  retn\n",
      " returned columns:\n",
      "['col1_retn']\n",
      "\n",
      "evaluating column:  col2\n",
      "processing column:  col2\n",
      "    root category:  ntgr\n",
      " returned columns:\n",
      "['col2_ordl', 'col2_ord3_mnmx', 'col2_retn', 'col2_1010_0', 'col2_1010_1', 'col2_1010_2', 'col2_1010_3']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  3_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  4_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  5_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  6_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  7_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  8_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  10_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  11_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  12_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  14_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  15_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  16_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  18_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  19_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  20_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  22_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  23_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  24_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  25_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  26_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  27_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  28_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  col1_retn\n",
      "     infill type: zeroinfill\n",
      "\n",
      "infill to column:  col2_ordl\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  9_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_1.0865380764007568\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_2.1730761528015137\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_1.1074360609054563\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_2.2148721218109126\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_1.2741122245788574\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_2.548224449157715\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_1.5509806871414182\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_3.1019613742828365\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  col2_ord3_mnmx\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col2_retn\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col2_1010_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col2_1010_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col2_1010_2\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col2_1010_3\n",
      "     infill type: MLinfill\n",
      "\n",
      "______\n",
      "\n",
      "versioning serial stamp:\n",
      "_5.22_255806795135_2020-11-15T14:10:31.731113\n",
      "\n",
      "Automunge returned ID column set: \n",
      "['Automunge_index_255806795135']\n",
      "\n",
      "Automunge returned train column set: \n",
      "['1_nmbr', '2_nmbr', '3_nmbr', '4_nmbr', '5_nmbr', '6_nmbr', '7_nmbr', '8_nmbr', '10_nmbr', '11_nmbr', '12_nmbr', '14_nmbr', '15_nmbr', '16_nmbr', '18_nmbr', '19_nmbr', '20_nmbr', '22_nmbr', '23_nmbr', '24_nmbr', '25_nmbr', '26_nmbr', '27_nmbr', '28_nmbr', 'col1_retn', 'col2_ordl', '9_0.0', '9_1.0865380764007568', '9_2.1730761528015137', '13_0.0', '13_1.1074360609054563', '13_2.2148721218109126', '17_0.0', '17_1.2741122245788574', '17_2.548224449157715', '21_0.0', '21_1.5509806871414182', '21_3.1019613742828365', 'col2_ord3_mnmx', 'col2_retn', 'col2_1010_0', 'col2_1010_1', 'col2_1010_2', 'col2_1010_3']\n",
      "\n",
      "_______________\n",
      "Automunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#here we'll set the output as pandas to easier view the results\n",
    "\n",
    "assigncat = {'retn':['col1'], 'ntgr':['col2']}\n",
    "assigninfill = {'zeroinfill':['col1'], 'MLinfill':['col2']}\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels,\\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test,\\\n",
    "featureimportance, postprocess_dict \\\n",
    "= am.automunge(df_train, \n",
    "  assigncat = assigncat, \n",
    "  assigninfill = assigninfill,\n",
    "  pandasoutput = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1_retn</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6252</th>\n",
       "      <td>0.056924</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4684</th>\n",
       "      <td>0.054757</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1731</th>\n",
       "      <td>0.194145</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4742</th>\n",
       "      <td>0.081438</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4521</th>\n",
       "      <td>0.051223</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5734</th>\n",
       "      <td>0.230517</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5191</th>\n",
       "      <td>0.068696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5390</th>\n",
       "      <td>0.208055</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>860</th>\n",
       "      <td>0.083262</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7270</th>\n",
       "      <td>0.230403</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      col1_retn\n",
       "6252   0.056924\n",
       "4684   0.054757\n",
       "1731   0.194145\n",
       "4742   0.081438\n",
       "4521   0.051223\n",
       "...         ...\n",
       "5734   0.230517\n",
       "5191   0.068696\n",
       "5390   0.208055\n",
       "860    0.083262\n",
       "7270   0.230403\n",
       "\n",
       "[10000 rows x 1 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#now can view the results\n",
    "\n",
    "pd.DataFrame(train['col1_retn'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col2_ord3_mnmx</th>\n",
       "      <th>col2_retn</th>\n",
       "      <th>col2_1010_0</th>\n",
       "      <th>col2_1010_1</th>\n",
       "      <th>col2_1010_2</th>\n",
       "      <th>col2_1010_3</th>\n",
       "      <th>col2_ordl</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6252</th>\n",
       "      <td>0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4684</th>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1731</th>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4742</th>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4521</th>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5734</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5191</th>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5390</th>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>860</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7270</th>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      col2_ord3_mnmx  col2_retn  col2_1010_0  col2_1010_1  col2_1010_2  \\\n",
       "6252        0.222222   0.333333            0            0            1   \n",
       "4684        0.555556   0.888889            1            0            0   \n",
       "1731        0.111111   0.111111            0            0            0   \n",
       "4742        0.777778   0.555556            0            1            0   \n",
       "4521        0.888889   0.444444            0            1            0   \n",
       "...              ...        ...          ...          ...          ...   \n",
       "5734        0.333333   0.777778            0            1            1   \n",
       "5191        0.666667   0.000000            0            0            0   \n",
       "5390        0.777778   0.555556            0            1            0   \n",
       "860         0.333333   0.777778            0            1            1   \n",
       "7270        0.111111   0.111111            0            0            0   \n",
       "\n",
       "      col2_1010_3  col2_ordl  \n",
       "6252            1          3  \n",
       "4684            0          8  \n",
       "1731            1          1  \n",
       "4742            1          5  \n",
       "4521            0          4  \n",
       "...           ...        ...  \n",
       "5734            1          7  \n",
       "5191            0          0  \n",
       "5390            1          5  \n",
       "860             1          7  \n",
       "7270            1          1  \n",
       "\n",
       "[10000 rows x 7 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train[['col2_ord3_mnmx', 'col2_retn', 'col2_1010_0', 'col2_1010_1', \\\n",
    "       'col2_1010_2', 'col2_1010_3', 'col2_ordl']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# C - Custom Family Trees"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Custom defined family trees of transformations can be passed to a function call my way fo the transformdict and processdict parameters. The transformdict parameter is used to populate a family tree, and the processdict parameter is to populate a supporting data structure for a new transformation categories.\n",
    "\n",
    "Let's demonstrate a scenario to assemble a transformation set in which a normalization is supplemented by two types of bin aggregation. We'll create a new root category `newt' and populate with transformations pre-defined in the library. Here we'll apply an upstream retain normalization and power of ten bins, and a standard deviation bins downstream of the retain normalization. We'll also include a NArw transformation which designates markers for entries that were subject to infill based on values of the source column."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "transformdict =  {'newt' : {'parents'       : ['newt'],\n",
    "                            'siblings'      : [],\n",
    "                            'auntsuncles'   : ['pwr2'],\n",
    "                            'cousins'       : ['NArw'],\n",
    "                            'children'      : [],\n",
    "                            'niecesnephews' : [],\n",
    "                            'coworkers'     : [],\n",
    "                            'friends'       : ['bins']}}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The corresponding processdict will make use of transformation functions defined in the library for the retain normalization. Here NArowtype designates the types of entries from the source column that will be targets for infill, MLinfilltype designates the types of predictive models to be trained for ML infill, and labelctgy is a support entry for feature importance for cases where a label in returned in multiple configurations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "processdict    =  {'newt' : {'functionpointer' : 'retn', \\\n",
    "                                 'NArowtype' : 'numeric',\n",
    "                                 'MLinfilltype' : 'numeric',\n",
    "                                 'labelctgy' : 'newt'}}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can then pass these populated structures to a function call and assign a column with header `col1' to the newly defined root category `newt'. If we want we can also apply ML infill even on this custom defined transformation set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Automunge processing\n",
      "\n",
      "evaluating column:  1\n",
      "processing column:  1\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['1_nmbr']\n",
      "\n",
      "evaluating column:  2\n",
      "processing column:  2\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['2_nmbr']\n",
      "\n",
      "evaluating column:  3\n",
      "processing column:  3\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['3_nmbr']\n",
      "\n",
      "evaluating column:  4\n",
      "processing column:  4\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['4_nmbr']\n",
      "\n",
      "evaluating column:  5\n",
      "processing column:  5\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['5_nmbr']\n",
      "\n",
      "evaluating column:  6\n",
      "processing column:  6\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['6_nmbr']\n",
      "\n",
      "evaluating column:  7\n",
      "processing column:  7\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['7_nmbr']\n",
      "\n",
      "evaluating column:  8\n",
      "processing column:  8\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['8_nmbr']\n",
      "\n",
      "evaluating column:  9\n",
      "processing column:  9\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['9_0.0', '9_1.0865380764007568', '9_2.1730761528015137']\n",
      "\n",
      "evaluating column:  10\n",
      "processing column:  10\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['10_nmbr']\n",
      "\n",
      "evaluating column:  11\n",
      "processing column:  11\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['11_nmbr']\n",
      "\n",
      "evaluating column:  12\n",
      "processing column:  12\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['12_nmbr']\n",
      "\n",
      "evaluating column:  13\n",
      "processing column:  13\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['13_0.0', '13_1.1074360609054563', '13_2.2148721218109126']\n",
      "\n",
      "evaluating column:  14\n",
      "processing column:  14\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['14_nmbr']\n",
      "\n",
      "evaluating column:  15\n",
      "processing column:  15\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['15_nmbr']\n",
      "\n",
      "evaluating column:  16\n",
      "processing column:  16\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['16_nmbr']\n",
      "\n",
      "evaluating column:  17\n",
      "processing column:  17\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['17_0.0', '17_1.2741122245788574', '17_2.548224449157715']\n",
      "\n",
      "evaluating column:  18\n",
      "processing column:  18\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['18_nmbr']\n",
      "\n",
      "evaluating column:  19\n",
      "processing column:  19\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['19_nmbr']\n",
      "\n",
      "evaluating column:  20\n",
      "processing column:  20\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['20_nmbr']\n",
      "\n",
      "evaluating column:  21\n",
      "processing column:  21\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "evaluating column:  22\n",
      "processing column:  22\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['22_nmbr']\n",
      "\n",
      "evaluating column:  23\n",
      "processing column:  23\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['23_nmbr']\n",
      "\n",
      "evaluating column:  24\n",
      "processing column:  24\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['24_nmbr']\n",
      "\n",
      "evaluating column:  25\n",
      "processing column:  25\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['25_nmbr']\n",
      "\n",
      "evaluating column:  26\n",
      "processing column:  26\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['26_nmbr']\n",
      "\n",
      "evaluating column:  27\n",
      "processing column:  27\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['27_nmbr']\n",
      "\n",
      "evaluating column:  28\n",
      "processing column:  28\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['28_nmbr']\n",
      "\n",
      "evaluating column:  col1\n",
      "processing column:  col1\n",
      "    root category:  newt\n",
      " returned columns:\n",
      "['col1_NArw', 'col1_retn', 'col1_retn_bins_0', 'col1_retn_bins_1', 'col1_retn_bins_2', 'col1_retn_bins_3', 'col1_retn_bins_4', 'col1_retn_bins_5', 'col1_10^-1', 'col1_10^0']\n",
      "\n",
      "evaluating column:  col2\n",
      "processing column:  col2\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['col2_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  3_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  4_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  5_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  6_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  7_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  8_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  10_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  11_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  12_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  14_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  15_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  16_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  18_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  19_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  20_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  22_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  23_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  24_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  25_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  26_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  27_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  28_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  col2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_1.0865380764007568\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_2.1730761528015137\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_1.1074360609054563\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_2.2148721218109126\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_1.2741122245788574\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_2.548224449157715\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_1.5509806871414182\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_3.1019613742828365\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  col1_retn\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_2\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_3\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_4\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_5\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_10^-1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_10^0\n",
      "     infill type: MLinfill\n",
      "\n",
      "______\n",
      "\n",
      "versioning serial stamp:\n",
      "_5.22_501861149523_2020-11-15T14:11:21.084818\n",
      "\n",
      "Automunge returned ID column set: \n",
      "['Automunge_index_501861149523']\n",
      "\n",
      "Automunge returned train column set: \n",
      "['1_nmbr', '2_nmbr', '3_nmbr', '4_nmbr', '5_nmbr', '6_nmbr', '7_nmbr', '8_nmbr', '10_nmbr', '11_nmbr', '12_nmbr', '14_nmbr', '15_nmbr', '16_nmbr', '18_nmbr', '19_nmbr', '20_nmbr', '22_nmbr', '23_nmbr', '24_nmbr', '25_nmbr', '26_nmbr', '27_nmbr', '28_nmbr', 'col2_nmbr', '9_0.0', '9_1.0865380764007568', '9_2.1730761528015137', '13_0.0', '13_1.1074360609054563', '13_2.2148721218109126', '17_0.0', '17_1.2741122245788574', '17_2.548224449157715', '21_0.0', '21_1.5509806871414182', '21_3.1019613742828365', 'col1_NArw', 'col1_retn', 'col1_retn_bins_0', 'col1_retn_bins_1', 'col1_retn_bins_2', 'col1_retn_bins_3', 'col1_retn_bins_4', 'col1_retn_bins_5', 'col1_10^-1', 'col1_10^0']\n",
      "\n",
      "_______________\n",
      "Automunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#here we'll set the output as pandas to easier view the results\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict \\\n",
    "= am.automunge(df_train, \n",
    "  assigncat = {'newt':['col1']}, \n",
    "  assigninfill = {'MLinfill':['col1']}, \n",
    "  transformdict = transformdict, processdict = processdict,\n",
    "  pandasoutput=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When it comes time to process additional data, all of these customizations will be saved in the returned postprocess\\_dict, which we can then pass to the postmunge(.) function for consistent processing of a test data set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test = df_train.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  1\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['1_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  2\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['2_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  3\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['3_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  4\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['4_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  5\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['5_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  6\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['6_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  7\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['7_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  8\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['8_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  9\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['9_0.0', '9_1.0865380764007568', '9_2.1730761528015137']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  10\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['10_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  11\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['11_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  12\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['12_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  13\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['13_0.0', '13_1.1074360609054563', '13_2.2148721218109126']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  14\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['14_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  15\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['15_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  16\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['16_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  17\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['17_0.0', '17_1.2741122245788574', '17_2.548224449157715']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  18\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['18_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  19\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['19_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  20\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['20_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  21\n",
      "    root category:  text\n",
      "\n",
      " returned columns:\n",
      "['21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  22\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['22_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  23\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['23_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  24\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['24_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  25\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['25_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  26\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['26_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  27\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['27_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  28\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['28_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  col1\n",
      "    root category:  newt\n",
      "\n",
      " returned columns:\n",
      "['col1_NArw', 'col1_retn', 'col1_retn_bins_0', 'col1_retn_bins_1', 'col1_retn_bins_2', 'col1_retn_bins_3', 'col1_retn_bins_4', 'col1_retn_bins_5', 'col1_10^-1', 'col1_10^0']\n",
      "\n",
      "______\n",
      "\n",
      "processing column:  col2\n",
      "    root category:  nmbr\n",
      "\n",
      " returned columns:\n",
      "['col2_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  3_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  4_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  5_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  6_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  7_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  8_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  10_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  11_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  12_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  14_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  15_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  16_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  18_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  19_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  20_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  22_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  23_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  24_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  25_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  26_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  27_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  28_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  col2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_1.0865380764007568\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_2.1730761528015137\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_1.1074360609054563\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_2.2148721218109126\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_1.2741122245788574\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_2.548224449157715\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_1.5509806871414182\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_3.1019613742828365\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  col1_retn\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_0\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_2\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_3\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_4\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_retn_bins_5\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_10^-1\n",
      "     infill type: MLinfill\n",
      "\n",
      "infill to column:  col1_10^0\n",
      "     infill type: MLinfill\n",
      "\n",
      "_______________\n",
      "Postmunge returned ID column set: \n",
      "['Automunge_index_501861149523']\n",
      "\n",
      "Postmunge returned test column set: \n",
      "['1_nmbr', '2_nmbr', '3_nmbr', '4_nmbr', '5_nmbr', '6_nmbr', '7_nmbr', '8_nmbr', '10_nmbr', '11_nmbr', '12_nmbr', '14_nmbr', '15_nmbr', '16_nmbr', '18_nmbr', '19_nmbr', '20_nmbr', '22_nmbr', '23_nmbr', '24_nmbr', '25_nmbr', '26_nmbr', '27_nmbr', '28_nmbr', 'col2_nmbr', '9_0.0', '9_1.0865380764007568', '9_2.1730761528015137', '13_0.0', '13_1.1074360609054563', '13_2.2148721218109126', '17_0.0', '17_1.2741122245788574', '17_2.548224449157715', '21_0.0', '21_1.5509806871414182', '21_3.1019613742828365', 'col1_NArw', 'col1_retn', 'col1_retn_bins_0', 'col1_retn_bins_1', 'col1_retn_bins_2', 'col1_retn_bins_3', 'col1_retn_bins_4', 'col1_retn_bins_5', 'col1_10^-1', 'col1_10^0']\n",
      "\n",
      "_______________\n",
      "Postmunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df_test, pandasoutput=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1_NArw</th>\n",
       "      <th>col1_retn</th>\n",
       "      <th>col1_retn_bins_0</th>\n",
       "      <th>col1_retn_bins_1</th>\n",
       "      <th>col1_retn_bins_2</th>\n",
       "      <th>col1_retn_bins_3</th>\n",
       "      <th>col1_retn_bins_4</th>\n",
       "      <th>col1_retn_bins_5</th>\n",
       "      <th>col1_10^-1</th>\n",
       "      <th>col1_10^0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6252</th>\n",
       "      <td>0</td>\n",
       "      <td>0.056924</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4684</th>\n",
       "      <td>0</td>\n",
       "      <td>0.054757</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1731</th>\n",
       "      <td>0</td>\n",
       "      <td>0.194145</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4742</th>\n",
       "      <td>0</td>\n",
       "      <td>0.081438</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4521</th>\n",
       "      <td>0</td>\n",
       "      <td>0.051223</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5734</th>\n",
       "      <td>0</td>\n",
       "      <td>0.230517</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5191</th>\n",
       "      <td>0</td>\n",
       "      <td>0.068696</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5390</th>\n",
       "      <td>0</td>\n",
       "      <td>0.208055</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>860</th>\n",
       "      <td>0</td>\n",
       "      <td>0.083262</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7270</th>\n",
       "      <td>0</td>\n",
       "      <td>0.230403</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      col1_NArw  col1_retn  col1_retn_bins_0  col1_retn_bins_1  \\\n",
       "6252          0   0.056924                 0                 0   \n",
       "4684          0   0.054757                 0                 0   \n",
       "1731          0   0.194145                 0                 0   \n",
       "4742          0   0.081438                 0                 0   \n",
       "4521          0   0.051223                 0                 0   \n",
       "...         ...        ...               ...               ...   \n",
       "5734          0   0.230517                 0                 0   \n",
       "5191          0   0.068696                 0                 0   \n",
       "5390          0   0.208055                 0                 0   \n",
       "860           0   0.083262                 0                 0   \n",
       "7270          0   0.230403                 0                 0   \n",
       "\n",
       "      col1_retn_bins_2  col1_retn_bins_3  col1_retn_bins_4  col1_retn_bins_5  \\\n",
       "6252                 1                 0                 0                 0   \n",
       "4684                 1                 0                 0                 0   \n",
       "1731                 0                 1                 0                 0   \n",
       "4742                 1                 0                 0                 0   \n",
       "4521                 1                 0                 0                 0   \n",
       "...                ...               ...               ...               ...   \n",
       "5734                 0                 0                 1                 0   \n",
       "5191                 1                 0                 0                 0   \n",
       "5390                 0                 0                 1                 0   \n",
       "860                  1                 0                 0                 0   \n",
       "7270                 0                 0                 1                 0   \n",
       "\n",
       "      col1_10^-1  col1_10^0  \n",
       "6252           1          0  \n",
       "4684           1          0  \n",
       "1731           0          1  \n",
       "4742           1          0  \n",
       "4521           1          0  \n",
       "...          ...        ...  \n",
       "5734           0          1  \n",
       "5191           1          0  \n",
       "5390           0          1  \n",
       "860            1          0  \n",
       "7270           0          1  \n",
       "\n",
       "[10000 rows x 10 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#here are the returned results of our custom family tree\n",
    "train[postprocess_dict['column_map']['col1']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# D - Data Augmentation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The recommended workflow for applying training data augmentation is to assign any desired target columns to DP transforms in assigncat in an automunge(.) call, passing any parameters to assignparam to deviate on default noise distributions if desired, adn then process the same set again in postmunge(.) without noise injection, concatinating the two results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()\n",
    "\n",
    "df_train = pd.DataFrame({'number1':[-4,-3,-2,-1,0,1,2,3,4,5], \n",
    "                         'number2':[5,4,3,2,1,0,-1,-2,-3,-4],\n",
    "                         'categoric1':['a','b','a','b','c','a','b','a','b','c'],\n",
    "                         'categoric2':['x','y','x','y','z','x','y','x','y','z']})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#for the example of a recieved train set df_train \n",
    "#with column headers of numeric sets 'number1', 'number2'\n",
    "#and column headers of categoric sets 'categoric1', 'categoric2'\n",
    "#we can elect to apply noise injections to these \n",
    "#by assigning DP transforms in assigncat\n",
    "\n",
    "assigncat = \\\n",
    "{'DPrt':['number1', 'number2'], \n",
    " 'DP10':['categoric1', 'categoric2']}\n",
    "\n",
    "#the noise injection transfrms have default entries noted in READ ME\n",
    "#for noise ratios and distribution parameters\n",
    "#if we want to overwrite for the transformation category globally\n",
    "\n",
    "assignparam = \\\n",
    "{'default_assignparam' : \n",
    "    {'DPrt' : {'sigma' : 0.05, 'flip_prob' : 0.5}, \n",
    "     'DP10' : {'flip_prob' : 0.5}\n",
    "    }\n",
    "}\n",
    "\n",
    "#or overwrite for a specific column\n",
    "#here we demostrate applying scaled Laplace distributed noise instead of Gaussian\n",
    "assignparam.update(\n",
    "  {'DPrt' : {'number2' : {'noisedistribution' : 'laplace'}}}\n",
    ")\n",
    " \n",
    "#We can then process our train set in automunge(.) with noise injection\n",
    " \n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict \\\n",
    "= am.automunge(df_train, \n",
    "                    assigncat = assigncat, \n",
    "                    assignparam = assignparam, \n",
    "                    pandasoutput = True,\n",
    "                    shuffletrain = False,\n",
    "                    printstatus = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>number1_DPrt</th>\n",
       "      <th>number2_DPrt</th>\n",
       "      <th>categoric1_ord3_DPod_1010_0</th>\n",
       "      <th>categoric1_ord3_DPod_1010_1</th>\n",
       "      <th>categoric2_ord3_DPod_1010_0</th>\n",
       "      <th>categoric2_ord3_DPod_1010_1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.444444</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.342452</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.111111</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.042429</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.012196</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.272652</td>\n",
       "      <td>-0.111111</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.338203</td>\n",
       "      <td>-0.222222</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.445698</td>\n",
       "      <td>-0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.555556</td>\n",
       "      <td>-0.444444</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   number1_DPrt  number2_DPrt  categoric1_ord3_DPod_1010_0  \\\n",
       "0     -0.444444      0.555556                            0   \n",
       "1     -0.342452      0.444444                            0   \n",
       "2     -0.222222      0.333333                            0   \n",
       "3     -0.111111      0.222222                            1   \n",
       "4     -0.042429      0.111111                            1   \n",
       "5      0.111111      0.012196                            0   \n",
       "6      0.272652     -0.111111                            0   \n",
       "7      0.338203     -0.222222                            0   \n",
       "8      0.445698     -0.333333                            0   \n",
       "9      0.555556     -0.444444                            1   \n",
       "\n",
       "   categoric1_ord3_DPod_1010_1  categoric2_ord3_DPod_1010_0  \\\n",
       "0                            0                            0   \n",
       "1                            1                            0   \n",
       "2                            0                            0   \n",
       "3                            0                            0   \n",
       "4                            0                            1   \n",
       "5                            0                            0   \n",
       "6                            1                            0   \n",
       "7                            0                            0   \n",
       "8                            1                            0   \n",
       "9                            0                            1   \n",
       "\n",
       "   categoric2_ord3_DPod_1010_1  \n",
       "0                            0  \n",
       "1                            1  \n",
       "2                            0  \n",
       "3                            1  \n",
       "4                            0  \n",
       "5                            0  \n",
       "6                            1  \n",
       "7                            0  \n",
       "8                            1  \n",
       "9                            0  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The convention for noise injection transforms in the DP family is that noise is injected to train sets by default and not to test sets unless signaled by the traindata parameter in postmunge(.). Test sets passed to automunge(.) do not receive injections.\n",
    "\n",
    "Once the training data set has been processed with noise, the same data set can be processed again in the postmunge(.) function and concatenated, using the postprocess\\_dict dictionary returned from the corresponding automunge(.) call. The postmunge accepts a parameter traindata which signals to the DP whether to inject noise in postmunge(.). traindata is a Boolean defaulting to False for no noise injection. When set as True postmunge(.) will inject noise. Note that each application will inject a unique randomization of noise that is independent of the randomseed parameter although consistent with any distribution parameters. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df_train, \n",
    "                 pandasoutput = True, \n",
    "                 traindata = False,\n",
    "                 printstatus = False)\n",
    "\n",
    "#we can then concatinate the results for the train data, ID sets, and label sets\n",
    "#to double the number of samples to include a set with and without noise\n",
    "train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>number1_DPrt</th>\n",
       "      <th>number2_DPrt</th>\n",
       "      <th>categoric1_ord3_DPod_1010_0</th>\n",
       "      <th>categoric1_ord3_DPod_1010_1</th>\n",
       "      <th>categoric2_ord3_DPod_1010_0</th>\n",
       "      <th>categoric2_ord3_DPod_1010_1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.444444</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.342452</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.111111</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.042429</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.012196</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.272652</td>\n",
       "      <td>-0.111111</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.338203</td>\n",
       "      <td>-0.222222</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.445698</td>\n",
       "      <td>-0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.555556</td>\n",
       "      <td>-0.444444</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>-0.444444</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>-0.333333</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>-0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>-0.111111</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>0.222222</td>\n",
       "      <td>-0.111111</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>-0.222222</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>0.444444</td>\n",
       "      <td>-0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>0.555556</td>\n",
       "      <td>-0.444444</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    number1_DPrt  number2_DPrt  categoric1_ord3_DPod_1010_0  \\\n",
       "0      -0.444444      0.555556                            0   \n",
       "1      -0.342452      0.444444                            0   \n",
       "2      -0.222222      0.333333                            0   \n",
       "3      -0.111111      0.222222                            1   \n",
       "4      -0.042429      0.111111                            1   \n",
       "5       0.111111      0.012196                            0   \n",
       "6       0.272652     -0.111111                            0   \n",
       "7       0.338203     -0.222222                            0   \n",
       "8       0.445698     -0.333333                            0   \n",
       "9       0.555556     -0.444444                            1   \n",
       "10     -0.444444      0.555556                            0   \n",
       "11     -0.333333      0.444444                            0   \n",
       "12     -0.222222      0.333333                            0   \n",
       "13     -0.111111      0.222222                            0   \n",
       "14      0.000000      0.111111                            1   \n",
       "15      0.111111      0.000000                            0   \n",
       "16      0.222222     -0.111111                            0   \n",
       "17      0.333333     -0.222222                            0   \n",
       "18      0.444444     -0.333333                            0   \n",
       "19      0.555556     -0.444444                            1   \n",
       "\n",
       "    categoric1_ord3_DPod_1010_1  categoric2_ord3_DPod_1010_0  \\\n",
       "0                             0                            0   \n",
       "1                             1                            0   \n",
       "2                             0                            0   \n",
       "3                             0                            0   \n",
       "4                             0                            1   \n",
       "5                             0                            0   \n",
       "6                             1                            0   \n",
       "7                             0                            0   \n",
       "8                             1                            0   \n",
       "9                             0                            1   \n",
       "10                            0                            0   \n",
       "11                            1                            0   \n",
       "12                            0                            0   \n",
       "13                            1                            0   \n",
       "14                            0                            1   \n",
       "15                            0                            0   \n",
       "16                            1                            0   \n",
       "17                            0                            0   \n",
       "18                            1                            0   \n",
       "19                            0                            1   \n",
       "\n",
       "    categoric2_ord3_DPod_1010_1  \n",
       "0                             0  \n",
       "1                             1  \n",
       "2                             0  \n",
       "3                             1  \n",
       "4                             0  \n",
       "5                             0  \n",
       "6                             1  \n",
       "7                             0  \n",
       "8                             1  \n",
       "9                             0  \n",
       "10                            0  \n",
       "11                            1  \n",
       "12                            0  \n",
       "13                            1  \n",
       "14                            0  \n",
       "15                            0  \n",
       "16                            1  \n",
       "17                            0  \n",
       "18                            1  \n",
       "19                            0  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that while our experiments only doubled the number of samples to include a set with and without noise injection, further multiples of noise injected samples may also be applied by processing in postmunge(.) with traindata=True. \n",
    "\n",
    "In an alternate workflow, if only a doubling of the number of samples is desired, the processing can all be performed in a single automunge(.) call by passing the same df\\_train set as both the train and test sets as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict \\\n",
    "= am.automunge(df_train, \n",
    "                    df_test = df_train, \n",
    "                    assigncat = assigncat, \n",
    "                    assignparam = assignparam, \n",
    "                    pandasoutput = True,\n",
    "                    shuffletrain = False,\n",
    "                    printstatus = False)\n",
    "                    \n",
    "train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>number1_DPrt</th>\n",
       "      <th>number2_DPrt</th>\n",
       "      <th>categoric1_ord3_DPod_1010_0</th>\n",
       "      <th>categoric1_ord3_DPod_1010_1</th>\n",
       "      <th>categoric2_ord3_DPod_1010_0</th>\n",
       "      <th>categoric2_ord3_DPod_1010_1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.444444</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.333722</td>\n",
       "      <td>0.440173</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.122941</td>\n",
       "      <td>0.276445</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.033201</td>\n",
       "      <td>0.190816</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.054851</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.222222</td>\n",
       "      <td>-0.111111</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.340873</td>\n",
       "      <td>-0.140415</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.444444</td>\n",
       "      <td>-0.335978</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.495258</td>\n",
       "      <td>-0.444444</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>-0.444444</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>-0.333333</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>-0.222222</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>-0.111111</td>\n",
       "      <td>0.222222</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>0.222222</td>\n",
       "      <td>-0.111111</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>-0.222222</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>0.444444</td>\n",
       "      <td>-0.333333</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>0.555556</td>\n",
       "      <td>-0.444444</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    number1_DPrt  number2_DPrt  categoric1_ord3_DPod_1010_0  \\\n",
       "0      -0.444444      0.555556                            0   \n",
       "1      -0.333722      0.440173                            0   \n",
       "2      -0.222222      0.333333                            0   \n",
       "3      -0.122941      0.276445                            0   \n",
       "4       0.033201      0.190816                            1   \n",
       "5       0.054851      0.000000                            0   \n",
       "6       0.222222     -0.111111                            0   \n",
       "7       0.340873     -0.140415                            0   \n",
       "8       0.444444     -0.335978                            0   \n",
       "9       0.495258     -0.444444                            1   \n",
       "10     -0.444444      0.555556                            0   \n",
       "11     -0.333333      0.444444                            0   \n",
       "12     -0.222222      0.333333                            0   \n",
       "13     -0.111111      0.222222                            0   \n",
       "14      0.000000      0.111111                            1   \n",
       "15      0.111111      0.000000                            0   \n",
       "16      0.222222     -0.111111                            0   \n",
       "17      0.333333     -0.222222                            0   \n",
       "18      0.444444     -0.333333                            0   \n",
       "19      0.555556     -0.444444                            1   \n",
       "\n",
       "    categoric1_ord3_DPod_1010_1  categoric2_ord3_DPod_1010_0  \\\n",
       "0                             0                            0   \n",
       "1                             1                            0   \n",
       "2                             0                            0   \n",
       "3                             1                            0   \n",
       "4                             0                            1   \n",
       "5                             0                            0   \n",
       "6                             1                            0   \n",
       "7                             0                            0   \n",
       "8                             1                            0   \n",
       "9                             0                            1   \n",
       "10                            0                            0   \n",
       "11                            1                            0   \n",
       "12                            0                            0   \n",
       "13                            1                            0   \n",
       "14                            0                            1   \n",
       "15                            0                            0   \n",
       "16                            1                            0   \n",
       "17                            0                            0   \n",
       "18                            1                            0   \n",
       "19                            0                            1   \n",
       "\n",
       "    categoric2_ord3_DPod_1010_1  \n",
       "0                             0  \n",
       "1                             1  \n",
       "2                             0  \n",
       "3                             1  \n",
       "4                             0  \n",
       "5                             1  \n",
       "6                             1  \n",
       "7                             0  \n",
       "8                             1  \n",
       "9                             0  \n",
       "10                            0  \n",
       "11                            1  \n",
       "12                            0  \n",
       "13                            1  \n",
       "14                            0  \n",
       "15                            0  \n",
       "16                            1  \n",
       "17                            0  \n",
       "18                            1  \n",
       "19                            0  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# E - Column types of returned data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The data returned from an automunge(.) call is intended to be suitable for direct application of machine learning in the framework of a user's choice. In some cases, downstream machine learning libraries may accept as input designations for the types of data found in a column, such as for instance if a column contains a numeric or categoric set. These type of designations may be used for determination of whether to apply an entity embedding layer for instance.\n",
    "\n",
    "The automugne(.) function thus returns a report of data types for returned columns, available in the returned dictionary as postprocess\\_dict[`columntype\\_report'].\n",
    "\n",
    "The report lists column headers of the retuned columns, aggregated by different types of column contents. Specifically, it lists column headers for column content types:\n",
    "\n",
    "- continuous\n",
    "- boolean\n",
    "- ordinal\n",
    "- onehot\n",
    "- onehot\\_sets\n",
    "- binary\n",
    "- binary\\_sets\n",
    "- passthrough"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "continuous\n",
      "['number1_DPrt', 'number2_DPrt']\n",
      "\n",
      "boolean\n",
      "[]\n",
      "\n",
      "ordinal\n",
      "[]\n",
      "\n",
      "onehot\n",
      "[]\n",
      "\n",
      "onehot_sets\n",
      "[]\n",
      "\n",
      "binary\n",
      "['categoric1_ord3_DPod_1010_0', 'categoric1_ord3_DPod_1010_1', 'categoric2_ord3_DPod_1010_0', 'categoric2_ord3_DPod_1010_1']\n",
      "\n",
      "binary_sets\n",
      "[['categoric1_ord3_DPod_1010_0', 'categoric1_ord3_DPod_1010_1'], ['categoric2_ord3_DPod_1010_0', 'categoric2_ord3_DPod_1010_1']]\n",
      "\n",
      "passthrough\n",
      "[]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for keys in postprocess_dict['columntype_report']:\n",
    "  print(keys)\n",
    "  print(postprocess_dict['columntype_report'][keys])\n",
    "  print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## F - Inversion"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The Automunge library includes an inversion option to recover the form of data as prior to transformations. This type of operation might be useful for instance in converting ML predictions back to the original form of labels, or otherwise for recovering data to invert transformations. The inversion operation is performed in the postmunge(.) function by activating the inversion parameter. The method relies on data properties stored in the postprocess\\_dict which was returned from the corresponding automunge(.) call. For cases where a source column is included in multiple configurations, the inversion operation relies on a heuristic of selecting the shortest path of transformations with full information retention.\n",
    "\n",
    "As an example of an inversion operation, if we want to recover the form of labels after a ML prediction process, we can pass those predictions as a dataframe with column header consistent to our processed labels as:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Automunge processing\n",
      "\n",
      "evaluating column:  1\n",
      "processing column:  1\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['1_nmbr']\n",
      "\n",
      "evaluating column:  2\n",
      "processing column:  2\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['2_nmbr']\n",
      "\n",
      "evaluating column:  3\n",
      "processing column:  3\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['3_nmbr']\n",
      "\n",
      "evaluating column:  4\n",
      "processing column:  4\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['4_nmbr']\n",
      "\n",
      "evaluating column:  5\n",
      "processing column:  5\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['5_nmbr']\n",
      "\n",
      "evaluating column:  6\n",
      "processing column:  6\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['6_nmbr']\n",
      "\n",
      "evaluating column:  7\n",
      "processing column:  7\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['7_nmbr']\n",
      "\n",
      "evaluating column:  8\n",
      "processing column:  8\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['8_nmbr']\n",
      "\n",
      "evaluating column:  9\n",
      "processing column:  9\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['9_0.0', '9_1.0865380764007568', '9_2.1730761528015137']\n",
      "\n",
      "evaluating column:  10\n",
      "processing column:  10\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['10_nmbr']\n",
      "\n",
      "evaluating column:  11\n",
      "processing column:  11\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['11_nmbr']\n",
      "\n",
      "evaluating column:  12\n",
      "processing column:  12\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['12_nmbr']\n",
      "\n",
      "evaluating column:  13\n",
      "processing column:  13\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['13_0.0', '13_1.1074360609054563', '13_2.2148721218109126']\n",
      "\n",
      "evaluating column:  14\n",
      "processing column:  14\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['14_nmbr']\n",
      "\n",
      "evaluating column:  15\n",
      "processing column:  15\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['15_nmbr']\n",
      "\n",
      "evaluating column:  16\n",
      "processing column:  16\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['16_nmbr']\n",
      "\n",
      "evaluating column:  17\n",
      "processing column:  17\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['17_0.0', '17_1.2741122245788574', '17_2.548224449157715']\n",
      "\n",
      "evaluating column:  18\n",
      "processing column:  18\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['18_nmbr']\n",
      "\n",
      "evaluating column:  19\n",
      "processing column:  19\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['19_nmbr']\n",
      "\n",
      "evaluating column:  20\n",
      "processing column:  20\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['20_nmbr']\n",
      "\n",
      "evaluating column:  21\n",
      "processing column:  21\n",
      "    root category:  text\n",
      " returned columns:\n",
      "['21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "evaluating column:  22\n",
      "processing column:  22\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['22_nmbr']\n",
      "\n",
      "evaluating column:  23\n",
      "processing column:  23\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['23_nmbr']\n",
      "\n",
      "evaluating column:  24\n",
      "processing column:  24\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['24_nmbr']\n",
      "\n",
      "evaluating column:  25\n",
      "processing column:  25\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['25_nmbr']\n",
      "\n",
      "evaluating column:  26\n",
      "processing column:  26\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['26_nmbr']\n",
      "\n",
      "evaluating column:  27\n",
      "processing column:  27\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['27_nmbr']\n",
      "\n",
      "evaluating column:  28\n",
      "processing column:  28\n",
      "    root category:  nmbr\n",
      " returned columns:\n",
      "['28_nmbr']\n",
      "\n",
      "______\n",
      "\n",
      "evaluating label column:  0\n",
      "processing label column:  0\n",
      "    root label category:  lbbn\n",
      "\n",
      " returned columns:\n",
      "['0_0.0', '0_1.0']\n",
      "\n",
      "______\n",
      "\n",
      "infill to column:  1_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  2_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  3_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  4_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  5_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  6_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  7_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  8_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  10_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  11_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  12_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  14_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  15_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  16_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  18_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  19_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  20_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  22_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  23_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  24_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  25_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  26_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  27_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  28_nmbr\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_1.0865380764007568\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  9_2.1730761528015137\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_1.1074360609054563\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  13_2.2148721218109126\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_1.2741122245788574\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  17_2.548224449157715\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_0.0\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_1.5509806871414182\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "infill to column:  21_3.1019613742828365\n",
      "     infill type: stdrdinfill\n",
      "\n",
      "______\n",
      "\n",
      "versioning serial stamp:\n",
      "_5.22_722779570711_2020-11-15T14:11:45.768424\n",
      "\n",
      "Automunge returned ID column set: \n",
      "['Automunge_index_722779570711']\n",
      "\n",
      "Automunge returned train column set: \n",
      "['1_nmbr', '2_nmbr', '3_nmbr', '4_nmbr', '5_nmbr', '6_nmbr', '7_nmbr', '8_nmbr', '10_nmbr', '11_nmbr', '12_nmbr', '14_nmbr', '15_nmbr', '16_nmbr', '18_nmbr', '19_nmbr', '20_nmbr', '22_nmbr', '23_nmbr', '24_nmbr', '25_nmbr', '26_nmbr', '27_nmbr', '28_nmbr', '9_0.0', '9_1.0865380764007568', '9_2.1730761528015137', '13_0.0', '13_1.1074360609054563', '13_2.2148721218109126', '17_0.0', '17_1.2741122245788574', '17_2.548224449157715', '21_0.0', '21_1.5509806871414182', '21_3.1019613742828365']\n",
      "\n",
      "Automunge returned label column set: \n",
      "['0_0.0', '0_1.0']\n",
      "\n",
      "_______________\n",
      "Automunge Complete\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#To demonstrate inversion of labels we'll need a processed label column\n",
    "#so let's run automunge(.) on df_train one more time and \n",
    "#this time designate a label column\n",
    "\n",
    "df_train = pd.read_csv('Higgs_data_partial.csv')\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict \\\n",
    "= am.automunge(df_train, \n",
    "  shuffletrain = False, \n",
    "  labels_column = '0', \n",
    "  pandasoutput = True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0_0.0</th>\n",
       "      <th>0_1.0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      0_0.0  0_1.0\n",
       "0         0      1\n",
       "1         0      1\n",
       "2         0      1\n",
       "3         1      0\n",
       "4         0      1\n",
       "...     ...    ...\n",
       "9995      0      1\n",
       "9996      1      0\n",
       "9997      1      0\n",
       "9998      1      0\n",
       "9999      1      0\n",
       "\n",
       "[10000 rows x 2 columns]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#this returned a one-hot encoded set for the two activations\n",
    "\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "Performing inversion recovery of original columns for label set.\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  0\n",
      "Inversion path selected based on returned column  0_0.0\n",
      "With full recovery.\n",
      "Recovered source column:  0\n",
      "\n",
      "Inversion succeeded in recovering original form for columns:\n",
      "['0']\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#now if we want to recover the original single column form can call\n",
    "\n",
    "df_invert, recovered_list, inversion_info_dict = \\\n",
    "am.postmunge(postprocess_dict, labels, inversion='labels', \n",
    "          LabelSmoothing=False, pandasoutput=True, printstatus=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        0\n",
       "0     1.0\n",
       "1     1.0\n",
       "2     1.0\n",
       "3     0.0\n",
       "4     1.0\n",
       "...   ...\n",
       "9995  1.0\n",
       "9996  0.0\n",
       "9997  0.0\n",
       "9998  0.0\n",
       "9999  0.0\n",
       "\n",
       "[10000 rows x 1 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#here are the recovered labels\n",
    "\n",
    "df_invert"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_______________\n",
      "Begin Postmunge processing\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  1\n",
      "Inversion path selected based on returned column  1_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  1\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  2\n",
      "Inversion path selected based on returned column  2_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  2\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  3\n",
      "Inversion path selected based on returned column  3_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  3\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  4\n",
      "Inversion path selected based on returned column  4_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  4\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  5\n",
      "Inversion path selected based on returned column  5_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  5\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  6\n",
      "Inversion path selected based on returned column  6_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  6\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  7\n",
      "Inversion path selected based on returned column  7_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  7\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  8\n",
      "Inversion path selected based on returned column  8_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  8\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  10\n",
      "Inversion path selected based on returned column  10_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  10\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  11\n",
      "Inversion path selected based on returned column  11_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  11\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  12\n",
      "Inversion path selected based on returned column  12_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  12\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  14\n",
      "Inversion path selected based on returned column  14_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  14\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  15\n",
      "Inversion path selected based on returned column  15_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  15\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  16\n",
      "Inversion path selected based on returned column  16_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  16\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  18\n",
      "Inversion path selected based on returned column  18_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  18\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  19\n",
      "Inversion path selected based on returned column  19_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  19\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  20\n",
      "Inversion path selected based on returned column  20_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  20\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  22\n",
      "Inversion path selected based on returned column  22_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  22\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  23\n",
      "Inversion path selected based on returned column  23_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  23\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  24\n",
      "Inversion path selected based on returned column  24_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  24\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  25\n",
      "Inversion path selected based on returned column  25_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  25\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  26\n",
      "Inversion path selected based on returned column  26_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  26\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  27\n",
      "Inversion path selected based on returned column  27_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  27\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  28\n",
      "Inversion path selected based on returned column  28_nmbr\n",
      "With full recovery.\n",
      "Recovered source column:  28\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  9\n",
      "Inversion path selected based on returned column  9_0.0\n",
      "With full recovery.\n",
      "Recovered source column:  9\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  13\n",
      "Inversion path selected based on returned column  13_0.0\n",
      "With full recovery.\n",
      "Recovered source column:  13\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  17\n",
      "Inversion path selected based on returned column  17_0.0\n",
      "With full recovery.\n",
      "Recovered source column:  17\n",
      "\n",
      "Evaluating inversion paths for columns derived from:  21\n",
      "Inversion path selected based on returned column  21_0.0\n",
      "With full recovery.\n",
      "Recovered source column:  21\n",
      "\n",
      "Inversion succeeded in recovering original form for columns:\n",
      "['1', '2', '3', '4', '5', '6', '7', '8', '10', '11', '12', '14', '15', '16', '18', '19', '20', '22', '23', '24', '25', '26', '27', '28', '9', '13', '17', '21']\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#similarly, if we want to recover the form of df_train prior to encodings\n",
    "#can apply inverison = 'test'\n",
    "\n",
    "df_invert, recovered_list, inversion_info_dict = \\\n",
    "am.postmunge(postprocess_dict, train, inversion='test', \n",
    "          pandasoutput=True, printstatus=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>10</th>\n",
       "      <th>11</th>\n",
       "      <th>...</th>\n",
       "      <th>23</th>\n",
       "      <th>24</th>\n",
       "      <th>25</th>\n",
       "      <th>26</th>\n",
       "      <th>27</th>\n",
       "      <th>28</th>\n",
       "      <th>9</th>\n",
       "      <th>13</th>\n",
       "      <th>17</th>\n",
       "      <th>21</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.869293</td>\n",
       "      <td>-0.635082</td>\n",
       "      <td>0.225690</td>\n",
       "      <td>0.327470</td>\n",
       "      <td>-0.689993</td>\n",
       "      <td>0.754202</td>\n",
       "      <td>-0.248573</td>\n",
       "      <td>-1.092064</td>\n",
       "      <td>1.374992</td>\n",
       "      <td>-0.653674</td>\n",
       "      <td>...</td>\n",
       "      <td>0.979563</td>\n",
       "      <td>0.978076</td>\n",
       "      <td>0.920005</td>\n",
       "      <td>0.721657</td>\n",
       "      <td>0.988751</td>\n",
       "      <td>0.876678</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.1074360609054563</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.1019613742828365</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.907542</td>\n",
       "      <td>0.329147</td>\n",
       "      <td>0.359412</td>\n",
       "      <td>1.497970</td>\n",
       "      <td>-0.313010</td>\n",
       "      <td>1.095531</td>\n",
       "      <td>-0.557525</td>\n",
       "      <td>-1.588230</td>\n",
       "      <td>0.812581</td>\n",
       "      <td>-0.213642</td>\n",
       "      <td>...</td>\n",
       "      <td>0.833048</td>\n",
       "      <td>0.985700</td>\n",
       "      <td>0.978098</td>\n",
       "      <td>0.779732</td>\n",
       "      <td>0.992356</td>\n",
       "      <td>0.798343</td>\n",
       "      <td>2.1730761528015137</td>\n",
       "      <td>2.2148721218109126</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.798835</td>\n",
       "      <td>1.470639</td>\n",
       "      <td>-1.635975</td>\n",
       "      <td>0.453773</td>\n",
       "      <td>0.425629</td>\n",
       "      <td>1.104875</td>\n",
       "      <td>1.282322</td>\n",
       "      <td>1.381664</td>\n",
       "      <td>0.851737</td>\n",
       "      <td>1.540659</td>\n",
       "      <td>...</td>\n",
       "      <td>1.108330</td>\n",
       "      <td>0.985692</td>\n",
       "      <td>0.951331</td>\n",
       "      <td>0.803252</td>\n",
       "      <td>0.865924</td>\n",
       "      <td>0.780118</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.2148721218109126</td>\n",
       "      <td>2.548224449157715</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.344385</td>\n",
       "      <td>-0.876626</td>\n",
       "      <td>0.935913</td>\n",
       "      <td>1.992050</td>\n",
       "      <td>0.882454</td>\n",
       "      <td>1.786066</td>\n",
       "      <td>-1.646778</td>\n",
       "      <td>-0.942383</td>\n",
       "      <td>2.423265</td>\n",
       "      <td>-0.676016</td>\n",
       "      <td>...</td>\n",
       "      <td>1.028704</td>\n",
       "      <td>0.998656</td>\n",
       "      <td>0.728281</td>\n",
       "      <td>0.869200</td>\n",
       "      <td>1.026736</td>\n",
       "      <td>0.957904</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.2148721218109126</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.105009</td>\n",
       "      <td>0.321356</td>\n",
       "      <td>1.522401</td>\n",
       "      <td>0.882808</td>\n",
       "      <td>-1.205349</td>\n",
       "      <td>0.681466</td>\n",
       "      <td>-1.070464</td>\n",
       "      <td>-0.921871</td>\n",
       "      <td>0.800872</td>\n",
       "      <td>1.020974</td>\n",
       "      <td>...</td>\n",
       "      <td>1.361057</td>\n",
       "      <td>0.986610</td>\n",
       "      <td>0.838085</td>\n",
       "      <td>1.133295</td>\n",
       "      <td>0.872245</td>\n",
       "      <td>0.808487</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.2148721218109126</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>0.743017</td>\n",
       "      <td>0.390507</td>\n",
       "      <td>-0.431368</td>\n",
       "      <td>0.734279</td>\n",
       "      <td>0.491310</td>\n",
       "      <td>1.748141</td>\n",
       "      <td>1.165475</td>\n",
       "      <td>-1.033300</td>\n",
       "      <td>1.185004</td>\n",
       "      <td>0.622711</td>\n",
       "      <td>...</td>\n",
       "      <td>1.058611</td>\n",
       "      <td>0.979336</td>\n",
       "      <td>1.028349</td>\n",
       "      <td>0.964556</td>\n",
       "      <td>0.915222</td>\n",
       "      <td>0.816399</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.2148721218109126</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.1019613742828365</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>2.778078</td>\n",
       "      <td>-0.498726</td>\n",
       "      <td>-0.236057</td>\n",
       "      <td>0.625503</td>\n",
       "      <td>0.443076</td>\n",
       "      <td>1.353130</td>\n",
       "      <td>0.014828</td>\n",
       "      <td>1.449298</td>\n",
       "      <td>0.546673</td>\n",
       "      <td>-1.020853</td>\n",
       "      <td>...</td>\n",
       "      <td>0.814733</td>\n",
       "      <td>1.050062</td>\n",
       "      <td>1.759452</td>\n",
       "      <td>1.297594</td>\n",
       "      <td>1.164259</td>\n",
       "      <td>0.962886</td>\n",
       "      <td>2.1730761528015137</td>\n",
       "      <td>2.2148721218109126</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>0.602100</td>\n",
       "      <td>2.008270</td>\n",
       "      <td>-1.646517</td>\n",
       "      <td>0.566484</td>\n",
       "      <td>-1.323110</td>\n",
       "      <td>0.597096</td>\n",
       "      <td>0.511923</td>\n",
       "      <td>0.191420</td>\n",
       "      <td>0.315388</td>\n",
       "      <td>0.884982</td>\n",
       "      <td>...</td>\n",
       "      <td>0.863039</td>\n",
       "      <td>0.986306</td>\n",
       "      <td>0.672815</td>\n",
       "      <td>0.244809</td>\n",
       "      <td>0.607074</td>\n",
       "      <td>0.588277</td>\n",
       "      <td>1.0865380764007568</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.548224449157715</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>0.771017</td>\n",
       "      <td>-1.224333</td>\n",
       "      <td>0.076987</td>\n",
       "      <td>0.819454</td>\n",
       "      <td>1.574369</td>\n",
       "      <td>0.411866</td>\n",
       "      <td>-0.686255</td>\n",
       "      <td>-1.531129</td>\n",
       "      <td>0.463828</td>\n",
       "      <td>-0.374890</td>\n",
       "      <td>...</td>\n",
       "      <td>1.234889</td>\n",
       "      <td>1.074361</td>\n",
       "      <td>0.675443</td>\n",
       "      <td>0.374609</td>\n",
       "      <td>1.027287</td>\n",
       "      <td>1.185665</td>\n",
       "      <td>2.1730761528015137</td>\n",
       "      <td>2.2148721218109126</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>0.545184</td>\n",
       "      <td>-1.011034</td>\n",
       "      <td>0.280067</td>\n",
       "      <td>0.152822</td>\n",
       "      <td>-0.196007</td>\n",
       "      <td>1.035253</td>\n",
       "      <td>-1.189292</td>\n",
       "      <td>-1.137523</td>\n",
       "      <td>0.904491</td>\n",
       "      <td>-0.748869</td>\n",
       "      <td>...</td>\n",
       "      <td>1.059162</td>\n",
       "      <td>0.983805</td>\n",
       "      <td>1.137410</td>\n",
       "      <td>0.705999</td>\n",
       "      <td>1.035395</td>\n",
       "      <td>0.914397</td>\n",
       "      <td>2.1730761528015137</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.5509806871414182</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 28 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             1         2         3         4         5         6         7  \\\n",
       "0     0.869293 -0.635082  0.225690  0.327470 -0.689993  0.754202 -0.248573   \n",
       "1     0.907542  0.329147  0.359412  1.497970 -0.313010  1.095531 -0.557525   \n",
       "2     0.798835  1.470639 -1.635975  0.453773  0.425629  1.104875  1.282322   \n",
       "3     1.344385 -0.876626  0.935913  1.992050  0.882454  1.786066 -1.646778   \n",
       "4     1.105009  0.321356  1.522401  0.882808 -1.205349  0.681466 -1.070464   \n",
       "...        ...       ...       ...       ...       ...       ...       ...   \n",
       "9995  0.743017  0.390507 -0.431368  0.734279  0.491310  1.748141  1.165475   \n",
       "9996  2.778078 -0.498726 -0.236057  0.625503  0.443076  1.353130  0.014828   \n",
       "9997  0.602100  2.008270 -1.646517  0.566484 -1.323110  0.597096  0.511923   \n",
       "9998  0.771017 -1.224333  0.076987  0.819454  1.574369  0.411866 -0.686255   \n",
       "9999  0.545184 -1.011034  0.280067  0.152822 -0.196007  1.035253 -1.189292   \n",
       "\n",
       "             8        10        11  ...        23        24        25  \\\n",
       "0    -1.092064  1.374992 -0.653674  ...  0.979563  0.978076  0.920005   \n",
       "1    -1.588230  0.812581 -0.213642  ...  0.833048  0.985700  0.978098   \n",
       "2     1.381664  0.851737  1.540659  ...  1.108330  0.985692  0.951331   \n",
       "3    -0.942383  2.423265 -0.676016  ...  1.028704  0.998656  0.728281   \n",
       "4    -0.921871  0.800872  1.020974  ...  1.361057  0.986610  0.838085   \n",
       "...        ...       ...       ...  ...       ...       ...       ...   \n",
       "9995 -1.033300  1.185004  0.622711  ...  1.058611  0.979336  1.028349   \n",
       "9996  1.449298  0.546673 -1.020853  ...  0.814733  1.050062  1.759452   \n",
       "9997  0.191420  0.315388  0.884982  ...  0.863039  0.986306  0.672815   \n",
       "9998 -1.531129  0.463828 -0.374890  ...  1.234889  1.074361  0.675443   \n",
       "9999 -1.137523  0.904491 -0.748869  ...  1.059162  0.983805  1.137410   \n",
       "\n",
       "            26        27        28                   9                  13  \\\n",
       "0     0.721657  0.988751  0.876678                 0.0  1.1074360609054563   \n",
       "1     0.779732  0.992356  0.798343  2.1730761528015137  2.2148721218109126   \n",
       "2     0.803252  0.865924  0.780118                 0.0  2.2148721218109126   \n",
       "3     0.869200  1.026736  0.957904                 0.0  2.2148721218109126   \n",
       "4     1.133295  0.872245  0.808487                 0.0  2.2148721218109126   \n",
       "...        ...       ...       ...                 ...                 ...   \n",
       "9995  0.964556  0.915222  0.816399                 0.0  2.2148721218109126   \n",
       "9996  1.297594  1.164259  0.962886  2.1730761528015137  2.2148721218109126   \n",
       "9997  0.244809  0.607074  0.588277  1.0865380764007568                 0.0   \n",
       "9998  0.374609  1.027287  1.185665  2.1730761528015137  2.2148721218109126   \n",
       "9999  0.705999  1.035395  0.914397  2.1730761528015137                 0.0   \n",
       "\n",
       "                     17                  21  \n",
       "0                   0.0  3.1019613742828365  \n",
       "1                   0.0                 0.0  \n",
       "2     2.548224449157715                 0.0  \n",
       "3                   0.0                 0.0  \n",
       "4                   0.0                 0.0  \n",
       "...                 ...                 ...  \n",
       "9995                0.0  3.1019613742828365  \n",
       "9996                0.0                 0.0  \n",
       "9997  2.548224449157715                 0.0  \n",
       "9998                0.0                 0.0  \n",
       "9999                0.0  1.5509806871414182  \n",
       "\n",
       "[10000 rows x 28 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_invert"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Voila"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
