{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Let's run some timed demonstrations to validate discussions around efficency\n",
    "#Noting that we're using operating time as a proxy for efficiency\n",
    "#I expect the actual times may vary based on the hardware where this is run\n",
    "#So we'll primarily be interested in relative instead of absolute difference\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#We'll benchmark three different scenarios here\n",
    "#1) Full automation (automunge vs postmunge)\n",
    "#2) Full specification (automunge vs postmunge)\n",
    "#3) Comparison to manual normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()\n",
    "\n",
    "import timeit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# #As a sample data set we'll use a common introductory tabular data set\n",
    "# #From the Kaggle competition for \"House Prices Advanced Regression Techniques\"\n",
    "# #Available at: \n",
    "# #https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data\n",
    "\n",
    "#housing set\n",
    "df_train = pd.read_csv('housing_train.csv')\n",
    "df_test = pd.read_csv('housing_test.csv')\n",
    "\n",
    "\n",
    "#As a sample data set we'll use a common introductory tabular data set\n",
    "#From the Kaggle competition for Titanic\n",
    "\n",
    "# #titanic set\n",
    "# df_train = pd.read_csv('train.csv')\n",
    "# df_test = pd.read_csv('test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Note this sets includes an intended ID column and Label column\n",
    "#Which will be carved out by passing the associated column headers\n",
    "\n",
    "labels_column = 'SalePrice'\n",
    "ID_columnlist = 'Id'\n",
    "\n",
    "# labels_column = 'Survived'\n",
    "# ID_columnlist = 'PassengerId'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1) Benchmarks under automation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Under automation, automunge(.) includes a data type evaluation \n",
    "#which has a little overhead\n",
    "#so we expect the relative speed of postmunge to be higher in this secnario"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "13.727263502999996\n",
      "\n",
      "average time elapsed:\n",
      "6.863690869999999\n"
     ]
    }
   ],
   "source": [
    "#Let's run automunge on just the train set\n",
    "\n",
    "#We'll run it a few times to smooth out any randomness\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train, \\\n",
    "                 labels_column = labels_column, \\\n",
    "                 trainID_column = ID_columnlist, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  i+=1\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "1.0899743769999972\n",
      "\n",
      "average time elapsed:\n",
      "0.5450332899999992\n"
     ]
    }
   ],
   "source": [
    "#Now that we have populated our postprocess_dict\n",
    "#Which is a dictionary containing all of the steps and parameters of transformations\n",
    "#We can now perform comparable transformations in the postmunge(.) function\n",
    "#Here we'll apply to the same data set for a direct comparison\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train, \\\n",
    "                 labelscolumn = labels_column, \\\n",
    "                 testID_column = ID_columnlist, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  i+=1\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2) Benchmarks under specification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#The expectation is that without use of the evaluation function\n",
    "#The automunge application will be more efficient relative to preceding\n",
    "#In order to demonstrate let's first grab the returned root categories\n",
    "#Derived under automation and returned in the postprocess_dict\n",
    "#So as to perfom a like for like comparision\n",
    "\n",
    "assigncat = postprocess_dict['final_assigncat']\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "2.3871823810000024\n",
      "\n",
      "average time elapsed:\n",
      "1.1936413840000029\n"
     ]
    }
   ],
   "source": [
    "#Let's run automunge on just the train set\n",
    "#Specifying the trasnformation categories in assigncat\n",
    "#To reduce the overhead of the evaluation function\n",
    "\n",
    "#We'll run it a few times to smooth out any randomness\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train, \\\n",
    "                 labels_column = labels_column, \\\n",
    "                 trainID_column = ID_columnlist, \\\n",
    "                 assigncat = assigncat, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  i+=1\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "1.0498833569999988\n",
      "\n",
      "average time elapsed:\n",
      "0.5250243930000025\n"
     ]
    }
   ],
   "source": [
    "#now the corresponding postmunge application should be comparable to the first demonstration\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train, \\\n",
    "                 labelscolumn = labels_column, \\\n",
    "                 testID_column = ID_columnlist, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  i+=1\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3) Comparison to manual normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "#For the third benchmark, let's just focus on the numerical columns for simplicity\n",
    "\n",
    "df_train_numeric = df_train[postprocess_dict['final_assigncat']['nmbr']]\n",
    "\n",
    "assigncat = {'nmbr': postprocess_dict['final_assigncat']['nmbr']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "#We'll run following scenarios\n",
    "#- automunge under automation\n",
    "#- automunge under specification\n",
    "#- postmunge\n",
    "#- manual normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "4.337960189000114\n",
      "\n",
      "average time elapsed:\n",
      "2.169220313000096\n"
     ]
    }
   ],
   "source": [
    "#Here is automunge under automation\n",
    "#including the overhead of the evaluation function\n",
    "\n",
    "#applied to just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train_numeric, \\\n",
    "                 labels_column = False, \\\n",
    "                 trainID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "0.7458571600000141\n",
      "\n",
      "average time elapsed:\n",
      "0.37297673400007625\n"
     ]
    }
   ],
   "source": [
    "#Here is automunge under specification\n",
    "#without the overhead of the evaluation function\n",
    "\n",
    "#applied to just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train_numeric, \\\n",
    "                 labels_column = False, \\\n",
    "                 trainID_column = False, \\\n",
    "                 assigncat = assigncat, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "0.26212924199990084\n",
      "\n",
      "average time elapsed:\n",
      "0.1311172054997769\n"
     ]
    }
   ],
   "source": [
    "#now the corresponding postmunge application\n",
    "#again with just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train_numeric, \\\n",
    "                 labelscolumn = False, \\\n",
    "                 testID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "time elapsed:\n",
      "0.23815144500076713\n",
      "\n",
      "average time elapsed:\n",
      "0.021658887909159766\n"
     ]
    }
   ],
   "source": [
    "#now let's perform a manual z-score normalization on these same columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(11):\n",
    "  \n",
    "  df_train_normalized = df_train_numeric.copy()\n",
    "\n",
    "  for column in list(df_train_numeric):\n",
    "    \n",
    "    mean = df_train_numeric[column].mean()\n",
    "    std = df_train_numeric[column].std()\n",
    "    \n",
    "    df_train_normalized[column] = \\\n",
    "    (df_train_normalized[column] - mean) / std\n",
    "    \n",
    "    df_train_normalized[column].fillna(mean)\n",
    "    \n",
    "  print(\"iteration: \", i)\n",
    "    \n",
    "  \n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "#This discrepency appears pretty large in favor of the manual normalization\n",
    "#But you have to consider the scaling of postmunge overhead is below that of transformations\n",
    "#with increasing data set size\n",
    "\n",
    "#Let's try again after duplicating the number of rows for a more substantial size data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_numeric2 = pd.concat([df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric], \\\n",
    "                               axis=0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "df_train_numeric.shape = \n",
      "(1460, 32)\n",
      "\n",
      "df_train_numeric2.shape = \n",
      "(189800, 32)\n"
     ]
    }
   ],
   "source": [
    "print(\"df_train_numeric.shape = \")\n",
    "print(df_train_numeric.shape)\n",
    "print()\n",
    "print(\"df_train_numeric2.shape = \")\n",
    "print(df_train_numeric2.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "443.78997745700053\n",
      "\n",
      "average time elapsed:\n",
      "221.89506205650014\n"
     ]
    }
   ],
   "source": [
    "#Here is automunge under automation\n",
    "#including the overhead of the evaluation function\n",
    "\n",
    "#applied to just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train_numeric2, \\\n",
    "                 labels_column = False, \\\n",
    "                 trainID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "3.2403669180002908\n",
      "\n",
      "average time elapsed:\n",
      "1.6202577730000485\n"
     ]
    }
   ],
   "source": [
    "#Here is automunge under specification\n",
    "#without the overhead of the evaluation function\n",
    "\n",
    "#applied to just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train_numeric2, \\\n",
    "                 labels_column = False, \\\n",
    "                 trainID_column = False, \\\n",
    "                 assigncat = assigncat, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "#of course these last two cells demonstrate \n",
    "#the considerable overhead of the automunge(.) evaluation functions\n",
    "#in relation to the transformations themselves at increasing data scale \n",
    "\n",
    "#As an aside note that we can decrease the ratio of rows that are sampled for evaluation\n",
    "#With the automunge(.) eval_ratio parameter\n",
    "#To speed things up without losing functionality\n",
    "#eval_ratio parameter currently defaults to 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "iteration:  2\n",
      "iteration:  3\n",
      "iteration:  4\n",
      "iteration:  5\n",
      "iteration:  6\n",
      "iteration:  7\n",
      "iteration:  8\n",
      "iteration:  9\n",
      "iteration:  10\n",
      "\n",
      "time elapsed:\n",
      "6.720049497999753\n",
      "\n",
      "average time elapsed:\n",
      "0.6109256326363638\n"
     ]
    }
   ],
   "source": [
    "#now the corresponding postmunge application\n",
    "#again with just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(11):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train_numeric2, \\\n",
    "                 labelscolumn = False, \\\n",
    "                 testID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "iteration:  2\n",
      "iteration:  3\n",
      "iteration:  4\n",
      "iteration:  5\n",
      "iteration:  6\n",
      "iteration:  7\n",
      "iteration:  8\n",
      "iteration:  9\n",
      "iteration:  10\n",
      "\n",
      "time elapsed:\n",
      "2.984008198000083\n",
      "\n",
      "average time elapsed:\n",
      "0.27128671245455654\n"
     ]
    }
   ],
   "source": [
    "#now let's perform a manual z-score normalization on these same columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(11):\n",
    "  \n",
    "  df_train_normalized2 = df_train_numeric2.copy()\n",
    "\n",
    "  for column in list(df_train_numeric):\n",
    "    \n",
    "    #extract normalization parameters\n",
    "    mean = df_train_numeric2[column].mean()\n",
    "    std = df_train_numeric2[column].std()\n",
    "    \n",
    "    #apply normalization\n",
    "    df_train_normalized2[column] = \\\n",
    "    (df_train_normalized2[column] - mean) / std\n",
    "    \n",
    "    #mean infill\n",
    "    df_train_normalized2[column].fillna(mean)\n",
    "  \n",
    "  #convert results to numpy array\n",
    "  df_train_normalized2 = df_train_normalized2.values\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  \n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "#When I ran this notebook on my computer\n",
    "\n",
    "#The relative difference between manual normalizations and postmunge normalizations\n",
    "#Were postmunge at about 6 times longer than manual on the small data set (with 1460 rows)\n",
    "#(e.g. 0.0217s vs 0.1311s)\n",
    "\n",
    "#And then when scaled up the data set to 189800 rows\n",
    "#The relative difference between manual normalizations and postmunge normalizations\n",
    "#Were at postmunge about 2.3 times longer than manual on the scaled up data set\n",
    "#(e.g. 0.271s vs 0.611s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Naturally this begged the question of where the asymptote might be\n",
    "#And so tried one more run scaling up the data set even further:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "df_train_numeric.shape = \n",
      "(1460, 32)\n",
      "\n",
      "df_train_numeric2.shape = \n",
      "(16133000, 32)\n"
     ]
    }
   ],
   "source": [
    "#let's try with an even bigger data set\n",
    "\n",
    "df_train_numeric3 = pd.concat([df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2], \\\n",
    "                               axis=0)\n",
    "\n",
    "print(\"df_train_numeric.shape = \")\n",
    "print(df_train_numeric.shape)\n",
    "print()\n",
    "print(\"df_train_numeric2.shape = \")\n",
    "print(df_train_numeric3.shape)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "iteration:  2\n",
      "\n",
      "time elapsed:\n",
      "190.922307412\n",
      "\n",
      "average time elapsed:\n",
      "63.64081944700016\n"
     ]
    }
   ],
   "source": [
    "#now the corresponding postmunge application\n",
    "#again with just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(3):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train_numeric3, \\\n",
    "                 labelscolumn = False, \\\n",
    "                 testID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "iteration:  2\n",
      "\n",
      "time elapsed:\n",
      "118.90164633999939\n",
      "\n",
      "average time elapsed:\n",
      "39.63392772333312\n"
     ]
    }
   ],
   "source": [
    "#now let's perform a manual z-score normalization on these same columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(3):\n",
    "  \n",
    "  df_train_normalized3 = df_train_numeric3.copy()\n",
    "\n",
    "  for column in list(df_train_numeric3):\n",
    "    \n",
    "    #extract normalization parameters\n",
    "    mean = df_train_numeric3[column].mean()\n",
    "    std = df_train_numeric3[column].std()\n",
    "    \n",
    "    #apply normalization\n",
    "    df_train_normalized3[column] = \\\n",
    "    (df_train_normalized3[column] - mean) / std\n",
    "    \n",
    "    #mean infill\n",
    "    df_train_normalized3[column].fillna(mean)\n",
    "  \n",
    "  #convert results to numpy array\n",
    "  df_train_normalized3 = df_train_normalized3.values\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  \n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "#In this larger configuration of 16,133,000 rows\n",
    "#(compared to the prior configuration of 189,800 rows)\n",
    "#The postmunge function came much closer to matching the manual specification\n",
    "\n",
    "#Where postmunge only took 1.6 times longer to perform a comparable operation\n",
    "#(63.6s vs 39.6s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "#not shown, ran another test after taking data another order of magnitude higher\n",
    "#and postmunge tool only 1.4 times longer than manual\n",
    "#So that kind of gives an idea of asymptote for this particular configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
