{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Let's run some timed demonstrations to validate discussions around efficency\n",
    "#Noting that we're using operating time as a proxy for efficiency\n",
    "#I expect the actual times may vary based on the hardware where this is run\n",
    "#So we'll primarily be interested in relative instead of absolute difference\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#We'll benchmark three different scenarios here\n",
    "#1) Full automation (automunge vs postmunge)\n",
    "#2) Full specification (automunge vs postmunge)\n",
    "#3) Comparison to manual normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()\n",
    "\n",
    "import timeit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#As a sample data set we'll use a common introductory tabular data set\n",
    "#From the Kaggle competition for \"House Prices Advanced Regression Techniques\"\n",
    "#Available at: \n",
    "#https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data\n",
    "\n",
    "#housing set\n",
    "df_train = pd.read_csv('housing_train.csv')\n",
    "df_test = pd.read_csv('housing_test.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Note this sets includes an intended ID column and Label column\n",
    "#Which will be carved out by passing the associated column headers\n",
    "\n",
    "labels_column = 'SalePrice'\n",
    "ID_columnlist = 'Id'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1) Benchmarks under automation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Under automation, automunge(.) includes a data type evaluation \n",
    "#which has a little overhead\n",
    "#so we expect the relative speed of postmunge to be higher in this scenario"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "21.274660863\n",
      "\n",
      "average time elapsed:\n",
      "10.6374064335\n"
     ]
    }
   ],
   "source": [
    "#Let's run automunge on just the train set\n",
    "\n",
    "#We'll run it a few times to smooth out any randomness\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train, \\\n",
    "                 labels_column = labels_column, \\\n",
    "                 trainID_column = ID_columnlist, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  i+=1\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "0.9826989020000028\n",
      "\n",
      "average time elapsed:\n",
      "0.49144166600000005\n"
     ]
    }
   ],
   "source": [
    "#Now that we have populated our postprocess_dict\n",
    "#Which is a dictionary containing all of the steps and parameters of transformations\n",
    "#We can now perform comparable transformations in the postmunge(.) function\n",
    "#Here we'll apply to the same data set for a direct comparison\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train, \\\n",
    "                 labelscolumn = labels_column, \\\n",
    "                 testID_column = ID_columnlist, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  i+=1\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2) Benchmarks under specification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#The expectation is that without use of the evaluation function\n",
    "#The automunge application will be more efficient relative to preceding\n",
    "#In order to demonstrate let's first grab the returned root categories\n",
    "#Derived under automation and returned in the postprocess_dict\n",
    "#So as to perfom a like for like comparision\n",
    "\n",
    "assigncat = postprocess_dict['final_assigncat']\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "2.194483778000002\n",
      "\n",
      "average time elapsed:\n",
      "1.0973271880000013\n"
     ]
    }
   ],
   "source": [
    "#Let's run automunge(.) on just the train set\n",
    "#Specifying the transformation categories in assigncat\n",
    "#To reduce the overhead of the evaluation function\n",
    "\n",
    "#We'll run it a few times to smooth out any randomness\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train, \\\n",
    "                 labels_column = labels_column, \\\n",
    "                 trainID_column = ID_columnlist, \\\n",
    "                 assigncat = assigncat, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  i+=1\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "0.9748246479999985\n",
      "\n",
      "average time elapsed:\n",
      "0.4874972730000007\n"
     ]
    }
   ],
   "source": [
    "#now the corresponding postmunge(.) application should be comparable to the first demonstration\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train, \\\n",
    "                 labelscolumn = labels_column, \\\n",
    "                 testID_column = ID_columnlist, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  i+=1\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3) Comparison to manual normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#For the third benchmark, let's just focus on the numerical columns for simplicity\n",
    "\n",
    "df_train_numeric = df_train[postprocess_dict['final_assigncat']['nmbr']]\n",
    "\n",
    "assigncat = {'nmbr': postprocess_dict['final_assigncat']['nmbr']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#We'll run following scenarios\n",
    "#- automunge under automation\n",
    "#- automunge under specification\n",
    "#- postmunge\n",
    "#- manual normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "7.059701028999996\n",
      "\n",
      "average time elapsed:\n",
      "3.529896920999999\n"
     ]
    }
   ],
   "source": [
    "#Here is automunge(.) under automation\n",
    "#including the overhead of the evaluation function\n",
    "\n",
    "#applied to just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train_numeric, \\\n",
    "                 labels_column = False, \\\n",
    "                 trainID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "0.729570865999996\n",
      "\n",
      "average time elapsed:\n",
      "0.3648284265000008\n"
     ]
    }
   ],
   "source": [
    "#Here is automunge(.) under specification\n",
    "#without the overhead of the evaluation function\n",
    "\n",
    "#applied to just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train_numeric, \\\n",
    "                 labels_column = False, \\\n",
    "                 trainID_column = False, \\\n",
    "                 assigncat = assigncat, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "0.26339404299999813\n",
      "\n",
      "average time elapsed:\n",
      "0.13174473699999822\n"
     ]
    }
   ],
   "source": [
    "#now the corresponding postmunge application\n",
    "#again with just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train_numeric, \\\n",
    "                 labelscolumn = False, \\\n",
    "                 testID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "iteration:  2\n",
      "iteration:  3\n",
      "iteration:  4\n",
      "iteration:  5\n",
      "iteration:  6\n",
      "iteration:  7\n",
      "iteration:  8\n",
      "iteration:  9\n",
      "iteration:  10\n",
      "\n",
      "time elapsed:\n",
      "0.22616958200000425\n",
      "\n",
      "average time elapsed:\n",
      "0.02056912163636391\n"
     ]
    }
   ],
   "source": [
    "#now let's perform a manual z-score normalization on these same columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(11):\n",
    "  \n",
    "  df_train_normalized = df_train_numeric.copy()\n",
    "\n",
    "  for column in list(df_train_numeric):\n",
    "    \n",
    "    mean = df_train_numeric[column].mean()\n",
    "    std = df_train_numeric[column].std()\n",
    "    \n",
    "    df_train_normalized[column] = \\\n",
    "    (df_train_normalized[column] - mean) / std\n",
    "    \n",
    "    df_train_normalized[column].fillna(mean)\n",
    "    \n",
    "  print(\"iteration: \", i)\n",
    "    \n",
    "  \n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#This discrepency appears pretty large in favor of the manual normalization\n",
    "#But you have to consider the scaling of postmunge overhead is below that of transformations\n",
    "#with increasing data set size\n",
    "\n",
    "#Let's try again after duplicating the number of rows for a more substantial size data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_numeric2 = pd.concat([df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, \\\n",
    "                               df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric, df_train_numeric], \\\n",
    "                               axis=0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "df_train_numeric.shape = \n",
      "(1460, 34)\n",
      "\n",
      "df_train_numeric2.shape = \n",
      "(189800, 34)\n"
     ]
    }
   ],
   "source": [
    "print(\"df_train_numeric.shape = \")\n",
    "print(df_train_numeric.shape)\n",
    "print()\n",
    "print(\"df_train_numeric2.shape = \")\n",
    "print(df_train_numeric2.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "411.193009176\n",
      "\n",
      "average time elapsed:\n",
      "205.59656994099998\n"
     ]
    }
   ],
   "source": [
    "#Here is automunge(.) under automation\n",
    "#including the overhead of the evaluation function\n",
    "\n",
    "#applied to just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train_numeric2, \\\n",
    "                 labels_column = False, \\\n",
    "                 trainID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "\n",
      "time elapsed:\n",
      "3.3234976179999762\n",
      "\n",
      "average time elapsed:\n",
      "1.6618192045000058\n"
     ]
    }
   ],
   "source": [
    "#Here is automunge(.) under specification\n",
    "#without the overhead of the evaluation function\n",
    "\n",
    "#applied to just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(2):\n",
    "\n",
    "  train, trainID, labels, \\\n",
    "  validation1, validationID1, validationlabels1, \\\n",
    "  validation2, validationID2, validationlabels2, \\\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "  featureimportance, postprocess_dict \\\n",
    "  = am.automunge(df_train_numeric2, \\\n",
    "                 labels_column = False, \\\n",
    "                 trainID_column = False, \\\n",
    "                 assigncat = assigncat, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#of course these last two cells demonstrate \n",
    "#the considerable overhead of the automunge(.) evaluation functions\n",
    "#in relation to the transformations themselves at increasing data scale \n",
    "\n",
    "#As an aside note that we can decrease the ratio of rows that are sampled for evaluation\n",
    "#With the automunge(.) eval_ratio parameter\n",
    "#To speed things up without losing functionality\n",
    "#eval_ratio parameter currently defaults to 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "iteration:  2\n",
      "iteration:  3\n",
      "iteration:  4\n",
      "iteration:  5\n",
      "iteration:  6\n",
      "iteration:  7\n",
      "iteration:  8\n",
      "iteration:  9\n",
      "iteration:  10\n",
      "\n",
      "time elapsed:\n",
      "6.847629239000014\n",
      "\n",
      "average time elapsed:\n",
      "0.6225408791818215\n"
     ]
    }
   ],
   "source": [
    "#now the corresponding postmunge(.) application\n",
    "#again with just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(11):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train_numeric2, \\\n",
    "                 labelscolumn = False, \\\n",
    "                 testID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "iteration:  2\n",
      "iteration:  3\n",
      "iteration:  4\n",
      "iteration:  5\n",
      "iteration:  6\n",
      "iteration:  7\n",
      "iteration:  8\n",
      "iteration:  9\n",
      "iteration:  10\n",
      "\n",
      "time elapsed:\n",
      "3.1023630149999804\n",
      "\n",
      "average time elapsed:\n",
      "0.2820456821818208\n"
     ]
    }
   ],
   "source": [
    "#now let's perform a manual z-score normalization on these same columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(11):\n",
    "  \n",
    "  df_train_normalized2 = df_train_numeric2.copy()\n",
    "\n",
    "  for column in list(df_train_numeric):\n",
    "    \n",
    "    #extract normalization parameters\n",
    "    mean = df_train_numeric2[column].mean()\n",
    "    std = df_train_numeric2[column].std()\n",
    "    \n",
    "    #apply normalization\n",
    "    df_train_normalized2[column] = \\\n",
    "    (df_train_normalized2[column] - mean) / std\n",
    "    \n",
    "    #mean infill\n",
    "    df_train_normalized2[column].fillna(mean)\n",
    "  \n",
    "  #convert results to numpy array\n",
    "  df_train_normalized2 = df_train_normalized2.values\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  \n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "#When I ran this notebook on my computer\n",
    "\n",
    "#The relative difference between manual normalizations and postmunge normalizations\n",
    "#Were postmunge at about 6 times longer than manual on the small data set (with 1460 rows)\n",
    "#(e.g. 0.0217s vs 0.1311s)\n",
    "\n",
    "#And then when scaled up the data set to 189800 rows\n",
    "#The relative difference between manual normalizations and postmunge normalizations\n",
    "#Were at postmunge about 2.3 times longer than manual on the scaled up data set\n",
    "#(e.g. 0.271s vs 0.611s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Naturally this begged the question of where the asymptote might be\n",
    "#And so tried one more run scaling up the data set even further:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "df_train_numeric.shape = \n",
      "(1460, 34)\n",
      "\n",
      "df_train_numeric2.shape = \n",
      "(16133000, 34)\n"
     ]
    }
   ],
   "source": [
    "#let's try with an even bigger data set\n",
    "\n",
    "df_train_numeric3 = pd.concat([df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, \\\n",
    "                               df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2, df_train_numeric2], \\\n",
    "                               axis=0)\n",
    "\n",
    "print(\"df_train_numeric.shape = \")\n",
    "print(df_train_numeric.shape)\n",
    "print()\n",
    "print(\"df_train_numeric2.shape = \")\n",
    "print(df_train_numeric3.shape)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "iteration:  2\n",
      "\n",
      "time elapsed:\n",
      "253.99919574599994\n",
      "\n",
      "average time elapsed:\n",
      "84.666451818\n"
     ]
    }
   ],
   "source": [
    "#now the corresponding postmunge application\n",
    "#again with just the numeric columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(3):\n",
    "\n",
    "  test, testID, testlabels, \\\n",
    "  labelsencoding_dict, postreports_dict \\\n",
    "  = am.postmunge(postprocess_dict, df_train_numeric3, \\\n",
    "                 labelscolumn = False, \\\n",
    "                 testID_column = False, \\\n",
    "                 printstatus = False)\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "\n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration:  0\n",
      "iteration:  1\n",
      "iteration:  2\n",
      "\n",
      "time elapsed:\n",
      "131.32367403400008\n",
      "\n",
      "average time elapsed:\n",
      "43.77459584233335\n"
     ]
    }
   ],
   "source": [
    "#now let's perform a manual z-score normalization on these same columns\n",
    "\n",
    "start_time = timeit.default_timer()\n",
    "\n",
    "for i in range(3):\n",
    "  \n",
    "  df_train_normalized3 = df_train_numeric3.copy()\n",
    "\n",
    "  for column in list(df_train_numeric3):\n",
    "    \n",
    "    #extract normalization parameters\n",
    "    mean = df_train_numeric3[column].mean()\n",
    "    std = df_train_numeric3[column].std()\n",
    "    \n",
    "    #apply normalization\n",
    "    df_train_normalized3[column] = \\\n",
    "    (df_train_normalized3[column] - mean) / std\n",
    "    \n",
    "    #mean infill\n",
    "    df_train_normalized3[column].fillna(mean)\n",
    "  \n",
    "  #convert results to numpy array\n",
    "  df_train_normalized3 = df_train_normalized3.values\n",
    "  \n",
    "  print(\"iteration: \", i)\n",
    "  \n",
    "print()\n",
    "print(\"time elapsed:\")\n",
    "print(timeit.default_timer() - start_time)\n",
    "print()\n",
    "print(\"average time elapsed:\")\n",
    "print((timeit.default_timer() - start_time)/(i+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "#In this larger configuration of 16,133,000 rows\n",
    "#(compared to the prior configuration of 189,800 rows)\n",
    "#The postmunge(.) function came much closer to matching the manual specification\n",
    "\n",
    "#Where postmunge(.) only took 1.6 times longer to perform a comparable operation\n",
    "#(63.6s vs 39.6s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "#not shown, ran another test after taking data another order of magnitude higher\n",
    "#and postmunge(.) took only 1.4 times longer than manual\n",
    "#So that kind of gives an idea of asymptote for this particular configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
