{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai.tabular.all import *\n",
    "\n",
    "#we'll handle missing data infill with Automunge\n",
    "#note exc2 is a pass-through transform that defaults to mode infill\n",
    "\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# number_of_epochs = 1\n",
    "# number_of_epochs = 3\n",
    "number_of_epochs = 14\n",
    "# number_of_epochs = 28\n",
    "# number_of_epochs = 35\n",
    "# number_of_epochs = 42\n",
    "# number_of_epochs = 200\n",
    "\n",
    "#training did not converge when I tried this at 5\n",
    "#small data converged when I tried at 1\n",
    "pm_noise_sets = 0\n",
    "\n",
    "#the validation ratios shown are selected to achieve 500k validation set size\n",
    "#to be consistent with Higgs paper\n",
    "#but turns out my derivation was slightly off\n",
    "#so validation set size is 623,332 to be consistent with other experiments\n",
    "#0.25% and 5% percentages represent portion of data remaininbg after validation split\n",
    "\n",
    "# #full DATA\n",
    "# sample_ratio = 1.0\n",
    "# validation_ratio = 0.0567\n",
    "\n",
    "#5% DATA\n",
    "sample_ratio = 0.103833244329036\n",
    "validation_ratio = 0.545745953948647\n",
    "\n",
    "#0.25% DATA\n",
    "# sample_ratio = 0.05902490655\n",
    "# validation_ratio = 0.9600451114\n",
    "\n",
    "# #tiny data\n",
    "# sample_ratio = 0.0001\n",
    "# validation_ratio = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(11000000, 29)\n"
     ]
    }
   ],
   "source": [
    "path = \"/data/Benchmark_datasets/Higgs/HIGGS.csv\"\n",
    "\n",
    "df_train1 = pd.read_csv(path, header=None)\n",
    "\n",
    "print(df_train1.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Augmentation Experiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "randomstate = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "cont_names = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28']\n",
    "label_names='0'\n",
    "\n",
    "assigncat={'DPrt':cont_names, 'lbos':label_names}\n",
    "#turning off inplace as expect the order of columns consistency with DPrt will be impacted by inplace\n",
    "assignparam={'default_assignparam' : {'DPrt' : {'flip_prob' : 0.03}}, \\\n",
    "             'global_assignparam' : {'inplace' : False}}\n",
    "\n",
    "\n",
    "df_train = df_train1.sample(frac = sample_ratio, random_state = randomstate)\n",
    "\n",
    "#have to use a different train_test_split approach\n",
    "#for noise injection\n",
    "#so we'll sp;lit out validation sets prior to passing to automunge\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "df_train, df_validation = \\\n",
    "train_test_split(df_train, test_size=validation_ratio, random_state=randomstate)\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train, df_test = df_validation, \n",
    "             labels_column = label_names, MLinfill = False, \n",
    "             shuffletrain = 'traintest', \n",
    "             randomseed = randomstate,\n",
    "             assigncat=assigncat, \\\n",
    "             assigninfill={'adjinfill':cont_names}, \\\n",
    "             assignparam = assignparam, \\\n",
    "             pandasoutput=True, printstatus=False)\n",
    "\n",
    "\n",
    "#this gives the validation set at the top\n",
    "#at 500k rows based on the validation_ratio selected for that value\n",
    "#and a noise set after\n",
    "train   = pd.concat([test, train], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([testID, trainID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([testlabels, labels], axis=0, ignore_index=True)\n",
    "\n",
    "\n",
    "cont_names_final = finalcolumns_train\n",
    "y_names = list(labels)[0]\n",
    "\n",
    "#now do one train set without noise injection\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df_train, \\\n",
    "             pandasoutput = True, printstatus = False, \\\n",
    "             traindata = False, \\\n",
    "             shuffletrain = True)\n",
    "\n",
    "train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)\n",
    "\n",
    "\n",
    "#now range(i) adds on i more noise sets to give i+1 noise and one without\n",
    "for i in range(pm_noise_sets):\n",
    "\n",
    "    test, testID, testlabels, \\\n",
    "    labelsencoding_dict, postreports_dict = \\\n",
    "    am.postmunge(postprocess_dict, df_train, \\\n",
    "                 pandasoutput = True, printstatus = False, \\\n",
    "                 traindata = True, \\\n",
    "                 shuffletrain = True)\n",
    "\n",
    "    train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "    trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "    labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)\n",
    "\n",
    "\n",
    "train = pd.concat([train, labels], axis = 1)\n",
    "\n",
    "\n",
    "#so this returns the first 500k rows a validation set\n",
    "#which is only in set once\n",
    "#and then the rest of rows are duplicated once without noise\n",
    "#and pm_noise_sets +1 times with noise\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>roc_auc_score</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.538359</td>\n",
       "      <td>0.538218</td>\n",
       "      <td>0.802176</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.528374</td>\n",
       "      <td>0.517456</td>\n",
       "      <td>0.819088</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.518970</td>\n",
       "      <td>0.513677</td>\n",
       "      <td>0.823637</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.511722</td>\n",
       "      <td>0.505184</td>\n",
       "      <td>0.830424</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.525652</td>\n",
       "      <td>0.500067</td>\n",
       "      <td>0.834001</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.505731</td>\n",
       "      <td>0.497722</td>\n",
       "      <td>0.835019</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>0.485177</td>\n",
       "      <td>0.499475</td>\n",
       "      <td>0.836127</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>0.485626</td>\n",
       "      <td>0.493058</td>\n",
       "      <td>0.838968</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>0.490913</td>\n",
       "      <td>0.489319</td>\n",
       "      <td>0.841027</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>0.494510</td>\n",
       "      <td>0.488487</td>\n",
       "      <td>0.841633</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>0.477124</td>\n",
       "      <td>0.488740</td>\n",
       "      <td>0.841855</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>0.462822</td>\n",
       "      <td>0.487299</td>\n",
       "      <td>0.843393</td>\n",
       "      <td>02:28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>0.458245</td>\n",
       "      <td>0.488846</td>\n",
       "      <td>0.842274</td>\n",
       "      <td>02:27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>0.451049</td>\n",
       "      <td>0.486028</td>\n",
       "      <td>0.844104</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#rev1\n",
    "\n",
    "# splits = RandomSplitter(valid_pct=validation_ratio)(range_of(train))\n",
    "\n",
    "#using a different validation split approach for noise than other experiments \n",
    "#to ensure noise injected data\n",
    "#isn't included in validation set\n",
    "#automunge call shuffles data based on randomseed\n",
    "#and the set without noise will be the top of the train set with index 0-etc\n",
    "splitter = IndexSplitter(list(range(623332)))\n",
    "splits = splitter(range_of(train))\n",
    "\n",
    "\n",
    "to = TabularPandas(train, procs=[],\n",
    "                   cont_names = cont_names_final,\n",
    "                   y_names = y_names,\n",
    "                   splits = splits)\n",
    "\n",
    "dls = to.dataloaders(bs=64)\n",
    "\n",
    "auc_metric = RocAucBinary()\n",
    "\n",
    "learn = tabular_learner(dls, layers= [300, 300, 300, 300, 300], metrics=auc_metric)\n",
    "\n",
    "#a batch size convention has already been established for these experiments \n",
    "#so not getting benefit out of second\n",
    "# learn.model = torch.nn.DataParallel(learn.model, device_ids=[0, 1])\n",
    "\n",
    "learn.fit_one_cycle(number_of_epochs)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# data aug rev2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "randomstate = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "cont_names = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28']\n",
    "label_names='0'\n",
    "\n",
    "assigncat={'DPrt':cont_names, 'lbos':label_names}\n",
    "#turning off inplace as expect the order of columns consistency with DPrt will be impacted by inplace\n",
    "assignparam={'default_assignparam' : {'DPrt' : {'flip_prob' : 0.03}}, \\\n",
    "             'global_assignparam' : {'inplace' : False}}\n",
    "\n",
    "\n",
    "df_train = df_train1.sample(frac = sample_ratio, random_state = randomstate)\n",
    "\n",
    "#have to use a different train_test_split approach\n",
    "#for noise injection\n",
    "#so we'll sp;lit out validation sets prior to passing to automunge\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "df_train, df_validation = \\\n",
    "train_test_split(df_train, test_size=validation_ratio, random_state=randomstate)\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train, df_test = df_validation, \n",
    "             labels_column = label_names, MLinfill = False, \n",
    "             shuffletrain = 'traintest', \n",
    "             randomseed = randomstate,\n",
    "             assigncat=assigncat, \\\n",
    "             assigninfill={'adjinfill':cont_names}, \\\n",
    "             assignparam = assignparam, \\\n",
    "             pandasoutput=True, printstatus=False)\n",
    "\n",
    "\n",
    "#this gives the validation set at the top\n",
    "#at 500k rows based on the validation_ratio selected for that value\n",
    "#and a noise set after\n",
    "train   = pd.concat([test, train], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([testID, trainID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([testlabels, labels], axis=0, ignore_index=True)\n",
    "\n",
    "\n",
    "cont_names_final = finalcolumns_train\n",
    "y_names = list(labels)[0]\n",
    "\n",
    "#now do one train set without noise injection\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df_train, \\\n",
    "             pandasoutput = True, printstatus = False, \\\n",
    "             traindata = False, \\\n",
    "             shuffletrain = True)\n",
    "\n",
    "train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)\n",
    "\n",
    "\n",
    "#now range(i) adds on i more noise sets to give i+1 noise and one without\n",
    "for i in range(pm_noise_sets):\n",
    "\n",
    "    test, testID, testlabels, \\\n",
    "    labelsencoding_dict, postreports_dict = \\\n",
    "    am.postmunge(postprocess_dict, df_train, \\\n",
    "                 pandasoutput = True, printstatus = False, \\\n",
    "                 traindata = True, \\\n",
    "                 shuffletrain = True)\n",
    "\n",
    "    train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "    trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "    labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)\n",
    "\n",
    "\n",
    "train = pd.concat([train, labels], axis = 1)\n",
    "\n",
    "\n",
    "#so this returns the first 500k rows a validation set\n",
    "#which is only in set once\n",
    "#and then the rest of rows are duplicated once without noise\n",
    "#and pm_noise_sets +1 times with noise\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>roc_auc_score</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.545033</td>\n",
       "      <td>0.534529</td>\n",
       "      <td>0.806424</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.539256</td>\n",
       "      <td>0.520077</td>\n",
       "      <td>0.817952</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.532003</td>\n",
       "      <td>0.511596</td>\n",
       "      <td>0.825538</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.514585</td>\n",
       "      <td>0.511314</td>\n",
       "      <td>0.829803</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.511734</td>\n",
       "      <td>0.499568</td>\n",
       "      <td>0.833291</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.496388</td>\n",
       "      <td>0.497327</td>\n",
       "      <td>0.835767</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>0.501063</td>\n",
       "      <td>0.493253</td>\n",
       "      <td>0.838157</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>0.488516</td>\n",
       "      <td>0.488767</td>\n",
       "      <td>0.842364</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>0.499865</td>\n",
       "      <td>0.488598</td>\n",
       "      <td>0.841532</td>\n",
       "      <td>02:28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>0.478558</td>\n",
       "      <td>0.489666</td>\n",
       "      <td>0.841009</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>0.462922</td>\n",
       "      <td>0.485601</td>\n",
       "      <td>0.844137</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>0.473410</td>\n",
       "      <td>0.485555</td>\n",
       "      <td>0.844148</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>0.469831</td>\n",
       "      <td>0.484329</td>\n",
       "      <td>0.845035</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>0.469921</td>\n",
       "      <td>0.484253</td>\n",
       "      <td>0.845445</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#rev2\n",
    "\n",
    "# splits = RandomSplitter(valid_pct=validation_ratio)(range_of(train))\n",
    "\n",
    "#using a different validation split approach for noise than other experiments \n",
    "#to ensure noise injected data\n",
    "#isn't included in validation set\n",
    "#automunge call shuffles data based on randomseed\n",
    "#and the set without noise will be the top of the train set with index 0-etc\n",
    "splitter = IndexSplitter(list(range(623332)))\n",
    "splits = splitter(range_of(train))\n",
    "\n",
    "\n",
    "to = TabularPandas(train, procs=[],\n",
    "                   cont_names = cont_names_final,\n",
    "                   y_names = y_names,\n",
    "                   splits = splits)\n",
    "\n",
    "dls = to.dataloaders(bs=64)\n",
    "\n",
    "auc_metric = RocAucBinary()\n",
    "\n",
    "learn = tabular_learner(dls, layers= [300, 300, 300, 300, 300], metrics=auc_metric)\n",
    "\n",
    "#a batch size convention has already been established for these experiments \n",
    "#so not getting benefit out of second\n",
    "# learn.model = torch.nn.DataParallel(learn.model, device_ids=[0, 1])\n",
    "\n",
    "learn.fit_one_cycle(number_of_epochs)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Aug rev3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "randomstate = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "cont_names = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28']\n",
    "label_names='0'\n",
    "\n",
    "assigncat={'DPrt':cont_names, 'lbos':label_names}\n",
    "#turning off inplace as expect the order of columns consistency with DPrt will be impacted by inplace\n",
    "assignparam={'default_assignparam' : {'DPrt' : {'flip_prob' : 0.03}}, \\\n",
    "             'global_assignparam' : {'inplace' : False}}\n",
    "\n",
    "\n",
    "df_train = df_train1.sample(frac = sample_ratio, random_state = randomstate)\n",
    "\n",
    "#have to use a different train_test_split approach\n",
    "#for noise injection\n",
    "#so we'll sp;lit out validation sets prior to passing to automunge\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "df_train, df_validation = \\\n",
    "train_test_split(df_train, test_size=validation_ratio, random_state=randomstate)\n",
    "\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train, df_test = df_validation, \n",
    "             labels_column = label_names, MLinfill = False, \n",
    "             shuffletrain = 'traintest', \n",
    "             randomseed = randomstate,\n",
    "             assigncat=assigncat, \\\n",
    "             assigninfill={'adjinfill':cont_names}, \\\n",
    "             assignparam = assignparam, \\\n",
    "             pandasoutput=True, printstatus=False)\n",
    "\n",
    "\n",
    "#this gives the validation set at the top\n",
    "#at 500k rows based on the validation_ratio selected for that value\n",
    "#and a noise set after\n",
    "train   = pd.concat([test, train], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([testID, trainID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([testlabels, labels], axis=0, ignore_index=True)\n",
    "\n",
    "\n",
    "cont_names_final = finalcolumns_train\n",
    "y_names = list(labels)[0]\n",
    "\n",
    "#now do one train set without noise injection\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, postreports_dict = \\\n",
    "am.postmunge(postprocess_dict, df_train, \\\n",
    "             pandasoutput = True, printstatus = False, \\\n",
    "             traindata = False, \\\n",
    "             shuffletrain = True)\n",
    "\n",
    "train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)\n",
    "\n",
    "\n",
    "#now range(i) adds on i more noise sets to give i+1 noise and one without\n",
    "for i in range(pm_noise_sets):\n",
    "\n",
    "    test, testID, testlabels, \\\n",
    "    labelsencoding_dict, postreports_dict = \\\n",
    "    am.postmunge(postprocess_dict, df_train, \\\n",
    "                 pandasoutput = True, printstatus = False, \\\n",
    "                 traindata = True, \\\n",
    "                 shuffletrain = True)\n",
    "\n",
    "    train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "    trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "    labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)\n",
    "\n",
    "\n",
    "train = pd.concat([train, labels], axis = 1)\n",
    "\n",
    "\n",
    "#so this returns the first 500k rows a validation set\n",
    "#which is only in set once\n",
    "#and then the rest of rows are duplicated once without noise\n",
    "#and pm_noise_sets +1 times with noise\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>roc_auc_score</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.547295</td>\n",
       "      <td>0.540100</td>\n",
       "      <td>0.800405</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.545029</td>\n",
       "      <td>0.519142</td>\n",
       "      <td>0.818979</td>\n",
       "      <td>02:27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.530280</td>\n",
       "      <td>0.512104</td>\n",
       "      <td>0.825093</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.514123</td>\n",
       "      <td>0.505388</td>\n",
       "      <td>0.829559</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.504718</td>\n",
       "      <td>0.502608</td>\n",
       "      <td>0.831381</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.487939</td>\n",
       "      <td>0.505746</td>\n",
       "      <td>0.833069</td>\n",
       "      <td>02:26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>0.497974</td>\n",
       "      <td>0.494180</td>\n",
       "      <td>0.837937</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>0.498632</td>\n",
       "      <td>0.491298</td>\n",
       "      <td>0.839726</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>0.492909</td>\n",
       "      <td>0.488246</td>\n",
       "      <td>0.841725</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>0.485205</td>\n",
       "      <td>0.488923</td>\n",
       "      <td>0.841672</td>\n",
       "      <td>02:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>0.473862</td>\n",
       "      <td>0.489880</td>\n",
       "      <td>0.841058</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>0.473536</td>\n",
       "      <td>0.485343</td>\n",
       "      <td>0.844139</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>0.470401</td>\n",
       "      <td>0.484935</td>\n",
       "      <td>0.844704</td>\n",
       "      <td>02:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>0.464088</td>\n",
       "      <td>0.485384</td>\n",
       "      <td>0.844888</td>\n",
       "      <td>02:29</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#rev3\n",
    "\n",
    "# splits = RandomSplitter(valid_pct=validation_ratio)(range_of(train))\n",
    "\n",
    "#using a different validation split approach for noise than other experiments \n",
    "#to ensure noise injected data\n",
    "#isn't included in validation set\n",
    "#automunge call shuffles data based on randomseed\n",
    "#and the set without noise will be the top of the train set with index 0-etc\n",
    "splitter = IndexSplitter(list(range(623332)))\n",
    "splits = splitter(range_of(train))\n",
    "\n",
    "\n",
    "to = TabularPandas(train, procs=[],\n",
    "                   cont_names = cont_names_final,\n",
    "                   y_names = y_names,\n",
    "                   splits = splits)\n",
    "\n",
    "dls = to.dataloaders(bs=64)\n",
    "\n",
    "auc_metric = RocAucBinary()\n",
    "\n",
    "learn = tabular_learner(dls, layers= [300, 300, 300, 300, 300], metrics=auc_metric)\n",
    "\n",
    "#a batch size convention has already been established for these experiments \n",
    "#so not getting benefit out of second\n",
    "# learn.model = torch.nn.DataParallel(learn.model, device_ids=[0, 1])\n",
    "\n",
    "learn.fit_one_cycle(number_of_epochs)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
