{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai.tabular.all import *\n",
    "\n",
    "#we'll handle missing data infill with Automunge\n",
    "#note exc2 is a pass-through transform that defaults to mode infill\n",
    "\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "number_of_epochs = 3\n",
    "# number_of_epochs = 14\n",
    "# number_of_epochs = 28\n",
    "# number_of_epochs = 35\n",
    "# number_of_epochs = 42\n",
    "# number_of_epochs = 200\n",
    "\n",
    "# #full DATA\n",
    "# sample_ratio = 1.0\n",
    "# validation_ratio = 0.0567\n",
    "\n",
    "#5% DATA\n",
    "# sample_ratio = 0.103833244329036\n",
    "# validation_ratio = 0.545745953948647\n",
    "\n",
    "#0.25% DATA\n",
    "sample_ratio = 0.05902490655\n",
    "validation_ratio = 0.9600451114\n",
    "\n",
    "# #tiny data\n",
    "# sample_ratio = 0.0001\n",
    "# validation_ratio = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"/data/Benchmark_datasets/Higgs/HIGGS.csv\"\n",
    "\n",
    "df_train1 = pd.read_csv(path, header=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Augmentation Experiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "randomstate = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "cont_names = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28']\n",
    "label_names='0'\n",
    "\n",
    "assigncat={'DPrt':cont_names, 'lbos':label_names}\n",
    "#turning off inplace as expect the order of columns consistency with DPrt will be impacted by inplace\n",
    "assignparam={'default_assignparam' : {'DPrt' : {'flip_prob' : 0.03}}, \\\n",
    "             'global_assignparam' : {'inplace' : False}}\n",
    "\n",
    "\n",
    "df_train = df_train1.sample(frac = sample_ratio, random_state = randomstate)\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train, df_test = df_train, \n",
    "             labels_column = label_names, MLinfill = False, \n",
    "             assigncat=assigncat, \\\n",
    "             assigninfill={'adjinfill':cont_names}, \\\n",
    "             assignparam = assignparam, \\\n",
    "             pandasoutput=True, printstatus=False)\n",
    "\n",
    "train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)\n",
    "\n",
    "cont_names_final = finalcolumns_train\n",
    "y_names = list(labels)[0]\n",
    "\n",
    "train = pd.concat([train, labels], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>roc_auc_score</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.607060</td>\n",
       "      <td>0.597545</td>\n",
       "      <td>0.740626</td>\n",
       "      <td>01:07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.579432</td>\n",
       "      <td>0.567804</td>\n",
       "      <td>0.773863</td>\n",
       "      <td>01:06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.549451</td>\n",
       "      <td>0.552961</td>\n",
       "      <td>0.791058</td>\n",
       "      <td>01:08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#rev1\n",
    "\n",
    "splits = RandomSplitter(valid_pct=validation_ratio)(range_of(train))\n",
    "\n",
    "to = TabularPandas(train, procs=[],\n",
    "                   cont_names = cont_names_final,\n",
    "                   y_names = y_names,\n",
    "                   splits = splits)\n",
    "\n",
    "dls = to.dataloaders(bs=64)\n",
    "\n",
    "auc_metric = RocAucBinary()\n",
    "\n",
    "learn = tabular_learner(dls, layers= [300, 300, 300, 300, 300], metrics=auc_metric)\n",
    "\n",
    "learn.fit_one_cycle(number_of_epochs)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# data aug rev2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "randomstate = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "cont_names = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28']\n",
    "label_names='0'\n",
    "\n",
    "assigncat={'DPrt':cont_names, 'lbos':label_names}\n",
    "#turning off inplace as expect the order of columns consistency with DPrt will be impacted by inplace\n",
    "assignparam={'default_assignparam' : {'DPrt' : {'flip_prob' : 0.03}}, \\\n",
    "             'global_assignparam' : {'inplace' : False}}\n",
    "\n",
    "\n",
    "df_train = df_train1.sample(frac = sample_ratio, random_state = randomstate)\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train, df_test = df_train, \n",
    "             labels_column = label_names, MLinfill = False, \n",
    "             assigncat=assigncat, \\\n",
    "             assigninfill={'adjinfill':cont_names}, \\\n",
    "             assignparam = assignparam, \\\n",
    "             pandasoutput=True, printstatus=False)\n",
    "\n",
    "train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)\n",
    "\n",
    "cont_names_final = finalcolumns_train\n",
    "y_names = list(labels)[0]\n",
    "\n",
    "train = pd.concat([train, labels], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>roc_auc_score</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.610025</td>\n",
       "      <td>0.599825</td>\n",
       "      <td>0.742867</td>\n",
       "      <td>01:06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.578264</td>\n",
       "      <td>0.564275</td>\n",
       "      <td>0.779037</td>\n",
       "      <td>01:08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.546902</td>\n",
       "      <td>0.551217</td>\n",
       "      <td>0.790496</td>\n",
       "      <td>01:06</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#rev2\n",
    "\n",
    "splits = RandomSplitter(valid_pct=validation_ratio)(range_of(train))\n",
    "\n",
    "to = TabularPandas(train, procs=[],\n",
    "                   cont_names = cont_names_final,\n",
    "                   y_names = y_names,\n",
    "                   splits = splits)\n",
    "\n",
    "dls = to.dataloaders(bs=64)\n",
    "\n",
    "auc_metric = RocAucBinary()\n",
    "\n",
    "learn = tabular_learner(dls, layers= [300, 300, 300, 300, 300], metrics=auc_metric)\n",
    "\n",
    "learn.fit_one_cycle(number_of_epochs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Aug rev3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "randomstate = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "cont_names = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28']\n",
    "label_names='0'\n",
    "\n",
    "assigncat={'DPrt':cont_names, 'lbos':label_names}\n",
    "#turning off inplace as expect the order of columns consistency with DPrt will be impacted by inplace\n",
    "assignparam={'default_assignparam' : {'DPrt' : {'flip_prob' : 0.03}}, \\\n",
    "             'global_assignparam' : {'inplace' : False}}\n",
    "\n",
    "\n",
    "df_train = df_train1.sample(frac = sample_ratio, random_state = randomstate)\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train, df_test = df_train, \n",
    "             labels_column = label_names, MLinfill = False, \n",
    "             assigncat=assigncat, \\\n",
    "             assigninfill={'adjinfill':cont_names}, \\\n",
    "             assignparam = assignparam, \\\n",
    "             pandasoutput=True, printstatus=False)\n",
    "\n",
    "train   = pd.concat([train, test], axis=0, ignore_index=True)\n",
    "trainID = pd.concat([trainID, testID], axis=0, ignore_index=True)\n",
    "labels  = pd.concat([labels,testlabels], axis=0, ignore_index=True)\n",
    "\n",
    "cont_names_final = finalcolumns_train\n",
    "y_names = list(labels)[0]\n",
    "\n",
    "train = pd.concat([train, labels], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>roc_auc_score</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.612105</td>\n",
       "      <td>0.593214</td>\n",
       "      <td>0.750773</td>\n",
       "      <td>01:06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.564844</td>\n",
       "      <td>0.571336</td>\n",
       "      <td>0.772769</td>\n",
       "      <td>01:06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.542032</td>\n",
       "      <td>0.551158</td>\n",
       "      <td>0.789646</td>\n",
       "      <td>01:08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#rev3\n",
    "\n",
    "splits = RandomSplitter(valid_pct=validation_ratio)(range_of(train))\n",
    "\n",
    "to = TabularPandas(train, procs=[],\n",
    "                   cont_names = cont_names_final,\n",
    "                   y_names = y_names,\n",
    "                   splits = splits)\n",
    "\n",
    "dls = to.dataloaders(bs=64)\n",
    "\n",
    "auc_metric = RocAucBinary()\n",
    "\n",
    "learn = tabular_learner(dls, layers= [300, 300, 300, 300, 300], metrics=auc_metric)\n",
    "\n",
    "learn.fit_one_cycle(number_of_epochs)\n",
    "\n",
    "cont_names_final = finalcolumns_train\n",
    "y_names = list(labels)[0]\n",
    "\n",
    "train = pd.concat([train, labels], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
