{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from Automunge import Automunger\n",
    "am = Automunger.AutoMunge()\n",
    "\n",
    "from fastai.tabular.all import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#I wasn't sure about distribution rights for the complete data set\n",
    "#so just using a 10,000 row excerpt for this demonsrtation\n",
    "#note th full data used for training was 8,823,544 samples\n",
    "#from which a 500,000 row validation set was extracted\n",
    "#available online at https://archive.ics.uci.edu/ml/datasets/HIGGS\n",
    "\n",
    "path = \"Higgs_data_partial.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(path, header='infer')\n",
    "\n",
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#to preprocess the data with automunge we'll apply a few different assigncat scnearios\n",
    "\n",
    "#first let's collect the feature and label headers\n",
    "\n",
    "features = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]\n",
    "label = 0\n",
    "\n",
    "#now here are the assigncat scenarios for assigning trasnfomartions\n",
    "\n",
    "#exc2 is a passthrough trasnformation which applies a default mode infill\n",
    "#lbos is a label ordinal encoding folowed by conversion to string\n",
    "#which helps fastai recoghnize that this is a target for classificatoin instead of regression\n",
    "assigncat1 = {'exc2':features, 'lbos':label}\n",
    "\n",
    "#nmbr is a z-score normalization\n",
    "assigncat2 = {'nmbr':features, 'lbos':label}\n",
    "\n",
    "#retn is retain normalization\n",
    "assigncat3 = {'retn':features, 'lbos':label}\n",
    "\n",
    "#rtbs is a family tree we'll populate for retain with bins\n",
    "assigncat4 = {'rtbs':features, 'lbos':label}\n",
    "\n",
    "#this will return columns column_retn and column_retn_bins (set)\n",
    "transformdict = \\\n",
    "{'rtbs' : {'parents'       : ['rtbs'], \\\n",
    "           'siblings'      : [], \\\n",
    "           'auntsuncles'   : [], \\\n",
    "           'cousins'       : [], \\\n",
    "           'children'      : [], \\\n",
    "           'niecesnephews' : [], \\\n",
    "           'coworkers'     : [], \\\n",
    "           'friends'       : ['bins'] }}\n",
    "\n",
    "processdict = \\\n",
    "{'rtbs' : {'dualprocess' : am.process_retn_class, \\\n",
    "          'singleprocess' : None, \\\n",
    "          'postprocess' : am.postprocess_retn_class, \\\n",
    "          'NArowtype' : 'numeric', \\\n",
    "          'MLinfilltype' : 'numeric', \\\n",
    "          'labelctgy' : 'retn'}}\n",
    "\n",
    "#DPrt is retain normalization with noise injection\n",
    "#in first scenario we'll inject to 100% of data\n",
    "assigncat5 = {'DPrt':features, 'lbos':label}\n",
    "\n",
    "assignparam5 = {'default_assignparam' : {'DPrt' : {'flip_prob' : 1.0}}}\n",
    "\n",
    "\n",
    "#in second DPrt scenario we'll inject to 3% of data\n",
    "assigncat6 = {'DPrt':features, 'lbos':label}\n",
    "\n",
    "assignparam6 = {'default_assignparam' : {'DPrt' : {'flip_prob' : 0.03}}}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#now we can prprocess our data for each of these scenarios to run our experiments\n",
    "\n",
    "\n",
    "#scenario1, exc2\n",
    "\n",
    "train, trainID, labels, \\\n",
    "validation1, validationID1, validationlabels1, \\\n",
    "validation2, validationID2, validationlabels2, \\\n",
    "test, testID, testlabels, \\\n",
    "labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "featureimportance, postprocess_dict = \\\n",
    "am.automunge(df_train, labels_column = label, \\\n",
    "             assigncat = assigncat1, \\\n",
    "             assigninfill={'adjinfill':features}, \\\n",
    "             pandasoutput=True, printstatus=True)\n",
    "\n",
    "# #scenario2, nmbr\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\ \n",
    "#              assigncat = assigncat2, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              pandasoutput=True, printstatus=True)\n",
    "\n",
    "# #scenario3, retn\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\\n",
    "#              assigncat = assigncat3, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              pandasoutput=True, printstatus=True)\n",
    "\n",
    "# #scenario4, retn with standard deviation bins\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\\n",
    "#              assigncat = assigncat4, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              transformdict = transformdict, processdict = processdict, \\\n",
    "#              pandasoutput=True, printstatus=True)\n",
    "\n",
    "\n",
    "# #scenario5, DPrt with full noise injection\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\\n",
    "#              assigncat = assigncat5, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              assignparam = assignparam5, \\\n",
    "#              pandasoutput=True, printstatus=True)\n",
    "\n",
    "\n",
    "# #scenario5, DPrt with partial noise injection\n",
    "\n",
    "# train, trainID, labels, \\\n",
    "# validation1, validationID1, validationlabels1, \\\n",
    "# validation2, validationID2, validationlabels2, \\\n",
    "# test, testID, testlabels, \\\n",
    "# labelsencoding_dict, finalcolumns_train, finalcolumns_test, \\\n",
    "# featureimportance, postprocess_dict = \\\n",
    "# am.automunge(df_train, labels_column = label, \\\n",
    "#              assigncat = assigncat6, \\\n",
    "#              assigninfill={'adjinfill':features}, \\\n",
    "#              assignparam = assignparam6, \\\n",
    "#              pandasoutput=True, printstatus=True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#the postprocess_dict returned should be saved if we want to later consistently process additional data\n",
    "\n",
    "# import pickle\n",
    "\n",
    "# #backup postprocess_dict on disk\n",
    "# with open('filename.pkl', 'wb') as download:\n",
    "#     pickle.dump(postprocess_dict, download)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#two tweaks on the returned data to support fastai conventions\n",
    "\n",
    "#first we'll need the columns for continuous and categoric\n",
    "\n",
    "cont_names = postprocess_dict['columntype_report']['continuous']\n",
    "cat_names = postprocess_dict['columntype_report']['ordinal']\n",
    "y_names = list(labels)[0]\n",
    "\n",
    "#and second we'll want labels included as part of the train set dataframe\n",
    "\n",
    "train = pd.concat([train, labels], axis = 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#fastai uses a splis function to assign validation rows\n",
    "#(in full trial this was set based on selecting 500k samples)\n",
    "\n",
    "splits = RandomSplitter(valid_pct=0.05)(range_of(train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#then load the dataframe\n",
    "#in our experiments we did not use categorify for scenaio 4 with the bins\n",
    "\n",
    "to = TabularPandas(train, procs=[],\n",
    "                   cont_names = cont_names,\n",
    "                   y_names = y_names,\n",
    "                   splits = splits)\n",
    "\n",
    "dls = to.dataloaders(bs=64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#initialize the auc metric:\n",
    "\n",
    "auc_metric = RocAucBinary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#then initilize the model\n",
    "\n",
    "learn = tabular_learner(dls, layers= [300, 300, 300, 300, 300], metrics=auc_metric)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#fit the model\n",
    "#here 1 is the number of epochs\n",
    "learn.fit_one_cycle(1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
