{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Claim 1: HS Improves Predictive performance of TBM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/bro/Documents/FRI/MLDS/repro/MLDS/notebooks\n"
     ]
    }
   ],
   "source": [
    "# move to notebooks directory\n",
    "%cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "###IMPORTS\n",
    "\n",
    "# system path manipulations\n",
    "import os\n",
    "import sys\n",
    "\n",
    "# standard data science toolbox\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# train test splitting\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# standard DT and RF\n",
    "from sklearn.tree import export_text, DecisionTreeClassifier, DecisionTreeRegressor\n",
    "\n",
    "# authors implementations of HS\n",
    "import imodels\n",
    "\n",
    "# making deep copies of trees for improvement comparison\n",
    "from copy import deepcopy\n",
    "\n",
    "# cross-validation of models\n",
    "from sklearn.model_selection import StratifiedKFold, KFold\n",
    "\n",
    "# scoring\n",
    "from sklearn.metrics import roc_auc_score, r2_score\n",
    "\n",
    "# hyperparameter search\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "# timing algorithm execution\n",
    "import time\n",
    "\n",
    "# import datasets\n",
    "from utils.experiment_functions import get_datasets\n",
    "\n",
    "# count number of leaves\n",
    "from utils.experiment_functions import leaf_count\n",
    "\n",
    "# calculate best alpha\n",
    "from utils.experiment_functions import pick_alpha, pick_alpha_best\n",
    "\n",
    "###CONSTANTS\n",
    "\n",
    "# Repositories used for experiment (location: paper_autors_repo/config/shrinkage/models.py)\n",
    "CLASSIFICATION_DATASET_NAMES = [\"heart\", \"breast-cancer\", \"haberman\", \"ionosphere\", \"diabetes\", \"german-credit\", \"juvenile\", \"recidivism\"]\n",
    "REGRESSION_DATASET_NAMES = ['friedman1', 'friedman3', \"diabetes-regr\", 'abalone', \"red-wine\", \"satellite-image\", \"california-housing\", \"music\"]\n",
    "\n",
    "# number of leafs used for DT\n",
    "num_of_leaves = [2, 4, 8, 12, 15, 20, 24, 28, 30, 32]\n",
    "\n",
    "# regularization parameter used for HS\n",
    "reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load classification tasks\n",
    "tasks_classification = get_datasets(CLASSIFICATION_DATASET_NAMES)\n",
    "\n",
    "# load regression tasks\n",
    "tasks_regression = get_datasets(REGRESSION_DATASET_NAMES)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 A) Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset: recidivism, Sample: 99, Fold 0d 0\r"
     ]
    }
   ],
   "source": [
    "# Potential problem HS appeared to be choosen via CV (hopefully they split the dataset before hand)\n",
    "NUM_OF_BOOTSTRAP_SAMPS = 100\n",
    "\n",
    "# classification comparison of DT and HS-DT\n",
    "classification_results = pd.DataFrame(columns = [\"task\", \"dataset\", \"boot_iter\", \"algorithm\", \"scoring\", \"n_leaves\", \"max_leaves\", \"regularization\", \"train_score\", \"test_score\", \\\n",
    "                                                 \"train_wall_time\", \"test_wall_time\", \"train_cpu_time\", \"test_cpu_time\", \"tunning_wall_time\", \"tunning_cpu_time\"])\n",
    "\n",
    "for task in DATASETS_CLASSIFICATION:\n",
    "    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):\n",
    "        skf = StratifiedKFold(n_splits=3, shuffle=True)\n",
    "        X, y = np.array(tasks_classification[task[0]].drop(\"label\", axis = 1)), np.array(tasks_classification[task[0]][\"label\"])\n",
    "        for i, (train_index, test_index) in enumerate(skf.split(tasks_classification[task[0]], tasks_classification[task[0]][\"label\"])):\n",
    "            print(f\"Dataset: {task[0]}, Sample: {samp}, Fold {i}\", end = \"\\r\")\n",
    "\n",
    "            X_train, y_train = X[train_index, :], y[train_index]\n",
    "            X_test, y_test = X[test_index, :], y[test_index]\n",
    "\n",
    "            for m in num_of_leaves:\n",
    "                ### CART ###\n",
    "                \n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                m1 = DecisionTreeClassifier(max_leaf_nodes=m).fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_dt = m1.predict_proba(X_train)[:, 1]\n",
    "                y_test_pred_dt = m1.predict_proba(X_test)[:, 1]\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                classification_results = pd.concat([classification_results, pd.DataFrame({\"task\": [\"classification\"], \n",
    "                                                                            \"dataset\": [task[0]],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"DT\"],\n",
    "                                                                            \"scoring\": [\"AUC\"],\n",
    "                                                                            \"n_leaves\": [leaf_count(m1)],\n",
    "                                                                            \"max_leaves\": [m],\n",
    "                                                                            \"regularization\": [\"None\"],\n",
    "                                                                            \"train_score\": [roc_auc_score(y_train, y_train_pred_dt)],\n",
    "                                                                            \"test_score\": [roc_auc_score(y_test, y_test_pred_dt)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [None], \n",
    "                                                                            \"tunning_cpu_time\": [None]})])\n",
    "                \n",
    "                ### Hierarchical shrinkage ###\n",
    "                \n",
    "                # measure tunning time\n",
    "                start_wall_time_tunning = time.time()\n",
    "                start_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                cv_scores = {}\n",
    "                for reg_param in reg_hs:\n",
    "                    hs_skf = StratifiedKFold(n_splits=3, shuffle = True)\n",
    "                    cv_scores[reg_param] = []\n",
    "                    for j, (cv_train_index, cv_val_index) in enumerate(hs_skf.split(X_train, y_train)):\n",
    "                        X_cv_train, y_cv_train = X[cv_train_index, :], y[cv_train_index]\n",
    "                        X_cv_val, y_cv_val = X[cv_val_index, :], y[cv_val_index]\n",
    "                        hs_cv_dt = DecisionTreeClassifier(max_leaf_nodes=m)\n",
    "                        hs_cv_dt.fit(X_cv_train, y_cv_train)\n",
    "                        hs_cv_dt = imodels.HSTreeClassifier(hs_cv_dt, reg_param=reg_param)\n",
    "                        y_val_pred = hs_cv_dt.predict_proba(X_cv_val)[:, 1]\n",
    "                        cv_scores[reg_param].append(roc_auc_score(y_cv_val, y_val_pred))\n",
    "                cv_scores = {reg_param: np.mean(cv_scores[reg_param]) for reg_param in cv_scores.keys()}\n",
    "                best_score = np.max([cv_scores[reg_param] for reg_param in cv_scores.keys()])\n",
    "                best_param = [reg_param for reg_param in cv_scores.keys() if cv_scores[reg_param] == best_score][0]\n",
    "                hs_reg_param = best_param\n",
    "                \n",
    "                end_wall_time_tunning = time.time()\n",
    "                end_cpu_time_tunning = time.process_time()\n",
    "\n",
    "                # evaluation of improvements offered by hierarchical shrinkage model\n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                mshrunk = imodels.HSTreeClassifier(deepcopy(m1), reg_param=hs_reg_param) #.fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_shrunk = mshrunk.predict_proba(X_train)[:, 1]\n",
    "                y_test_pred_shrunk = mshrunk.predict_proba(X_test)[:, 1]\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                classification_results = pd.concat([classification_results, pd.DataFrame({\"task\": [\"classification\"], \n",
    "                                                                            \"dataset\": [task[0]],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"HS (CART)\"],\n",
    "                                                                            \"scoring\": [\"AUC\"],\n",
    "                                                                            \"n_leaves\": [leaf_count(mshrunk.estimator_)],\n",
    "                                                                            \"max_leaves\": [m],\n",
    "                                                                            \"regularization\": [hs_reg_param],\n",
    "                                                                            \"train_score\": [roc_auc_score(y_train, y_train_pred_shrunk)],\n",
    "                                                                            \"test_score\": [roc_auc_score(y_test, y_test_pred_shrunk)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [end_wall_time_tunning - start_wall_time_tunning], \n",
    "                                                                            \"tunning_cpu_time\": [end_cpu_time_tunning - start_cpu_time_tunning]})])\n",
    "\n",
    "                classification_results.to_csv(\"results/claim_1_1_dt_comparison_classification.csv\", index = False)\n",
    "                \n",
    "            break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 B) Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset: red-wine, Sample: 99, Fold 099, Fold 0\r"
     ]
    }
   ],
   "source": [
    "# number of leafs used in paper\n",
    "num_of_leaves = [2, 4, 8, 12, 15, 20, 24, 28, 30, 32]\n",
    "# reuglarization parameter\n",
    "reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]\n",
    "\n",
    "# Potential problem HS appeared to be choosen via CV (hopefully they split the dataset before hand)\n",
    "NUM_OF_BOOTSTRAP_SAMPS = 100\n",
    "regression_results = pd.DataFrame(columns = [\"task\", \"dataset\", \"boot_iter\", \"algorithm\", \"scoring\", \"n_leaves\", \"max_leaves\", \"regularization\", \"train_score\", \"test_score\", \\\n",
    "                                                 \"train_wall_time\", \"test_wall_time\", \"train_cpu_time\", \"test_cpu_time\", \"tunning_wall_time\", \"tunning_cpu_time\"])\n",
    "\n",
    "for task in DATASETS_REGRESSION:\n",
    "    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):\n",
    "        skf = KFold(n_splits=3, shuffle = True)\n",
    "        if task[0] != \"music\":\n",
    "            X, y = np.array(tasks_regression[task[0]].drop(\"label\", axis = 1)), np.array(tasks_regression[task[0]][\"label\"])\n",
    "        else:\n",
    "            X, y = np.array(tasks_regression[task[0]].drop([\"label1\", \"label2\"], axis = 1)), np.array(tasks_regression[task[0]][[\"label1\", \"label2\"]])\n",
    "        \n",
    "        for i, (train_index, test_index) in enumerate(skf.split(tasks_regression[task[0]])):\n",
    "            print(f\"Dataset: {task[0]}, Sample: {samp}, Fold {i}\", end = \"\\r\")\n",
    "\n",
    "            if task[0] != \"music\":\n",
    "                X_train, y_train = X[train_index, :], y[train_index]\n",
    "                X_test, y_test = X[test_index, :], y[test_index]\n",
    "            else:\n",
    "                X_train, y_train = X[train_index, :], y[train_index, :]\n",
    "                X_test, y_test = X[test_index, :], y[test_index, :]\n",
    "\n",
    "            for m in num_of_leaves:\n",
    "                ### CART ###\n",
    "                \n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                m1 = DecisionTreeRegressor(max_leaf_nodes=m).fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_dt = m1.predict(X_train)\n",
    "                y_test_pred_dt = m1.predict(X_test)\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                regression_results = pd.concat([regression_results, pd.DataFrame({\"task\": [\"regression\"], \n",
    "                                                                            \"dataset\": [task[0]],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"DT\"],\n",
    "                                                                            \"scoring\": [\"R2\"],\n",
    "                                                                            \"n_leaves\": [leaf_count(m1)],\n",
    "                                                                            \"max_leaves\": [m],\n",
    "                                                                            \"regularization\": [\"None\"],\n",
    "                                                                            \"train_score\": [r2_score(y_train, y_train_pred_dt)],\n",
    "                                                                            \"test_score\": [r2_score(y_test, y_test_pred_dt)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [None], \n",
    "                                                                            \"tunning_cpu_time\": [None]})])\n",
    "                \n",
    "                ### Hierarchical shrinkage ###\n",
    "                \n",
    "                # measure tunning time\n",
    "                start_wall_time_tunning = time.time()\n",
    "                start_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                cv_scores = {}\n",
    "                for reg_param in reg_hs:\n",
    "                    hs_skf = KFold(n_splits=3, shuffle = True)\n",
    "                    cv_scores[reg_param] = []\n",
    "                    for j, (cv_train_index, cv_val_index) in enumerate(hs_skf.split(X_train, y_train)):\n",
    "                        X_cv_train, y_cv_train = X[cv_train_index, :], y[cv_train_index]\n",
    "                        X_cv_val, y_cv_val = X[cv_val_index, :], y[cv_val_index]\n",
    "                        hs_cv_dt = DecisionTreeRegressor(max_leaf_nodes=m)\n",
    "                        hs_cv_dt.fit(X_cv_train, y_cv_train)\n",
    "                        hs_cv_dt = imodels.HSTreeRegressor(hs_cv_dt, reg_param=reg_param)\n",
    "                        y_val_pred = hs_cv_dt.predict(X_cv_val)\n",
    "                        cv_scores[reg_param].append(r2_score(y_cv_val, y_val_pred))\n",
    "                cv_scores = {reg_param: np.mean(cv_scores[reg_param]) for reg_param in cv_scores.keys()}\n",
    "                best_score = np.max([cv_scores[reg_param] for reg_param in cv_scores.keys()])\n",
    "                best_param = [reg_param for reg_param in cv_scores.keys() if cv_scores[reg_param] == best_score][0]\n",
    "                hs_reg_param = best_param\n",
    "                \n",
    "                end_wall_time_tunning = time.time()\n",
    "                end_cpu_time_tunning = time.process_time()\n",
    "\n",
    "                # evaluation of improvements offered by hierarchical shrinkage model\n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                mshrunk = imodels.HSTreeRegressor(deepcopy(m1), reg_param=hs_reg_param) #.fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_shrunk = mshrunk.predict(X_train)\n",
    "                y_test_pred_shrunk = mshrunk.predict(X_test)\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                regression_results = pd.concat([regression_results, pd.DataFrame({\"task\": [\"regression\"], \n",
    "                                                                            \"dataset\": [task[0]],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"HS (CART)\"],\n",
    "                                                                            \"scoring\": [\"R2\"],\n",
    "                                                                            \"n_leaves\": [leaf_count(mshrunk.estimator_)],\n",
    "                                                                            \"max_leaves\": [m],\n",
    "                                                                            \"regularization\": [hs_reg_param],\n",
    "                                                                            \"train_score\": [r2_score(y_train, y_train_pred_shrunk)],\n",
    "                                                                            \"test_score\": [r2_score(y_test, y_test_pred_shrunk)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [end_wall_time_tunning - start_wall_time_tunning], \n",
    "                                                                            \"tunning_cpu_time\": [end_cpu_time_tunning - start_cpu_time_tunning]})])\n",
    "\n",
    "                regression_results.to_csv(\"results/claim_1_1_dt_comparison_regression.csv\", index = False)\n",
    "\n",
    "            break"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mlds",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:16:33) [MSC v.1929 64 bit (AMD64)]"
  },
  "vscode": {
   "interpreter": {
    "hash": "aaea2abca523dde69918745ab0433be939f84f870f3a8b0d9770b38003b437c2"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
