{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ec04a714-1e4c-4976-99be-74bb3a0a6619",
   "metadata": {},
   "source": [
    "# Claim 2: HS is a better than other regularization methods for TBM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5f274741-0b18-44e5-8ee6-793cc39335ff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/bro/Documents/FRI/MLDS/repro/MLDS/notebooks\n"
     ]
    }
   ],
   "source": [
    "# move to notebooks directory\n",
    "%cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "078ff059-3051-4d0a-aa37-cd3d0bcdb7f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "###IMPORTS\n",
    "\n",
    "# system path manipulations\n",
    "import os\n",
    "import sys\n",
    "\n",
    "# standard data science toolbox\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# train test splitting\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# standard DT and RF\n",
    "from sklearn.tree import export_text, DecisionTreeClassifier, DecisionTreeRegressor\n",
    "\n",
    "# sklearn baseline random forest \n",
    "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
    "\n",
    "# authors implementations of HS\n",
    "import imodels\n",
    "\n",
    "# this was used to get the datasets\n",
    "from imodels.util.data_util import get_clean_dataset \n",
    "\n",
    "# making deep copies of trees for improvement comparison\n",
    "from copy import deepcopy\n",
    "\n",
    "# cross-validation of models\n",
    "from sklearn.model_selection import StratifiedKFold, KFold\n",
    "\n",
    "# scoring\n",
    "from sklearn.metrics import roc_auc_score, r2_score, make_scorer\n",
    "\n",
    "# hyperparameter search\n",
    "from sklearn.model_selection import GridSearchCV, cross_val_score\n",
    "\n",
    "# timing algorithm execution\n",
    "import time\n",
    "\n",
    "# import datasets\n",
    "from utils.experiment_functions import get_datasets\n",
    "\n",
    "# count number of leaves\n",
    "from utils.experiment_functions import leaf_count\n",
    "\n",
    "# calculate best alpha\n",
    "from utils.experiment_functions import pick_alpha, pick_alpha_best\n",
    "\n",
    "# hyperparameter tunnning\n",
    "from skopt import gp_minimize\n",
    "\n",
    "# hierarchical shrinkage\n",
    "from imodels import HSTreeClassifier, HSTreeClassifierCV \n",
    "\n",
    "# bayesian-additive regression models (BART)\n",
    "from bartpy.sklearnmodel import SklearnModel\n",
    "\n",
    "###CONSTANTS\n",
    "\n",
    "# Repositories used for experiment (location: paper_autors_repo/config/shrinkage/models.py)\n",
    "CLASSIFICATION_DATASET_NAMES = [\"heart\", \"breast-cancer\", \"haberman\", \"ionosphere\", \"diabetes\", \"german-credit\", \"juvenile\", \"recidivism\"]\n",
    "REGRESSION_DATASET_NAMES = [\"red-wine\", \"california-housing\"]\n",
    "\n",
    "# number of leafs used for DT\n",
    "num_of_leaves = [2, 4, 8, 12, 15, 20, 24, 28, 30, 32]\n",
    "\n",
    "# regularization parameter used for HS\n",
    "reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0ccc8087-2884-45c7-af90-e2989e5fd514",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load classification tasks\n",
    "tasks_classification = get_datasets(CLASSIFICATION_DATASET_NAMES)\n",
    "\n",
    "# load regression tasks\n",
    "tasks_regression = get_datasets(REGRESSION_DATASET_NAMES)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d09a5d0b-d421-4d67-a769-e4cb62cece07",
   "metadata": {},
   "source": [
    "# A) Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4fa18f7f-d26e-48a9-b274-ad73ff376c6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# number of samples per each (dataset, num.leaves, algorithm)\n",
    "NUM_OF_BOOTSTRAP_SAMPS = 30\n",
    "\n",
    "def dt_regularization_comparison(dataset_names, task_type, save_to):\n",
    "    \n",
    "    tasks = get_datasets(dataset_names)\n",
    "    \n",
    "    results = pd.DataFrame(columns = [\"task\", \"dataset\", \"boot_iter\", \"algorithm\", \"scoring\", \"n_leaves\", \"max_leaves\", \"regularization\", \"train_score\", \"test_score\", \\\n",
    "                                                     \"train_wall_time\", \"test_wall_time\", \"train_cpu_time\", \"test_cpu_time\", \"tunning_wall_time\", \"tunning_cpu_time\"])\n",
    "\n",
    "    for task in dataset_names:\n",
    "        for samp in range(NUM_OF_BOOTSTRAP_SAMPS):\n",
    "            skf = StratifiedKFold(n_splits=3, shuffle=True)\n",
    "            X, y = np.array(tasks[task].drop(\"label\", axis = 1)), np.array(tasks[task][\"label\"])\n",
    "            for i, (train_index, test_index) in enumerate(skf.split(tasks[task], tasks[task][\"label\"])):\n",
    "                print(f\"Dataset: {task}, Sample: {samp}, Fold {i}\", end = \"\\r\")\n",
    "\n",
    "                X_train, y_train = X[train_index, :], y[train_index]\n",
    "                X_test, y_test = X[test_index, :], y[test_index]\n",
    "\n",
    "                for m in num_of_leaves:\n",
    "\n",
    "                    ### CART with CCP ###\n",
    "\n",
    "                    # measure tunning time\n",
    "                    start_wall_time_tunning = time.time()\n",
    "                    start_cpu_time_tunning = time.process_time()\n",
    "\n",
    "                    best_alpha = pick_alpha(X_train, y_train, m, DecisionTreeClassifier)\n",
    "\n",
    "                    end_wall_time_tunning = time.time()\n",
    "                    end_cpu_time_tunning = time.process_time()\n",
    "\n",
    "                    # measure train time\n",
    "                    start_wall_time_train = time.time()\n",
    "                    start_cpu_time_train = time.process_time()\n",
    "\n",
    "                    mccp = DecisionTreeClassifier(ccp_alpha=best_alpha).fit(X_train, y_train)\n",
    "\n",
    "                    end_wall_time_train = time.time()\n",
    "                    end_cpu_time_train = time.process_time()\n",
    "\n",
    "                    # measure test time\n",
    "                    start_wall_time_test = time.time()\n",
    "                    start_cpu_time_test = time.process_time()\n",
    "\n",
    "                    y_train_pred_ccp = mccp.predict_proba(X_train)[:, 1]\n",
    "                    y_test_pred_ccp = mccp.predict_proba(X_test)[:, 1]\n",
    "\n",
    "                    end_wall_time_test = time.time()\n",
    "                    end_cpu_time_test = time.process_time()\n",
    "\n",
    "                    results = pd.concat([results, pd.DataFrame({\"task\": [\"classification\"], \n",
    "                                                                                \"dataset\": [task],\n",
    "                                                                                \"boot_iter\": [samp],\n",
    "                                                                                \"algorithm\": [\"CCP\"],\n",
    "                                                                                \"scoring\": [\"AUC\"],\n",
    "                                                                                \"n_leaves\": [leaf_count(mccp)],\n",
    "                                                                                \"max_leaves\": [m],\n",
    "                                                                                \"regularization\": [best_alpha],\n",
    "                                                                                \"train_score\": [roc_auc_score(y_train, y_train_pred_ccp)],\n",
    "                                                                                \"test_score\": [roc_auc_score(y_test, y_test_pred_ccp)],\n",
    "                                                                                \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                                \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                                \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                                \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                                \"tunning_wall_time\": [end_wall_time_tunning - start_wall_time_tunning], \n",
    "                                                                                \"tunning_cpu_time\": [end_cpu_time_tunning - start_cpu_time_tunning]})])\n",
    "                    \n",
    "                    ### Hierarchical shrinkage (CCP) ###\n",
    "\n",
    "                    # measure tunning time\n",
    "                    start_wall_time_tunning = time.time()\n",
    "                    start_cpu_time_tunning = time.process_time()\n",
    "\n",
    "                    cv_scores = {}\n",
    "                    for reg_param in reg_hs:\n",
    "                        hs_skf = StratifiedKFold(n_splits=3, shuffle = True)\n",
    "                        cv_scores[reg_param] = []\n",
    "                        for j, (cv_train_index, cv_val_index) in enumerate(hs_skf.split(X_train, y_train)):\n",
    "                            X_cv_train, y_cv_train = X[cv_train_index, :], y[cv_train_index]\n",
    "                            X_cv_val, y_cv_val = X[cv_val_index, :], y[cv_val_index]\n",
    "                            hs_cv_ccp = DecisionTreeClassifier(max_leaf_nodes=m, ccp_alpha=pick_alpha(X_cv_train, y_cv_train, m, DecisionTreeClassifier))\n",
    "                            hs_cv_ccp.fit(X_cv_train, y_cv_train)\n",
    "                            hs_cv_ccp = imodels.HSTreeClassifier(hs_cv_ccp, reg_param=reg_param)\n",
    "                            y_val_pred = hs_cv_ccp.predict_proba(X_cv_val)[:, 1]\n",
    "                            cv_scores[reg_param].append(roc_auc_score(y_cv_val, y_val_pred))\n",
    "                    cv_scores = {reg_param: np.mean(cv_scores[reg_param]) for reg_param in cv_scores.keys()}\n",
    "                    best_score = np.max([cv_scores[reg_param] for reg_param in cv_scores.keys()])\n",
    "                    best_param = [reg_param for reg_param in cv_scores.keys() if cv_scores[reg_param] == best_score][0]\n",
    "                    hs_reg_param = best_param\n",
    "\n",
    "                    end_wall_time_tunning = time.time()\n",
    "                    end_cpu_time_tunning = time.process_time()\n",
    "\n",
    "                    # evaluation of improvements offered by hierarchical shrinkage model\n",
    "\n",
    "                    # measure train time\n",
    "                    start_wall_time_train = time.time()\n",
    "                    start_cpu_time_train = time.process_time()\n",
    "\n",
    "                    mshrunk = imodels.HSTreeClassifier(deepcopy(mccp), reg_param=hs_reg_param) #.fit(X_train, y_train)\n",
    "\n",
    "                    end_wall_time_train = time.time()\n",
    "                    end_cpu_time_train = time.process_time()\n",
    "\n",
    "                    # measure test time\n",
    "                    start_wall_time_test = time.time()\n",
    "                    start_cpu_time_test = time.process_time()\n",
    "\n",
    "                    y_train_pred_shrunk = mshrunk.predict_proba(X_train)[:, 1]\n",
    "                    y_test_pred_shrunk = mshrunk.predict_proba(X_test)[:, 1]\n",
    "\n",
    "                    end_wall_time_test = time.time()\n",
    "                    end_cpu_time_test = time.process_time()\n",
    "\n",
    "                    results = pd.concat([results, pd.DataFrame({\"task\": [\"classification\"], \n",
    "                                                                                \"dataset\": [task],\n",
    "                                                                                \"boot_iter\": [samp],\n",
    "                                                                                \"algorithm\": [\"HS (CART-CCP)\"],\n",
    "                                                                                \"scoring\": [\"AUC\"],\n",
    "                                                                                \"n_leaves\": [leaf_count(mshrunk.estimator_)],\n",
    "                                                                                \"max_leaves\": [m],\n",
    "                                                                                \"regularization\": [hs_reg_param],\n",
    "                                                                                \"train_score\": [roc_auc_score(y_train, y_train_pred_shrunk)],\n",
    "                                                                                \"test_score\": [roc_auc_score(y_test, y_test_pred_shrunk)],\n",
    "                                                                                \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                                \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                                \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                                \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                                \"tunning_wall_time\": [end_wall_time_tunning - start_wall_time_tunning], \n",
    "                                                                                \"tunning_cpu_time\": [end_cpu_time_tunning - start_cpu_time_tunning]})])\n",
    "\n",
    "\n",
    "                    results.to_csv(save_to, index = False)\n",
    "\n",
    "                break\n",
    "                \n",
    "    return results\n",
    "                \n",
    "dt_classification = dt_regularization_comparison(CLASSIFICATION_DATASET_NAMES, \"classification\", \"results/claim_1_1_ccp_comparison_classification.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8fb7ce6b-802b-4fc6-9532-6fdb70532bff",
   "metadata": {},
   "source": [
    "# B) Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ed1728f-2841-4684-98bb-6e2c29bbfa68",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset: california-housing, Sample: 0, Fold 0\r"
     ]
    }
   ],
   "source": [
    "# number of leafs used in paper\n",
    "num_of_leaves = [2, 4, 8, 12, 15, 20, 24, 28, 30, 32]\n",
    "# reuglarization parameter\n",
    "reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]\n",
    "\n",
    "# Potential problem HS appeared to be choosen via CV (hopefully they split the dataset before hand)\n",
    "NUM_OF_BOOTSTRAP_SAMPS = 10\n",
    "regression_results = pd.DataFrame(columns = [\"task\", \"dataset\", \"boot_iter\", \"algorithm\", \"scoring\", \"n_leaves\", \"max_leaves\", \"regularization\", \"train_score\", \"test_score\", \\\n",
    "                                                 \"train_wall_time\", \"test_wall_time\", \"train_cpu_time\", \"test_cpu_time\", \"tunning_wall_time\", \"tunning_cpu_time\"])\n",
    "\n",
    "for task in REGRESSION_DATASET_NAMES:\n",
    "    if task == \"music\":\n",
    "        continue\n",
    "    \n",
    "    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):\n",
    "        skf = KFold(n_splits=3, shuffle = True)\n",
    "\n",
    "        if task != \"music\":\n",
    "            X, y = np.array(tasks_regression[task].drop(\"label\", axis = 1)), np.array(tasks_regression[task][\"label\"])\n",
    "        else:\n",
    "            X, y = np.array(tasks_regression[task].drop([\"label1\", \"label2\"], axis = 1)), np.array(tasks_regression[task][[\"label1\", \"label2\"]])\n",
    "        \n",
    "        for i, (train_index, test_index) in enumerate(skf.split(tasks_regression[task])):\n",
    "            print(f\"Dataset: {task}, Sample: {samp}, Fold {i}\", end = \"\\r\")\n",
    "\n",
    "            if task != \"music\":\n",
    "                X_train, y_train = X[train_index, :], y[train_index]\n",
    "                X_test, y_test = X[test_index, :], y[test_index]\n",
    "            else:\n",
    "                X_train, y_train = X[train_index, :], y[train_index, :]\n",
    "                X_test, y_test = X[test_index, :], y[test_index, :]\n",
    "\n",
    "            for m in num_of_leaves:\n",
    "                ### CART with CCP ###\n",
    "                \n",
    "                # measure tunning time\n",
    "                start_wall_time_tunning = time.time()\n",
    "                start_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                best_alpha = pick_alpha(X_train, y_train, m, DecisionTreeRegressor)\n",
    "                \n",
    "                end_wall_time_tunning = time.time()\n",
    "                end_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                mccp = DecisionTreeRegressor(ccp_alpha=best_alpha).fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_ccp = mccp.predict(X_train)\n",
    "                y_test_pred_ccp = mccp.predict(X_test)\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                regression_results = pd.concat([regression_results, pd.DataFrame({\"task\": [\"regression\"], \n",
    "                                                                            \"dataset\": [task],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"CCP\"],\n",
    "                                                                            \"scoring\": [\"R2\"],\n",
    "                                                                            \"n_leaves\": [leaf_count(mccp)],\n",
    "                                                                            \"max_leaves\": [m],\n",
    "                                                                            \"regularization\": [best_alpha],\n",
    "                                                                            \"train_score\": [r2_score(y_train, y_train_pred_ccp)],\n",
    "                                                                            \"test_score\": [r2_score(y_test, y_test_pred_ccp)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [end_wall_time_tunning - start_wall_time_tunning], \n",
    "                                                                            \"tunning_cpu_time\": [end_cpu_time_tunning - start_cpu_time_tunning]})])\n",
    "\n",
    "                # TODO: GOSDT ###\n",
    "\n",
    "                ### Hierarchical shrinkage (CCP) ###\n",
    "            \n",
    "                # measure tunning time\n",
    "                start_wall_time_tunning = time.time()\n",
    "                start_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                cv_scores = {}\n",
    "                for reg_param in reg_hs:\n",
    "                    hs_skf = KFold(n_splits=3, shuffle = True)\n",
    "                    cv_scores[reg_param] = []\n",
    "                    for j, (cv_train_index, cv_val_index) in enumerate(hs_skf.split(X_train)):\n",
    "                        X_cv_train, y_cv_train = X[cv_train_index, :], y[cv_train_index]\n",
    "                        X_cv_val, y_cv_val = X[cv_val_index, :], y[cv_val_index]\n",
    "                        hs_cv_ccp = DecisionTreeRegressor(max_leaf_nodes=m, ccp_alpha=pick_alpha(X_cv_train, y_cv_train, m, DecisionTreeRegressor))\n",
    "                        hs_cv_ccp.fit(X_cv_train, y_cv_train)\n",
    "                        hs_cv_ccp = imodels.HSTreeRegressor(hs_cv_ccp, reg_param=reg_param)\n",
    "                        y_val_pred = hs_cv_ccp.predict(X_cv_val)\n",
    "                        cv_scores[reg_param].append(r2_score(y_cv_val, y_val_pred))\n",
    "                cv_scores = {reg_param: np.mean(cv_scores[reg_param]) for reg_param in cv_scores.keys()}\n",
    "                best_score = np.max([cv_scores[reg_param] for reg_param in cv_scores.keys()])\n",
    "                best_param = [reg_param for reg_param in cv_scores.keys() if cv_scores[reg_param] == best_score][0]\n",
    "                hs_reg_param = best_param\n",
    "                \n",
    "                end_wall_time_tunning = time.time()\n",
    "                end_cpu_time_tunning = time.process_time()\n",
    "\n",
    "                # evaluation of improvements offered by hierarchical shrinkage model\n",
    "                \n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                mshrunk = imodels.HSTreeRegressor(deepcopy(mccp), reg_param=hs_reg_param) #.fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_shrunk = mshrunk.predict(X_train)\n",
    "                y_test_pred_shrunk = mshrunk.predict(X_test)\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                regression_results = pd.concat([regression_results, pd.DataFrame({\"task\": [\"regression\"], \n",
    "                                                                            \"dataset\": [task],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"HS (CART-CCP)\"],\n",
    "                                                                            \"scoring\": [\"R2\"],\n",
    "                                                                            \"n_leaves\": [leaf_count(mshrunk.estimator_)],\n",
    "                                                                            \"max_leaves\": [m],\n",
    "                                                                            \"regularization\": [hs_reg_param],\n",
    "                                                                            \"train_score\": [r2_score(y_train, y_train_pred_shrunk)],\n",
    "                                                                            \"test_score\": [r2_score(y_test, y_test_pred_shrunk)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [end_wall_time_tunning - start_wall_time_tunning], \n",
    "                                                                            \"tunning_cpu_time\": [end_cpu_time_tunning - start_cpu_time_tunning]})])\n",
    "\n",
    "\n",
    "                regression_results.to_csv(\"results/claim_1_1_ccp_comparison_regression.csv\", index = False)\n",
    "            break"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0255f096-d157-467a-b04a-a42dd5e01cdf",
   "metadata": {},
   "source": [
    "# Classification for RF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "211fb87a-5f6a-43bf-a1cc-b31183e7a533",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Datasets used in paper (location in author repo: github.com/Yu-Group/imodels-experiments/config/shrinkage/models.py)\n",
    "\n",
    "DATASETS_CLASSIFICATION = [\n",
    "    # classification datasets from original random forests paper\n",
    "    # page 9: https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf\n",
    "    (\"heart\", \"heart\", 'imodels'),\n",
    "    (\"breast-cancer\", \"breast_cancer\", 'imodels'),\n",
    "    (\"haberman\", \"haberman\", 'imodels'),\n",
    "    (\"ionosphere\", \"ionosphere\", 'pmlb'),\n",
    "    (\"diabetes\", \"diabetes\", \"pmlb\"),\n",
    "    (\"german-credit\", \"german\", \"pmlb\"),\n",
    "    (\"juvenile\", \"juvenile_clean\", 'imodels'),\n",
    "    (\"recidivism\", \"compas_two_year_clean\", 'imodels')\n",
    "]\n",
    "\n",
    "# load datasets (datasets of authors seem to already be preprocessed so we will use theirs)\n",
    "tasks = {}\n",
    "\n",
    "for task in DATASETS_CLASSIFICATION:\n",
    "    X, y, feature_names = get_clean_dataset(task[1], data_source = task[2])\n",
    "    df = pd.DataFrame(X, columns=feature_names)\n",
    "    df[\"label\"] = y\n",
    "    tasks[task[0]] = df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d125095-9621-4477-87bc-706619469106",
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataframe to save performance of models\n",
    "classification_results = pd.DataFrame(columns = [\"task\", \"dataset\", \"boot_iter\", \"algorithm\", \"scoring\", \"n_trees\", \"regularization\", \"train_score\", \"test_score\", \\\n",
    "                                                 \"train_wall_time\", \"test_wall_time\", \"train_cpu_time\", \"test_cpu_time\", \"tunning_wall_time\", \"tunning_cpu_time\"])\n",
    "\n",
    "# number of leafs used in paper\n",
    "num_of_trees = [10, 25, 50, 75, 100, 300, 500]\n",
    "\n",
    "# regularization parameter\n",
    "reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]\n",
    "\n",
    "# number of times to repeat evaluations with random splits (5 repeats with 3-fold cross-validation = 15 repeats)\n",
    "NUM_OF_BOOTSTRAP_SAMPS = 10\n",
    "\n",
    "# for each dataset that was used in paper\n",
    "for task in DATASETS_CLASSIFICATION:\n",
    "    # repeat NUM_OF_BOOTSTRAP_SAMPS\n",
    "    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):\n",
    "        # use statified splitting (we tried both stratified and un-stratified => no significant differences)\n",
    "        skf = StratifiedKFold(n_splits=3, shuffle=True)\n",
    "         \n",
    "        X, y = np.array(tasks[task[0]].drop(\"label\", axis = 1)), np.array(tasks[task[0]][\"label\"])\n",
    "        \n",
    "        # cross-validation loop\n",
    "        for i, (train_index, test_index) in enumerate(skf.split(tasks[task[0]], tasks[task[0]][\"label\"])):\n",
    "            print(f\"Dataset: {task[0]}, Sample: {samp}, Fold {i}\", end = \"\\r\")\n",
    "\n",
    "            X_train, y_train = X[train_index, :], y[train_index]\n",
    "            X_test, y_test = X[test_index, :], y[test_index]\n",
    "\n",
    "            # for each tree (as deduced from fig. 4D)\n",
    "            for m in num_of_trees:\n",
    "                \n",
    "                ### Random Forest (RF) ###\n",
    "                \n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                rf = RandomForestClassifier(n_estimators=m, max_features = \"sqrt\").fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_rf = rf.predict_proba(X_train)[:, 1]\n",
    "                y_test_pred_rf = rf.predict_proba(X_test)[:, 1]\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                classification_results = pd.concat([classification_results, pd.DataFrame({\"task\": [\"classification\"], \n",
    "                                                                            \"dataset\": [task[0]],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"RF\"],\n",
    "                                                                            \"scoring\": [\"AUC\"],\n",
    "                                                                            \"n_trees\": [m],\n",
    "                                                                            \"regularization\": [\"None\"],\n",
    "                                                                            \"train_score\": [roc_auc_score(y_train, y_train_pred_rf)],\n",
    "                                                                            \"test_score\": [roc_auc_score(y_test, y_test_pred_rf)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [None], \n",
    "                                                                            \"tunning_cpu_time\": [None]})])\n",
    "                \n",
    "                ### RF-CV (max_features (mtry)) ###                \n",
    "                \n",
    "                # tunning function to use in gp_minimize\n",
    "                def rf_mtry(mtry):\n",
    "                    rf_mtry = RandomForestClassifier(n_estimators=m, max_features = mtry[0])\n",
    "                    roc_spec = make_scorer(roc_auc_score, needs_proba=True)\n",
    "                    scores = cross_val_score(rf_mtry, X_train, y_train, cv=3, scoring = roc_spec)\n",
    "                    return -np.mean(scores)\n",
    "                \n",
    "                # measure tunning time\n",
    "                start_wall_time_tunning = time.time()\n",
    "                start_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                mtry_best = gp_minimize(rf_mtry,\n",
    "                            [(0.1, 1.0)],\n",
    "                            acq_func=\"EI\",\n",
    "                            n_calls = 15,\n",
    "                            n_initial_points = 5,\n",
    "                            noise = 0.1**2).x[0]\n",
    "                \n",
    "                end_wall_time_tunning = time.time()\n",
    "                end_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                rf_mtry = RandomForestClassifier(n_estimators=m, max_features = mtry_best).fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_rf_mtry = rf_mtry.predict_proba(X_train)[:, 1]\n",
    "                y_test_pred_rf_mtry = rf_mtry.predict_proba(X_test)[:, 1]\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                classification_results = pd.concat([classification_results, pd.DataFrame({\"task\": [\"classification\"], \n",
    "                                                                            \"dataset\": [task[0]],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"RF-MTRY\"],\n",
    "                                                                            \"scoring\": [\"AUC\"],\n",
    "                                                                            \"n_trees\": [m],\n",
    "                                                                            \"regularization\": [rf_mtry], # we store best mtry parameter in regularization\n",
    "                                                                            \"train_score\": [roc_auc_score(y_train, y_train_pred_rf_mtry)],\n",
    "                                                                            \"test_score\": [roc_auc_score(y_test, y_test_pred_rf_mtry)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [end_wall_time_tunning - start_wall_time_tunning], \n",
    "                                                                            \"tunning_cpu_time\": [end_cpu_time_tunning - start_cpu_time_tunning]})])\n",
    "                \n",
    "                ### RF-CV (max_depth (depth)) ###\n",
    "                def rf_depth(depth):\n",
    "                    rf_depth = RandomForestClassifier(n_estimators=m, max_depth = int(np.round(depth[0])))\n",
    "                    roc_spec = make_scorer(roc_auc_score, needs_proba=True)\n",
    "                    scores = cross_val_score(rf_depth, X_train, y_train, cv=3, scoring = roc_spec)\n",
    "                    return -np.mean(scores)\n",
    "                \n",
    "                # measure tunning time\n",
    "                start_wall_time_tunning = time.time()\n",
    "                start_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                depth_best = int(np.round(gp_minimize(rf_depth,\n",
    "                            [(1.0, 30.0)],\n",
    "                            acq_func=\"EI\",\n",
    "                            n_calls = 15,\n",
    "                            n_initial_points = 5,\n",
    "                            noise = 0.1**2).x[0]))\n",
    "                \n",
    "                end_wall_time_tunning = time.time()\n",
    "                end_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                rf_depth = RandomForestClassifier(n_estimators=m, max_depth = depth_best).fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_rf_depth = rf_depth.predict_proba(X_train)[:, 1]                \n",
    "                y_test_pred_rf_depth = rf_depth.predict_proba(X_test)[:, 1]\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                classification_results = pd.concat([classification_results, pd.DataFrame({\"task\": [\"classification\"], \n",
    "                                                                            \"dataset\": [task[0]],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"RF-DEPTH\"],\n",
    "                                                                            \"scoring\": [\"AUC\"],\n",
    "                                                                            \"n_trees\": [m],\n",
    "                                                                            \"regularization\": [rf_depth], # we store best depth parameter in regularization\n",
    "                                                                            \"train_score\": [roc_auc_score(y_train, y_train_pred_rf_depth)],\n",
    "                                                                            \"test_score\": [roc_auc_score(y_test, y_test_pred_rf_depth)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [end_wall_time_tunning - start_wall_time_tunning], \n",
    "                                                                            \"tunning_cpu_time\": [end_cpu_time_tunning - start_cpu_time_tunning]})])\n",
    "                \n",
    "                ### HS-RF (hierarchical shrinkage) ###\n",
    "                \n",
    "                # measure tunning time\n",
    "                start_wall_time_tunning = time.time()\n",
    "                start_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                roc_spec = make_scorer(roc_auc_score)\n",
    "                rf = RandomForestClassifier(n_estimators=m, max_features = \"sqrt\")\n",
    "                hs_rf_cv = HSTreeClassifierCV(estimator_=rf, reg_param_list = reg_hs, cv = 3, scoring = roc_spec)\n",
    "                hs_rf_cv.fit(X_train, y_train)\n",
    "                \n",
    "                best_hs_reg = hs_rf_cv.reg_param\n",
    "                \n",
    "                end_wall_time_tunning = time.time()\n",
    "                end_cpu_time_tunning = time.process_time()\n",
    "                \n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "                \n",
    "                rf = RandomForestClassifier(n_estimators=m, max_features = \"sqrt\")\n",
    "                hs_rf = HSTreeClassifier(estimator_= rf, reg_param = best_hs_reg) \n",
    "                hs_rf.fit(X_train, y_train)\n",
    "                \n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "                \n",
    "                y_train_pred_hs_rf = hs_rf.predict_proba(X_train)[:, 1]\n",
    "                y_test_pred_hs_rf = hs_rf.predict_proba(X_test)[:, 1]\n",
    "                \n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                classification_results = pd.concat([classification_results, pd.DataFrame({\"task\": [\"classification\"], \n",
    "                                                                            \"dataset\": [task[0]],\n",
    "                                                                            \"boot_iter\": [samp],\n",
    "                                                                            \"algorithm\": [\"HS-RF\"],\n",
    "                                                                            \"scoring\": [\"AUC\"],\n",
    "                                                                            \"n_trees\": [m],\n",
    "                                                                            \"regularization\": [best_hs_reg], # HS regression parameter (lambda)\n",
    "                                                                            \"train_score\": [roc_auc_score(y_train, y_train_pred_hs_rf)],\n",
    "                                                                            \"test_score\": [roc_auc_score(y_test, y_test_pred_hs_rf)],\n",
    "                                                                            \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                            \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                            \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                            \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [end_wall_time_tunning - start_wall_time_tunning], \n",
    "                                                                            \"tunning_cpu_time\": [end_cpu_time_tunning - start_cpu_time_tunning]})])\n",
    "\n",
    "                classification_results.to_csv(\"results/rf_comparison_classification.csv\", index = False)\n",
    "\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6ccb317-0ff7-467c-bd3b-72f25e66bd8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataframe to save performance of models\n",
    "classification_results = pd.DataFrame(columns = [\"task\", \"dataset\", \"boot_iter\", \"algorithm\", \"scoring\", \"n_trees\", \"regularization\", \"train_score\", \"test_score\", \\\n",
    "                                                 \"train_wall_time\", \"test_wall_time\", \"train_cpu_time\", \"test_cpu_time\", \"tunning_wall_time\", \"tunning_cpu_time\"])\n",
    "\n",
    "# number of leafs used in paper\n",
    "num_of_trees = [10, 25, 50, 75, 100, 300, 500]\n",
    "\n",
    "# number of times to repeat evaluations with random splits (5 repeats with 3-fold cross-validation = 15 repeats)\n",
    "NUM_OF_BOOTSTRAP_SAMPS = 10\n",
    "\n",
    "# for each dataset that was used in paper\n",
    "for task in DATASETS_CLASSIFICATION:\n",
    "    # repeat NUM_OF_BOOTSTRAP_SAMPS\n",
    "    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):\n",
    "        # use statified splitting (we tried both stratified and un-stratified => no significant differences)\n",
    "        skf = StratifiedKFold(n_splits=3, shuffle=True)\n",
    "         \n",
    "        X, y = np.array(tasks[task[0]].drop(\"label\", axis = 1)), np.array(tasks[task[0]][\"label\"])\n",
    "        \n",
    "        # cross-validation loop\n",
    "        for i, (train_index, test_index) in enumerate(skf.split(tasks[task[0]], tasks[task[0]][\"label\"])):\n",
    "            print(f\"Dataset: {task[0]}, Sample: {samp}, Fold {i}\", end = \"\\r\")\n",
    "\n",
    "            X_train, y_train = X[train_index, :], y[train_index]\n",
    "            X_test, y_test = X[test_index, :], y[test_index]\n",
    "\n",
    "            # for each tree (as deduced from fig. 4D)\n",
    "            for m in num_of_trees:\n",
    "                                \n",
    "                ### BART (doesn't use tunning - takes to long/performs the best anyway) ###\n",
    "                \n",
    "                # measure train time\n",
    "                start_wall_time_train = time.time()\n",
    "                start_cpu_time_train = time.process_time()\n",
    "\n",
    "                bart = SklearnModel(n_trees = m);\n",
    "                bart.fit(X_train, y_train);\n",
    "\n",
    "                end_wall_time_train = time.time()\n",
    "                end_cpu_time_train = time.process_time()\n",
    "                \n",
    "                # measure test time\n",
    "                start_wall_time_test = time.time()\n",
    "                start_cpu_time_test = time.process_time()\n",
    "\n",
    "                bart_train_pred = np.round(bart.predict(X_train));\n",
    "                bart_test_pred = np.round(bart.predict(X_test));\n",
    "\n",
    "                end_wall_time_test = time.time()\n",
    "                end_cpu_time_test = time.process_time()\n",
    "\n",
    "                print(roc_auc_score(y_test, bart_test_pred))\n",
    "\n",
    "                classification_results = pd.concat([classification_results, pd.DataFrame({\"task\": [\"classification\"], \n",
    "                                                                                \"dataset\": [task[0]],\n",
    "                                                                                \"boot_iter\": [samp],\n",
    "                                                                                \"algorithm\": [\"BART\"],\n",
    "                                                                                \"scoring\": [\"AUC\"],\n",
    "                                                                                \"n_trees\": [m],\n",
    "                                                                                \"regularization\": [\"None\"],\n",
    "                                                                                \"train_score\": [roc_auc_score(y_train, bart_train_pred)],\n",
    "                                                                                \"test_score\": [roc_auc_score(y_test, bart_test_pred)],\n",
    "                                                                                \"train_wall_time\": [end_wall_time_train - start_wall_time_train],\n",
    "                                                                                \"test_wall_time\": [end_wall_time_test - start_wall_time_test],\n",
    "                                                                                \"train_cpu_time\": [end_cpu_time_train - start_cpu_time_train],\n",
    "                                                                                \"test_cpu_time\": [end_cpu_time_test - start_cpu_time_test],\n",
    "                                                                            \"tunning_wall_time\": [None], \n",
    "                                                                            \"tunning_cpu_time\": [None]})])\n",
    "\n",
    "                classification_results.to_csv(\"results/rf_bart_classification.csv\")\n",
    "\n",
    "            break"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mlds",
   "language": "python",
   "name": "mlds"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
