{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cfba4cba-031f-4e17-a759-e842c1692c26",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.spatial.distance import cdist\n",
    "import random\n",
    "from sklearn.cluster import KMeans\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from icfesl import *\n",
    "from utility_functions import *\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.metrics import roc_auc_score, accuracy_score\n",
    "from xgboost import XGBClassifier\n",
    "from pytorch_tabnet.tab_model import TabNetClassifier\n",
    "import time    \n",
    "from sklearn.model_selection import train_test_split\n",
    "from catboost import CatBoostClassifier, Pool\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "506ea2ca-8d50-4f17-9eee-5dfcb4c4044f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"../../../writing/UCI datasets/mushroom/agaricus-lepiota.data\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e23be722-daf7-4b34-bee3-024fa990fe71",
   "metadata": {},
   "source": [
    "## Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ac7bac62-f13f-4ff2-afd1-f3ec2fbca50d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#convert target to binary\n",
    "y = data['p'].apply(lambda x: 1 if x == 'p' else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b6c44cf9-ae25-4a92-bc93-dc127b354046",
   "metadata": {},
   "outputs": [],
   "source": [
    "#drop this because it causes complete separation\n",
    "data = data.drop('p.1', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8399b1fe-2ff3-4712-8ecb-8eb047ae6fe4",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = data.columns.tolist()\n",
    "cat_vars.remove('p')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7b58fba0-c9e9-4359-a9ee-6b4c413c84a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "encoder = LabelEncoder()\n",
    "for var in cat_vars:\n",
    "    data[var] = encoder.fit_transform(data[var])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "2acf1e79-bedf-48de-bcd5-743ac9dc8202",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = data.columns.tolist()\n",
    "cat_vars.remove('p')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "291672fa-dabd-4273-af1f-fbe5951c8778",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data[cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7d98142d-b77c-44f7-9bc5-437444581271",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ba11f53f-8a1a-43d6-bf7e-2e98161d789b",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X_train.reset_index(drop=True)\n",
    "X_test = X_test.reset_index(drop=True)\n",
    "y_train = y_train.reset_index(drop=True)\n",
    "y_test = y_test.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "9cd1b154-2c5c-436e-b9ac-270fa08230aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2, encoder = icfesl.f_get_dummies(X_train, cat_vars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "49c5a38e-ebc1-417e-a46e-0bad6c004271",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2_test = icfesl.f_get_dummies(X_test, cat_vars, encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "ddb7ecc3-4148-4cd4-a13a-4523c8411be1",
   "metadata": {},
   "outputs": [],
   "source": [
    "selector = VarianceThreshold(threshold=np.mean(y_train)*0.1)\n",
    "\n",
    "selector.fit(X2)\n",
    "\n",
    "selected_features_mask = selector.get_support()\n",
    "\n",
    "selected_column_names = X2.columns[selected_features_mask]\n",
    "\n",
    "X2 = X2[selected_column_names]\n",
    "\n",
    "X2_test = X2_test[selected_column_names]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b581d02a-5aa5-4d4a-8dec-33523662accb",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in X2.columns.tolist():\n",
    "    X2[c] = X2[c].astype('int')\n",
    "    X2_test[c] = X2_test[c].astype('int')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "148fa316-a054-4e1c-b7c7-1ed143cb0ca6",
   "metadata": {},
   "source": [
    "### CatBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0012472d-d357-4546-be63-6eb731983590",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(\n",
    "    iterations=100,  \n",
    "    loss_function='Logloss', custom_metric=['AUC'],\n",
    "    random_seed=42,  \n",
    "    verbose=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "50d6a715-a584-4556-b07f-e2ddb772b577",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.3072\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train, y_train, cat_features=cat_vars)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "75abb71c-cb18-4a44-960d-476ba7cb2daa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5686, 21)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d111bfd0-6fba-4c89-8cbb-64532f31b36b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_train)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "e9fa54dc-55f3-4fe9-9315-6d0032e61179",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 1.0\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "4f826dd2-182d-4b59-9ede-d895c91a981d",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "cc0a49ce-a852-4055-aba6-fbf9bd7996fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 1.0\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "82fb0a5a-7377-49fa-ac1a-8e88f74c8bca",
   "metadata": {},
   "source": [
    "#### TabNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "9309d696-4d0f-4f2a-91b0-73216f27f70b",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_idxs = [X_train.columns.get_loc(col) for col in cat_vars]\n",
    "cat_dims = [X_train[col].nunique() for col in cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "644618b1-aad7-4627-8720-97ee289dbb02",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabNetClassifier(verbose=0, seed=200, cat_idxs=cat_idxs, cat_dims=cat_dims)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "7fe0f9d3-793e-43dc-a1c8-2b486c371c94",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Early stopping occurred at epoch 37 with best_epoch = 27 and best_train_auc = 1.0\n",
      "3.6352\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/pytorch_tabnet/callbacks.py:172: UserWarning: Best weights from best epoch are automatically used!\n",
      "  warnings.warn(wrn_msg)\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train=X_train.values, y_train=y_train, eval_set=[(X_train.values, y_train)], eval_name=['train'], max_epochs=50)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "36f7d83b-786b-4db7-8b4a-4fd0a4162e25",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_train.to_numpy())[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "1ab4984f-9258-4549-84a4-017ddfbe92e5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC:1.0\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC:{auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "f555c895-1fcf-437b-a700-c461360324c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_test.to_numpy())[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "92c82fe9-de20-4471-96cf-73bd66951807",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC:0.9999979752467162\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC:{auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f929208-f612-4202-977d-b6a88baed894",
   "metadata": {},
   "source": [
    "## One hot encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17f79be3-a6cb-4d41-b2f0-172fe692c3a7",
   "metadata": {},
   "source": [
    "#### 1.logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "2b99da68-82a5-4931-a75f-53898aae1c62",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.3269\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X2, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "da4d0889-cf6a-4003-a99f-4a63e88fccc4",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:49; R2: 0.7413685080509469\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "5c9fc0df-c85a-4d14-88c3-aecc79822598",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "9809291b-f3d3-47f3-a7ab-04c5eb7da152",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc:0.9998804860021242\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f'training auroc:{auroc}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "69aec418-9c93-4429-a961-1e55d3e02873",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc:0.9996146219583145\n"
     ]
    }
   ],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2_test, has_constant='skip'))\n",
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f'testing auroc:{auroc}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa859d1b-699f-4077-bebf-b8d34b38c41a",
   "metadata": {},
   "source": [
    "### 2.xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "f76a3651-11d4-4d72-8911-faa35318250d",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "896c8c9d-e91f-4c36-acc8-b5e9dfa0238d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.1655\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X2, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "e4c61003-3992-421c-881f-6073fd4c252e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X2)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "de5209cc-de1e-4c6d-a82c-5b06e19b2a82",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 1.0\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "13cfb8af-8423-4d3f-a656-ddc37c7d9344",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X2_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "ca92dfbb-8904-4898-b1fc-75069cc23a1d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 1.0\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4705c61-e163-4ed1-a806-acd6976f3513",
   "metadata": {},
   "source": [
    "## Target encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "82f52507-8a63-4727-be31-b7cd1d4bccb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import TargetEncoder\n",
    "enc_auto = TargetEncoder(smooth='auto')\n",
    "X_t = enc_auto.fit_transform(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "9697af0f-a2ce-4425-b896-f08a281d2659",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_feature_names = enc_auto.get_feature_names_out()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "9bb5297e-be8b-44b3-b444-00d07870a4e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_tdf = pd.DataFrame(X_t, columns=list(output_feature_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "e8c0da87-b096-40dd-888d-3fe9634fd3c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_t_test = enc_auto.transform(X_test)\n",
    "X_tdf_test = pd.DataFrame(X_t_test, columns=list(output_feature_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ce9a5e01-e409-4e77-8892-704c47ceb6a9",
   "metadata": {},
   "source": [
    "#### 1. logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "e32f45b9-33ec-48b7-90c4-bb4e552ce76b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0354\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, X_tdf, family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "2adc63b8-6516-4239-ad4c-bab762acc4d5",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:20; R2: 0.6905462711203648\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "174a8184-63bd-4d22-8e5e-6d35e0b04fea",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "8ec91532-0c7e-491b-bad2-a47482589e0d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training auroc: 0.9922229207289041\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f'Training auroc: {auroc}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "8ebf2f4b-0c9a-4ff4-8e8f-8d0a49edebf3",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "0f332da2-e349-4763-86f6-d636cb574242",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Testing auroc: 0.993771183981232\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f'Testing auroc: {auroc}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b8ed15c7-8e18-4d79-b981-06fabdb5086f",
   "metadata": {},
   "source": [
    "#### 2.xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "8efc32cc-9e4b-49c4-a408-3039200574d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0001\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = XGBClassifier(n_estimators=100, random_state=200)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "adc7969f-f09a-445e-a359-3f4034240557",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {\n",
       "  /* Definition of color scheme common for light and dark mode */\n",
       "  --sklearn-color-text: #000;\n",
       "  --sklearn-color-text-muted: #666;\n",
       "  --sklearn-color-line: gray;\n",
       "  /* Definition of color scheme for unfitted estimators */\n",
       "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
       "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
       "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
       "  --sklearn-color-unfitted-level-3: chocolate;\n",
       "  /* Definition of color scheme for fitted estimators */\n",
       "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
       "  --sklearn-color-fitted-level-1: #d4ebff;\n",
       "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
       "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
       "\n",
       "  /* Specific color for light theme */\n",
       "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
       "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
       "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
       "  --sklearn-color-icon: #696969;\n",
       "\n",
       "  @media (prefers-color-scheme: dark) {\n",
       "    /* Redefinition of color scheme for dark theme */\n",
       "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
       "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
       "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
       "    --sklearn-color-icon: #878787;\n",
       "  }\n",
       "}\n",
       "\n",
       "#sk-container-id-1 {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 pre {\n",
       "  padding: 0;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 input.sk-hidden--visually {\n",
       "  border: 0;\n",
       "  clip: rect(1px 1px 1px 1px);\n",
       "  clip: rect(1px, 1px, 1px, 1px);\n",
       "  height: 1px;\n",
       "  margin: -1px;\n",
       "  overflow: hidden;\n",
       "  padding: 0;\n",
       "  position: absolute;\n",
       "  width: 1px;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-dashed-wrapped {\n",
       "  border: 1px dashed var(--sklearn-color-line);\n",
       "  margin: 0 0.4em 0.5em 0.4em;\n",
       "  box-sizing: border-box;\n",
       "  padding-bottom: 0.4em;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-container {\n",
       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
       "     so we also need the `!important` here to be able to override the\n",
       "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
       "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
       "  display: inline-block !important;\n",
       "  position: relative;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-text-repr-fallback {\n",
       "  display: none;\n",
       "}\n",
       "\n",
       "div.sk-parallel-item,\n",
       "div.sk-serial,\n",
       "div.sk-item {\n",
       "  /* draw centered vertical line to link estimators */\n",
       "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
       "  background-size: 2px 100%;\n",
       "  background-repeat: no-repeat;\n",
       "  background-position: center center;\n",
       "}\n",
       "\n",
       "/* Parallel-specific style estimator block */\n",
       "\n",
       "#sk-container-id-1 div.sk-parallel-item::after {\n",
       "  content: \"\";\n",
       "  width: 100%;\n",
       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
       "  flex-grow: 1;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-parallel {\n",
       "  display: flex;\n",
       "  align-items: stretch;\n",
       "  justify-content: center;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  position: relative;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-parallel-item {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
       "  align-self: flex-end;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
       "  align-self: flex-start;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
       "  width: 0;\n",
       "}\n",
       "\n",
       "/* Serial-specific style estimator block */\n",
       "\n",
       "#sk-container-id-1 div.sk-serial {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "  align-items: center;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  padding-right: 1em;\n",
       "  padding-left: 1em;\n",
       "}\n",
       "\n",
       "\n",
       "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
       "clickable and can be expanded/collapsed.\n",
       "- Pipeline and ColumnTransformer use this feature and define the default style\n",
       "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
       "*/\n",
       "\n",
       "/* Pipeline and ColumnTransformer style (default) */\n",
       "\n",
       "#sk-container-id-1 div.sk-toggleable {\n",
       "  /* Default theme specific background. It is overwritten whether we have a\n",
       "  specific estimator or a Pipeline/ColumnTransformer */\n",
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
       "/* Toggleable label */\n",
       "#sk-container-id-1 label.sk-toggleable__label {\n",
       "  cursor: pointer;\n",
       "  display: flex;\n",
       "  width: 100%;\n",
       "  margin-bottom: 0;\n",
       "  padding: 0.5em;\n",
       "  box-sizing: border-box;\n",
       "  text-align: center;\n",
       "  align-items: start;\n",
       "  justify-content: space-between;\n",
       "  gap: 0.5em;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 label.sk-toggleable__label .caption {\n",
       "  font-size: 0.6rem;\n",
       "  font-weight: lighter;\n",
       "  color: var(--sklearn-color-text-muted);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
       "  /* Arrow on the left of the label */\n",
       "  content: \"▸\";\n",
       "  float: left;\n",
       "  margin-right: 0.25em;\n",
       "  color: var(--sklearn-color-icon);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
       "/* Toggleable content - dropdown */\n",
       "\n",
       "#sk-container-id-1 div.sk-toggleable__content {\n",
       "  max-height: 0;\n",
       "  max-width: 0;\n",
       "  overflow: hidden;\n",
       "  text-align: left;\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-toggleable__content pre {\n",
       "  margin: 0.2em;\n",
       "  border-radius: 0.25em;\n",
       "  color: var(--sklearn-color-text);\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
       "  /* Expand drop-down */\n",
       "  max-height: 200px;\n",
       "  max-width: 100%;\n",
       "  overflow: auto;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
       "  content: \"▾\";\n",
       "}\n",
       "\n",
       "/* Pipeline/ColumnTransformer-specific style */\n",
       "\n",
       "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator-specific style */\n",
       "\n",
       "/* Colorize estimator box */\n",
       "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
       "#sk-container-id-1 div.sk-label label {\n",
       "  /* The background is the default theme color */\n",
       "  color: var(--sklearn-color-text-on-default-background);\n",
       "}\n",
       "\n",
       "/* On hover, darken the color of the background */\n",
       "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "/* Label box, darken color on hover, fitted */\n",
       "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator label */\n",
       "\n",
       "#sk-container-id-1 div.sk-label label {\n",
       "  font-family: monospace;\n",
       "  font-weight: bold;\n",
       "  display: inline-block;\n",
       "  line-height: 1.2em;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-label-container {\n",
       "  text-align: center;\n",
       "}\n",
       "\n",
       "/* Estimator-specific */\n",
       "#sk-container-id-1 div.sk-estimator {\n",
       "  font-family: monospace;\n",
       "  border: 1px dotted var(--sklearn-color-border-box);\n",
       "  border-radius: 0.25em;\n",
       "  box-sizing: border-box;\n",
       "  margin-bottom: 0.5em;\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-estimator.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "/* on hover */\n",
       "#sk-container-id-1 div.sk-estimator:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
       "\n",
       "/* Common style for \"i\" and \"?\" */\n",
       "\n",
       ".sk-estimator-doc-link,\n",
       "a:link.sk-estimator-doc-link,\n",
       "a:visited.sk-estimator-doc-link {\n",
       "  float: right;\n",
       "  font-size: smaller;\n",
       "  line-height: 1em;\n",
       "  font-family: monospace;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  border-radius: 1em;\n",
       "  height: 1em;\n",
       "  width: 1em;\n",
       "  text-decoration: none !important;\n",
       "  margin-left: 0.5em;\n",
       "  text-align: center;\n",
       "  /* unfitted */\n",
       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-unfitted-level-1);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link.fitted,\n",
       "a:link.sk-estimator-doc-link.fitted,\n",
       "a:visited.sk-estimator-doc-link.fitted {\n",
       "  /* fitted */\n",
       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-fitted-level-1);\n",
       "}\n",
       "\n",
       "/* On hover */\n",
       "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
       ".sk-estimator-doc-link:hover,\n",
       "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
       ".sk-estimator-doc-link:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
       ".sk-estimator-doc-link.fitted:hover,\n",
       "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
       ".sk-estimator-doc-link.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "/* Span, style for the box shown on hovering the info icon */\n",
       ".sk-estimator-doc-link span {\n",
       "  display: none;\n",
       "  z-index: 9999;\n",
       "  position: relative;\n",
       "  font-weight: normal;\n",
       "  right: .2ex;\n",
       "  padding: .5ex;\n",
       "  margin: .5ex;\n",
       "  width: min-content;\n",
       "  min-width: 20ex;\n",
       "  max-width: 50ex;\n",
       "  color: var(--sklearn-color-text);\n",
       "  box-shadow: 2pt 2pt 4pt #999;\n",
       "  /* unfitted */\n",
       "  background: var(--sklearn-color-unfitted-level-0);\n",
       "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link.fitted span {\n",
       "  /* fitted */\n",
       "  background: var(--sklearn-color-fitted-level-0);\n",
       "  border: var(--sklearn-color-fitted-level-3);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link:hover span {\n",
       "  display: block;\n",
       "}\n",
       "\n",
       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
       "\n",
       "#sk-container-id-1 a.estimator_doc_link {\n",
       "  float: right;\n",
       "  font-size: 1rem;\n",
       "  line-height: 1em;\n",
       "  font-family: monospace;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  border-radius: 1rem;\n",
       "  height: 1rem;\n",
       "  width: 1rem;\n",
       "  text-decoration: none;\n",
       "  /* unfitted */\n",
       "  color: var(--sklearn-color-unfitted-level-1);\n",
       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
       "  /* fitted */\n",
       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-fitted-level-1);\n",
       "}\n",
       "\n",
       "/* On hover */\n",
       "#sk-container-id-1 a.estimator_doc_link:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-3);\n",
       "}\n",
       "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              feature_weights=None, gamma=None, grow_policy=None,\n",
       "              importance_type=None, interaction_constraints=None,\n",
       "              learning_rate=None, max_bin=None, max_cat_threshold=None,\n",
       "              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
       "              max_leaves=None, min_child_weight=None, missing=nan,\n",
       "              monotone_constraints=None, multi_strategy=None, n_estimators=100,\n",
       "              n_jobs=None, num_parallel_tree=None, ...)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>XGBClassifier</div></div><div><a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.0.0/python/python_api.html#xgboost.XGBClassifier\">?<span>Documentation for XGBClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></div></label><div class=\"sk-toggleable__content fitted\"><pre>XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              feature_weights=None, gamma=None, grow_policy=None,\n",
       "              importance_type=None, interaction_constraints=None,\n",
       "              learning_rate=None, max_bin=None, max_cat_threshold=None,\n",
       "              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
       "              max_leaves=None, min_child_weight=None, missing=nan,\n",
       "              monotone_constraints=None, multi_strategy=None, n_estimators=100,\n",
       "              n_jobs=None, num_parallel_tree=None, ...)</pre></div> </div></div></div></div>"
      ],
      "text/plain": [
       "XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              feature_weights=None, gamma=None, grow_policy=None,\n",
       "              importance_type=None, interaction_constraints=None,\n",
       "              learning_rate=None, max_bin=None, max_cat_threshold=None,\n",
       "              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
       "              max_leaves=None, min_child_weight=None, missing=nan,\n",
       "              monotone_constraints=None, multi_strategy=None, n_estimators=100,\n",
       "              n_jobs=None, num_parallel_tree=None, ...)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X_t, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "bc7be755-33d1-4be5-880e-af3a1215a2ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_t)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "d8f4cccf-de7d-4275-98c5-ac4be92d4c5f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 1.0\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "5fbdf77c-40c0-4c5b-b050-9cfb191bda8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_t_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "dc9ca2fa-0fcc-40bb-a119-6e440e4ab35a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 1.0\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "95e9d69d-e4ae-4bbb-9345-343a6cd1855c",
   "metadata": {},
   "source": [
    "## ICFESL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "53e1c2a9-2a7f-47e3-86cd-5168ead7edc2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-12-02 06:12:39.803\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-12-02 06:12:41.007\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m420\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:41.179\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m438\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:41.449\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:41.451\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-12-02 06:12:42.513\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m420\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:42.670\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m438\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:42.943\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:42.944\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-12-02 06:12:43.911\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m420\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:44.085\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m438\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:44.358\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:44.358\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-12-02 06:12:45.305\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m420\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:45.481\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m438\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:45.754\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:45.754\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-12-02 06:12:46.735\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m420\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:46.865\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m438\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:47.141\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:47.142\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:48.076\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m420\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:48.213\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m438\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:48.407\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:48.407\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 5 ------>\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:49.303\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m420\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:49.434\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m438\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:49.647\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 5 ------>\u001b[0m\n",
      "\u001b[32m2025-12-02 06:12:49.649\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m532\u001b[0m - \u001b[1msearch stopped: model fit scores are decreasing...\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "fit_info_panel, best_index, fit_figs, cluster_groups, criterions, inertias, gap_statss = icfesl.regularized_search_algorun(\n",
    "    X2, pd.Series(y_train), X2_test, pd.Series(y_test), cat_vars, 'classification', alphas = [0.01, 0.05, 0.1, 0.2, 0.5, 1, 5], cbine_column=False,\n",
    "    distance_threshold=0.002, figure=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "f81f16b3-a97b-41cb-ba53-ee2604d4dc35",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot, summary_plot = fit_figs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "c5b1463e-4840-4fcb-8c56-77ef594c561e",
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_info_panel.to_excel(\"mushroom_fit_info.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "5c294ae1-b7f6-4174-8baf-09c98380428d",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot.savefig('decision_plot_mushroom.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "8bfaae83-43dd-40d7-b52b-58820fde955f",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_plot.savefig('summary_plot_mushroom.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "2f27aeac-8890-4402-aee9-ee56b7cfed90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Experiment</th>\n",
       "      <th>dof</th>\n",
       "      <th>reg_fit_time</th>\n",
       "      <th>reg_training_auroc</th>\n",
       "      <th>reg_testing_auroc</th>\n",
       "      <th>xgb_fit_time</th>\n",
       "      <th>xgb_training_auroc</th>\n",
       "      <th>xgb_testing_auroc</th>\n",
       "      <th>var_inf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>34</td>\n",
       "      <td>0.1604</td>\n",
       "      <td>0.999897</td>\n",
       "      <td>0.999592</td>\n",
       "      <td>0.2642</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>34</td>\n",
       "      <td>0.1481</td>\n",
       "      <td>0.999897</td>\n",
       "      <td>0.999592</td>\n",
       "      <td>0.2625</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000068</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>34</td>\n",
       "      <td>0.1646</td>\n",
       "      <td>0.999897</td>\n",
       "      <td>0.999592</td>\n",
       "      <td>0.2664</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000063</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>33</td>\n",
       "      <td>0.1659</td>\n",
       "      <td>0.999856</td>\n",
       "      <td>0.999451</td>\n",
       "      <td>0.2662</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>34</td>\n",
       "      <td>0.1236</td>\n",
       "      <td>0.999620</td>\n",
       "      <td>0.999086</td>\n",
       "      <td>0.2709</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000071</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>34</td>\n",
       "      <td>0.1269</td>\n",
       "      <td>0.999660</td>\n",
       "      <td>0.999229</td>\n",
       "      <td>0.1777</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000054</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>6</td>\n",
       "      <td>34</td>\n",
       "      <td>0.1256</td>\n",
       "      <td>0.999366</td>\n",
       "      <td>0.998926</td>\n",
       "      <td>0.1969</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000046</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Experiment  dof  reg_fit_time  reg_training_auroc  reg_testing_auroc  \\\n",
       "0           0   34        0.1604            0.999897           0.999592   \n",
       "1           1   34        0.1481            0.999897           0.999592   \n",
       "2           2   34        0.1646            0.999897           0.999592   \n",
       "3           3   33        0.1659            0.999856           0.999451   \n",
       "4           4   34        0.1236            0.999620           0.999086   \n",
       "5           5   34        0.1269            0.999660           0.999229   \n",
       "6           6   34        0.1256            0.999366           0.998926   \n",
       "\n",
       "   xgb_fit_time  xgb_training_auroc  xgb_testing_auroc   var_inf  \n",
       "0        0.2642                 1.0                1.0  0.000075  \n",
       "1        0.2625                 1.0                1.0  0.000068  \n",
       "2        0.2664                 1.0                1.0  0.000063  \n",
       "3        0.2662                 1.0                1.0  0.000065  \n",
       "4        0.2709                 1.0                1.0  0.000071  \n",
       "5        0.1777                 1.0                1.0  0.000054  \n",
       "6        0.1969                 1.0                1.0  0.000046  "
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fit_info_panel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "09c5ad8a-555c-418f-9594-dfa75504ccf0",
   "metadata": {},
   "outputs": [],
   "source": [
    "X3 = icfesl.combine_features(X2, cluster_groups[best_index])\n",
    "X3_test = icfesl.combine_features(X2_test, cluster_groups[best_index])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "ca8394c1-e89e-4aaf-9c3c-5b58fdda63dd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "best_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "d74064bb-1ff6-46a6-ae21-bd1330787075",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-2 {\n",
       "  /* Definition of color scheme common for light and dark mode */\n",
       "  --sklearn-color-text: #000;\n",
       "  --sklearn-color-text-muted: #666;\n",
       "  --sklearn-color-line: gray;\n",
       "  /* Definition of color scheme for unfitted estimators */\n",
       "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
       "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
       "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
       "  --sklearn-color-unfitted-level-3: chocolate;\n",
       "  /* Definition of color scheme for fitted estimators */\n",
       "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
       "  --sklearn-color-fitted-level-1: #d4ebff;\n",
       "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
       "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
       "\n",
       "  /* Specific color for light theme */\n",
       "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
       "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
       "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
       "  --sklearn-color-icon: #696969;\n",
       "\n",
       "  @media (prefers-color-scheme: dark) {\n",
       "    /* Redefinition of color scheme for dark theme */\n",
       "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
       "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
       "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
       "    --sklearn-color-icon: #878787;\n",
       "  }\n",
       "}\n",
       "\n",
       "#sk-container-id-2 {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 pre {\n",
       "  padding: 0;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 input.sk-hidden--visually {\n",
       "  border: 0;\n",
       "  clip: rect(1px 1px 1px 1px);\n",
       "  clip: rect(1px, 1px, 1px, 1px);\n",
       "  height: 1px;\n",
       "  margin: -1px;\n",
       "  overflow: hidden;\n",
       "  padding: 0;\n",
       "  position: absolute;\n",
       "  width: 1px;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-dashed-wrapped {\n",
       "  border: 1px dashed var(--sklearn-color-line);\n",
       "  margin: 0 0.4em 0.5em 0.4em;\n",
       "  box-sizing: border-box;\n",
       "  padding-bottom: 0.4em;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-container {\n",
       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
       "     so we also need the `!important` here to be able to override the\n",
       "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
       "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
       "  display: inline-block !important;\n",
       "  position: relative;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-text-repr-fallback {\n",
       "  display: none;\n",
       "}\n",
       "\n",
       "div.sk-parallel-item,\n",
       "div.sk-serial,\n",
       "div.sk-item {\n",
       "  /* draw centered vertical line to link estimators */\n",
       "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
       "  background-size: 2px 100%;\n",
       "  background-repeat: no-repeat;\n",
       "  background-position: center center;\n",
       "}\n",
       "\n",
       "/* Parallel-specific style estimator block */\n",
       "\n",
       "#sk-container-id-2 div.sk-parallel-item::after {\n",
       "  content: \"\";\n",
       "  width: 100%;\n",
       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
       "  flex-grow: 1;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-parallel {\n",
       "  display: flex;\n",
       "  align-items: stretch;\n",
       "  justify-content: center;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  position: relative;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-parallel-item {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
       "  align-self: flex-end;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
       "  align-self: flex-start;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
       "  width: 0;\n",
       "}\n",
       "\n",
       "/* Serial-specific style estimator block */\n",
       "\n",
       "#sk-container-id-2 div.sk-serial {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "  align-items: center;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  padding-right: 1em;\n",
       "  padding-left: 1em;\n",
       "}\n",
       "\n",
       "\n",
       "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
       "clickable and can be expanded/collapsed.\n",
       "- Pipeline and ColumnTransformer use this feature and define the default style\n",
       "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
       "*/\n",
       "\n",
       "/* Pipeline and ColumnTransformer style (default) */\n",
       "\n",
       "#sk-container-id-2 div.sk-toggleable {\n",
       "  /* Default theme specific background. It is overwritten whether we have a\n",
       "  specific estimator or a Pipeline/ColumnTransformer */\n",
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
       "/* Toggleable label */\n",
       "#sk-container-id-2 label.sk-toggleable__label {\n",
       "  cursor: pointer;\n",
       "  display: flex;\n",
       "  width: 100%;\n",
       "  margin-bottom: 0;\n",
       "  padding: 0.5em;\n",
       "  box-sizing: border-box;\n",
       "  text-align: center;\n",
       "  align-items: start;\n",
       "  justify-content: space-between;\n",
       "  gap: 0.5em;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 label.sk-toggleable__label .caption {\n",
       "  font-size: 0.6rem;\n",
       "  font-weight: lighter;\n",
       "  color: var(--sklearn-color-text-muted);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
       "  /* Arrow on the left of the label */\n",
       "  content: \"▸\";\n",
       "  float: left;\n",
       "  margin-right: 0.25em;\n",
       "  color: var(--sklearn-color-icon);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
       "/* Toggleable content - dropdown */\n",
       "\n",
       "#sk-container-id-2 div.sk-toggleable__content {\n",
       "  max-height: 0;\n",
       "  max-width: 0;\n",
       "  overflow: hidden;\n",
       "  text-align: left;\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-toggleable__content pre {\n",
       "  margin: 0.2em;\n",
       "  border-radius: 0.25em;\n",
       "  color: var(--sklearn-color-text);\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
       "  /* Expand drop-down */\n",
       "  max-height: 200px;\n",
       "  max-width: 100%;\n",
       "  overflow: auto;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
       "  content: \"▾\";\n",
       "}\n",
       "\n",
       "/* Pipeline/ColumnTransformer-specific style */\n",
       "\n",
       "#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator-specific style */\n",
       "\n",
       "/* Colorize estimator box */\n",
       "#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
       "#sk-container-id-2 div.sk-label label {\n",
       "  /* The background is the default theme color */\n",
       "  color: var(--sklearn-color-text-on-default-background);\n",
       "}\n",
       "\n",
       "/* On hover, darken the color of the background */\n",
       "#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "/* Label box, darken color on hover, fitted */\n",
       "#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator label */\n",
       "\n",
       "#sk-container-id-2 div.sk-label label {\n",
       "  font-family: monospace;\n",
       "  font-weight: bold;\n",
       "  display: inline-block;\n",
       "  line-height: 1.2em;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-label-container {\n",
       "  text-align: center;\n",
       "}\n",
       "\n",
       "/* Estimator-specific */\n",
       "#sk-container-id-2 div.sk-estimator {\n",
       "  font-family: monospace;\n",
       "  border: 1px dotted var(--sklearn-color-border-box);\n",
       "  border-radius: 0.25em;\n",
       "  box-sizing: border-box;\n",
       "  margin-bottom: 0.5em;\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-estimator.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "/* on hover */\n",
       "#sk-container-id-2 div.sk-estimator:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
       "\n",
       "/* Common style for \"i\" and \"?\" */\n",
       "\n",
       ".sk-estimator-doc-link,\n",
       "a:link.sk-estimator-doc-link,\n",
       "a:visited.sk-estimator-doc-link {\n",
       "  float: right;\n",
       "  font-size: smaller;\n",
       "  line-height: 1em;\n",
       "  font-family: monospace;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  border-radius: 1em;\n",
       "  height: 1em;\n",
       "  width: 1em;\n",
       "  text-decoration: none !important;\n",
       "  margin-left: 0.5em;\n",
       "  text-align: center;\n",
       "  /* unfitted */\n",
       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-unfitted-level-1);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link.fitted,\n",
       "a:link.sk-estimator-doc-link.fitted,\n",
       "a:visited.sk-estimator-doc-link.fitted {\n",
       "  /* fitted */\n",
       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-fitted-level-1);\n",
       "}\n",
       "\n",
       "/* On hover */\n",
       "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
       ".sk-estimator-doc-link:hover,\n",
       "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
       ".sk-estimator-doc-link:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
       ".sk-estimator-doc-link.fitted:hover,\n",
       "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
       ".sk-estimator-doc-link.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "/* Span, style for the box shown on hovering the info icon */\n",
       ".sk-estimator-doc-link span {\n",
       "  display: none;\n",
       "  z-index: 9999;\n",
       "  position: relative;\n",
       "  font-weight: normal;\n",
       "  right: .2ex;\n",
       "  padding: .5ex;\n",
       "  margin: .5ex;\n",
       "  width: min-content;\n",
       "  min-width: 20ex;\n",
       "  max-width: 50ex;\n",
       "  color: var(--sklearn-color-text);\n",
       "  box-shadow: 2pt 2pt 4pt #999;\n",
       "  /* unfitted */\n",
       "  background: var(--sklearn-color-unfitted-level-0);\n",
       "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link.fitted span {\n",
       "  /* fitted */\n",
       "  background: var(--sklearn-color-fitted-level-0);\n",
       "  border: var(--sklearn-color-fitted-level-3);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link:hover span {\n",
       "  display: block;\n",
       "}\n",
       "\n",
       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
       "\n",
       "#sk-container-id-2 a.estimator_doc_link {\n",
       "  float: right;\n",
       "  font-size: 1rem;\n",
       "  line-height: 1em;\n",
       "  font-family: monospace;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  border-radius: 1rem;\n",
       "  height: 1rem;\n",
       "  width: 1rem;\n",
       "  text-decoration: none;\n",
       "  /* unfitted */\n",
       "  color: var(--sklearn-color-unfitted-level-1);\n",
       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 a.estimator_doc_link.fitted {\n",
       "  /* fitted */\n",
       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-fitted-level-1);\n",
       "}\n",
       "\n",
       "/* On hover */\n",
       "#sk-container-id-2 a.estimator_doc_link:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-3);\n",
       "}\n",
       "</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              feature_weights=None, gamma=None, grow_policy=None,\n",
       "              importance_type=None, interaction_constraints=None,\n",
       "              learning_rate=None, max_bin=None, max_cat_threshold=None,\n",
       "              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
       "              max_leaves=None, min_child_weight=None, missing=nan,\n",
       "              monotone_constraints=None, multi_strategy=None, n_estimators=100,\n",
       "              n_jobs=None, num_parallel_tree=None, ...)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>XGBClassifier</div></div><div><a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://xgboost.readthedocs.io/en/release_3.0.0/python/python_api.html#xgboost.XGBClassifier\">?<span>Documentation for XGBClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></div></label><div class=\"sk-toggleable__content fitted\"><pre>XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              feature_weights=None, gamma=None, grow_policy=None,\n",
       "              importance_type=None, interaction_constraints=None,\n",
       "              learning_rate=None, max_bin=None, max_cat_threshold=None,\n",
       "              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
       "              max_leaves=None, min_child_weight=None, missing=nan,\n",
       "              monotone_constraints=None, multi_strategy=None, n_estimators=100,\n",
       "              n_jobs=None, num_parallel_tree=None, ...)</pre></div> </div></div></div></div>"
      ],
      "text/plain": [
       "XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              feature_weights=None, gamma=None, grow_policy=None,\n",
       "              importance_type=None, interaction_constraints=None,\n",
       "              learning_rate=None, max_bin=None, max_cat_threshold=None,\n",
       "              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
       "              max_leaves=None, min_child_weight=None, missing=nan,\n",
       "              monotone_constraints=None, multi_strategy=None, n_estimators=100,\n",
       "              n_jobs=None, num_parallel_tree=None, ...)"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)\n",
    "model.fit(X3, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "9b684532-ad46-496d-a1c3-7df6ff27abbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "9be7edbc-fe83-4f47-80fb-3f5087e18ebf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy_score(y_train, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "0f8a4273-c052-4aa8-a3ec-6b3ffbb1eb53",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred = model.predict(X3_test)\n",
    "accuracy_score(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "7ad23cdd-c393-4084-960a-b2997fc5a023",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'x+0': ['x::3'],\n",
       "  'x+1': ['x::2', 'x::5'],\n",
       "  's+0': ['s::2', 's::3'],\n",
       "  'n+0': ['n::2', 'n::3', 'n::4'],\n",
       "  'n+1': ['n::9'],\n",
       "  'n+2': ['n::8'],\n",
       "  't+0': ['t::1'],\n",
       "  'c+0': ['c::1'],\n",
       "  'n.1+0': ['n.1::1'],\n",
       "  'k+0': ['k::2', 'k::3', 'k::5', 'k::7', 'k::9', 'k::10'],\n",
       "  'e+0': ['e::1'],\n",
       "  'e.1+0': ['e.1::2', 'e.1::3'],\n",
       "  'e.1+1': ['e.1::1'],\n",
       "  's.1+0': ['s.1::2'],\n",
       "  's.1+1': ['s.1::1'],\n",
       "  's.2+0': ['s.2::2'],\n",
       "  's.2+1': ['s.2::1'],\n",
       "  'w+0': ['w::3', 'w::6', 'w::7'],\n",
       "  'w+1': ['w::4'],\n",
       "  'w.1+0': ['w.1::3', 'w.1::6', 'w.1::7'],\n",
       "  'w.1+1': ['w.1::4'],\n",
       "  'o+0': ['o::2'],\n",
       "  'o+1': ['o::1'],\n",
       "  'p.3+0': ['p.3::4'],\n",
       "  'p.3+1': ['p.3::2'],\n",
       "  'k.1+0': ['k.1::2', 'k.1::3'],\n",
       "  'k.1+1': ['k.1::7'],\n",
       "  'k.1+2': ['k.1::1'],\n",
       "  's.3+0': ['s.3::4'],\n",
       "  's.3+1': ['s.3::5'],\n",
       "  's.3+2': ['s.3::3'],\n",
       "  'u+0': ['u::2'],\n",
       "  'u+1': ['u::1'],\n",
       "  'u+2': ['u::4']},\n",
       " {'x+0': ['x::3'],\n",
       "  'x+1': ['x::2', 'x::5'],\n",
       "  's+0': ['s::2', 's::3'],\n",
       "  'n+0': ['n::2', 'n::3', 'n::4'],\n",
       "  'n+1': ['n::9'],\n",
       "  'n+2': ['n::8'],\n",
       "  't+0': ['t::1'],\n",
       "  'c+0': ['c::1'],\n",
       "  'n.1+0': ['n.1::1'],\n",
       "  'k+0': ['k::2', 'k::3', 'k::5', 'k::7', 'k::9', 'k::10'],\n",
       "  'e+0': ['e::1'],\n",
       "  'e.1+0': ['e.1::2', 'e.1::3'],\n",
       "  'e.1+1': ['e.1::1'],\n",
       "  's.1+0': ['s.1::2'],\n",
       "  's.1+1': ['s.1::1'],\n",
       "  's.2+0': ['s.2::2'],\n",
       "  's.2+1': ['s.2::1'],\n",
       "  'w+0': ['w::3', 'w::6', 'w::7'],\n",
       "  'w+1': ['w::4'],\n",
       "  'w.1+0': ['w.1::3', 'w.1::6', 'w.1::7'],\n",
       "  'w.1+1': ['w.1::4'],\n",
       "  'o+0': ['o::2'],\n",
       "  'o+1': ['o::1'],\n",
       "  'p.3+0': ['p.3::4'],\n",
       "  'p.3+1': ['p.3::2'],\n",
       "  'k.1+0': ['k.1::2', 'k.1::3'],\n",
       "  'k.1+1': ['k.1::7'],\n",
       "  'k.1+2': ['k.1::1'],\n",
       "  's.3+0': ['s.3::4'],\n",
       "  's.3+1': ['s.3::5'],\n",
       "  's.3+2': ['s.3::3'],\n",
       "  'u+0': ['u::2'],\n",
       "  'u+1': ['u::1'],\n",
       "  'u+2': ['u::4']},\n",
       " {'x+0': ['x::3'],\n",
       "  'x+1': ['x::2', 'x::5'],\n",
       "  's+0': ['s::2', 's::3'],\n",
       "  'n+0': ['n::2', 'n::3', 'n::4'],\n",
       "  'n+1': ['n::8'],\n",
       "  'n+2': ['n::9'],\n",
       "  't+0': ['t::1'],\n",
       "  'c+0': ['c::1'],\n",
       "  'n.1+0': ['n.1::1'],\n",
       "  'k+0': ['k::2', 'k::3', 'k::5', 'k::7', 'k::9', 'k::10'],\n",
       "  'e+0': ['e::1'],\n",
       "  'e.1+0': ['e.1::2', 'e.1::3'],\n",
       "  'e.1+1': ['e.1::1'],\n",
       "  's.1+0': ['s.1::2'],\n",
       "  's.1+1': ['s.1::1'],\n",
       "  's.2+0': ['s.2::2'],\n",
       "  's.2+1': ['s.2::1'],\n",
       "  'w+0': ['w::3', 'w::6', 'w::7'],\n",
       "  'w+1': ['w::4'],\n",
       "  'w.1+0': ['w.1::3', 'w.1::6', 'w.1::7'],\n",
       "  'w.1+1': ['w.1::4'],\n",
       "  'o+0': ['o::2'],\n",
       "  'o+1': ['o::1'],\n",
       "  'p.3+0': ['p.3::4'],\n",
       "  'p.3+1': ['p.3::2'],\n",
       "  'k.1+0': ['k.1::2', 'k.1::3'],\n",
       "  'k.1+1': ['k.1::7'],\n",
       "  'k.1+2': ['k.1::1'],\n",
       "  's.3+0': ['s.3::4'],\n",
       "  's.3+1': ['s.3::5'],\n",
       "  's.3+2': ['s.3::3'],\n",
       "  'u+0': ['u::2'],\n",
       "  'u+1': ['u::1'],\n",
       "  'u+2': ['u::4']},\n",
       " {'x+0': ['x::3'],\n",
       "  'x+1': ['x::2', 'x::5'],\n",
       "  's+0': ['s::2', 's::3'],\n",
       "  'n+0': ['n::2', 'n::3', 'n::4'],\n",
       "  'n+1': ['n::8'],\n",
       "  'n+2': ['n::9'],\n",
       "  't+0': ['t::1'],\n",
       "  'c+0': ['c::1'],\n",
       "  'n.1+0': ['n.1::1'],\n",
       "  'k+0': ['k::2', 'k::3', 'k::5', 'k::7', 'k::9', 'k::10'],\n",
       "  'e+0': ['e::1'],\n",
       "  'e.1+0': ['e.1::2', 'e.1::3'],\n",
       "  'e.1+1': ['e.1::1'],\n",
       "  's.1+0': ['s.1::2'],\n",
       "  's.1+1': ['s.1::1'],\n",
       "  's.2+0': ['s.2::2'],\n",
       "  's.2+1': ['s.2::1'],\n",
       "  'w+0': ['w::3', 'w::6', 'w::7'],\n",
       "  'w+1': ['w::4'],\n",
       "  'w.1+0': ['w.1::3', 'w.1::6', 'w.1::7'],\n",
       "  'w.1+1': ['w.1::4'],\n",
       "  'o+0': ['o::2'],\n",
       "  'o+1': ['o::1'],\n",
       "  'p.3+0': ['p.3::4'],\n",
       "  'p.3+1': ['p.3::2'],\n",
       "  'k.1+0': ['k.1::2', 'k.1::3'],\n",
       "  'k.1+1': ['k.1::7'],\n",
       "  'k.1+2': ['k.1::1'],\n",
       "  's.3+0': ['s.3::4'],\n",
       "  's.3+1': ['s.3::5'],\n",
       "  's.3+2': ['s.3::3'],\n",
       "  'u+0': ['u::2', 'u::4'],\n",
       "  'u+1': ['u::1']},\n",
       " {'x+0': ['x::3'],\n",
       "  'x+1': ['x::2'],\n",
       "  'x+2': ['x::5'],\n",
       "  's+0': ['s::2', 's::3'],\n",
       "  'n+0': ['n::2', 'n::4'],\n",
       "  'n+1': ['n::8'],\n",
       "  'n+2': ['n::3', 'n::9'],\n",
       "  't+0': ['t::1'],\n",
       "  'c+0': ['c::1'],\n",
       "  'n.1+0': ['n.1::1'],\n",
       "  'k+0': ['k::2', 'k::3', 'k::5', 'k::7', 'k::9', 'k::10'],\n",
       "  'e+0': ['e::1'],\n",
       "  'e.1+0': ['e.1::2', 'e.1::3'],\n",
       "  'e.1+1': ['e.1::1'],\n",
       "  's.1+0': ['s.1::2'],\n",
       "  's.1+1': ['s.1::1'],\n",
       "  's.2+0': ['s.2::2'],\n",
       "  's.2+1': ['s.2::1'],\n",
       "  'w+0': ['w::6', 'w::7'],\n",
       "  'w+1': ['w::3'],\n",
       "  'w+2': ['w::4'],\n",
       "  'w.1+0': ['w.1::3', 'w.1::6', 'w.1::7'],\n",
       "  'w.1+1': ['w.1::4'],\n",
       "  'o+0': ['o::2'],\n",
       "  'o+1': ['o::1'],\n",
       "  'p.3+0': ['p.3::4'],\n",
       "  'p.3+1': ['p.3::2'],\n",
       "  'k.1+0': ['k.1::2', 'k.1::3'],\n",
       "  'k.1+1': ['k.1::1', 'k.1::7'],\n",
       "  's.3+0': ['s.3::4'],\n",
       "  's.3+1': ['s.3::5'],\n",
       "  's.3+2': ['s.3::3'],\n",
       "  'u+0': ['u::2', 'u::4'],\n",
       "  'u+1': ['u::1']},\n",
       " {'x+0': ['x::3', 'x::5'],\n",
       "  'x+1': ['x::2'],\n",
       "  's+0': ['s::2', 's::3'],\n",
       "  'n+0': ['n::3', 'n::4', 'n::9'],\n",
       "  'n+1': ['n::8'],\n",
       "  'n+2': ['n::2'],\n",
       "  't+0': ['t::1'],\n",
       "  'c+0': ['c::1'],\n",
       "  'n.1+0': ['n.1::1'],\n",
       "  'k+0': ['k::2', 'k::3', 'k::5', 'k::7', 'k::9', 'k::10'],\n",
       "  'e+0': ['e::1'],\n",
       "  'e.1+0': ['e.1::2', 'e.1::3'],\n",
       "  'e.1+1': ['e.1::1'],\n",
       "  's.1+0': ['s.1::2'],\n",
       "  's.1+1': ['s.1::1'],\n",
       "  's.2+0': ['s.2::2'],\n",
       "  's.2+1': ['s.2::1'],\n",
       "  'w+0': ['w::6', 'w::7'],\n",
       "  'w+1': ['w::3'],\n",
       "  'w+2': ['w::4'],\n",
       "  'w.1+0': ['w.1::6', 'w.1::7'],\n",
       "  'w.1+1': ['w.1::4'],\n",
       "  'w.1+2': ['w.1::3'],\n",
       "  'o+0': ['o::2'],\n",
       "  'o+1': ['o::1'],\n",
       "  'p.3+0': ['p.3::4'],\n",
       "  'p.3+1': ['p.3::2'],\n",
       "  'k.1+0': ['k.1::2', 'k.1::3'],\n",
       "  'k.1+1': ['k.1::1', 'k.1::7'],\n",
       "  's.3+0': ['s.3::4'],\n",
       "  's.3+1': ['s.3::5'],\n",
       "  's.3+2': ['s.3::3'],\n",
       "  'u+0': ['u::2', 'u::4'],\n",
       "  'u+1': ['u::1']},\n",
       " {'x+0': ['x::2', 'x::3', 'x::5'],\n",
       "  's+0': ['s::3'],\n",
       "  's+1': ['s::2'],\n",
       "  'n+0': ['n::3', 'n::4', 'n::9'],\n",
       "  'n+1': ['n::8'],\n",
       "  'n+2': ['n::2'],\n",
       "  't+0': ['t::1'],\n",
       "  'c+0': ['c::1'],\n",
       "  'n.1+0': ['n.1::1'],\n",
       "  'k+0': ['k::5', 'k::7', 'k::9', 'k::10'],\n",
       "  'k+1': ['k::2', 'k::3'],\n",
       "  'e+0': ['e::1'],\n",
       "  'e.1+0': ['e.1::2', 'e.1::3'],\n",
       "  'e.1+1': ['e.1::1'],\n",
       "  's.1+0': ['s.1::2'],\n",
       "  's.1+1': ['s.1::1'],\n",
       "  's.2+0': ['s.2::2'],\n",
       "  's.2+1': ['s.2::1'],\n",
       "  'w+0': ['w::4', 'w::6'],\n",
       "  'w+1': ['w::3'],\n",
       "  'w+2': ['w::7'],\n",
       "  'w.1+0': ['w.1::6', 'w.1::7'],\n",
       "  'w.1+1': ['w.1::4'],\n",
       "  'w.1+2': ['w.1::3'],\n",
       "  'o+0': ['o::2'],\n",
       "  'o+1': ['o::1'],\n",
       "  'p.3+0': ['p.3::4'],\n",
       "  'p.3+1': ['p.3::2'],\n",
       "  'k.1+0': ['k.1::2', 'k.1::3'],\n",
       "  'k.1+1': ['k.1::1', 'k.1::7'],\n",
       "  's.3+0': ['s.3::3', 's.3::4'],\n",
       "  's.3+1': ['s.3::5'],\n",
       "  'u+0': ['u::2'],\n",
       "  'u+1': ['u::1', 'u::4']}]"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cluster_groups"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20c86cbd-9fa1-415c-be03-824ea0ebaeea",
   "metadata": {},
   "source": [
    "### CBind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "a92e3869-cf90-4efb-8312-7627dfc728d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "cgrouping = icfesl.group_categorical_features(X2, selected_column_names, distance_threshold=np.mean(y)*0.1)\n",
    "X4 = icfesl.combine_features(X2, cgrouping)\n",
    "X4_test = icfesl.combine_features(X2_test, cgrouping)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09ae5881-df34-4e61-8a90-833fc81eb95a",
   "metadata": {},
   "source": [
    "#### 1. logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "038c76a3-77cc-41cb-818b-ba6dc446226a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2003\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X4, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "994cb781-37a5-40c8-ba4d-46506bed5e16",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:48; R2: 0.7413685080511012\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "73942996-167c-462d-b8d9-efc705caddde",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "ea1c02e4-3a74-4dfd-b328-639290801bdb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9998804860021242\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "debe27cb-b036-47ad-93c0-2d1d432bf371",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "457f1030-063e-498d-aacb-1685593d0886",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.9996146219583144\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "edcefdd0-76ae-4aa1-a22f-208878e2bc68",
   "metadata": {},
   "source": [
    "#### 2.xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "97fe435d-ec99-46d6-84ab-7f38253af4cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "af406fdb-7283-4d41-8443-b5c04e9b4bc5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.1438\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X4, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "660f84e8-e9ce-4b4a-9a0e-55d0928cbacd",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X4)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "cbd39fa5-2afa-49b3-8358-8a7db22f8597",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 1.0\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "3701c0c3-2be0-4a2a-b1e1-7b916666118e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X4_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "9fa705da-ed57-48bb-9a45-a47f5331b74e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 1.0\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd487b13-b989-442b-9d57-4d2dde847b4a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d686a721-91c0-433b-8834-b575613019ec",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f597b250-7f14-4fa2-889b-014c17def955",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52622bc8-e6c6-4d92-93b0-1ad0326afb35",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
