{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d77e3e2d-0706-4fde-a71e-1ac3b74e8346",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.spatial.distance import cdist\n",
    "import random\n",
    "from sklearn.cluster import KMeans\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from icfesl import *\n",
    "from utility_functions import *\n",
    "from xgboost import XGBClassifier\n",
    "from pytorch_tabnet.tab_model import TabNetClassifier\n",
    "import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import norm\n",
    "from catboost import CatBoostClassifier, Pool"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dd181542-b52a-4555-afdf-ba920573f9b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(420)\n",
    "n_samples = 50000\n",
    "sigma = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b6513970-d79a-4beb-9517-dea61573012e",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_variables = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "59c4ca71-57f4-4828-b297-248dc0e07b66",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_levels = 1000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "afd1b4a4-2de9-4f25-b159-b61ccd671289",
   "metadata": {},
   "outputs": [],
   "source": [
    "noise_percentage = 0.03"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "664151bb-fc8a-4545-9345-0ee0f1ba41c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = ['var1','var2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "524eb3b6-1960-4635-b0d9-a90eed58524b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.stats import norm\n",
    "\n",
    "def generate_correlated_categorical_variables(num_samples, num_variables, sigma, categories_per_variable):\n",
    "\n",
    "    correlation_matrix = np.array([[1.0, sigma],[sigma, 1.0]])\n",
    "    L = np.linalg.cholesky(correlation_matrix)\n",
    "    uncorrelated_normals = np.random.normal(size=(num_samples, num_variables))\n",
    "    correlated_normals = uncorrelated_normals @ L.T\n",
    "\n",
    "    categorical_data = np.zeros_like(correlated_normals, dtype=int)\n",
    "\n",
    "    for i in range(num_variables):\n",
    "        num_categories = categories_per_variable[i]\n",
    "        quantiles = np.linspace(0, 1, num_categories + 1)[1:-1]\n",
    "        thresholds = norm.ppf(quantiles)\n",
    "\n",
    "        for j in range(num_categories):\n",
    "            if j == 0:\n",
    "                categorical_data[:, i][correlated_normals[:, i] <= thresholds[j]] = j\n",
    "            elif j == num_categories - 1:\n",
    "                categorical_data[:, i][correlated_normals[:, i] > thresholds[j-1]] = j\n",
    "            else:\n",
    "                categorical_data[:, i][(correlated_normals[:, i] > thresholds[j-1]) & (correlated_normals[:, i] <= thresholds[j])] = j\n",
    "\n",
    "    res = pd.DataFrame(categorical_data, columns=[f\"var{i+1}\" for i in range(num_variables)])\n",
    "                       \n",
    "    return res\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "040ee5e5-73ea-47b1-9e60-e3ca9bdaf127",
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_label_noise(labels, noise_level, num_classes=2):\n",
    "\n",
    "    noisy_labels = np.copy(labels)\n",
    "    num_samples = len(labels)\n",
    "    num_noisy_samples = int(num_samples * noise_level)\n",
    "\n",
    "    noisy_indices = np.random.choice(num_samples, num_noisy_samples, replace=False)\n",
    "\n",
    "    for idx in noisy_indices:\n",
    "        original_label = noisy_labels[idx]\n",
    "        possible_new_labels = [i for i in range(num_classes) if i != original_label]\n",
    "        noisy_labels[idx] = np.random.choice(possible_new_labels)\n",
    "        \n",
    "    return noisy_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e82673dc-d14a-4b93-9c8f-613f27855b1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = generate_correlated_categorical_variables(n_samples, n_variables, sigma, [n_levels, n_levels])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3049f72f-fc0b-47cd-9341-269490a4a249",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of simulated data: 0.7630958557128906 MB\n"
     ]
    }
   ],
   "source": [
    "simulated_data_size = sys.getsizeof(X)\n",
    "print(f\"Size of simulated data: {simulated_data_size/1024**2} MB\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3fb395f5-06e5-4b29-886a-d3d22ea99556",
   "metadata": {},
   "source": [
    "## Simulated study for classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "493bc876-7031-4fd8-a2f6-0d4636372984",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: overflow encountered in exp\n",
      "  result = getattr(ufunc, method)(*inputs, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "beta_var1 = 1\n",
    "beta_var2 = -1\n",
    "intercept = 1\n",
    "\n",
    "y = (1/(1+np.exp(-intercept - beta_var1 * X['var1'] - beta_var2 * X['var2'])))>0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "0050d2e5-15f2-4150-a773-00d786435f24",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = add_label_noise(np.array(y), noise_percentage)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "46f3d45d-b176-4784-8935-c732c37fd356",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = [int(i) for i in y.tolist()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "462dc5a8-4863-4e54-9448-5fa42125b828",
   "metadata": {},
   "outputs": [],
   "source": [
    "X['y'] = y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5f333929-695c-4ec2-832d-8051963eb198",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3848fd5d-177a-4d03-a12b-f4301b1bd3ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.sort_values('var1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "8d8483bd-5b4d-4087-8d13-92cdfc662ed3",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_unobserved = data.loc[data['var1']==n_levels-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2ef433a9-a3f7-4f79-94ce-a99265858fdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_observed = data.loc[data['var1']<n_levels]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "38ca1c00-b163-462c-9fa2-b91354b44429",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(data_observed[cat_vars], data_observed['y'], test_size=0.3, random_state=35)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "21495b2e-2742-4490-9986-a47f4265d8d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test = pd.concat([X_test, data_unobserved[cat_vars]], axis=0)\n",
    "y_test = pd.concat([y_test, data_unobserved['y']], axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e3528929-b2f8-462d-bb96-636052fcd93d",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X_train.reset_index(drop=True)\n",
    "X_test = X_test.reset_index(drop=True)\n",
    "y_train = y_train.reset_index(drop=True)\n",
    "y_test = y_test.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "0360b9a9-ee20-4089-87c9-fab640ff5e21",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2, encoder = icfesl.f_get_dummies(X_train, ['var1','var2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "924c7ed4-5b53-440b-b103-77ad85033a86",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2_test = icfesl.f_get_dummies(X_test, ['var1','var2'], encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5fcaa6b4-8007-4e01-a0e9-4dcb8728a4a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "min_child_weight = np.floor(X2.shape[0]/X2.shape[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "9d51c73a-70d3-475e-9376-6c8c5b6cf498",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.float64(17.0)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "min_child_weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "7c6871b4-c5c6-4b76-856a-cfcb3c4a7873",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of X2 data: 533.5237159729004 MB\n"
     ]
    }
   ],
   "source": [
    "X2_data_size = sys.getsizeof(X2)\n",
    "print(f\"Size of X2 data: {X2_data_size/1024**2} MB\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "59924c30-72c8-41e4-ac5c-9f63911b271c",
   "metadata": {},
   "source": [
    "### CatBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "a64d20c9-b58a-4edc-b9c2-44c6f4721c3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(\n",
    "    iterations=100,  \n",
    "    loss_function='Logloss', custom_metric=['AUC'],\n",
    "    random_seed=42,  \n",
    "    verbose=False,    \n",
    "    min_data_in_leaf=min_child_weight\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "2cd0335b-e6d5-409f-b32d-9086c345f403",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2596\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train, y_train, cat_features=cat_vars)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "e940226e-b8a7-4285-a743-0684b8fd6abd",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_train)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "c26341be-4a74-483b-9810-c7f36596adb9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 0.9298996427766095\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "599550aa-a1d9-4164-ae75-4785ed139d29",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "ffd15889-01a3-4d87-8d81-1892ef32eb98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 0.8887539326419464\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a14325db-bbee-43d1-976c-176f0018bddb",
   "metadata": {},
   "source": [
    "### TabNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "377e0174-52ab-44b0-8fbc-92597f657406",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_idxs = [X_train.columns.get_loc(col) for col in cat_vars]\n",
    "cat_dims = [X_train[col].nunique() for col in cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "bf5c1e81-c149-454d-a677-7048b07eecd7",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabNetClassifier(verbose=0, seed=200, cat_idxs=cat_idxs, cat_dims=cat_dims)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "ff2493dd-9bbe-42ac-b8e0-c8799070b26b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stop training because you reached max_epochs = 50 with best_epoch = 45 and best_train_auc = 0.95747\n",
      "21.5613\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/pytorch_tabnet/callbacks.py:172: UserWarning: Best weights from best epoch are automatically used!\n",
      "  warnings.warn(wrn_msg)\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train=X_train.to_numpy(), y_train=y_train.to_numpy(), eval_set=[(X_train.to_numpy(), y_train.to_numpy())], eval_name=['train'], max_epochs=50)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "994cbfd4-36d0-4879-b0be-5ab2eef8cf06",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_train.to_numpy())[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "2441371f-d15c-40d4-ad9d-590b03df2a0a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC:0.9574745808103796\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC:{auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c35b05c9-a34b-4fda-ae6e-f5bf8c8e9de2",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_test.to_numpy())[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "72114801-717a-4ba8-83a0-8cd697671770",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC:0.9405312524128264\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC:{auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04f66670-54f6-4c90-96e9-d65a096663b0",
   "metadata": {},
   "source": [
    "### One hot encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9407c138-1151-49a5-b15f-067c12f1fee9",
   "metadata": {},
   "source": [
    "#### 1. GLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "5876a562-9ad5-4796-a0b3-e6a250d0ad2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "164.6428\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X2, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "4cad784d-cd85-42a1-b674-2281c411ccd1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:1998; R2: 0.6105506315267526\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "9d98efc3-7dd7-4684-8bb1-e6b18d630451",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "d522aafa-4a4f-49af-b35c-252568e8752a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 0.9732996515225705\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "dd3b6924-3608-4e29-9017-d332b8acc6dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "77159d20-1353-4815-9365-506dd9985242",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 0.9603902634516577\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c888aa95-d9f6-4694-80ac-05b2bbe8b61c",
   "metadata": {},
   "source": [
    "#### 2.xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "ed98a66f-01db-4ea3-a8b3-a78e955be1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "df42399f-ca2b-4525-8198-fe1fa9e3c0d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.9082\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X2, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "ae1430b7-aceb-4742-b934-a1eed4e907a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X2)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "40224e28-b679-452a-9534-678ae93c1867",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 0.7653734279083088\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "16ffb892-5e5e-4601-a5a5-653e1b86997a",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X2_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "040f7a6c-00dd-45e3-9c25-60e92992708a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 0.7240343992334388\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27a2a623-5281-47c0-acbe-0247e59a4334",
   "metadata": {},
   "source": [
    "### Target Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "586c16d4-4a09-4b8d-8dbc-76a35d822258",
   "metadata": {},
   "outputs": [],
   "source": [
    "from category_encoders import TargetEncoder\n",
    "enc_auto = TargetEncoder(cols=cat_vars, min_samples_leaf = min_child_weight).fit(X_train, y_train)\n",
    "X_t = enc_auto.transform(X_train)\n",
    "X_t_test = enc_auto.transform(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e0a8ae9-bd27-4cfa-9c5d-dfd479d121e3",
   "metadata": {},
   "source": [
    "#### 1. GLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "d2131938-e931-4f91-b99b-82c8f3d74226",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0457\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X_t, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "a3053246-108a-46e7-98ae-89ee1ac35bbc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:2; R2: 0.5037383165150269\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "5ccab110-ab74-4e63-9d09-0174472a644b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "34ff4e5f-8b9a-41d3-94b7-79d64c7bf241",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 0.9292302423963431\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "2ac968e0-4bfd-423d-944b-73c5710f7f69",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "82ae9a2a-4a84-4ae3-8abd-5520a9b0f785",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 0.8894793272566167\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fb66c13-ae7e-4dbd-b1ab-6564363315e4",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "cc923abd-b206-440d-a64e-b1177cb8642a",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "647e7142-4e25-47d3-a749-b8b17a11da96",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2026\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_t, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "bd191fae-0925-4ace-8690-42b41085b2f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_t)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "a75ecae2-9d64-47e0-9206-044b02010fc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 0.9509339431655698\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "af3cddbb-aa89-4942-9fce-f791b0270580",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_t_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "49e89411-0ebb-4dce-b2c9-1e8d6b27dc2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 0.8990613592545487\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f9efe23-76f4-49e5-9130-053c9680dd64",
   "metadata": {},
   "source": [
    "### ICFESL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "95eed9eb-f300-40a6-b9d4-dd749b7e3769",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-11-27 20:07:46.356\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-11-27 20:08:25.943\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:08:26.097\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:08:26.398\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:08:26.399\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:01.896\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:02.023\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:02.328\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:02.329\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:27.212\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:27.330\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:27.638\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:27.638\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:48.019\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:48.180\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:48.489\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:09:48.489\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:07.118\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:07.231\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:07.523\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:07.523\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:26.616\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:26.715\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:27.025\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:27.025\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:45.938\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:46.060\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:46.360\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-27 20:10:46.361\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m527\u001b[0m - \u001b[1msearch stopped: model fit scores are decreasing...\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "fit_info_panel, best_index, fit_figs, cluster_groups, criterions, inertias, gap_statss = icfesl.regularized_search_algorun(\n",
    "    X2, pd.Series(y_train), X2_test, pd.Series(y_test), cat_vars, 'classification', alphas = [0.01, 0.05, 0.1, 0.2, 0.5, 1, 5], cbine_column=False,\n",
    "    distance_threshold=0.002, figure=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "9139ee6e-1f9f-4834-961e-fa45bf6fe245",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot, summary_plot = fit_figs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "79c0959a-b17f-4c3e-a1aa-477505a69d25",
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_info_panel.to_excel(\"simulation_classification_fit_info_case2.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "8bb5be76-c3f2-48d7-88fa-1c5f8e07566c",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot.savefig('decision_plot_simulation_classification_case2.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "ffdb9c09-7e59-49c6-9d4d-b8eda8992216",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_plot.savefig('summary_plot_simulation_classification_case2.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "0c583112-9148-4c78-8d6c-2b581c772dd9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Experiment</th>\n",
       "      <th>dof</th>\n",
       "      <th>reg_fit_time</th>\n",
       "      <th>reg_training_auroc</th>\n",
       "      <th>reg_testing_auroc</th>\n",
       "      <th>xgb_fit_time</th>\n",
       "      <th>xgb_training_auroc</th>\n",
       "      <th>xgb_testing_auroc</th>\n",
       "      <th>var_inf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.1354</td>\n",
       "      <td>0.933346</td>\n",
       "      <td>0.920606</td>\n",
       "      <td>0.2745</td>\n",
       "      <td>0.933898</td>\n",
       "      <td>0.921465</td>\n",
       "      <td>0.000028</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>14</td>\n",
       "      <td>0.1057</td>\n",
       "      <td>0.933452</td>\n",
       "      <td>0.920696</td>\n",
       "      <td>0.2784</td>\n",
       "      <td>0.934117</td>\n",
       "      <td>0.921751</td>\n",
       "      <td>0.000027</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "      <td>0.1013</td>\n",
       "      <td>0.933561</td>\n",
       "      <td>0.919919</td>\n",
       "      <td>0.2813</td>\n",
       "      <td>0.934198</td>\n",
       "      <td>0.920869</td>\n",
       "      <td>0.000026</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>14</td>\n",
       "      <td>0.1437</td>\n",
       "      <td>0.930470</td>\n",
       "      <td>0.916405</td>\n",
       "      <td>0.2757</td>\n",
       "      <td>0.930762</td>\n",
       "      <td>0.916990</td>\n",
       "      <td>0.000025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>14</td>\n",
       "      <td>0.0895</td>\n",
       "      <td>0.930752</td>\n",
       "      <td>0.912114</td>\n",
       "      <td>0.2655</td>\n",
       "      <td>0.931283</td>\n",
       "      <td>0.913487</td>\n",
       "      <td>0.000023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>14</td>\n",
       "      <td>0.0841</td>\n",
       "      <td>0.930574</td>\n",
       "      <td>0.912483</td>\n",
       "      <td>0.2846</td>\n",
       "      <td>0.930847</td>\n",
       "      <td>0.912458</td>\n",
       "      <td>0.000022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>6</td>\n",
       "      <td>14</td>\n",
       "      <td>0.1012</td>\n",
       "      <td>0.929647</td>\n",
       "      <td>0.903508</td>\n",
       "      <td>0.2736</td>\n",
       "      <td>0.929789</td>\n",
       "      <td>0.903925</td>\n",
       "      <td>0.000016</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Experiment  dof  reg_fit_time  reg_training_auroc  reg_testing_auroc  \\\n",
       "0           0   14        0.1354            0.933346           0.920606   \n",
       "1           1   14        0.1057            0.933452           0.920696   \n",
       "2           2   14        0.1013            0.933561           0.919919   \n",
       "3           3   14        0.1437            0.930470           0.916405   \n",
       "4           4   14        0.0895            0.930752           0.912114   \n",
       "5           5   14        0.0841            0.930574           0.912483   \n",
       "6           6   14        0.1012            0.929647           0.903508   \n",
       "\n",
       "   xgb_fit_time  xgb_training_auroc  xgb_testing_auroc   var_inf  \n",
       "0        0.2745            0.933898           0.921465  0.000028  \n",
       "1        0.2784            0.934117           0.921751  0.000027  \n",
       "2        0.2813            0.934198           0.920869  0.000026  \n",
       "3        0.2757            0.930762           0.916990  0.000025  \n",
       "4        0.2655            0.931283           0.913487  0.000023  \n",
       "5        0.2846            0.930847           0.912458  0.000022  \n",
       "6        0.2736            0.929789           0.903925  0.000016  "
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fit_info_panel"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ba26263-8b9d-4e35-880e-7b077e7e9159",
   "metadata": {},
   "source": [
    "## CBind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "ca2a9cd3-2da2-4fd4-b673-7d62c346e657",
   "metadata": {},
   "outputs": [],
   "source": [
    "cgrouping = icfesl.group_categorical_features(X2, X2.columns.tolist(), distance_threshold=0.002)\n",
    "X4 = icfesl.combine_features(X2, cgrouping)\n",
    "X4_test = icfesl.combine_features(X2_test, cgrouping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "b055c561-7dce-4048-8ad6-0ca5c2c4ca68",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15043, 869)"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X4_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7db5b54-7a14-401e-be2f-2670f66a5bf6",
   "metadata": {},
   "source": [
    "#### 1.GLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "11ace45f-93b7-4611-b2ba-4f1c186f2ecc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.9574\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X4, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "6cd36506-6cb2-4002-bde7-75be46c788df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:869; R2: 0.19728866112778232\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "889e61bb-0c16-4dcf-ba9f-4117720d6bd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "b0f172ff-c995-4354-a9bd-c8490faafcba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 0.7559357805226721\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "5a226b37-6efc-42f2-bdf5-473a1844b96e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "189d19cb-6c55-4e55-9cdf-7c3652999a78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 0.7004954478361417\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f83b17f-caa2-48e7-a00c-fb108a9c74bb",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "d5752ddb-6128-4743-a7b9-f6617ac74036",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "d9f35ec7-a09b-497a-b2a5-a6ff2b8a9479",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7752\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X4, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "859f3057-694d-4d9b-b4c7-afd9900985fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X4)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "642b22c0-8f68-4ebf-8b8d-37e9096a591b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 0.7019735203234396\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "49110501-218a-44b5-901e-35765607adaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X4_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "d581ff33-8ae3-4b70-b709-6570723daa99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 0.6473923373864576\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c29881f4-a3d9-4ef4-84a7-ceb656bd7acf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab84783b-284f-4d36-9806-2f619cb9412c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c7f16a8-e38e-4ee5-8daa-f660b9a76a8e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
