{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "82beccaa-e829-4712-99e5-014dfb189c59",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.spatial.distance import cdist\n",
    "import random\n",
    "from sklearn.cluster import KMeans\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from icfesl import *\n",
    "from utility_functions import *\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from xgboost import XGBClassifier\n",
    "from pytorch_tabnet.tab_model import TabNetClassifier\n",
    "from sklearn.mixture import GaussianMixture\n",
    "import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "from catboost import CatBoostClassifier, Pool\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9706d06e-64c7-49e9-b15f-4f49265fcc85",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data = pd.read_csv(\"../../../writing/NIPS dataset/Base.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3353047e-c300-46e3-86e8-43acf66be2ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "gm = GaussianMixture(n_components=3, random_state=0, init_params='k-means++').fit(raw_data[['credit_risk_score','name_email_similarity']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f8eb94f6-bddd-40ce-8670-50c4eb81954b",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data['segment'] = gm.predict(raw_data[['credit_risk_score','name_email_similarity']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d96cfe98-5033-4b5d-9888-d85ca13c78ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = ['prev_address_months_count','current_address_months_count','customer_age', 'payment_type', 'date_of_birth_distinct_emails_4w', 'employment_status', 'email_is_free', 'housing_status',\n",
    "           'phone_home_valid', 'phone_mobile_valid', 'bank_months_count','has_other_cards', 'foreign_request', 'source', \n",
    "           'device_os', 'keep_alive_session', 'device_distinct_emails_8w']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "07fac057-9711-4f20-992d-61551e3d6f94",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = raw_data.loc[raw_data['segment']==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "81597536-6ba2-4ce2-9ba4-60e8ee82010e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_1 = data.loc[data['fraud_bool']==1]\n",
    "data_0 = data.loc[data['fraud_bool']==0].sample(frac=1/20,random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "393af376-eb00-4ef2-9ef6-3956da37a6cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_ = pd.concat([data_1,data_0], axis=0).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6064656e-11c4-4380-a6d6-1618f4b3ec9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data_[cat_vars].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "328f5f49-38cf-4c86-949e-06494e256b0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = data_['fraud_bool'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8458e42a-0c5d-44a6-9d89-cdd48c28a926",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "prev_address_months_count\n",
       "-1                           16864\n",
       " 11                            227\n",
       " 29                            225\n",
       " 27                            223\n",
       " 10                            215\n",
       "                             ...  \n",
       " 281                             1\n",
       " 287                             1\n",
       " 294                             1\n",
       " 300                             1\n",
       " 364                             1\n",
       "Name: count, Length: 318, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_[['prev_address_months_count']].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7058e78c-9b7a-4368-b4d2-1e3d68d58c6e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.float64(0.11598991392052865)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6b5e6aa7-4a60-4387-9f37-a422218857df",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "0de93675-5e5b-4035-b355-e6d1e55e9630",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X_train.reset_index(drop=True)\n",
    "X_test = X_test.reset_index(drop=True)\n",
    "y_train = y_train.reset_index(drop=True)\n",
    "y_test = y_test.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "97f8335a-581d-4444-be09-9c4a384e8fef",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2, encoder = icfesl.f_get_dummies(X_train, cat_vars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "8db04298-4609-4666-bf42-70effe7dde00",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [0, 1, 4, 10] during transform. These unknown categories will be encoded as all zeros\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "X2_test = icfesl.f_get_dummies(X_test, cat_vars, encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "f14516c9-cb0a-4a1a-aeef-da52fb19ccf3",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in X2.columns.tolist():\n",
    "    X2[c] = X2[c].astype('int')\n",
    "    X2_test[c] = X2_test[c].astype('int')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0b79a35-ac5a-4cdd-8511-6a356542f179",
   "metadata": {},
   "source": [
    "### CatBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "83aed7dd-eb00-4fe7-bf62-90936b7db059",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(\n",
    "    iterations=100,  \n",
    "    loss_function='Logloss', custom_metric=['AUC'],\n",
    "    random_seed=42,  \n",
    "    verbose=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "73684525-55f4-497e-8674-c040d035b153",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.3302\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train, y_train, cat_features=cat_vars)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "298f3db0-5b94-4f9f-a766-bc415fcd8d47",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_train)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "2c6a07d5-4164-4b9c-900e-e2860410f2f2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 0.8686965226548797\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "949dcd1c-8cc3-4bd2-81b5-a5e88c3e1a0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "9e7fbe5c-53af-4d1c-9984-79d28546adb4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 0.8511391561630264\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0cb3b8f9-1e8a-4e82-aee2-4f113df397cc",
   "metadata": {},
   "source": [
    "### TabNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "d22141be-b31a-46c2-b46b-709b5ab46220",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(23002, 33)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "6132737c-bcd6-4e32-bff9-4ceb84087050",
   "metadata": {},
   "outputs": [],
   "source": [
    "for v in cat_vars:\n",
    "    tmp = data_[v].value_counts().reset_index()\n",
    "    data_ = data_[~data_[v].isin(tmp.loc[tmp['count']<10,v].tolist())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "47300ac6-a958-41cf-aa9a-500ed09765df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(21673, 33)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "364c7272-3db3-418f-9443-0983dbff2ce9",
   "metadata": {},
   "outputs": [],
   "source": [
    "for var in cat_vars:\n",
    "    le = LabelEncoder()\n",
    "    data_[var] = le.fit_transform(data_[var])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "0073fb67-2a37-4c5c-9d23-30016b5e0d83",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_, X_test_, y_train_, y_test_ = train_test_split(data_[cat_vars], data_['fraud_bool'], test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "14ee0e8c-3c5d-453b-8bda-5c5986297668",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_ = X_train_.reset_index(drop=True)\n",
    "X_test_ = X_test_.reset_index(drop=True)\n",
    "y_train_ = y_train_.reset_index(drop=True)\n",
    "y_test_ = y_test_.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "4aebb647-62f8-4207-bfdf-245820f2455f",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_idxs = [X_train_.columns.get_loc(col) for col in cat_vars]\n",
    "cat_dims = [X_train_[col].nunique() for col in cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "285a4cd1-7be2-48be-a9a9-37fea808d750",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabNetClassifier(verbose=0, seed=200, cat_idxs=cat_idxs, cat_dims=cat_dims)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "61167c66-1b58-4518-a4aa-e3ffffc9b975",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stop training because you reached max_epochs = 50 with best_epoch = 46 and best_train_auc = 0.8428\n",
      "14.7903\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/pytorch_tabnet/callbacks.py:172: UserWarning: Best weights from best epoch are automatically used!\n",
      "  warnings.warn(wrn_msg)\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train=X_train_.values, y_train=np.ravel(y_train_), eval_set=[(X_train_.values, np.ravel(y_train_))], eval_name=['train'], max_epochs=50)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "fa2f0de0-489b-41b5-97ad-dde3d0f5b520",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_train_.values)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "00edccd0-61cc-4261-b937-507d762e1c98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC:0.8428013046960705\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train_, y_pred)\n",
    "print(f\"training AUROC:{auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "672e4677-f3c3-4a75-91a8-687f36b08800",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_test_.values)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "343923c1-1354-4b70-a387-3971057c81ce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC:0.8219897254817125\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test_, y_pred)\n",
    "print(f\"testing AUROC:{auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "522b86ca-642f-465b-a431-a8b7860fb5ed",
   "metadata": {},
   "source": [
    "### One hot encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9407c138-1151-49a5-b15f-067c12f1fee9",
   "metadata": {},
   "source": [
    "#### 1. logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "5876a562-9ad5-4796-a0b3-e6a250d0ad2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "21.2087\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X2, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "4cad784d-cd85-42a1-b674-2281c411ccd1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:797; R2: 0.2181681791365202\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "9d98efc3-7dd7-4684-8bb1-e6b18d630451",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "d522aafa-4a4f-49af-b35c-252568e8752a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.8812512518725714\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "4de1fda6-64ec-4439-a59e-270b408b4bf7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(16101, 798)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sm.add_constant(X2, has_constant='skip').shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "dd3b6924-3608-4e29-9017-d332b8acc6dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "77159d20-1353-4815-9365-506dd9985242",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.7899735298932797\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c888aa95-d9f6-4694-80ac-05b2bbe8b61c",
   "metadata": {},
   "source": [
    "#### 2.xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "ed98a66f-01db-4ea3-a8b3-a78e955be1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "df42399f-ca2b-4525-8198-fe1fa9e3c0d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5295\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X2, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "ae1430b7-aceb-4742-b934-a1eed4e907a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X2)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "40224e28-b679-452a-9534-678ae93c1867",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9184164332000796\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "16ffb892-5e5e-4601-a5a5-653e1b86997a",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X2_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "040f7a6c-00dd-45e3-9c25-60e92992708a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.8343924981147312\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27a2a623-5281-47c0-acbe-0247e59a4334",
   "metadata": {},
   "source": [
    "### Target Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "586c16d4-4a09-4b8d-8dbc-76a35d822258",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import TargetEncoder\n",
    "enc_auto = TargetEncoder(smooth=\"auto\")\n",
    "X_t = enc_auto.fit_transform(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "d25f81ac-ab4a-4de5-8a2a-ac245b2cab66",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_feature_names = enc_auto.get_feature_names_out()\n",
    "X_tdf = pd.DataFrame(X_t, columns=list(output_feature_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "9e5720eb-6a72-4307-a7bc-0e4842b703e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_t_test = enc_auto.transform(X_test)\n",
    "X_tdf_test = pd.DataFrame(X_t_test, columns=list(output_feature_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e0a8ae9-bd27-4cfa-9c5d-dfd479d121e3",
   "metadata": {},
   "source": [
    "#### 1. logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "d2131938-e931-4f91-b99b-82c8f3d74226",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.058\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X_tdf, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "a3053246-108a-46e7-98ae-89ee1ac35bbc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:17; R2: 0.16813644471273181\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "5ccab110-ab74-4e63-9d09-0174472a644b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "34ff4e5f-8b9a-41d3-94b7-79d64c7bf241",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.8385766447093501\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "2ac968e0-4bfd-423d-944b-73c5710f7f69",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "82ae9a2a-4a84-4ae3-8abd-5520a9b0f785",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.8515244805013584\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fb66c13-ae7e-4dbd-b1ab-6564363315e4",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "cc923abd-b206-440d-a64e-b1177cb8642a",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "647e7142-4e25-47d3-a749-b8b17a11da96",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.226\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_t, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "bd191fae-0925-4ace-8690-42b41085b2f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_t)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "a75ecae2-9d64-47e0-9206-044b02010fc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9804092927154495\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "af3cddbb-aa89-4942-9fce-f791b0270580",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_t_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "49e89411-0ebb-4dce-b2c9-1e8d6b27dc2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.827755673407828\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f9efe23-76f4-49e5-9130-053c9680dd64",
   "metadata": {},
   "source": [
    "### ICFESL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "95eed9eb-f300-40a6-b9d4-dd749b7e3769",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-11-28 07:55:16.913\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-11-28 07:55:29.035\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:55:43.904\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:55:44.413\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 07:55:44.413\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-11-28 07:55:54.887\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:56:06.122\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:56:06.625\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 07:56:06.625\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-11-28 07:56:17.002\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:56:27.772\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:56:28.300\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 07:56:28.300\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-11-28 07:56:38.380\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:56:47.783\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:56:48.288\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 07:56:48.288\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-11-28 07:56:58.604\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:08.242\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:08.762\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:08.762\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:17.146\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:25.586\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:26.112\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:26.113\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:32.686\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:41.815\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 07:57:42.538\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 5 ------>\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "fit_info_panel, best_index, fit_figs, cluster_groups, criterions, inertias, gap_statss = icfesl.regularized_search_algorun(\n",
    "    X2, pd.Series(y_train), X2_test, pd.Series(y_test), cat_vars, 'classification', alphas = [0.01, 0.05, 0.1, 0.2, 0.5, 1, 5], cbine_column=False,\n",
    "    distance_threshold=0.002, figure=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "9139ee6e-1f9f-4834-961e-fa45bf6fe245",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot, summary_plot = fit_figs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "79c0959a-b17f-4c3e-a1aa-477505a69d25",
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_info_panel.to_excel(\"fraud_uw_fit_info.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "8bb5be76-c3f2-48d7-88fa-1c5f8e07566c",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot.savefig('decision_plot_fraud_uw.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "ffdb9c09-7e59-49c6-9d4d-b8eda8992216",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_plot.savefig('summary_plot_fraud_uw.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "5887bff2-f6b0-4ebe-befd-34351a02548b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Experiment</th>\n",
       "      <th>dof</th>\n",
       "      <th>reg_fit_time</th>\n",
       "      <th>reg_training_auroc</th>\n",
       "      <th>reg_testing_auroc</th>\n",
       "      <th>xgb_fit_time</th>\n",
       "      <th>xgb_training_auroc</th>\n",
       "      <th>xgb_testing_auroc</th>\n",
       "      <th>var_inf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>514</td>\n",
       "      <td>14.7558</td>\n",
       "      <td>0.870963</td>\n",
       "      <td>0.783563</td>\n",
       "      <td>0.4571</td>\n",
       "      <td>0.895310</td>\n",
       "      <td>0.815548</td>\n",
       "      <td>0.001005</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>514</td>\n",
       "      <td>11.1395</td>\n",
       "      <td>0.870892</td>\n",
       "      <td>0.783512</td>\n",
       "      <td>0.4503</td>\n",
       "      <td>0.895558</td>\n",
       "      <td>0.815295</td>\n",
       "      <td>0.001016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>514</td>\n",
       "      <td>10.6611</td>\n",
       "      <td>0.871363</td>\n",
       "      <td>0.784693</td>\n",
       "      <td>0.4758</td>\n",
       "      <td>0.901076</td>\n",
       "      <td>0.815964</td>\n",
       "      <td>0.000997</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>514</td>\n",
       "      <td>9.2986</td>\n",
       "      <td>0.871173</td>\n",
       "      <td>0.786672</td>\n",
       "      <td>0.4541</td>\n",
       "      <td>0.897163</td>\n",
       "      <td>0.819813</td>\n",
       "      <td>0.000992</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>515</td>\n",
       "      <td>9.5440</td>\n",
       "      <td>0.872079</td>\n",
       "      <td>0.787844</td>\n",
       "      <td>0.4659</td>\n",
       "      <td>0.903094</td>\n",
       "      <td>0.817200</td>\n",
       "      <td>0.000980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>516</td>\n",
       "      <td>8.3395</td>\n",
       "      <td>0.871378</td>\n",
       "      <td>0.789450</td>\n",
       "      <td>0.4752</td>\n",
       "      <td>0.905282</td>\n",
       "      <td>0.818211</td>\n",
       "      <td>0.000898</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>6</td>\n",
       "      <td>516</td>\n",
       "      <td>9.0168</td>\n",
       "      <td>0.870738</td>\n",
       "      <td>0.795141</td>\n",
       "      <td>0.5715</td>\n",
       "      <td>0.906851</td>\n",
       "      <td>0.826991</td>\n",
       "      <td>0.000732</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Experiment  dof  reg_fit_time  reg_training_auroc  reg_testing_auroc  \\\n",
       "0           0  514       14.7558            0.870963           0.783563   \n",
       "1           1  514       11.1395            0.870892           0.783512   \n",
       "2           2  514       10.6611            0.871363           0.784693   \n",
       "3           3  514        9.2986            0.871173           0.786672   \n",
       "4           4  515        9.5440            0.872079           0.787844   \n",
       "5           5  516        8.3395            0.871378           0.789450   \n",
       "6           6  516        9.0168            0.870738           0.795141   \n",
       "\n",
       "   xgb_fit_time  xgb_training_auroc  xgb_testing_auroc   var_inf  \n",
       "0        0.4571            0.895310           0.815548  0.001005  \n",
       "1        0.4503            0.895558           0.815295  0.001016  \n",
       "2        0.4758            0.901076           0.815964  0.000997  \n",
       "3        0.4541            0.897163           0.819813  0.000992  \n",
       "4        0.4659            0.903094           0.817200  0.000980  \n",
       "5        0.4752            0.905282           0.818211  0.000898  \n",
       "6        0.5715            0.906851           0.826991  0.000732  "
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fit_info_panel"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ba26263-8b9d-4e35-880e-7b077e7e9159",
   "metadata": {},
   "source": [
    "## CBind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "ca2a9cd3-2da2-4fd4-b673-7d62c346e657",
   "metadata": {},
   "outputs": [],
   "source": [
    "cgrouping = icfesl.group_categorical_features(X2, X2.columns.tolist(), distance_threshold=0.002)\n",
    "X4 = icfesl.combine_features(X2, cgrouping)\n",
    "X4_test = icfesl.combine_features(X2_test, cgrouping)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7db5b54-7a14-401e-be2f-2670f66a5bf6",
   "metadata": {},
   "source": [
    "#### 1.logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "11ace45f-93b7-4611-b2ba-4f1c186f2ecc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5.4875\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X4, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "6cd36506-6cb2-4002-bde7-75be46c788df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:345; R2: 0.19967250826431926\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "889e61bb-0c16-4dcf-ba9f-4117720d6bd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "b0f172ff-c995-4354-a9bd-c8490faafcba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.8661564588393198\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "5a226b37-6efc-42f2-bdf5-473a1844b96e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "189d19cb-6c55-4e55-9cdf-7c3652999a78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.8331103456934014\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f83b17f-caa2-48e7-a00c-fb108a9c74bb",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "d5752ddb-6128-4743-a7b9-f6617ac74036",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "d9f35ec7-a09b-497a-b2a5-a6ff2b8a9479",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.3377\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X4, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "859f3057-694d-4d9b-b4c7-afd9900985fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X4)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "642b22c0-8f68-4ebf-8b8d-37e9096a591b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9224297114106038\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "49110501-218a-44b5-901e-35765607adaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X4_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "d581ff33-8ae3-4b70-b709-6570723daa99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.8375129466432886\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c29881f4-a3d9-4ef4-84a7-ceb656bd7acf",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
