{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "82beccaa-e829-4712-99e5-014dfb189c59",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.spatial.distance import cdist\n",
    "import random\n",
    "from sklearn.cluster import KMeans\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from icfesl import *\n",
    "from utility_functions import *\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.metrics import roc_auc_score, accuracy_score\n",
    "from xgboost import XGBClassifier\n",
    "from pytorch_tabnet.tab_model import TabNetClassifier\n",
    "from sklearn.mixture import GaussianMixture\n",
    "import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "from feature_engine.datetime import DatetimeFeatures\n",
    "from catboost import CatBoostClassifier, Pool\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c62136aa-d4c2-40c2-89ac-fcedb548d32f",
   "metadata": {},
   "source": [
    "## Data Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "89139b9a-4912-4770-a4cb-7040c3db48dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../../../writing/kaggle dataset/financial_transactions\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9706d06e-64c7-49e9-b15f-4f49265fcc85",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_user_data = pd.read_csv(f\"{path}/users_data.csv\").sort_values('id').reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ed49e5e4-4a38-4978-8be1-434e167acbe0",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_card_data = pd.read_csv(f\"{path}/cards_data.csv\").sort_values(['id','client_id']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d8a65eee-3b4c-4b4e-93d8-9acc0c08f684",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_user_data = raw_user_data.rename(columns={'id':'user_id'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1de07502-4f98-4370-9ff3-6fecf8ba6f19",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_transaction_data = pd.read_csv(f\"{path}/transactions_data.csv\").sort_values(['id','client_id']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "70d1511f-12c5-4916-82cf-1a8100f28043",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_transaction_data = raw_transaction_data.rename(columns={'id':'transaction_id','client_id':'user_id'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d1f77573-910b-497b-bade-f705255f1a02",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_card_data = raw_card_data.rename(columns={'id':'card_id','client_id':'user_id'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "12d4a559-15f0-451d-ae69-1e35b9e3a96c",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = pd.read_json(f\"{path}/train_fraud_labels.json\").reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "d93af05a-842c-42dd-8894-38d5ba39e3d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = labels.rename(columns={'index':'transaction_id'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d742813b-d5cb-479e-8c22-cda9701a11d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "mcc = pd.read_json(f\"{path}/mcc_codes.json\",typ='series').reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6798c9a0-62b4-42dc-9867-e14f1006e46c",
   "metadata": {},
   "outputs": [],
   "source": [
    "mcc = mcc.rename(columns={'index':'mcc', 0:'merchant_type'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3a812238-d494-4b9b-95d2-d9f320b5504f",
   "metadata": {},
   "outputs": [],
   "source": [
    "transactions = raw_transaction_data.merge(mcc,on='mcc', how='left').merge(labels,on='transaction_id',how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "84b32741-1cd4-4d1f-88ae-30f61a46728b",
   "metadata": {},
   "outputs": [],
   "source": [
    "transactions = transactions.sort_values(by=['mcc','user_id','transaction_id']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "2ac9becb-fadd-4ca9-ac16-ccf6720b0644",
   "metadata": {},
   "outputs": [],
   "source": [
    "cards = raw_card_data.merge(raw_user_data, on=['user_id'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3faebd72-ce05-4e79-9aca-81a4da68bc7a",
   "metadata": {},
   "outputs": [],
   "source": [
    "cards = cards.sort_values(by=['user_id','card_id']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4495b7f4-c502-4c6b-9101-5aa3ed5c678c",
   "metadata": {},
   "outputs": [],
   "source": [
    "transactions.target = transactions['target'].apply(lambda x: 1 if x == 'Yes' else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "480d4a50-78f5-43b5-b1a6-5819dbf255e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "combine_data = transactions.merge(cards, on=['user_id', 'card_id'], how='left').sort_values(by=['user_id', 'card_id']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "fa4dc3f4-8b18-4208-b651-999ffc562161",
   "metadata": {},
   "outputs": [],
   "source": [
    "dtf = DatetimeFeatures(features_to_extract=[\"month\", 'hour'])\n",
    "\n",
    "date_features = dtf.fit_transform(combine_data[['date']])\n",
    "\n",
    "combine_data = pd.concat([combine_data, date_features], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "52daba1f-0b62-423a-b6f9-6126f3277442",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = ['zip','errors', 'merchant_type', 'card_brand', 'card_type', 'merchant_city', 'merchant_state', 'date_month', 'date_hour', 'num_credit_cards']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d197fc7c-de07-4639-b90e-8d01a4cbd1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "combine_data = combine_data[['target']+cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "5717c61e-d212-4f67-888b-384d7a61312c",
   "metadata": {},
   "outputs": [],
   "source": [
    "combine_data_1 = combine_data.loc[combine_data['target']==1]\n",
    "combine_data_0 = combine_data.loc[combine_data['target']==0].sample(frac = 1/200, random_state = 200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "eba34700-9ece-45b4-876b-2d9481435b8d",
   "metadata": {},
   "outputs": [],
   "source": [
    "combine_data_ = pd.concat([combine_data_1,combine_data_0],axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "82f07d03-21a6-452f-9e38-9d44afa7eecd",
   "metadata": {},
   "outputs": [],
   "source": [
    "combine_data_['zip'] = combine_data_['zip'].fillna(0)\n",
    "combine_data_['errors'] = combine_data_['errors'].fillna('missing')\n",
    "combine_data_['merchant_state'] =  combine_data_['merchant_state'].fillna('missing')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "c4b52c52-2289-4513-b8d7-c27c2b48463c",
   "metadata": {},
   "outputs": [],
   "source": [
    "combine_data_['zip'] = combine_data_['zip'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "f9a57bf4-5efe-450b-99a1-3ac6e75e4f9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = combine_data_[cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "8663725b-05dc-42ab-b059-b12bf29ad3f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = combine_data_['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "b1ad824f-1373-47e5-9bb5-13eb310507f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "ee6057b8-0bd9-4182-8cd5-81c4ab92361c",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X_train.reset_index(drop=True)\n",
    "X_test = X_test.reset_index(drop=True)\n",
    "y_train = y_train.reset_index(drop=True)\n",
    "y_test = y_test.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "ab6a93a6-b743-4a6e-b84c-ac398b3c4198",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2, encoder = icfesl.f_get_dummies(X_train, cat_vars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "0e2c494e-30cc-4358-a8c7-ea54556b84f6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [0, 1, 5, 6] during transform. These unknown categories will be encoded as all zeros\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "X2_test = icfesl.f_get_dummies(X_test, cat_vars, encoder=encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "b6272d3d-a61f-45d0-92fe-501f7aedd509",
   "metadata": {},
   "outputs": [],
   "source": [
    "selector = VarianceThreshold(threshold=0.0005)\n",
    "\n",
    "selector.fit(X2)\n",
    "\n",
    "selected_features_mask = selector.get_support()\n",
    "\n",
    "selected_column_names = X2.columns[selected_features_mask]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "9f7f4304-78de-4b84-9ece-b8dd793c2176",
   "metadata": {},
   "outputs": [],
   "source": [
    "selector = VarianceThreshold(threshold=0.001)\n",
    "\n",
    "selector.fit(X2_test)\n",
    "\n",
    "selected_features_mask_test = selector.get_support()\n",
    "\n",
    "selected_column_names_test = X2_test.columns[selected_features_mask_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "e64fa6dc-0cda-432d-8a07-a824ad208864",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2 = X2[list(set(selected_column_names.tolist() + selected_column_names_test.tolist()))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "764437a1-507d-4c15-abe6-0dc7765ebbbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2_test = X2_test[list(set(selected_column_names.tolist() + selected_column_names_test.tolist()))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "52c2fcb3-9065-424b-91a4-e0d7a3a53197",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in X2.columns.tolist():\n",
    "    X2[c] = X2[c].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "aae54736-0e38-4ff3-b353-ea7fc46f9ec7",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in X2_test.columns.tolist():\n",
    "    X2_test[c] = X2_test[c].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "75a2a7cc-848a-4659-b9d6-0e2c4d91897e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(55856, 935)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "666b06a7-b7db-4bfa-8074-84afbcfc18c5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(23939, 935)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X2_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "730d9fbd-e09a-4080-bfc6-faa5af443021",
   "metadata": {},
   "source": [
    "### CatBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "e92a6e97-1d87-48dd-9764-809023ef3029",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(\n",
    "    iterations=100,  \n",
    "    loss_function='Logloss', custom_metric=['AUC'],\n",
    "    random_seed=42,  \n",
    "    verbose=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "ef3286d3-6a0f-4b8e-af35-0e185e738a35",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['zip',\n",
       " 'errors',\n",
       " 'merchant_type',\n",
       " 'card_brand',\n",
       " 'card_type',\n",
       " 'merchant_city',\n",
       " 'merchant_state',\n",
       " 'date_month',\n",
       " 'date_hour',\n",
       " 'num_credit_cards']"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_vars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "472298e3-0788-4395-b2a5-512928759108",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7935\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train, y_train, cat_features=cat_vars)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "eaf437da-711d-448f-b9e7-ebc0bc4f38f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_train)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "a2a8a854-9749-4aea-b717-52da730a7584",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC: 0.9928978157807359\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "91c922d6-2b20-4524-8e26-5614db226412",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "bc1c2efc-5fa8-4d51-a7e5-a2725d73e287",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC: 0.9891771649861647\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing AUROC: {auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "acfeacd1-d307-44e8-abf5-692e0de18666",
   "metadata": {},
   "source": [
    "### TabNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "c91a0472-f6b3-4bc3-b8c5-06c0d3dd1250",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79795, 11)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combine_data_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "6c612bda-4f51-4269-b87b-99921ad5a0ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "for v in cat_vars:\n",
    "    tmp = combine_data_[v].value_counts().reset_index()\n",
    "    combine_data_ = combine_data_[~combine_data_[v].isin(tmp.loc[tmp['count']<10,v].tolist())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "afa2c821-c08d-4156-9261-c1a1ecca7a84",
   "metadata": {},
   "outputs": [],
   "source": [
    "for var in cat_vars:\n",
    "    le = LabelEncoder()\n",
    "    combine_data_[var] = le.fit_transform(combine_data_[var])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "353b46e8-9685-4ecc-8a5a-37c5b3f72b41",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(64279, 11)"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combine_data_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "eb0dce8b-c30c-41d8-89d9-3d39c5bbb40d",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_, X_test_, y_train_, y_test_ = train_test_split(combine_data_[cat_vars], combine_data_['target'], test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "f18473f8-39a2-414f-a27f-935d52263e7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_ = X_train_.reset_index(drop=True)\n",
    "X_test_ = X_test_.reset_index(drop=True)\n",
    "y_train_ = y_train_.reset_index(drop=True)\n",
    "y_test_ = y_test_.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "e5d60fce-195e-41ba-be10-0ce58864d1a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_idxs = [X_train_.columns.get_loc(col) for col in cat_vars]\n",
    "cat_dims = [X_train_[col].nunique() for col in cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "5bdfc454-48b8-49f5-af43-4401ed532b63",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabNetClassifier(verbose=0, seed=200, cat_idxs=cat_idxs, cat_dims=cat_dims)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "5705177e-3216-4b29-b83c-dbcda032dd7e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stop training because you reached max_epochs = 50 with best_epoch = 47 and best_train_auc = 0.99593\n",
      "29.8033\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/pytorch_tabnet/callbacks.py:172: UserWarning: Best weights from best epoch are automatically used!\n",
      "  warnings.warn(wrn_msg)\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train=X_train_.values, y_train=np.ravel(y_train_), eval_set=[(X_train_.values, np.ravel(y_train_))], eval_name=['train'], max_epochs=50)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "4d1f0dfc-0664-4273-9c22-471c23cd682e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_train_.values)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "bd21fa4b-7bd3-4bcc-bdaf-c6f8041d9218",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training AUROC:0.9959325355148703\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_train_, y_pred)\n",
    "print(f\"training AUROC:{auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "a356acff-5504-4d5a-8d47-681abbcc7939",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_test_.values)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "a9692bd9-1996-44e3-8861-01e597040b68",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing AUROC:0.9907212415024914\n"
     ]
    }
   ],
   "source": [
    "auc = roc_auc_score(y_test_, y_pred)\n",
    "print(f\"testing AUROC:{auc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "522b86ca-642f-465b-a431-a8b7860fb5ed",
   "metadata": {},
   "source": [
    "### One hot encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9407c138-1151-49a5-b15f-067c12f1fee9",
   "metadata": {},
   "source": [
    "#### 1. logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "5876a562-9ad5-4796-a0b3-e6a250d0ad2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20.3548\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = LogisticRegression(penalty='l2', C=1/0.02, solver='saga')\n",
    "model.fit(X2, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "4cad784d-cd85-42a1-b674-2281c411ccd1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:935\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{X2.shape[1]}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "9d98efc3-7dd7-4684-8bb1-e6b18d630451",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "d522aafa-4a4f-49af-b35c-252568e8752a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9376875779489494\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "76603f6f-5dac-4e71-bdb7-43a55c592c99",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "4785b5ce-2a1b-4cde-8bc1-9c6ae32bc0f5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.9379224571023195\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c888aa95-d9f6-4694-80ac-05b2bbe8b61c",
   "metadata": {},
   "source": [
    "#### 2.xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "ed98a66f-01db-4ea3-a8b3-a78e955be1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "df42399f-ca2b-4525-8198-fe1fa9e3c0d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.3983\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X2, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "ae1430b7-aceb-4742-b934-a1eed4e907a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X2)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "40224e28-b679-452a-9534-678ae93c1867",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9908065535804299\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "d16cd905-dfbf-48f0-9741-4345c1228b35",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X2_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "b0c434f0-d915-4781-a68e-9b547167ebe4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.9877932870980086\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27a2a623-5281-47c0-acbe-0247e59a4334",
   "metadata": {},
   "source": [
    "### Target Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "586c16d4-4a09-4b8d-8dbc-76a35d822258",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import TargetEncoder\n",
    "enc_auto = TargetEncoder(smooth=\"auto\")\n",
    "X_t = enc_auto.fit_transform(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "ccddaa36-8f12-4639-9de7-436e1aa30efb",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_feature_names = enc_auto.get_feature_names_out()\n",
    "X_tdf = pd.DataFrame(X_t, columns=list(output_feature_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "816e4575-c396-4d53-bcd4-c4d15f8db4b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_t_test = enc_auto.transform(X_test)\n",
    "X_tdf_test = pd.DataFrame(X_t_test, columns=list(output_feature_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e0a8ae9-bd27-4cfa-9c5d-dfd479d121e3",
   "metadata": {},
   "source": [
    "#### 1. logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "d2131938-e931-4f91-b99b-82c8f3d74226",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.1054\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X_tdf, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "a3053246-108a-46e7-98ae-89ee1ac35bbc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:10; R2: 0.46049507536246825\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "5ccab110-ab74-4e63-9d09-0174472a644b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "34ff4e5f-8b9a-41d3-94b7-79d64c7bf241",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9752125974861747\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "1e74c6fe-5c41-4824-aaf5-51c8be0a8a85",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "6afec548-36a8-48fc-94ff-22707b3b1654",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.975897119742374\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fb66c13-ae7e-4dbd-b1ab-6564363315e4",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "cc923abd-b206-440d-a64e-b1177cb8642a",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "647e7142-4e25-47d3-a749-b8b17a11da96",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2343\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_t, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "bd191fae-0925-4ace-8690-42b41085b2f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_t)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "a75ecae2-9d64-47e0-9206-044b02010fc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9970132645396323\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "258270ee-c901-4715-b627-4f8ec05f7c1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X_t_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "e1fb3037-c4af-4db7-8b95-b589c917df5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.9834936419348079\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f9efe23-76f4-49e5-9130-053c9680dd64",
   "metadata": {},
   "source": [
    "### ICFESL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "a438c513-69a1-4e61-a962-376a033b1239",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-11-28 19:15:44.485\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-11-28 19:16:21.771\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:16:22.157\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:16:22.523\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 19:16:22.523\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/linear_model/_sag.py:348: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  warnings.warn(\n",
      "\u001b[32m2025-11-28 19:18:15.531\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:18:16.338\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:18:16.679\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 19:18:16.679\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 19:18:44.978\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:18:45.785\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:18:46.144\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 19:18:46.145\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 19:19:11.870\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:19:12.785\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:19:13.147\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 19:19:13.147\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m394\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 19:19:33.059\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mRunning logit with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:19:33.726\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mRunning xgbClassifier with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-28 19:19:34.087\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m480\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-28 19:19:34.088\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m527\u001b[0m - \u001b[1msearch stopped: model fit scores are decreasing...\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "fit_info_panel, best_index, fit_figs, cluster_groups, criterions, inertias, gap_statss = icfesl.regularized_search_algorun(\n",
    "    X2, pd.Series(y_train), X2_test, pd.Series(y_test), cat_vars, 'classification', alphas = [0.1, 0.2, 0.5, 1, 5], cbine_column=False,\n",
    "    distance_threshold=0.002, figure=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "2fdd89c8-6cb2-40eb-9bf6-f3494737b0f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot, summary_plot = fit_figs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "0d4b5b73-afd0-4bb5-bbba-dee35487ebe2",
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_info_panel.to_excel(\"fraud_trans_fit_info.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "7f1deae4-a4d7-4108-bdd6-27929be8d2f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot.savefig('decision_plot_fraud_trans.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "7d588595-221c-47a7-b160-cb1c22d4ea5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_plot.savefig('summary_plot_fraud_trans.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "d725650e-0e36-42d9-8359-5b3bb3233f6d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Experiment</th>\n",
       "      <th>dof</th>\n",
       "      <th>reg_fit_time</th>\n",
       "      <th>reg_training_auroc</th>\n",
       "      <th>reg_testing_auroc</th>\n",
       "      <th>xgb_fit_time</th>\n",
       "      <th>xgb_training_auroc</th>\n",
       "      <th>xgb_testing_auroc</th>\n",
       "      <th>var_inf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>0.3509</td>\n",
       "      <td>0.984434</td>\n",
       "      <td>0.981237</td>\n",
       "      <td>0.3266</td>\n",
       "      <td>0.988046</td>\n",
       "      <td>0.980286</td>\n",
       "      <td>0.000070</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>35</td>\n",
       "      <td>0.7665</td>\n",
       "      <td>0.986042</td>\n",
       "      <td>0.983061</td>\n",
       "      <td>0.3013</td>\n",
       "      <td>0.989596</td>\n",
       "      <td>0.982185</td>\n",
       "      <td>0.000036</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>36</td>\n",
       "      <td>0.7707</td>\n",
       "      <td>0.986056</td>\n",
       "      <td>0.983177</td>\n",
       "      <td>0.3196</td>\n",
       "      <td>0.990027</td>\n",
       "      <td>0.983277</td>\n",
       "      <td>0.000058</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>35</td>\n",
       "      <td>0.8843</td>\n",
       "      <td>0.984186</td>\n",
       "      <td>0.981973</td>\n",
       "      <td>0.3237</td>\n",
       "      <td>0.987900</td>\n",
       "      <td>0.981890</td>\n",
       "      <td>0.000055</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>32</td>\n",
       "      <td>0.6335</td>\n",
       "      <td>0.983774</td>\n",
       "      <td>0.981585</td>\n",
       "      <td>0.3224</td>\n",
       "      <td>0.988191</td>\n",
       "      <td>0.982183</td>\n",
       "      <td>0.000047</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Experiment  dof  reg_fit_time  reg_training_auroc  reg_testing_auroc  \\\n",
       "0           0   35        0.3509            0.984434           0.981237   \n",
       "1           1   35        0.7665            0.986042           0.983061   \n",
       "2           2   36        0.7707            0.986056           0.983177   \n",
       "3           3   35        0.8843            0.984186           0.981973   \n",
       "4           4   32        0.6335            0.983774           0.981585   \n",
       "\n",
       "   xgb_fit_time  xgb_training_auroc  xgb_testing_auroc   var_inf  \n",
       "0        0.3266            0.988046           0.980286  0.000070  \n",
       "1        0.3013            0.989596           0.982185  0.000036  \n",
       "2        0.3196            0.990027           0.983277  0.000058  \n",
       "3        0.3237            0.987900           0.981890  0.000055  \n",
       "4        0.3224            0.988191           0.982183  0.000047  "
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fit_info_panel"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ba26263-8b9d-4e35-880e-7b077e7e9159",
   "metadata": {},
   "source": [
    "## CBind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "ca2a9cd3-2da2-4fd4-b673-7d62c346e657",
   "metadata": {},
   "outputs": [],
   "source": [
    "cgrouping = icfesl.group_categorical_features(X2, selected_column_names, distance_threshold=0.003)\n",
    "X4 = icfesl.combine_features(X2, cgrouping)\n",
    "X4_test = icfesl.combine_features(X2_test, cgrouping)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7db5b54-7a14-401e-be2f-2670f66a5bf6",
   "metadata": {},
   "source": [
    "#### 1.logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "11ace45f-93b7-4611-b2ba-4f1c186f2ecc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13.0665\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y_train, sm.add_constant(X4, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "6cd36506-6cb2-4002-bde7-75be46c788df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:234; R2: 0.5064665645050063\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.pseudo_rsquared()}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "889e61bb-0c16-4dcf-ba9f-4117720d6bd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "b0f172ff-c995-4354-a9bd-c8490faafcba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9853208730234985\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "f0d673fd-762b-4f6e-a943-128a05074e12",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "4c793bde-4c18-49f1-b63c-837ebdd72bda",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.983414782729432\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f83b17f-caa2-48e7-a00c-fb108a9c74bb",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "d5752ddb-6128-4743-a7b9-f6617ac74036",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBClassifier(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "d9f35ec7-a09b-497a-b2a5-a6ff2b8a9479",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.4872\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X4, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "859f3057-694d-4d9b-b4c7-afd9900985fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X4)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "642b22c0-8f68-4ebf-8b8d-37e9096a591b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training auroc: 0.9906993977782099\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_train, y_pred)\n",
    "print(f\"training auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "bf3bde12-0c79-4078-9dcb-72ac71e2fce6",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_proba(X4_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "cdfca3e5-0ebd-4eb6-a949-0c65db159b24",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing auroc: 0.987446409966664\n"
     ]
    }
   ],
   "source": [
    "auroc = roc_auc_score(y_test, y_pred)\n",
    "print(f\"testing auroc: {auroc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "029476ac-338c-4119-b91a-8cd7134d5a24",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4d96638-eede-428c-af4b-58ef307edeeb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
