{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "82beccaa-e829-4712-99e5-014dfb189c59",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.spatial.distance import cdist\n",
    "import random\n",
    "from sklearn.cluster import KMeans\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from icfesl import *\n",
    "from utility_functions import *\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from xgboost import XGBRegressor\n",
    "from pytorch_tabnet.tab_model import TabNetRegressor\n",
    "from sklearn.mixture import GaussianMixture\n",
    "import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "from feature_engine.datetime import DatetimeFeatures\n",
    "from catboost import CatBoostRegressor, Pool\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9706d06e-64c7-49e9-b15f-4f49265fcc85",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../../../writing/kaggle dataset/US_Air_quality/aqi_data_17_21.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0df55b74-b7c9-4855-91c3-4f6d09285c5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/j7/w07zs41n4s5df3svj9ygmh9h0000gn/T/ipykernel_5841/2200478113.py:1: DtypeWarning: Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  raw_data = pd.read_csv(path)\n"
     ]
    }
   ],
   "source": [
    "raw_data = pd.read_csv(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ac4d8aec-62dc-4f9e-9e94-78d8df16478f",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data.units_of_measure = raw_data.units_of_measure.astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c7f13a08-04a4-4a54-be0f-1be94827f21b",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data = raw_data.loc[raw_data['sample_duration']=='24 HOUR']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b69f4e10-0d2b-4de4-9a23-e91aafcadefb",
   "metadata": {},
   "outputs": [],
   "source": [
    "dtf = DatetimeFeatures(features_to_extract=[\"month\", 'hour'])\n",
    "\n",
    "date_features = dtf.fit_transform(raw_data[['date_local']])\n",
    "\n",
    "raw_data = pd.concat([raw_data, date_features], axis=1).sample(n=100000, random_state = 200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0667d9b1-a73c-4285-a3e8-fc6db996386f",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = ['site_number','method','first_max_hour', 'state','county','city','date_local_month','date_local_hour']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0baca774-cc85-40e9-a600-1f9021ac5b56",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = raw_data[cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9614c1d2-1460-43c1-ab28-218643753fe4",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = raw_data['aqi']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6b5e6aa7-4a60-4387-9f37-a422218857df",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0de93675-5e5b-4035-b355-e6d1e55e9630",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X_train.reset_index(drop=True)\n",
    "X_test = X_test.reset_index(drop=True)\n",
    "y_train = y_train.reset_index(drop=True)\n",
    "y_test = y_test.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "97f8335a-581d-4444-be09-9c4a384e8fef",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2, encoder = icfesl.f_get_dummies(X_train, cat_vars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8db04298-4609-4666-bf42-70effe7dde00",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [2, 5] during transform. These unknown categories will be encoded as all zeros\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "X2_test = icfesl.f_get_dummies(X_test, cat_vars, encoder=encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "34a046e4-07d5-45ce-9f09-5ef030e0383c",
   "metadata": {},
   "outputs": [],
   "source": [
    "selector = VarianceThreshold()\n",
    "\n",
    "selector.fit(X2)\n",
    "\n",
    "selected_features_mask = selector.get_support()\n",
    "\n",
    "selected_column_names = X2.columns[selected_features_mask]\n",
    "\n",
    "X2 = X2[selected_column_names]\n",
    "\n",
    "X2_test = X2_test[selected_column_names]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f14516c9-cb0a-4a1a-aeef-da52fb19ccf3",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in X2.columns.tolist():\n",
    "    X2[c] = X2[c].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c92afd8c-6758-4df9-9bd0-a323d34f9957",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in X2_test.columns.tolist():\n",
    "    X2_test[c] = X2_test[c].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c7d8a241-85c1-451d-86e9-4faa3e807d54",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(70000, 1191)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X2.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab9cea1e-a058-4985-b496-89c6c94419f9",
   "metadata": {},
   "source": [
    "#### CatBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "1a8231e4-e35e-482b-a871-b0830ae042cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(\n",
    "    iterations=100,  \n",
    "    loss_function='RMSE', \n",
    "    random_seed=42,  \n",
    "    verbose=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "7631df2f-ccce-4077-ab3e-7483e2a692de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5885\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train, y_train, cat_features=cat_vars)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "126a1e3b-0190-4cb7-b8a4-d807bc61adb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "dd5b8c27-ae22-448f-a792-bc8a2fffc30e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 17.10540739947989\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "1b036872-4ce3-4fe1-a482-c076a63f839d",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "1ff48134-8406-4eb7-bdd8-ae45cbed703e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 17.61604678367541\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ad0acfb-f93e-40cc-8e6b-e50258bf4db8",
   "metadata": {},
   "source": [
    "#### TabNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "195f3733-3198-4d0a-a8ca-49ba87bcc3b5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100000, 29)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "7d33193f-393c-48c0-8dbe-aa131ae2a974",
   "metadata": {},
   "outputs": [],
   "source": [
    "for v in cat_vars:\n",
    "    tmp = raw_data[v].value_counts().reset_index()\n",
    "    raw_data = raw_data[~raw_data[v].isin(tmp.loc[tmp['count']<10,v].tolist())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "874244e3-3df3-45c4-aa6c-b721dfc02f34",
   "metadata": {},
   "outputs": [],
   "source": [
    "le_list = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "97cd23f7-9cdf-4c38-83a8-2fd02d466c76",
   "metadata": {},
   "outputs": [],
   "source": [
    "for var in cat_vars:\n",
    "    le = LabelEncoder()\n",
    "    raw_data[var] = le.fit_transform(raw_data[var])\n",
    "    le_list[var] = le"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "146bd1f2-3c4d-4184-816a-34a99d2e3ba2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99913, 29)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "da8d9715-b7ae-45cc-b7d6-c146a237db87",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_, X_test_, y_train_, y_test_ = train_test_split(raw_data[cat_vars], raw_data['aqi'], test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "f7b5ce99-02d1-4e57-b134-b6333190dec2",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_idxs = [X_train_.columns.get_loc(col) for col in cat_vars]\n",
    "cat_dims = [X_train_[col].nunique() for col in cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "07b7e430-ef71-4d6e-80ea-ee8ae1edb787",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabNetRegressor(verbose=0, seed=200, cat_idxs=cat_idxs, cat_dims=cat_dims)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "c88f14ad-b988-4903-8f4c-2293678337d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Early stopping occurred at epoch 25 with best_epoch = 15 and best_train_mse = 301.13227\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/pytorch_tabnet/callbacks.py:172: UserWarning: Best weights from best epoch are automatically used!\n",
      "  warnings.warn(wrn_msg)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26.2951\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train=X_train_.to_numpy(), y_train=y_train_.to_numpy().reshape(-1,1), eval_set=[(X_train_.to_numpy(), y_train_.to_numpy().reshape(-1,1))], eval_name=['train'], max_epochs=50)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "c07ad8c7-f339-4a8e-a1f1-c6f5fba0fb38",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_train_.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "3f884efb-5690-4232-ab90-95cf799262c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE:17.353163118303655\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train_, y_pred)\n",
    "print(f\"training RMSE:{rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "9d5e5537-edfc-41a2-b1ec-32cee0a3c892",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_test_.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "65449876-131a-48f8-b32a-64e519d2bc1b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE:17.96229064958908\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test_, y_pred)\n",
    "print(f\"testing RMSE:{rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "522b86ca-642f-465b-a431-a8b7860fb5ed",
   "metadata": {},
   "source": [
    "### One hot encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9407c138-1151-49a5-b15f-067c12f1fee9",
   "metadata": {},
   "source": [
    "#### 1. OLS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "5876a562-9ad5-4796-a0b3-e6a250d0ad2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.6302\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.OLS(y_train, sm.add_constant(X2, has_constant='skip'))\n",
    "results = model.fit_regularized(alpha=0.05, L1_wt=0)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "4cad784d-cd85-42a1-b674-2281c411ccd1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:736.0\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "2e111f7c-e10a-41d8-bce0-aec3d408ed24",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(results.params,exog=sm.add_constant(X2, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "d522aafa-4a4f-49af-b35c-252568e8752a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 18.199926166299406\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "dd3b6924-3608-4e29-9017-d332b8acc6dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(results.params,sm.add_constant(X2_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "77159d20-1353-4815-9365-506dd9985242",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 18.59746840399437\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c888aa95-d9f6-4694-80ac-05b2bbe8b61c",
   "metadata": {},
   "source": [
    "#### 2.xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "ed98a66f-01db-4ea3-a8b3-a78e955be1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "df42399f-ca2b-4525-8198-fe1fa9e3c0d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.3959\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X2, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "ae1430b7-aceb-4742-b934-a1eed4e907a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "40224e28-b679-452a-9534-678ae93c1867",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 16.594946371558333\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "16ffb892-5e5e-4601-a5a5-653e1b86997a",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "040f7a6c-00dd-45e3-9c25-60e92992708a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 17.3842739876328\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27a2a623-5281-47c0-acbe-0247e59a4334",
   "metadata": {},
   "source": [
    "### Target Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "586c16d4-4a09-4b8d-8dbc-76a35d822258",
   "metadata": {},
   "outputs": [],
   "source": [
    "from category_encoders import TargetEncoder\n",
    "enc_auto = TargetEncoder(cols=cat_vars, min_samples_leaf=20, smoothing=10).fit(X_train, y_train)\n",
    "X_t = enc_auto.transform(X_train)\n",
    "X_t_test = enc_auto.transform(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e0a8ae9-bd27-4cfa-9c5d-dfd479d121e3",
   "metadata": {},
   "source": [
    "#### 1. OLS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "d2131938-e931-4f91-b99b-82c8f3d74226",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0102\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.OLS(y_train, sm.add_constant(X_t, has_constant='skip')).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "a3053246-108a-46e7-98ae-89ee1ac35bbc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:7.0; R2: 0.14924810401291422\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.rsquared}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "5ccab110-ab74-4e63-9d09-0174472a644b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "34ff4e5f-8b9a-41d3-94b7-79d64c7bf241",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 17.59951916220541\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "2ac968e0-4bfd-423d-944b-73c5710f7f69",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "82ae9a2a-4a84-4ae3-8abd-5520a9b0f785",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 18.0703586100017\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fb66c13-ae7e-4dbd-b1ab-6564363315e4",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "cc923abd-b206-440d-a64e-b1177cb8642a",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "647e7142-4e25-47d3-a749-b8b17a11da96",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.26\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_t, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "bd191fae-0925-4ace-8690-42b41085b2f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "a75ecae2-9d64-47e0-9206-044b02010fc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 16.211237687199407\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "af3cddbb-aa89-4942-9fce-f791b0270580",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "49e89411-0ebb-4dce-b2c9-1e8d6b27dc2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 17.19047512086509\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f9efe23-76f4-49e5-9130-053c9680dd64",
   "metadata": {},
   "source": [
    "### ICFESL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "95eed9eb-f300-40a6-b9d4-dd749b7e3769",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-11-29 14:39:07.113\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:40:31.810\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:40:32.522\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:40:33.077\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:40:33.077\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:42:06.320\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:42:07.000\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:42:07.557\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:42:07.558\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:43:32.285\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:43:32.953\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:43:33.479\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:43:33.479\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:45:08.700\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:45:09.395\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:45:09.928\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:45:09.929\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:47:23.893\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:47:24.801\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:47:25.383\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:47:25.383\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:49:54.833\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:49:55.722\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:49:56.344\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:49:56.344\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 14:51:22.241\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:51:22.936\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 14:51:23.512\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 5 ------>\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "fit_info_panel, index_of_best, figs, cluster_groups, criterions, inertias, gap_statss = icfesl.regularized_search_algorun(\n",
    "        X2, y_train, X2_test, y_test, cat_vars, 'regression', alphas = [0.01, 0.05, 0.1, 0.2, 0.5, 1, 5],\n",
    "        cbine_column=False, distance_threshold=0.005, figure=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "1e839141-0850-4a86-bacc-4a64c4d83d46",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.14.5\n"
     ]
    }
   ],
   "source": [
    "import statsmodels\n",
    "print(statsmodels.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "9139ee6e-1f9f-4834-961e-fa45bf6fe245",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot, summary_plot = figs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "79c0959a-b17f-4c3e-a1aa-477505a69d25",
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_info_panel.to_excel(\"us_aqi_fit_info.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "8bb5be76-c3f2-48d7-88fa-1c5f8e07566c",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot.savefig('decision_plot_us_aqi.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "ffdb9c09-7e59-49c6-9d4d-b8eda8992216",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_plot.savefig('summary_plot_us_aqi.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "249dd729-9c39-47e6-9a7d-9fa33799607f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Experiment</th>\n",
       "      <th>dof</th>\n",
       "      <th>reg_fit_time</th>\n",
       "      <th>reg_training_rmse</th>\n",
       "      <th>reg_testing_rmse</th>\n",
       "      <th>xgb_fit_time</th>\n",
       "      <th>xgb_training_rmse</th>\n",
       "      <th>xgb_testing_rmse</th>\n",
       "      <th>var_inf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>153.0</td>\n",
       "      <td>0.6041</td>\n",
       "      <td>17.475215</td>\n",
       "      <td>18.020773</td>\n",
       "      <td>0.4994</td>\n",
       "      <td>16.563807</td>\n",
       "      <td>17.403605</td>\n",
       "      <td>2.867630e-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>147.0</td>\n",
       "      <td>0.5693</td>\n",
       "      <td>17.481257</td>\n",
       "      <td>18.021645</td>\n",
       "      <td>0.5041</td>\n",
       "      <td>16.586515</td>\n",
       "      <td>17.391447</td>\n",
       "      <td>4.647043e-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>151.0</td>\n",
       "      <td>0.5744</td>\n",
       "      <td>17.474114</td>\n",
       "      <td>18.013101</td>\n",
       "      <td>0.4725</td>\n",
       "      <td>16.571512</td>\n",
       "      <td>17.409186</td>\n",
       "      <td>2.992259e-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>152.0</td>\n",
       "      <td>0.5866</td>\n",
       "      <td>17.483796</td>\n",
       "      <td>18.015862</td>\n",
       "      <td>0.4785</td>\n",
       "      <td>16.595690</td>\n",
       "      <td>17.441367</td>\n",
       "      <td>4.418721e-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>178.0</td>\n",
       "      <td>0.8005</td>\n",
       "      <td>17.471263</td>\n",
       "      <td>18.011565</td>\n",
       "      <td>0.5209</td>\n",
       "      <td>16.534453</td>\n",
       "      <td>17.343387</td>\n",
       "      <td>2.075008e-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>188.0</td>\n",
       "      <td>0.7774</td>\n",
       "      <td>17.471603</td>\n",
       "      <td>18.010422</td>\n",
       "      <td>0.5615</td>\n",
       "      <td>16.520317</td>\n",
       "      <td>17.383030</td>\n",
       "      <td>1.706524e-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>6</td>\n",
       "      <td>154.0</td>\n",
       "      <td>0.5931</td>\n",
       "      <td>17.477125</td>\n",
       "      <td>18.013964</td>\n",
       "      <td>0.5215</td>\n",
       "      <td>16.558334</td>\n",
       "      <td>17.437770</td>\n",
       "      <td>2.883611e-10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Experiment    dof  reg_fit_time  reg_training_rmse  reg_testing_rmse  \\\n",
       "0           0  153.0        0.6041          17.475215         18.020773   \n",
       "1           1  147.0        0.5693          17.481257         18.021645   \n",
       "2           2  151.0        0.5744          17.474114         18.013101   \n",
       "3           3  152.0        0.5866          17.483796         18.015862   \n",
       "4           4  178.0        0.8005          17.471263         18.011565   \n",
       "5           5  188.0        0.7774          17.471603         18.010422   \n",
       "6           6  154.0        0.5931          17.477125         18.013964   \n",
       "\n",
       "   xgb_fit_time  xgb_training_rmse  xgb_testing_rmse       var_inf  \n",
       "0        0.4994          16.563807         17.403605  2.867630e-10  \n",
       "1        0.5041          16.586515         17.391447  4.647043e-10  \n",
       "2        0.4725          16.571512         17.409186  2.992259e-10  \n",
       "3        0.4785          16.595690         17.441367  4.418721e-10  \n",
       "4        0.5209          16.534453         17.343387  2.075008e-10  \n",
       "5        0.5615          16.520317         17.383030  1.706524e-10  \n",
       "6        0.5215          16.558334         17.437770  2.883611e-10  "
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fit_info_panel"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ba26263-8b9d-4e35-880e-7b077e7e9159",
   "metadata": {},
   "source": [
    "## CBind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "ca2a9cd3-2da2-4fd4-b673-7d62c346e657",
   "metadata": {},
   "outputs": [],
   "source": [
    "cgrouping = icfesl.group_categorical_features(X2, X2.columns.tolist(), distance_threshold=0.002)\n",
    "X4 = icfesl.combine_features(X2, cgrouping)\n",
    "X4_test = icfesl.combine_features(X2_test, cgrouping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "b055c561-7dce-4048-8ad6-0ca5c2c4ca68",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(30000, 585)"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X4_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7db5b54-7a14-401e-be2f-2670f66a5bf6",
   "metadata": {},
   "source": [
    "#### 1.OLS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "11ace45f-93b7-4611-b2ba-4f1c186f2ecc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.7732\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.OLS(y_train, sm.add_constant(X4, has_constant='skip')).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "6cd36506-6cb2-4002-bde7-75be46c788df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:563.0; R2: 0.1567628508056894\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.rsquared}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "889e61bb-0c16-4dcf-ba9f-4117720d6bd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "b0f172ff-c995-4354-a9bd-c8490faafcba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 17.52161790559223\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "5a226b37-6efc-42f2-bdf5-473a1844b96e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "189d19cb-6c55-4e55-9cdf-7c3652999a78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 57804885230.338745\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f83b17f-caa2-48e7-a00c-fb108a9c74bb",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "d5752ddb-6128-4743-a7b9-f6617ac74036",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "d9f35ec7-a09b-497a-b2a5-a6ff2b8a9479",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.1163\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X4, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "859f3057-694d-4d9b-b4c7-afd9900985fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "642b22c0-8f68-4ebf-8b8d-37e9096a591b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training rmse: 16.601589634539714\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training rmse: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "49110501-218a-44b5-901e-35765607adaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X4_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "d581ff33-8ae3-4b70-b709-6570723daa99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing rmse: 17.353362698789187\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing rmse: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c29881f4-a3d9-4ef4-84a7-ceb656bd7acf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab84783b-284f-4d36-9806-2f619cb9412c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c7f16a8-e38e-4ee5-8daa-f660b9a76a8e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
