{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "82beccaa-e829-4712-99e5-014dfb189c59",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.spatial.distance import cdist\n",
    "import random\n",
    "from sklearn.cluster import KMeans\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from icfesl import *\n",
    "from utility_functions import *\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from xgboost import XGBRegressor\n",
    "from pytorch_tabnet.tab_model import TabNetRegressor\n",
    "from sklearn.mixture import GaussianMixture\n",
    "import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "from catboost import CatBoostRegressor, Pool\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9706d06e-64c7-49e9-b15f-4f49265fcc85",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../../../writing/kaggle dataset/flght_price_prediction/Clean_Dataset.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0df55b74-b7c9-4855-91c3-4f6d09285c5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data = pd.read_csv(path).sample(n=70000, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "93d765c9-84a6-472e-8ae8-613291bf541f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(70000, 12)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "918fa2ec-e255-480d-93ef-f2226c96b6bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = ['airline','source_city','departure_time','stops','arrival_time','destination_city','class','days_left']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bed2bc0d-10c7-4435-9b34-b0df1c6bb387",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = raw_data[cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "91c2b64a-f72e-43c5-9f2a-1aaac219a096",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = raw_data['price']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6b5e6aa7-4a60-4387-9f37-a422218857df",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=35)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "0de93675-5e5b-4035-b355-e6d1e55e9630",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X_train.reset_index(drop=True)\n",
    "X_test = X_test.reset_index(drop=True)\n",
    "y_train = y_train.reset_index(drop=True)\n",
    "y_test = y_test.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "97f8335a-581d-4444-be09-9c4a384e8fef",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2, encoder = icfesl.f_get_dummies(X_train, cat_vars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8db04298-4609-4666-bf42-70effe7dde00",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2_test = icfesl.f_get_dummies(X_test, cat_vars, encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7e5184d1-c8cc-4c1b-8b4a-9527028b131d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# selector = VarianceThreshold()\n",
    "\n",
    "# selector.fit(X2)\n",
    "\n",
    "# selected_features_mask = selector.get_support()\n",
    "\n",
    "# selected_column_names = X2.columns[selected_features_mask]\n",
    "\n",
    "# X2 = X2[selected_column_names]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f14516c9-cb0a-4a1a-aeef-da52fb19ccf3",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in X2.columns.tolist():\n",
    "    X2[c] = X2[c].astype('int')\n",
    "    X2_test[c] = X2_test[c].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "32963ec8-ac04-45db-87ea-e5ae76137802",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in X2_test.columns.tolist():\n",
    "    X2_test[c] = X2_test[c].astype('int')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "58e1982b-2a32-43fb-8925-5a2f51ce3d6f",
   "metadata": {},
   "source": [
    "### CatBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "ba5d8f6d-1092-4914-a190-04c9463e310b",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(\n",
    "    iterations=100,  \n",
    "    loss_function='RMSE', \n",
    "    random_seed=42,  \n",
    "    verbose=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "254500c2-ea04-44db-957a-d308c834df26",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.4785\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train, y_train, cat_features=cat_vars)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "71a63fd6-fc0c-4165-969c-54943f57f68b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "96b7beb4-3d41-40fa-81a8-161a0ce6c792",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 4514.990988153726\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "c6950059-8e71-4fc9-b272-22eb43ae1d6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "76684d43-8beb-4212-aadb-78ab6c50724e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 4598.237120061354\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c965a19-820b-4c12-aa7e-e1deb324f02a",
   "metadata": {},
   "source": [
    "### TabNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "efd5e7a8-aa91-4678-bb3f-7895e67913ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(70000, 12)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d59c3273-e4dd-4e13-86be-9784ba1a2c9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "for v in cat_vars:\n",
    "    tmp = raw_data[v].value_counts().reset_index()\n",
    "    raw_data = raw_data[~raw_data[v].isin(tmp.loc[tmp['count']<10,v].tolist())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "18ae33f6-c14c-4bcf-8238-f9d8cfea953d",
   "metadata": {},
   "outputs": [],
   "source": [
    "le_list = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "70a6cbba-1b1e-4fec-9043-bf880642cf86",
   "metadata": {},
   "outputs": [],
   "source": [
    "for var in cat_vars:\n",
    "    le = LabelEncoder()\n",
    "    raw_data[var] = le.fit_transform(raw_data[var])\n",
    "    le_list[var] = le"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "5a963d71-5b6f-42cf-bb76-68a10a84e666",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(70000, 12)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "36b103ca-b27e-4777-90de-7762b5b2e5be",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_, X_test_, y_train_, y_test_ = train_test_split(raw_data[cat_vars], raw_data['price'], test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "75905434-b8ea-4d90-a85d-662c679784ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_idxs = [X_train_.columns.get_loc(col) for col in cat_vars]\n",
    "cat_dims = [X_train_[col].nunique() for col in cat_vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "c529276c-4fd7-4826-8463-0a99a569d7bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabNetRegressor(verbose=0, seed=200, cat_idxs=cat_idxs, cat_dims=cat_dims)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "d893f28e-38d9-47f1-987d-68b2d116d52e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stop training because you reached max_epochs = 50 with best_epoch = 48 and best_train_mse = 19903096.0\n",
      "34.4625\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/pytorch_tabnet/callbacks.py:172: UserWarning: Best weights from best epoch are automatically used!\n",
      "  warnings.warn(wrn_msg)\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train=X_train_.to_numpy(), y_train=y_train_.to_numpy().reshape(-1,1), eval_set=[(X_train_.to_numpy(), y_train_.to_numpy().reshape(-1,1))], eval_name=['train'], max_epochs=50)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "f1edb517-cba7-4d47-831a-85bcdcd7351c",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_train_.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "34ed5484-616f-4e9c-a148-1703f6fe9c4b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE:4461.28857421875\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train_, y_pred)\n",
    "print(f\"training RMSE:{rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "fc9b07dd-3953-4c1a-996a-e8060275439f",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_test_.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "831ebb74-9927-4825-bd5c-4cc6b8eb745a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE:4479.27392578125\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test_, y_pred)\n",
    "print(f\"testing RMSE:{rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "522b86ca-642f-465b-a431-a8b7860fb5ed",
   "metadata": {},
   "source": [
    "### One hot encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9407c138-1151-49a5-b15f-067c12f1fee9",
   "metadata": {},
   "source": [
    "#### 1. OLS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "5876a562-9ad5-4796-a0b3-e6a250d0ad2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2439\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.OLS(y_train, sm.add_constant(X2, has_constant='skip')).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "4cad784d-cd85-42a1-b674-2281c411ccd1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:76.0; R2: 0.9131404760573076\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.rsquared}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "9d98efc3-7dd7-4684-8bb1-e6b18d630451",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "d522aafa-4a4f-49af-b35c-252568e8752a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 6669.345051431331\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "dd3b6924-3608-4e29-9017-d332b8acc6dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "77159d20-1353-4815-9365-506dd9985242",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 6624.163221063852\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c888aa95-d9f6-4694-80ac-05b2bbe8b61c",
   "metadata": {},
   "source": [
    "#### 2.xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "ed98a66f-01db-4ea3-a8b3-a78e955be1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "df42399f-ca2b-4525-8198-fe1fa9e3c0d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.3459\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X2, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "ae1430b7-aceb-4742-b934-a1eed4e907a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "40224e28-b679-452a-9534-678ae93c1867",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 4082.174560546875\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "16ffb892-5e5e-4601-a5a5-653e1b86997a",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "040f7a6c-00dd-45e3-9c25-60e92992708a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 4431.78466796875\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27a2a623-5281-47c0-acbe-0247e59a4334",
   "metadata": {},
   "source": [
    "### Target Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "586c16d4-4a09-4b8d-8dbc-76a35d822258",
   "metadata": {},
   "outputs": [],
   "source": [
    "from category_encoders import TargetEncoder\n",
    "enc_auto = TargetEncoder(cols=cat_vars, min_samples_leaf=20, smoothing=10).fit(X_train, y_train)\n",
    "X_t = enc_auto.transform(X_train)\n",
    "X_t_test = enc_auto.transform(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e0a8ae9-bd27-4cfa-9c5d-dfd479d121e3",
   "metadata": {},
   "source": [
    "#### 1. OLS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "d2131938-e931-4f91-b99b-82c8f3d74226",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0076\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.OLS(y_train, sm.add_constant(X_t, has_constant='skip')).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "a3053246-108a-46e7-98ae-89ee1ac35bbc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:8.0; R2: 0.9017429604925657\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.rsquared}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "5ccab110-ab74-4e63-9d09-0174472a644b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "34ff4e5f-8b9a-41d3-94b7-79d64c7bf241",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 7093.430239528867\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "2ac968e0-4bfd-423d-944b-73c5710f7f69",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "82ae9a2a-4a84-4ae3-8abd-5520a9b0f785",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 7036.825477577443\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fb66c13-ae7e-4dbd-b1ab-6564363315e4",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "cc923abd-b206-440d-a64e-b1177cb8642a",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "647e7142-4e25-47d3-a749-b8b17a11da96",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2498\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_t, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "bd191fae-0925-4ace-8690-42b41085b2f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "a75ecae2-9d64-47e0-9206-044b02010fc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 3884.300048828125\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "af3cddbb-aa89-4942-9fce-f791b0270580",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "49e89411-0ebb-4dce-b2c9-1e8d6b27dc2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 4211.3876953125\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f9efe23-76f4-49e5-9130-053c9680dd64",
   "metadata": {},
   "source": [
    "### ICFESL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "95eed9eb-f300-40a6-b9d4-dd749b7e3769",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-11-29 15:05:08.391\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:14.051\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:14.126\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:14.462\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.01 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:14.463\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:20.020\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:20.086\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:20.414\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.05 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:20.414\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:26.123\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:26.197\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:26.532\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:26.532\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:32.147\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:32.204\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:32.538\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.2 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:32.538\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:38.211\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:38.279\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:38.614\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 0.5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:38.614\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:44.157\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:44.226\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:44.560\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 1 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:44.560\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m397\u001b[0m - \u001b[1mrunning algorithm with L2 regularization factor = 5 ------>\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:50.452\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m453\u001b[0m - \u001b[1mRunning OLS with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:50.523\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m471\u001b[0m - \u001b[1mRunning xgbRegressor with ICFESL encoding\u001b[0m\n",
      "\u001b[32m2025-11-29 15:05:50.858\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36micfesl\u001b[0m:\u001b[36mregularized_search_algorun\u001b[0m:\u001b[36m485\u001b[0m - \u001b[1mCompleted: running algorithm with L2 regularization factor = 5 ------>\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "fit_info_panel, index_of_best, figs, cluster_groups, criterions, inertias, gap_statss = icfesl.regularized_search_algorun(\n",
    "        X2, y_train, X2_test, y_test, cat_vars, 'regression', alphas = [0.01, 0.05, 0.1, 0.2, 0.5, 1, 5],\n",
    "        cbine_column=False, distance_threshold=0.005, figure=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "9139ee6e-1f9f-4834-961e-fa45bf6fe245",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot, summary_plot = figs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "79c0959a-b17f-4c3e-a1aa-477505a69d25",
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_info_panel.to_excel(\"flight_price_fit_info.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "8bb5be76-c3f2-48d7-88fa-1c5f8e07566c",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot.savefig('decision_plot_flight_price.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "ffdb9c09-7e59-49c6-9d4d-b8eda8992216",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_plot.savefig('summary_plot_flight_price.png')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ba26263-8b9d-4e35-880e-7b077e7e9159",
   "metadata": {},
   "source": [
    "## CBind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "ca2a9cd3-2da2-4fd4-b673-7d62c346e657",
   "metadata": {},
   "outputs": [],
   "source": [
    "cgrouping = icfesl.group_categorical_features(X2, X2.columns.tolist(), distance_threshold=0.002)\n",
    "X4 = icfesl.combine_features(X2, cgrouping)\n",
    "X4_test = icfesl.combine_features(X2_test, cgrouping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "b055c561-7dce-4048-8ad6-0ca5c2c4ca68",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(21000, 76)"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X4_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7db5b54-7a14-401e-be2f-2670f66a5bf6",
   "metadata": {},
   "source": [
    "#### 1.OLS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "11ace45f-93b7-4611-b2ba-4f1c186f2ecc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.1609\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.OLS(y_train, sm.add_constant(X4, has_constant='skip')).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "6cd36506-6cb2-4002-bde7-75be46c788df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:76.0; R2: 0.9131404760573076\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.rsquared}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "889e61bb-0c16-4dcf-ba9f-4117720d6bd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "b0f172ff-c995-4354-a9bd-c8490faafcba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 6669.345051431331\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "5a226b37-6efc-42f2-bdf5-473a1844b96e",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "189d19cb-6c55-4e55-9cdf-7c3652999a78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 6624.163221063852\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f83b17f-caa2-48e7-a00c-fb108a9c74bb",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "d5752ddb-6128-4743-a7b9-f6617ac74036",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "d9f35ec7-a09b-497a-b2a5-a6ff2b8a9479",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2975\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X4, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "859f3057-694d-4d9b-b4c7-afd9900985fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "642b22c0-8f68-4ebf-8b8d-37e9096a591b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training rmse: 4082.174560546875\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training rmse: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "49110501-218a-44b5-901e-35765607adaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X4_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "d581ff33-8ae3-4b70-b709-6570723daa99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing rmse: 4431.78466796875\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing rmse: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c29881f4-a3d9-4ef4-84a7-ceb656bd7acf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab84783b-284f-4d36-9806-2f619cb9412c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c7f16a8-e38e-4ee5-8daa-f660b9a76a8e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
