{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "82beccaa-e829-4712-99e5-014dfb189c59",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import random\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from icfesl import *\n",
    "from utility_functions import *\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.metrics import root_mean_squared_error\n",
    "from xgboost import XGBRegressor\n",
    "from pytorch_tabnet.tab_model import TabNetRegressor\n",
    "import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "import pdb"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c62136aa-d4c2-40c2-89ac-fcedb548d32f",
   "metadata": {},
   "source": [
    "## Data Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "89139b9a-4912-4770-a4cb-7040c3db48dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "file1 = \"../../../writing/kaggle dataset/us_hospital_satisfaction/archive-7/cms_hospital_patient_satisfaction_2016.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ca3dee7e-8b45-47cf-896d-a01cf166bc9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "file2 = \"../../../writing/kaggle dataset/us_hospital_satisfaction/archive-7/cms_hospital_patient_satisfaction_2017.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c5e82c69-2664-4699-8f42-97e12464609c",
   "metadata": {},
   "outputs": [],
   "source": [
    "file3 = \"../../../writing/kaggle dataset/us_hospital_satisfaction/archive-7/cms_hospital_patient_satisfaction_2018.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d8825787-853c-41bd-a825-ebb22555c6fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "file4 = \"../../../writing/kaggle dataset/us_hospital_satisfaction/archive-7/cms_hospital_patient_satisfaction_2019.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ce6a5deb-8207-459a-ab3f-811defa0637f",
   "metadata": {},
   "outputs": [],
   "source": [
    "file5 = \"../../../writing/kaggle dataset/us_hospital_satisfaction/archive-7/cms_hospital_patient_satisfaction_2020.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "70664d88-7079-42de-8e45-cd5f383010d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "data1 = pd.read_csv(file1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d72b3ad-54a6-4e5a-90a0-6ca301944919",
   "metadata": {},
   "outputs": [],
   "source": [
    "data2 = pd.read_csv(file2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0e145c6-a44b-4035-b25b-76793b0152f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "data3 = pd.read_csv(file3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41ac3495-75a6-4307-b551-cb0f519e2e6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data4 = pd.read_csv(file4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "184bd854-f5bd-40a7-b426-5786bb9236b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "data5 = pd.read_csv(file5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "307ee53f-3e80-4972-bfd5-10e6fb91ecc5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98f97b4a-43e1-4895-8a13-d1134020bb45",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a71d849-798b-4dcc-9346-d0d09546ea55",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cabfb901-9f4a-4d30-9642-363906a5d5bf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbc9917f-c2e3-4964-816d-d5443fccfa40",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9706d06e-64c7-49e9-b15f-4f49265fcc85",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data = pd.read_csv(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ed49e5e4-4a38-4978-8be1-434e167acbe0",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data.price = raw_data.price.str.replace(r'[^0-9]', '', regex=True).astype('float')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5099e572-42dc-42ce-bad0-0966ee6b08aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data.milage = raw_data.milage.str.replace(r'[^0-9]', '', regex=True).astype('float')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "697472b5-cbb8-4cf1-b666-edd96d27cece",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4009, 12)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "859efd36-5e93-45f0-b5bb-75a065711ff3",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = ['brand','model','model_year','fuel_type','engine','transmission','ext_col','int_col','accident','clean_title']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0aa5d48f-7412-4f40-a1a1-710f0b64dab6",
   "metadata": {},
   "outputs": [],
   "source": [
    "for var in cat_vars:\n",
    "    temp = raw_data[[var]].value_counts().rename_axis(var).reset_index(name='counts')\n",
    "    keep = temp.loc[temp['counts']>5, [var]]\n",
    "    raw_data = raw_data.loc[raw_data[var].isin(keep[var].tolist())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "26c73c4b-95d1-4648-b266-787e97b5e53e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(382, 12)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c982d83a-8b83-4c2e-b5be-c3d26e8c751a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebdfaa87-b9ed-4e6b-918f-b27d5e5fa6cb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f030bb9-daca-4258-b6d1-b5350dc528e8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b5462ab-3095-4468-b417-f5fac759885f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1020b04b-d53a-4588-87f1-6fda7216cc6b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "00109f00-0230-4305-aa9e-4f5649f043ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = X.model_year.value_counts().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "979be9d1-26a1-40ff-b730-cbe8395361df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model_year</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2022</td>\n",
       "      <td>354</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2021</td>\n",
       "      <td>350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020</td>\n",
       "      <td>322</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2018</td>\n",
       "      <td>315</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2019</td>\n",
       "      <td>297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2016</td>\n",
       "      <td>268</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2017</td>\n",
       "      <td>259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2015</td>\n",
       "      <td>228</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2023</td>\n",
       "      <td>226</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2014</td>\n",
       "      <td>181</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2013</td>\n",
       "      <td>158</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2012</td>\n",
       "      <td>141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>2011</td>\n",
       "      <td>124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2008</td>\n",
       "      <td>113</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>2010</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>2007</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2005</td>\n",
       "      <td>72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>2009</td>\n",
       "      <td>72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>2006</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>2004</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>2003</td>\n",
       "      <td>49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>2001</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>2002</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>2000</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>1999</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>1998</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>1997</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>1993</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>1996</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>1994</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>2024</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>1995</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>1974</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>1992</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    model_year  count\n",
       "0         2022    354\n",
       "1         2021    350\n",
       "2         2020    322\n",
       "3         2018    315\n",
       "4         2019    297\n",
       "5         2016    268\n",
       "6         2017    259\n",
       "7         2015    228\n",
       "8         2023    226\n",
       "9         2014    181\n",
       "10        2013    158\n",
       "11        2012    141\n",
       "12        2011    124\n",
       "13        2008    113\n",
       "14        2010    100\n",
       "15        2007     98\n",
       "16        2005     72\n",
       "17        2009     72\n",
       "18        2006     66\n",
       "19        2004     60\n",
       "20        2003     49\n",
       "21        2001     34\n",
       "22        2002     32\n",
       "23        2000     17\n",
       "24        1999     15\n",
       "25        1998     11\n",
       "26        1997      9\n",
       "27        1993      9\n",
       "28        1996      8\n",
       "29        1994      7\n",
       "30        2024      6\n",
       "31        1995      6\n",
       "32        1974      1\n",
       "33        1992      1"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "0b3a0c74-2aee-4345-8056-184b0620a68d",
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'Series' object has no attribute 'columns'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m/var/folders/j7/w07zs41n4s5df3svj9ygmh9h0000gn/T/ipykernel_1504/2897268474.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/opt/anaconda3/lib/python3.13/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   6295\u001b[0m             \u001b[0;32mand\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_accessors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   6296\u001b[0m             \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   6297\u001b[0m         \u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   6298\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6299\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'columns'"
     ]
    }
   ],
   "source": [
    "temp.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5a285cc-cd46-4351-96db-07fdfd22d5e5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24ae37c6-4e65-4a06-abb4-7b84ebb0559c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "88bf56fd-163c-4739-8cd5-e0759ba66a47",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = raw_data['price']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5d94b54e-a0ef-4df7-bb09-65b2c5712e99",
   "metadata": {},
   "outputs": [],
   "source": [
    "dummy_data = icfesl.f_get_dummies(X, cat_vars, drop_first=True, dummy_na = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1bc9fd9d-20c2-4000-98ab-572efc1f8b1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "dummy_data = pd.concat([dummy_data, raw_data['milage']],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a78ac996-a981-4768-9505-870df26b9b94",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars2 = [f for f in dummy_data.columns if (f not in X.columns)]\n",
    "dummy_data = dummy_data[cat_vars2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4e6a2579-0fb5-4ca2-bf82-a0a2709fef08",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(dummy_data, y, test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "846358c8-50f6-4b87-8eea-cf47fe751061",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X_train.reset_index(drop=True)\n",
    "X_test = X_test.reset_index(drop=True)\n",
    "y_train = y_train.reset_index(drop=True)\n",
    "y_test = y_test.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "72e4cf41-3282-4d54-a677-c24abf90415a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((2806, 3673), (1203, 3673))"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "52c2fcb3-9065-424b-91a4-e0d7a3a53197",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in cat_vars2:\n",
    "    X_train[c] = X_train[c].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5ce2e2d9-8bdd-49b5-b4a1-180fe67ad8c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in cat_vars2:\n",
    "    X_test[c] = X_test[c].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "3f234060-f911-419a-a1a9-df99163713dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2 = X_train\n",
    "X2_test = X_test"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "522b86ca-642f-465b-a431-a8b7860fb5ed",
   "metadata": {},
   "source": [
    "### One hot encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9407c138-1151-49a5-b15f-067c12f1fee9",
   "metadata": {},
   "source": [
    "#### 1. logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "5876a562-9ad5-4796-a0b3-e6a250d0ad2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.4667\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model = sm.OLS(y_train, sm.add_constant(X2, has_constant='skip')).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "4cad784d-cd85-42a1-b674-2281c411ccd1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DF:2124.0; R2: 0.9966340630259244\n"
     ]
    }
   ],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.rsquared}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9d98efc3-7dd7-4684-8bb1-e6b18d630451",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d522aafa-4a4f-49af-b35c-252568e8752a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 3045.0336021443522\n"
     ]
    }
   ],
   "source": [
    "mse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {mse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "76603f6f-5dac-4e71-bdb7-43a55c592c99",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X2_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "4785b5ce-2a1b-4cde-8bc1-9c6ae32bc0f5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 113525.91595694415\n"
     ]
    }
   ],
   "source": [
    "mse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {mse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c888aa95-d9f6-4694-80ac-05b2bbe8b61c",
   "metadata": {},
   "source": [
    "#### 2.xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "ed98a66f-01db-4ea3-a8b3-a78e955be1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "df42399f-ca2b-4525-8198-fe1fa9e3c0d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9295\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X2, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "ae1430b7-aceb-4742-b934-a1eed4e907a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "40224e28-b679-452a-9534-678ae93c1867",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE: 9567.824218610094\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "d16cd905-dfbf-48f0-9741-4345c1228b35",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "b0c434f0-d915-4781-a68e-9b547167ebe4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE: 110463.02739713003\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "244d68aa-b7af-40a3-a2ea-6e1567381ccb",
   "metadata": {},
   "source": [
    "#### 3. Tabnet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "e029b062-8c8e-4a5c-b82c-ccd9678e3c44",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabNetRegressor(verbose=0, seed=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "ce63c09e-4b6b-4591-8a56-4188edcb512e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Early stopping occurred at epoch 46 with best_epoch = 36 and best_train_mse = 4628740926.42649\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/pytorch_tabnet/callbacks.py:172: UserWarning: Best weights from best epoch are automatically used!\n",
      "  warnings.warn(wrn_msg)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "75.2704\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train=X2.to_numpy(), y_train=y_train.to_numpy().reshape(-1,1), \n",
    "          eval_set=[(X2.to_numpy(), y_train.to_numpy().reshape(-1,1))], eval_name=['train'], max_epochs=100)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "7a8ccf6e-214e-491d-af86-5169e21c0cdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "9505d1b8-e91c-4240-9563-a531ca48a19f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training RMSE:68034.85082240198\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE:{rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "36acb930-6482-46e8-8436-abd22c106607",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X2_test.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "49310019-ce4b-4b5a-ac8d-2bf411b4e098",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "testing RMSE:128074.62328408667\n"
     ]
    }
   ],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE:{rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27a2a623-5281-47c0-acbe-0247e59a4334",
   "metadata": {},
   "source": [
    "### Target Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "586c16d4-4a09-4b8d-8dbc-76a35d822258",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.13/site-packages/sklearn/model_selection/_split.py:805: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import TargetEncoder\n",
    "enc_auto = TargetEncoder(smooth=\"auto\")\n",
    "X_t = enc_auto.fit_transform(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ccddaa36-8f12-4639-9de7-436e1aa30efb",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_feature_names = enc_auto.get_feature_names_out()\n",
    "X_tdf = pd.DataFrame(X_t, columns=list(output_feature_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "816e4575-c396-4d53-bcd4-c4d15f8db4b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_t_test = enc_auto.transform(X_test)\n",
    "X_tdf_test = pd.DataFrame(X_t_test, columns=list(output_feature_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e0a8ae9-bd27-4cfa-9c5d-dfd479d121e3",
   "metadata": {},
   "source": [
    "#### 1. logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2131938-e931-4f91-b99b-82c8f3d74226",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "model = sm.OLS(y_train, sm.add_constant(X_tdf, has_constant='skip').fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3053246-108a-46e7-98ae-89ee1ac35bbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.rsquared}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5ccab110-ab74-4e63-9d09-0174472a644b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34ff4e5f-8b9a-41d3-94b7-79d64c7bf241",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e74c6fe-5c41-4824-aaf5-51c8be0a8a85",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X_t_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6afec548-36a8-48fc-94ff-22707b3b1654",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fb66c13-ae7e-4dbd-b1ab-6564363315e4",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc923abd-b206-440d-a64e-b1177cb8642a",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "647e7142-4e25-47d3-a749-b8b17a11da96",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "model.fit(X_t, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd191fae-0925-4ace-8690-42b41085b2f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a75ecae2-9d64-47e0-9206-044b02010fc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "258270ee-c901-4715-b627-4f8ec05f7c1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1fb3037-c4af-4db7-8b95-b589c917df5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c83f64af-9ac1-4df6-8141-e1b6addac795",
   "metadata": {},
   "source": [
    "#### 3. Tabnet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c877c1b-a1f1-4e3a-8ef6-88c30a18d659",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabNetRegressor(verbose=0, seed=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0a6fb17-5915-4fea-9223-e4c1511c5e62",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train=X_t, y_train=y_train.to_numpy().reshape(-1,1), eval_set=[(X_t, y_train.to_numpy().reshape(-1,1))], eval_name=['train'], max_epochs=50)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "974d9a8a-ab96-45e4-b571-6ea71306180f",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ecee08b-6179-452d-815c-9083d150bbd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b87f3d27-61bb-4145-9f10-a8bf55c8b214",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X_t_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a9ae616-b91e-4c13-825a-e45f99c14744",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f9efe23-76f4-49e5-9130-053c9680dd64",
   "metadata": {},
   "source": [
    "### ICFESL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a438c513-69a1-4e61-a962-376a033b1239",
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_info_panel, fit_figs, cluster_groups, criterions, inertias, gap_statss = icfesl.pthreshold_search_algorun(\n",
    "    X2, y_train, X2_test, y_test, cat_vars, 'regression', pvalue_thresholds = [0.05*i for i in range(1,21)], tabnet=True, figure=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2fdd89c8-6cb2-40eb-9bf6-f3494737b0f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot, summary_plot = fit_figs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d4b5b73-afd0-4bb5-bbba-dee35487ebe2",
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_info_panel.to_excel(\"used_cars_fit_info.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f1deae4-a4d7-4108-bdd6-27929be8d2f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_plot.savefig('decision_plot_used_cars.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d588595-221c-47a7-b160-cb1c22d4ea5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_plot.savefig('summary_plot_used_cars.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a814f0c-ca6b-4e3b-8403-2b652a55f2cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "criterions[-5]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ba26263-8b9d-4e35-880e-7b077e7e9159",
   "metadata": {},
   "source": [
    "## CBind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca2a9cd3-2da2-4fd4-b673-7d62c346e657",
   "metadata": {},
   "outputs": [],
   "source": [
    "cgrouping = icfesl.group_categorical_features(X2, selected_column_names, distance_threshold=0.05)\n",
    "X4 = icfesl.combine_features(X2, cgrouping)\n",
    "X4_test = icfesl.combine_features(X2_test, cgrouping)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7db5b54-7a14-401e-be2f-2670f66a5bf6",
   "metadata": {},
   "source": [
    "#### 1.logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11ace45f-93b7-4611-b2ba-4f1c186f2ecc",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "model = sm.OLS(y_train, sm.add_constant(X4, has_constant='skip').fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6cd36506-6cb2-4002-bde7-75be46c788df",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'DF:{model.df_model}; R2: {model.rsquared}') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "889e61bb-0c16-4dcf-ba9f-4117720d6bd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0f172ff-c995-4354-a9bd-c8490faafcba",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0d673fd-762b-4f6e-a943-128a05074e12",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(sm.add_constant(X4_test, has_constant='skip'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c793bde-4c18-49f1-b63c-837ebdd72bda",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f83b17f-caa2-48e7-a00c-fb108a9c74bb",
   "metadata": {},
   "source": [
    "#### 2. xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5752ddb-6128-4743-a7b9-f6617ac74036",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_estimators=100, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9f35ec7-a09b-497a-b2a5-a6ff2b8a9479",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "model.fit(X4, y_train)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "859f3057-694d-4d9b-b4c7-afd9900985fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "642b22c0-8f68-4ebf-8b8d-37e9096a591b",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training rmse: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf3bde12-0c79-4078-9dcb-72ac71e2fce6",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X4_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdfca3e5-0ebd-4eb6-a949-0c65db159b24",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing rmse: {rmse}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5e509e0-c104-476a-a674-fcf0b6d53578",
   "metadata": {},
   "source": [
    "#### 3. Tabnet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a8a3707-e135-4134-9c05-1509030885f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabNetRegressor(verbose=0, seed=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3cf3dd2d-2859-4c8c-84a0-2687287903bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "model.fit(X_train=X4.to_numpy(), y_train=y_train.to_numpy().reshape(-1,1), eval_set=[(X4.to_numpy(), y_train.to_numpy().reshape(-1,1))], eval_name=['train'],max_epochs=50)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac82f7f2-048b-4132-9eda-06f5f5d15cb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X4.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61677d7c-9a63-4c41-8e4a-84e5a47c0fc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_train, y_pred)\n",
    "print(f\"training RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de37b00f-6e30-4bb0-8a10-51f4a329c3ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict(X4_test.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac2c94e6-8ac1-4298-8652-a5a94de1c69c",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "print(f\"testing RMSE: {rmse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "029476ac-338c-4119-b91a-8cd7134d5a24",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4d96638-eede-428c-af4b-58ef307edeeb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
