{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import math\n",
    "\n",
    "str(math.nan) == \"Nan\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TeaRetail\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"TeaRetail\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(30801, 32)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['retained', 'esent', 'eopenrate', 'eclickrate', 'avgorder', 'ordfreq',\n",
       "       'paperless', 'refill', 'doorstep', 'created_year', 'created_month',\n",
       "       'created_day', 'created_dayofweek', 'firstorder_year',\n",
       "       'firstorder_month', 'firstorder_day', 'firstorder_dayofweek',\n",
       "       'lastorder_year', 'lastorder_month', 'lastorder_day',\n",
       "       'lastorder_dayofweek', 'favday_Friday', 'favday_Monday',\n",
       "       'favday_Saturday', 'favday_Sunday', 'favday_Thursday', 'favday_Tuesday',\n",
       "       'favday_Wednesday', 'city_BLR', 'city_BOM', 'city_DEL', 'city_MAA'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/TeaRetail/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Adult\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ucimlrepo import fetch_ucirepo\n",
    "\n",
    "# fetch dataset\n",
    "adult = fetch_ucirepo(id=2)\n",
    "df = adult.data.original\n",
    "df.to_csv(\"./../../artifacts/data/Adult/raw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education-num</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>workclass_Federal-gov</th>\n",
       "      <th>workclass_Local-gov</th>\n",
       "      <th>workclass_Never-worked</th>\n",
       "      <th>workclass_Private</th>\n",
       "      <th>...</th>\n",
       "      <th>native-country_Scotland</th>\n",
       "      <th>native-country_South</th>\n",
       "      <th>native-country_Taiwan</th>\n",
       "      <th>native-country_Thailand</th>\n",
       "      <th>native-country_Trinadad&amp;Tobago</th>\n",
       "      <th>native-country_United-States</th>\n",
       "      <th>native-country_Vietnam</th>\n",
       "      <th>native-country_Yugoslavia</th>\n",
       "      <th>income_&lt;=50K</th>\n",
       "      <th>income_&gt;50K</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39</td>\n",
       "      <td>77516</td>\n",
       "      <td>13</td>\n",
       "      <td>2174</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50</td>\n",
       "      <td>83311</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>215646</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>53</td>\n",
       "      <td>234721</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28</td>\n",
       "      <td>338409</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 107 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \\\n",
       "0   39   77516             13          2174             0              40   \n",
       "1   50   83311             13             0             0              13   \n",
       "2   38  215646              9             0             0              40   \n",
       "3   53  234721              7             0             0              40   \n",
       "4   28  338409             13             0             0              40   \n",
       "\n",
       "   workclass_Federal-gov  workclass_Local-gov  workclass_Never-worked  \\\n",
       "0                      0                    0                       0   \n",
       "1                      0                    0                       0   \n",
       "2                      0                    0                       0   \n",
       "3                      0                    0                       0   \n",
       "4                      0                    0                       0   \n",
       "\n",
       "   workclass_Private  ...  native-country_Scotland  native-country_South  \\\n",
       "0                  0  ...                        0                     0   \n",
       "1                  0  ...                        0                     0   \n",
       "2                  1  ...                        0                     0   \n",
       "3                  1  ...                        0                     0   \n",
       "4                  1  ...                        0                     0   \n",
       "\n",
       "   native-country_Taiwan  native-country_Thailand  \\\n",
       "0                      0                        0   \n",
       "1                      0                        0   \n",
       "2                      0                        0   \n",
       "3                      0                        0   \n",
       "4                      0                        0   \n",
       "\n",
       "   native-country_Trinadad&Tobago  native-country_United-States  \\\n",
       "0                               0                             1   \n",
       "1                               0                             1   \n",
       "2                               0                             1   \n",
       "3                               0                             1   \n",
       "4                               0                             0   \n",
       "\n",
       "   native-country_Vietnam  native-country_Yugoslavia  income_<=50K  \\\n",
       "0                       0                          0             1   \n",
       "1                       0                          0             1   \n",
       "2                       0                          0             1   \n",
       "3                       0                          0             1   \n",
       "4                       0                          0             1   \n",
       "\n",
       "   income_>50K  \n",
       "0            0  \n",
       "1            0  \n",
       "2            0  \n",
       "3            0  \n",
       "4            0  \n",
       "\n",
       "[5 rows x 107 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"Adult\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(48842, 107)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',\n",
       "       'hours-per-week', 'workclass_Federal-gov', 'workclass_Local-gov',\n",
       "       'workclass_Never-worked', 'workclass_Private',\n",
       "       ...\n",
       "       'native-country_Scotland', 'native-country_South',\n",
       "       'native-country_Taiwan', 'native-country_Thailand',\n",
       "       'native-country_Trinadad&Tobago', 'native-country_United-States',\n",
       "       'native-country_Vietnam', 'native-country_Yugoslavia', 'income_<=50K',\n",
       "       'income_>50K'],\n",
       "      dtype='object', length=107)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/Adult/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## California Housing\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>total_bedrooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "      <th>median_house_value</th>\n",
       "      <th>ocean_proximity_&lt;1H OCEAN</th>\n",
       "      <th>ocean_proximity_INLAND</th>\n",
       "      <th>ocean_proximity_ISLAND</th>\n",
       "      <th>ocean_proximity_NEAR BAY</th>\n",
       "      <th>ocean_proximity_NEAR OCEAN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-122.23</td>\n",
       "      <td>37.88</td>\n",
       "      <td>41.0</td>\n",
       "      <td>880.0</td>\n",
       "      <td>129.0</td>\n",
       "      <td>322.0</td>\n",
       "      <td>126.0</td>\n",
       "      <td>8.3252</td>\n",
       "      <td>452600.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-122.22</td>\n",
       "      <td>37.86</td>\n",
       "      <td>21.0</td>\n",
       "      <td>7099.0</td>\n",
       "      <td>1106.0</td>\n",
       "      <td>2401.0</td>\n",
       "      <td>1138.0</td>\n",
       "      <td>8.3014</td>\n",
       "      <td>358500.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-122.24</td>\n",
       "      <td>37.85</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1467.0</td>\n",
       "      <td>190.0</td>\n",
       "      <td>496.0</td>\n",
       "      <td>177.0</td>\n",
       "      <td>7.2574</td>\n",
       "      <td>352100.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-122.25</td>\n",
       "      <td>37.85</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1274.0</td>\n",
       "      <td>235.0</td>\n",
       "      <td>558.0</td>\n",
       "      <td>219.0</td>\n",
       "      <td>5.6431</td>\n",
       "      <td>341300.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-122.25</td>\n",
       "      <td>37.85</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1627.0</td>\n",
       "      <td>280.0</td>\n",
       "      <td>565.0</td>\n",
       "      <td>259.0</td>\n",
       "      <td>3.8462</td>\n",
       "      <td>342200.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
       "0    -122.23     37.88                41.0        880.0           129.0   \n",
       "1    -122.22     37.86                21.0       7099.0          1106.0   \n",
       "2    -122.24     37.85                52.0       1467.0           190.0   \n",
       "3    -122.25     37.85                52.0       1274.0           235.0   \n",
       "4    -122.25     37.85                52.0       1627.0           280.0   \n",
       "\n",
       "   population  households  median_income  median_house_value  \\\n",
       "0       322.0       126.0         8.3252            452600.0   \n",
       "1      2401.0      1138.0         8.3014            358500.0   \n",
       "2       496.0       177.0         7.2574            352100.0   \n",
       "3       558.0       219.0         5.6431            341300.0   \n",
       "4       565.0       259.0         3.8462            342200.0   \n",
       "\n",
       "   ocean_proximity_<1H OCEAN  ocean_proximity_INLAND  ocean_proximity_ISLAND  \\\n",
       "0                          0                       0                       0   \n",
       "1                          0                       0                       0   \n",
       "2                          0                       0                       0   \n",
       "3                          0                       0                       0   \n",
       "4                          0                       0                       0   \n",
       "\n",
       "   ocean_proximity_NEAR BAY  ocean_proximity_NEAR OCEAN  \n",
       "0                         1                           0  \n",
       "1                         1                           0  \n",
       "2                         1                           0  \n",
       "3                         1                           0  \n",
       "4                         1                           0  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Download csv from https://www.kaggle.com/datasets/camnugent/california-housing-prices?resource=download\n",
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"CaliforniaHousing\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(20640, 14)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',\n",
       "       'total_bedrooms', 'population', 'households', 'median_income',\n",
       "       'median_house_value', 'ocean_proximity_<1H OCEAN',\n",
       "       'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',\n",
       "       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/CaliforniaHousing/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bank Marketing\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ucimlrepo import fetch_ucirepo\n",
    "\n",
    "# fetch dataset\n",
    "bank_marketing = fetch_ucirepo(id=222)\n",
    "df = bank_marketing.data.original\n",
    "df.to_csv(\"./../../artifacts/data/BankMarketing/raw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>balance</th>\n",
       "      <th>day_of_week</th>\n",
       "      <th>duration</th>\n",
       "      <th>campaign</th>\n",
       "      <th>pdays</th>\n",
       "      <th>previous</th>\n",
       "      <th>job_admin.</th>\n",
       "      <th>job_blue-collar</th>\n",
       "      <th>job_entrepreneur</th>\n",
       "      <th>...</th>\n",
       "      <th>month_jan</th>\n",
       "      <th>month_jul</th>\n",
       "      <th>month_jun</th>\n",
       "      <th>month_mar</th>\n",
       "      <th>month_may</th>\n",
       "      <th>month_nov</th>\n",
       "      <th>month_oct</th>\n",
       "      <th>month_sep</th>\n",
       "      <th>y_no</th>\n",
       "      <th>y_yes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>58</td>\n",
       "      <td>2143</td>\n",
       "      <td>5</td>\n",
       "      <td>261</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>44</td>\n",
       "      <td>29</td>\n",
       "      <td>5</td>\n",
       "      <td>151</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>33</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>76</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>47</td>\n",
       "      <td>1506</td>\n",
       "      <td>5</td>\n",
       "      <td>92</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>198</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 46 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   age  balance  day_of_week  duration  campaign  pdays  previous  job_admin.  \\\n",
       "0   58     2143            5       261         1     -1         0           0   \n",
       "1   44       29            5       151         1     -1         0           0   \n",
       "2   33        2            5        76         1     -1         0           0   \n",
       "3   47     1506            5        92         1     -1         0           0   \n",
       "4   33        1            5       198         1     -1         0           0   \n",
       "\n",
       "   job_blue-collar  job_entrepreneur  ...  month_jan  month_jul  month_jun  \\\n",
       "0                0                 0  ...          0          0          0   \n",
       "1                0                 0  ...          0          0          0   \n",
       "2                0                 1  ...          0          0          0   \n",
       "3                1                 0  ...          0          0          0   \n",
       "4                1                 0  ...          0          0          0   \n",
       "\n",
       "   month_mar  month_may  month_nov  month_oct  month_sep  y_no  y_yes  \n",
       "0          0          1          0          0          0     1      0  \n",
       "1          0          1          0          0          0     1      0  \n",
       "2          0          1          0          0          0     1      0  \n",
       "3          0          1          0          0          0     1      0  \n",
       "4          0          1          0          0          0     1      0  \n",
       "\n",
       "[5 rows x 46 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"BankMarketing\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(45211, 46)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['age', 'balance', 'day_of_week', 'duration', 'campaign', 'pdays',\n",
       "       'previous', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',\n",
       "       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',\n",
       "       'job_services', 'job_student', 'job_technician', 'job_unemployed',\n",
       "       'marital_divorced', 'marital_married', 'marital_single',\n",
       "       'education_primary', 'education_secondary', 'education_tertiary',\n",
       "       'default_no', 'default_yes', 'housing_no', 'housing_yes', 'loan_no',\n",
       "       'loan_yes', 'contact_cellular', 'contact_telephone', 'month_apr',\n",
       "       'month_aug', 'month_dec', 'month_feb', 'month_jan', 'month_jul',\n",
       "       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',\n",
       "       'month_sep', 'y_no', 'y_yes'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/BankMarketing/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Blastchar\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SeniorCitizen</th>\n",
       "      <th>tenure</th>\n",
       "      <th>MonthlyCharges</th>\n",
       "      <th>TotalCharges</th>\n",
       "      <th>gender_Female</th>\n",
       "      <th>gender_Male</th>\n",
       "      <th>Partner_No</th>\n",
       "      <th>Partner_Yes</th>\n",
       "      <th>Dependents_No</th>\n",
       "      <th>Dependents_Yes</th>\n",
       "      <th>...</th>\n",
       "      <th>Contract_One year</th>\n",
       "      <th>Contract_Two year</th>\n",
       "      <th>PaperlessBilling_No</th>\n",
       "      <th>PaperlessBilling_Yes</th>\n",
       "      <th>PaymentMethod_Bank transfer (automatic)</th>\n",
       "      <th>PaymentMethod_Credit card (automatic)</th>\n",
       "      <th>PaymentMethod_Electronic check</th>\n",
       "      <th>PaymentMethod_Mailed check</th>\n",
       "      <th>Churn_No</th>\n",
       "      <th>Churn_Yes</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>customerID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7590-VHVEG</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>29.85</td>\n",
       "      <td>29.85</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5575-GNVDE</th>\n",
       "      <td>0</td>\n",
       "      <td>34</td>\n",
       "      <td>56.95</td>\n",
       "      <td>1889.50</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3668-QPYBK</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>53.85</td>\n",
       "      <td>108.15</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7795-CFOCW</th>\n",
       "      <td>0</td>\n",
       "      <td>45</td>\n",
       "      <td>42.30</td>\n",
       "      <td>1840.75</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9237-HQITU</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>70.70</td>\n",
       "      <td>151.65</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 47 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            SeniorCitizen  tenure  MonthlyCharges  TotalCharges  \\\n",
       "customerID                                                        \n",
       "7590-VHVEG              0       1           29.85         29.85   \n",
       "5575-GNVDE              0      34           56.95       1889.50   \n",
       "3668-QPYBK              0       2           53.85        108.15   \n",
       "7795-CFOCW              0      45           42.30       1840.75   \n",
       "9237-HQITU              0       2           70.70        151.65   \n",
       "\n",
       "            gender_Female  gender_Male  Partner_No  Partner_Yes  \\\n",
       "customerID                                                        \n",
       "7590-VHVEG              1            0           0            1   \n",
       "5575-GNVDE              0            1           1            0   \n",
       "3668-QPYBK              0            1           1            0   \n",
       "7795-CFOCW              0            1           1            0   \n",
       "9237-HQITU              1            0           1            0   \n",
       "\n",
       "            Dependents_No  Dependents_Yes  ...  Contract_One year  \\\n",
       "customerID                                 ...                      \n",
       "7590-VHVEG              1               0  ...                  0   \n",
       "5575-GNVDE              1               0  ...                  1   \n",
       "3668-QPYBK              1               0  ...                  0   \n",
       "7795-CFOCW              1               0  ...                  1   \n",
       "9237-HQITU              1               0  ...                  0   \n",
       "\n",
       "            Contract_Two year  PaperlessBilling_No  PaperlessBilling_Yes  \\\n",
       "customerID                                                                 \n",
       "7590-VHVEG                  0                    0                     1   \n",
       "5575-GNVDE                  0                    1                     0   \n",
       "3668-QPYBK                  0                    0                     1   \n",
       "7795-CFOCW                  0                    1                     0   \n",
       "9237-HQITU                  0                    0                     1   \n",
       "\n",
       "            PaymentMethod_Bank transfer (automatic)  \\\n",
       "customerID                                            \n",
       "7590-VHVEG                                        0   \n",
       "5575-GNVDE                                        0   \n",
       "3668-QPYBK                                        0   \n",
       "7795-CFOCW                                        1   \n",
       "9237-HQITU                                        0   \n",
       "\n",
       "            PaymentMethod_Credit card (automatic)  \\\n",
       "customerID                                          \n",
       "7590-VHVEG                                      0   \n",
       "5575-GNVDE                                      0   \n",
       "3668-QPYBK                                      0   \n",
       "7795-CFOCW                                      0   \n",
       "9237-HQITU                                      0   \n",
       "\n",
       "            PaymentMethod_Electronic check  PaymentMethod_Mailed check  \\\n",
       "customerID                                                               \n",
       "7590-VHVEG                               1                           0   \n",
       "5575-GNVDE                               0                           1   \n",
       "3668-QPYBK                               0                           1   \n",
       "7795-CFOCW                               0                           0   \n",
       "9237-HQITU                               1                           0   \n",
       "\n",
       "            Churn_No  Churn_Yes  \n",
       "customerID                       \n",
       "7590-VHVEG         1          0  \n",
       "5575-GNVDE         1          0  \n",
       "3668-QPYBK         0          1  \n",
       "7795-CFOCW         1          0  \n",
       "9237-HQITU         0          1  \n",
       "\n",
       "[5 rows x 47 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Downloaded from https://www.kaggle.com/datasets/blastchar/telco-customer-churn\n",
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"BlastChar\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(7043, 47)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',\n",
       "       'gender_Female', 'gender_Male', 'Partner_No', 'Partner_Yes',\n",
       "       'Dependents_No', 'Dependents_Yes', 'PhoneService_No',\n",
       "       'PhoneService_Yes', 'MultipleLines_No',\n",
       "       'MultipleLines_No phone service', 'MultipleLines_Yes',\n",
       "       'InternetService_DSL', 'InternetService_Fiber optic',\n",
       "       'InternetService_No', 'OnlineSecurity_No',\n",
       "       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',\n",
       "       'OnlineBackup_No', 'OnlineBackup_No internet service',\n",
       "       'OnlineBackup_Yes', 'DeviceProtection_No',\n",
       "       'DeviceProtection_No internet service', 'DeviceProtection_Yes',\n",
       "       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',\n",
       "       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',\n",
       "       'StreamingMovies_No', 'StreamingMovies_No internet service',\n",
       "       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',\n",
       "       'Contract_Two year', 'PaperlessBilling_No', 'PaperlessBilling_Yes',\n",
       "       'PaymentMethod_Bank transfer (automatic)',\n",
       "       'PaymentMethod_Credit card (automatic)',\n",
       "       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',\n",
       "       'Churn_No', 'Churn_Yes'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/BlastChar/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Shoppers\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ucimlrepo import fetch_ucirepo\n",
    "\n",
    "# fetch dataset\n",
    "online_shoppers = fetch_ucirepo(id=468)\n",
    "df = online_shoppers.data.original\n",
    "df.to_csv(\"./../../artifacts/data/Shoppers/raw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Administrative</th>\n",
       "      <th>Administrative_Duration</th>\n",
       "      <th>Informational</th>\n",
       "      <th>Informational_Duration</th>\n",
       "      <th>ProductRelated</th>\n",
       "      <th>ProductRelated_Duration</th>\n",
       "      <th>BounceRates</th>\n",
       "      <th>ExitRates</th>\n",
       "      <th>PageValues</th>\n",
       "      <th>SpecialDay</th>\n",
       "      <th>Month</th>\n",
       "      <th>OperatingSystems</th>\n",
       "      <th>Browser</th>\n",
       "      <th>Region</th>\n",
       "      <th>TrafficType</th>\n",
       "      <th>Weekend</th>\n",
       "      <th>Revenue</th>\n",
       "      <th>VisitorType_New_Visitor</th>\n",
       "      <th>VisitorType_Other</th>\n",
       "      <th>VisitorType_Returning_Visitor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.20</td>\n",
       "      <td>0.20</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>64.000000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.10</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.20</td>\n",
       "      <td>0.20</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2.666667</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.14</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10</td>\n",
       "      <td>627.500000</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Administrative  Administrative_Duration  Informational  \\\n",
       "0               0                      0.0              0   \n",
       "1               0                      0.0              0   \n",
       "2               0                      0.0              0   \n",
       "3               0                      0.0              0   \n",
       "4               0                      0.0              0   \n",
       "\n",
       "   Informational_Duration  ProductRelated  ProductRelated_Duration  \\\n",
       "0                     0.0               1                 0.000000   \n",
       "1                     0.0               2                64.000000   \n",
       "2                     0.0               1                 0.000000   \n",
       "3                     0.0               2                 2.666667   \n",
       "4                     0.0              10               627.500000   \n",
       "\n",
       "   BounceRates  ExitRates  PageValues  SpecialDay  Month  OperatingSystems  \\\n",
       "0         0.20       0.20         0.0         0.0      2                 1   \n",
       "1         0.00       0.10         0.0         0.0      2                 2   \n",
       "2         0.20       0.20         0.0         0.0      2                 4   \n",
       "3         0.05       0.14         0.0         0.0      2                 3   \n",
       "4         0.02       0.05         0.0         0.0      2                 3   \n",
       "\n",
       "   Browser  Region  TrafficType  Weekend  Revenue  VisitorType_New_Visitor  \\\n",
       "0        1       1            1    False    False                        0   \n",
       "1        2       1            2    False    False                        0   \n",
       "2        1       9            3    False    False                        0   \n",
       "3        2       2            4    False    False                        0   \n",
       "4        3       1            4     True    False                        0   \n",
       "\n",
       "   VisitorType_Other  VisitorType_Returning_Visitor  \n",
       "0                  0                              1  \n",
       "1                  0                              1  \n",
       "2                  0                              1  \n",
       "3                  0                              1  \n",
       "4                  0                              1  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"Shoppers\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(12330, 20)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['Administrative', 'Administrative_Duration', 'Informational',\n",
       "       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',\n",
       "       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',\n",
       "       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend',\n",
       "       'Revenue', 'VisitorType_New_Visitor', 'VisitorType_Other',\n",
       "       'VisitorType_Returning_Visitor'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/Shoppers/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ChurnModelling\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CustomerId</th>\n",
       "      <th>CreditScore</th>\n",
       "      <th>Age</th>\n",
       "      <th>Tenure</th>\n",
       "      <th>Balance</th>\n",
       "      <th>NumOfProducts</th>\n",
       "      <th>HasCrCard</th>\n",
       "      <th>IsActiveMember</th>\n",
       "      <th>EstimatedSalary</th>\n",
       "      <th>Exited</th>\n",
       "      <th>...</th>\n",
       "      <th>Surname_Zubarev</th>\n",
       "      <th>Surname_Zubareva</th>\n",
       "      <th>Surname_Zuev</th>\n",
       "      <th>Surname_Zuyev</th>\n",
       "      <th>Surname_Zuyeva</th>\n",
       "      <th>Geography_France</th>\n",
       "      <th>Geography_Germany</th>\n",
       "      <th>Geography_Spain</th>\n",
       "      <th>Gender_Female</th>\n",
       "      <th>Gender_Male</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RowNumber</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>15634602</td>\n",
       "      <td>619</td>\n",
       "      <td>42</td>\n",
       "      <td>2</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>101348.88</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15647311</td>\n",
       "      <td>608</td>\n",
       "      <td>41</td>\n",
       "      <td>1</td>\n",
       "      <td>83807.86</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>112542.58</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15619304</td>\n",
       "      <td>502</td>\n",
       "      <td>42</td>\n",
       "      <td>8</td>\n",
       "      <td>159660.80</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113931.57</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>15701354</td>\n",
       "      <td>699</td>\n",
       "      <td>39</td>\n",
       "      <td>1</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>93826.63</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>15737888</td>\n",
       "      <td>850</td>\n",
       "      <td>43</td>\n",
       "      <td>2</td>\n",
       "      <td>125510.82</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>79084.10</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 2947 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           CustomerId  CreditScore  Age  Tenure    Balance  NumOfProducts  \\\n",
       "RowNumber                                                                   \n",
       "1            15634602          619   42       2       0.00              1   \n",
       "2            15647311          608   41       1   83807.86              1   \n",
       "3            15619304          502   42       8  159660.80              3   \n",
       "4            15701354          699   39       1       0.00              2   \n",
       "5            15737888          850   43       2  125510.82              1   \n",
       "\n",
       "           HasCrCard  IsActiveMember  EstimatedSalary  Exited  ...  \\\n",
       "RowNumber                                                      ...   \n",
       "1                  1               1        101348.88       1  ...   \n",
       "2                  0               1        112542.58       0  ...   \n",
       "3                  1               0        113931.57       1  ...   \n",
       "4                  0               0         93826.63       0  ...   \n",
       "5                  1               1         79084.10       0  ...   \n",
       "\n",
       "           Surname_Zubarev  Surname_Zubareva  Surname_Zuev  Surname_Zuyev  \\\n",
       "RowNumber                                                                   \n",
       "1                        0                 0             0              0   \n",
       "2                        0                 0             0              0   \n",
       "3                        0                 0             0              0   \n",
       "4                        0                 0             0              0   \n",
       "5                        0                 0             0              0   \n",
       "\n",
       "           Surname_Zuyeva  Geography_France  Geography_Germany  \\\n",
       "RowNumber                                                        \n",
       "1                       0                 1                  0   \n",
       "2                       0                 0                  0   \n",
       "3                       0                 1                  0   \n",
       "4                       0                 1                  0   \n",
       "5                       0                 0                  0   \n",
       "\n",
       "           Geography_Spain  Gender_Female  Gender_Male  \n",
       "RowNumber                                               \n",
       "1                        0              1            0  \n",
       "2                        1              1            0  \n",
       "3                        0              1            0  \n",
       "4                        0              1            0  \n",
       "5                        1              1            0  \n",
       "\n",
       "[5 rows x 2947 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Downloaded from https://www.kaggle.com/datasets/shrutimechlearn/churn-modelling\n",
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"ChurnModelling\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10000, 2947)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',\n",
       "       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',\n",
       "       'Exited',\n",
       "       ...\n",
       "       'Surname_Zubarev', 'Surname_Zubareva', 'Surname_Zuev', 'Surname_Zuyev',\n",
       "       'Surname_Zuyeva', 'Geography_France', 'Geography_Germany',\n",
       "       'Geography_Spain', 'Gender_Female', 'Gender_Male'],\n",
       "      dtype='object', length=2947)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/ChurnModelling/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Students\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ucimlrepo import fetch_ucirepo\n",
    "\n",
    "# fetch dataset\n",
    "predict_students = fetch_ucirepo(id=697)\n",
    "df = predict_students.data.original\n",
    "df.to_csv(\"./../../artifacts/data/Students/raw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Marital Status</th>\n",
       "      <th>Application mode</th>\n",
       "      <th>Application order</th>\n",
       "      <th>Course</th>\n",
       "      <th>Daytime/evening attendance</th>\n",
       "      <th>Previous qualification</th>\n",
       "      <th>Previous qualification (grade)</th>\n",
       "      <th>Nacionality</th>\n",
       "      <th>Mother's qualification</th>\n",
       "      <th>...</th>\n",
       "      <th>Curricular units 2nd sem (evaluations)</th>\n",
       "      <th>Curricular units 2nd sem (approved)</th>\n",
       "      <th>Curricular units 2nd sem (grade)</th>\n",
       "      <th>Curricular units 2nd sem (without evaluations)</th>\n",
       "      <th>Unemployment rate</th>\n",
       "      <th>Inflation rate</th>\n",
       "      <th>GDP</th>\n",
       "      <th>Target_Dropout</th>\n",
       "      <th>Target_Enrolled</th>\n",
       "      <th>Target_Graduate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>5</td>\n",
       "      <td>171</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>122.0</td>\n",
       "      <td>1</td>\n",
       "      <td>19</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>10.8</td>\n",
       "      <td>1.4</td>\n",
       "      <td>1.74</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>15</td>\n",
       "      <td>1</td>\n",
       "      <td>9254</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>160.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>13.666667</td>\n",
       "      <td>0</td>\n",
       "      <td>13.9</td>\n",
       "      <td>-0.3</td>\n",
       "      <td>0.79</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>9070</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>122.0</td>\n",
       "      <td>1</td>\n",
       "      <td>37</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>10.8</td>\n",
       "      <td>1.4</td>\n",
       "      <td>1.74</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>2</td>\n",
       "      <td>9773</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>122.0</td>\n",
       "      <td>1</td>\n",
       "      <td>38</td>\n",
       "      <td>...</td>\n",
       "      <td>10</td>\n",
       "      <td>5</td>\n",
       "      <td>12.400000</td>\n",
       "      <td>0</td>\n",
       "      <td>9.4</td>\n",
       "      <td>-0.8</td>\n",
       "      <td>-3.12</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>39</td>\n",
       "      <td>1</td>\n",
       "      <td>8014</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>100.0</td>\n",
       "      <td>1</td>\n",
       "      <td>37</td>\n",
       "      <td>...</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>13.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>13.9</td>\n",
       "      <td>-0.3</td>\n",
       "      <td>0.79</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  Marital Status  Application mode  Application order  Course  \\\n",
       "0           0               1                17                  5     171   \n",
       "1           1               1                15                  1    9254   \n",
       "2           2               1                 1                  5    9070   \n",
       "3           3               1                17                  2    9773   \n",
       "4           4               2                39                  1    8014   \n",
       "\n",
       "   Daytime/evening attendance  Previous qualification  \\\n",
       "0                           1                       1   \n",
       "1                           1                       1   \n",
       "2                           1                       1   \n",
       "3                           1                       1   \n",
       "4                           0                       1   \n",
       "\n",
       "   Previous qualification (grade)  Nacionality  Mother's qualification  ...  \\\n",
       "0                           122.0            1                      19  ...   \n",
       "1                           160.0            1                       1  ...   \n",
       "2                           122.0            1                      37  ...   \n",
       "3                           122.0            1                      38  ...   \n",
       "4                           100.0            1                      37  ...   \n",
       "\n",
       "   Curricular units 2nd sem (evaluations)  \\\n",
       "0                                       0   \n",
       "1                                       6   \n",
       "2                                       0   \n",
       "3                                      10   \n",
       "4                                       6   \n",
       "\n",
       "   Curricular units 2nd sem (approved)  Curricular units 2nd sem (grade)  \\\n",
       "0                                    0                          0.000000   \n",
       "1                                    6                         13.666667   \n",
       "2                                    0                          0.000000   \n",
       "3                                    5                         12.400000   \n",
       "4                                    6                         13.000000   \n",
       "\n",
       "   Curricular units 2nd sem (without evaluations)  Unemployment rate  \\\n",
       "0                                               0               10.8   \n",
       "1                                               0               13.9   \n",
       "2                                               0               10.8   \n",
       "3                                               0                9.4   \n",
       "4                                               0               13.9   \n",
       "\n",
       "   Inflation rate   GDP  Target_Dropout  Target_Enrolled  Target_Graduate  \n",
       "0             1.4  1.74               1                0                0  \n",
       "1            -0.3  0.79               0                0                1  \n",
       "2             1.4  1.74               1                0                0  \n",
       "3            -0.8 -3.12               0                0                1  \n",
       "4            -0.3  0.79               0                0                1  \n",
       "\n",
       "[5 rows x 40 columns]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"Students\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(4424, 40)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['Unnamed: 0', 'Marital Status', 'Application mode', 'Application order',\n",
       "       'Course', 'Daytime/evening attendance', 'Previous qualification',\n",
       "       'Previous qualification (grade)', 'Nacionality',\n",
       "       'Mother's qualification', 'Father's qualification',\n",
       "       'Mother's occupation', 'Father's occupation', 'Admission grade',\n",
       "       'Displaced', 'Educational special needs', 'Debtor',\n",
       "       'Tuition fees up to date', 'Gender', 'Scholarship holder',\n",
       "       'Age at enrollment', 'International',\n",
       "       'Curricular units 1st sem (credited)',\n",
       "       'Curricular units 1st sem (enrolled)',\n",
       "       'Curricular units 1st sem (evaluations)',\n",
       "       'Curricular units 1st sem (approved)',\n",
       "       'Curricular units 1st sem (grade)',\n",
       "       'Curricular units 1st sem (without evaluations)',\n",
       "       'Curricular units 2nd sem (credited)',\n",
       "       'Curricular units 2nd sem (enrolled)',\n",
       "       'Curricular units 2nd sem (evaluations)',\n",
       "       'Curricular units 2nd sem (approved)',\n",
       "       'Curricular units 2nd sem (grade)',\n",
       "       'Curricular units 2nd sem (without evaluations)', 'Unemployment rate',\n",
       "       'Inflation rate', 'GDP', 'Target_Dropout', 'Target_Enrolled',\n",
       "       'Target_Graduate'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/Students/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Support2\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ucimlrepo import fetch_ucirepo\n",
    "\n",
    "# fetch dataset\n",
    "support2 = fetch_ucirepo(id=880)\n",
    "df = support2.data.original\n",
    "df.to_csv(\"./../../artifacts/data/Support2/raw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>age</th>\n",
       "      <th>death</th>\n",
       "      <th>hospdead</th>\n",
       "      <th>slos</th>\n",
       "      <th>d.time</th>\n",
       "      <th>num.co</th>\n",
       "      <th>edu</th>\n",
       "      <th>scoma</th>\n",
       "      <th>charges</th>\n",
       "      <th>...</th>\n",
       "      <th>ca_no</th>\n",
       "      <th>ca_yes</th>\n",
       "      <th>dnr_dnr after sadm</th>\n",
       "      <th>dnr_dnr before sadm</th>\n",
       "      <th>dnr_no dnr</th>\n",
       "      <th>sfdm2_&lt;2 mo. follow-up</th>\n",
       "      <th>sfdm2_Coma or Intub</th>\n",
       "      <th>sfdm2_SIP&gt;=30</th>\n",
       "      <th>sfdm2_adl&gt;=4 (&gt;=5 if sur)</th>\n",
       "      <th>sfdm2_no(M2 and SIP pres)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>62.84998</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2029</td>\n",
       "      <td>0</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9715.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>60.33899</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>44.0</td>\n",
       "      <td>34496.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>52.74698</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>47</td>\n",
       "      <td>2</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>41094.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>42.38498</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>133</td>\n",
       "      <td>2</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3075.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>79.88495</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>2029</td>\n",
       "      <td>1</td>\n",
       "      <td>11.747691</td>\n",
       "      <td>26.0</td>\n",
       "      <td>50127.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 72 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id       age  death  hospdead  slos  d.time  num.co        edu  scoma  \\\n",
       "0   1  62.84998      0         0     5    2029       0  11.000000    0.0   \n",
       "1   2  60.33899      1         1     4       4       2  12.000000   44.0   \n",
       "2   3  52.74698      1         0    17      47       2  12.000000    0.0   \n",
       "3   4  42.38498      1         0     3     133       2  11.000000    0.0   \n",
       "4   5  79.88495      0         0    16    2029       1  11.747691   26.0   \n",
       "\n",
       "   charges  ...  ca_no  ca_yes  dnr_dnr after sadm  dnr_dnr before sadm  \\\n",
       "0   9715.0  ...      0       0                   0                    0   \n",
       "1  34496.0  ...      1       0                   0                    0   \n",
       "2  41094.0  ...      1       0                   0                    0   \n",
       "3   3075.0  ...      0       0                   0                    0   \n",
       "4  50127.0  ...      1       0                   0                    0   \n",
       "\n",
       "   dnr_no dnr  sfdm2_<2 mo. follow-up  sfdm2_Coma or Intub  sfdm2_SIP>=30  \\\n",
       "0           1                       1                    0              0   \n",
       "1           1                       1                    0              0   \n",
       "2           1                       1                    0              0   \n",
       "3           1                       0                    0              0   \n",
       "4           1                       0                    0              0   \n",
       "\n",
       "   sfdm2_adl>=4 (>=5 if sur)  sfdm2_no(M2 and SIP pres)  \n",
       "0                          0                          0  \n",
       "1                          0                          0  \n",
       "2                          0                          0  \n",
       "3                          0                          1  \n",
       "4                          0                          1  \n",
       "\n",
       "[5 rows x 72 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"Support2\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(9105, 72)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['id', 'age', 'death', 'hospdead', 'slos', 'd.time', 'num.co', 'edu',\n",
       "       'scoma', 'charges', 'totcst', 'totmcst', 'avtisst', 'sps', 'aps',\n",
       "       'surv2m', 'surv6m', 'hday', 'diabetes', 'dementia', 'prg2m', 'prg6m',\n",
       "       'dnrday', 'meanbp', 'wblc', 'hrt', 'resp', 'temp', 'pafi', 'alb',\n",
       "       'bili', 'crea', 'sod', 'ph', 'glucose', 'bun', 'adls', 'adlsc',\n",
       "       'sex_female', 'sex_male', 'dzgroup_ARF/MOSF w/Sepsis', 'dzgroup_CHF',\n",
       "       'dzgroup_COPD', 'dzgroup_Cirrhosis', 'dzgroup_Colon Cancer',\n",
       "       'dzgroup_Coma', 'dzgroup_Lung Cancer', 'dzgroup_MOSF w/Malig',\n",
       "       'dzclass_ARF/MOSF', 'dzclass_COPD/CHF/Cirrhosis', 'dzclass_Cancer',\n",
       "       'dzclass_Coma', 'income_$11-$25k', 'income_$25-$50k', 'income_>$50k',\n",
       "       'income_under $11k', 'race_asian', 'race_black', 'race_hispanic',\n",
       "       'race_other', 'race_white', 'ca_metastatic', 'ca_no', 'ca_yes',\n",
       "       'dnr_dnr after sadm', 'dnr_dnr before sadm', 'dnr_no dnr',\n",
       "       'sfdm2_<2 mo. follow-up', 'sfdm2_Coma or Intub', 'sfdm2_SIP>=30',\n",
       "       'sfdm2_adl>=4 (>=5 if sur)', 'sfdm2_no(M2 and SIP pres)'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/Support2/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Abalone\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ucimlrepo import fetch_ucirepo\n",
    "\n",
    "# fetch dataset\n",
    "support2 = fetch_ucirepo(id=1)\n",
    "df = support2.data.original\n",
    "df.to_csv(\"./../../artifacts/data/Abalone/raw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Length</th>\n",
       "      <th>Diameter</th>\n",
       "      <th>Height</th>\n",
       "      <th>Whole_weight</th>\n",
       "      <th>Shucked_weight</th>\n",
       "      <th>Viscera_weight</th>\n",
       "      <th>Shell_weight</th>\n",
       "      <th>Rings</th>\n",
       "      <th>Sex_F</th>\n",
       "      <th>Sex_I</th>\n",
       "      <th>Sex_M</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.455</td>\n",
       "      <td>0.365</td>\n",
       "      <td>0.095</td>\n",
       "      <td>0.5140</td>\n",
       "      <td>0.2245</td>\n",
       "      <td>0.1010</td>\n",
       "      <td>0.150</td>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.350</td>\n",
       "      <td>0.265</td>\n",
       "      <td>0.090</td>\n",
       "      <td>0.2255</td>\n",
       "      <td>0.0995</td>\n",
       "      <td>0.0485</td>\n",
       "      <td>0.070</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.530</td>\n",
       "      <td>0.420</td>\n",
       "      <td>0.135</td>\n",
       "      <td>0.6770</td>\n",
       "      <td>0.2565</td>\n",
       "      <td>0.1415</td>\n",
       "      <td>0.210</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.440</td>\n",
       "      <td>0.365</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.5160</td>\n",
       "      <td>0.2155</td>\n",
       "      <td>0.1140</td>\n",
       "      <td>0.155</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.330</td>\n",
       "      <td>0.255</td>\n",
       "      <td>0.080</td>\n",
       "      <td>0.2050</td>\n",
       "      <td>0.0895</td>\n",
       "      <td>0.0395</td>\n",
       "      <td>0.055</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Length  Diameter  Height  Whole_weight  Shucked_weight  Viscera_weight  \\\n",
       "0   0.455     0.365   0.095        0.5140          0.2245          0.1010   \n",
       "1   0.350     0.265   0.090        0.2255          0.0995          0.0485   \n",
       "2   0.530     0.420   0.135        0.6770          0.2565          0.1415   \n",
       "3   0.440     0.365   0.125        0.5160          0.2155          0.1140   \n",
       "4   0.330     0.255   0.080        0.2050          0.0895          0.0395   \n",
       "\n",
       "   Shell_weight  Rings  Sex_F  Sex_I  Sex_M  \n",
       "0         0.150     15      0      0      1  \n",
       "1         0.070      7      0      0      1  \n",
       "2         0.210      9      1      0      0  \n",
       "3         0.155     10      0      0      1  \n",
       "4         0.055      7      0      1      0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"Abalone\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(4177, 11)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight',\n",
       "       'Viscera_weight', 'Shell_weight', 'Rings', 'Sex_F', 'Sex_I', 'Sex_M'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/Abalone/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parkinsons\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ucimlrepo import fetch_ucirepo\n",
    "\n",
    "# fetch dataset\n",
    "support2 = fetch_ucirepo(id=189)\n",
    "df = support2.data.original\n",
    "df.to_csv(\"./../../artifacts/data/Parkinsons/raw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>subject#</th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>test_time</th>\n",
       "      <th>Jitter(%)</th>\n",
       "      <th>Jitter(Abs)</th>\n",
       "      <th>Jitter:RAP</th>\n",
       "      <th>Jitter:PPQ5</th>\n",
       "      <th>Jitter:DDP</th>\n",
       "      <th>Shimmer</th>\n",
       "      <th>...</th>\n",
       "      <th>Shimmer:APQ5</th>\n",
       "      <th>Shimmer:APQ11</th>\n",
       "      <th>Shimmer:DDA</th>\n",
       "      <th>NHR</th>\n",
       "      <th>HNR</th>\n",
       "      <th>RPDE</th>\n",
       "      <th>DFA</th>\n",
       "      <th>PPE</th>\n",
       "      <th>motor_UPDRS</th>\n",
       "      <th>total_UPDRS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>72</td>\n",
       "      <td>0</td>\n",
       "      <td>5.6431</td>\n",
       "      <td>0.00662</td>\n",
       "      <td>0.000034</td>\n",
       "      <td>0.00401</td>\n",
       "      <td>0.00317</td>\n",
       "      <td>0.01204</td>\n",
       "      <td>0.02565</td>\n",
       "      <td>...</td>\n",
       "      <td>0.01309</td>\n",
       "      <td>0.01662</td>\n",
       "      <td>0.04314</td>\n",
       "      <td>0.014290</td>\n",
       "      <td>21.640</td>\n",
       "      <td>0.41888</td>\n",
       "      <td>0.54842</td>\n",
       "      <td>0.16006</td>\n",
       "      <td>28.199</td>\n",
       "      <td>34.398</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>72</td>\n",
       "      <td>0</td>\n",
       "      <td>12.6660</td>\n",
       "      <td>0.00300</td>\n",
       "      <td>0.000017</td>\n",
       "      <td>0.00132</td>\n",
       "      <td>0.00150</td>\n",
       "      <td>0.00395</td>\n",
       "      <td>0.02024</td>\n",
       "      <td>...</td>\n",
       "      <td>0.01072</td>\n",
       "      <td>0.01689</td>\n",
       "      <td>0.02982</td>\n",
       "      <td>0.011112</td>\n",
       "      <td>27.183</td>\n",
       "      <td>0.43493</td>\n",
       "      <td>0.56477</td>\n",
       "      <td>0.10810</td>\n",
       "      <td>28.447</td>\n",
       "      <td>34.894</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>72</td>\n",
       "      <td>0</td>\n",
       "      <td>19.6810</td>\n",
       "      <td>0.00481</td>\n",
       "      <td>0.000025</td>\n",
       "      <td>0.00205</td>\n",
       "      <td>0.00208</td>\n",
       "      <td>0.00616</td>\n",
       "      <td>0.01675</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00844</td>\n",
       "      <td>0.01458</td>\n",
       "      <td>0.02202</td>\n",
       "      <td>0.020220</td>\n",
       "      <td>23.047</td>\n",
       "      <td>0.46222</td>\n",
       "      <td>0.54405</td>\n",
       "      <td>0.21014</td>\n",
       "      <td>28.695</td>\n",
       "      <td>35.389</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>72</td>\n",
       "      <td>0</td>\n",
       "      <td>25.6470</td>\n",
       "      <td>0.00528</td>\n",
       "      <td>0.000027</td>\n",
       "      <td>0.00191</td>\n",
       "      <td>0.00264</td>\n",
       "      <td>0.00573</td>\n",
       "      <td>0.02309</td>\n",
       "      <td>...</td>\n",
       "      <td>0.01265</td>\n",
       "      <td>0.01963</td>\n",
       "      <td>0.03317</td>\n",
       "      <td>0.027837</td>\n",
       "      <td>24.445</td>\n",
       "      <td>0.48730</td>\n",
       "      <td>0.57794</td>\n",
       "      <td>0.33277</td>\n",
       "      <td>28.905</td>\n",
       "      <td>35.810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>72</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6420</td>\n",
       "      <td>0.00335</td>\n",
       "      <td>0.000020</td>\n",
       "      <td>0.00093</td>\n",
       "      <td>0.00130</td>\n",
       "      <td>0.00278</td>\n",
       "      <td>0.01703</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00929</td>\n",
       "      <td>0.01819</td>\n",
       "      <td>0.02036</td>\n",
       "      <td>0.011625</td>\n",
       "      <td>26.126</td>\n",
       "      <td>0.47188</td>\n",
       "      <td>0.56122</td>\n",
       "      <td>0.19361</td>\n",
       "      <td>29.187</td>\n",
       "      <td>36.375</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   subject#  age  sex  test_time  Jitter(%)  Jitter(Abs)  Jitter:RAP  \\\n",
       "0         1   72    0     5.6431    0.00662     0.000034     0.00401   \n",
       "1         1   72    0    12.6660    0.00300     0.000017     0.00132   \n",
       "2         1   72    0    19.6810    0.00481     0.000025     0.00205   \n",
       "3         1   72    0    25.6470    0.00528     0.000027     0.00191   \n",
       "4         1   72    0    33.6420    0.00335     0.000020     0.00093   \n",
       "\n",
       "   Jitter:PPQ5  Jitter:DDP  Shimmer  ...  Shimmer:APQ5  Shimmer:APQ11  \\\n",
       "0      0.00317     0.01204  0.02565  ...       0.01309        0.01662   \n",
       "1      0.00150     0.00395  0.02024  ...       0.01072        0.01689   \n",
       "2      0.00208     0.00616  0.01675  ...       0.00844        0.01458   \n",
       "3      0.00264     0.00573  0.02309  ...       0.01265        0.01963   \n",
       "4      0.00130     0.00278  0.01703  ...       0.00929        0.01819   \n",
       "\n",
       "   Shimmer:DDA       NHR     HNR     RPDE      DFA      PPE  motor_UPDRS  \\\n",
       "0      0.04314  0.014290  21.640  0.41888  0.54842  0.16006       28.199   \n",
       "1      0.02982  0.011112  27.183  0.43493  0.56477  0.10810       28.447   \n",
       "2      0.02202  0.020220  23.047  0.46222  0.54405  0.21014       28.695   \n",
       "3      0.03317  0.027837  24.445  0.48730  0.57794  0.33277       28.905   \n",
       "4      0.02036  0.011625  26.126  0.47188  0.56122  0.19361       29.187   \n",
       "\n",
       "   total_UPDRS  \n",
       "0       34.398  \n",
       "1       34.894  \n",
       "2       35.389  \n",
       "3       35.810  \n",
       "4       36.375  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"Parkinsons\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(5875, 22)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['subject#', 'age', 'sex', 'test_time', 'Jitter(%)', 'Jitter(Abs)',\n",
       "       'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP', 'Shimmer', 'Shimmer(dB)',\n",
       "       'Shimmer:APQ3', 'Shimmer:APQ5', 'Shimmer:APQ11', 'Shimmer:DDA', 'NHR',\n",
       "       'HNR', 'RPDE', 'DFA', 'PPE', 'motor_UPDRS', 'total_UPDRS'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/Parkinsons/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Thermography\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ucimlrepo import fetch_ucirepo\n",
    "\n",
    "# fetch dataset\n",
    "support2 = fetch_ucirepo(id=925)\n",
    "df = support2.data.original\n",
    "df.to_csv(\"./../../artifacts/data/Thermography/raw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>aveOralF</th>\n",
       "      <th>aveOralM</th>\n",
       "      <th>T_atm</th>\n",
       "      <th>Humidity</th>\n",
       "      <th>Distance</th>\n",
       "      <th>T_offset1</th>\n",
       "      <th>Max1R13_1</th>\n",
       "      <th>Max1L13_1</th>\n",
       "      <th>aveAllR13_1</th>\n",
       "      <th>aveAllL13_1</th>\n",
       "      <th>...</th>\n",
       "      <th>Age_31-40</th>\n",
       "      <th>Age_41-50</th>\n",
       "      <th>Age_51-60</th>\n",
       "      <th>Age_&gt;60</th>\n",
       "      <th>Ethnicity_American Indian or Alaskan Native</th>\n",
       "      <th>Ethnicity_Asian</th>\n",
       "      <th>Ethnicity_Black or African-American</th>\n",
       "      <th>Ethnicity_Hispanic/Latino</th>\n",
       "      <th>Ethnicity_Multiracial</th>\n",
       "      <th>Ethnicity_White</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>36.85</td>\n",
       "      <td>36.59</td>\n",
       "      <td>24.0</td>\n",
       "      <td>28.0</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.7025</td>\n",
       "      <td>35.0300</td>\n",
       "      <td>35.3775</td>\n",
       "      <td>34.4000</td>\n",
       "      <td>34.9175</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>37.00</td>\n",
       "      <td>37.19</td>\n",
       "      <td>24.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.7800</td>\n",
       "      <td>34.5500</td>\n",
       "      <td>34.5200</td>\n",
       "      <td>33.9300</td>\n",
       "      <td>34.2250</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>37.20</td>\n",
       "      <td>37.34</td>\n",
       "      <td>24.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.8625</td>\n",
       "      <td>35.6525</td>\n",
       "      <td>35.5175</td>\n",
       "      <td>34.2775</td>\n",
       "      <td>34.8000</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>36.85</td>\n",
       "      <td>37.09</td>\n",
       "      <td>24.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>35.2225</td>\n",
       "      <td>35.6125</td>\n",
       "      <td>34.3850</td>\n",
       "      <td>35.2475</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>36.80</td>\n",
       "      <td>37.04</td>\n",
       "      <td>24.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.8950</td>\n",
       "      <td>35.5450</td>\n",
       "      <td>35.6650</td>\n",
       "      <td>34.9100</td>\n",
       "      <td>35.3675</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1068 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   aveOralF  aveOralM  T_atm  Humidity  Distance  T_offset1  Max1R13_1  \\\n",
       "0     36.85     36.59   24.0      28.0       0.8     0.7025    35.0300   \n",
       "1     37.00     37.19   24.0      26.0       0.8     0.7800    34.5500   \n",
       "2     37.20     37.34   24.0      26.0       0.8     0.8625    35.6525   \n",
       "3     36.85     37.09   24.0      27.0       0.8     0.9300    35.2225   \n",
       "4     36.80     37.04   24.0      27.0       0.8     0.8950    35.5450   \n",
       "\n",
       "   Max1L13_1  aveAllR13_1  aveAllL13_1  ...  Age_31-40  Age_41-50  Age_51-60  \\\n",
       "0    35.3775      34.4000      34.9175  ...          0          1          0   \n",
       "1    34.5200      33.9300      34.2250  ...          1          0          0   \n",
       "2    35.5175      34.2775      34.8000  ...          0          0          0   \n",
       "3    35.6125      34.3850      35.2475  ...          0          0          0   \n",
       "4    35.6650      34.9100      35.3675  ...          0          0          0   \n",
       "\n",
       "   Age_>60  Ethnicity_American Indian or Alaskan Native  Ethnicity_Asian  \\\n",
       "0        0                                            0                0   \n",
       "1        0                                            0                0   \n",
       "2        0                                            0                0   \n",
       "3        0                                            0                0   \n",
       "4        0                                            0                0   \n",
       "\n",
       "   Ethnicity_Black or African-American  Ethnicity_Hispanic/Latino  \\\n",
       "0                                    0                          0   \n",
       "1                                    1                          0   \n",
       "2                                    0                          0   \n",
       "3                                    1                          0   \n",
       "4                                    0                          0   \n",
       "\n",
       "   Ethnicity_Multiracial  Ethnicity_White  \n",
       "0                      0                1  \n",
       "1                      0                0  \n",
       "2                      0                1  \n",
       "3                      0                0  \n",
       "4                      0                1  \n",
       "\n",
       "[5 rows x 1068 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"Thermography\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1020, 1068)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['aveOralF', 'aveOralM', 'T_atm', 'Humidity', 'Distance', 'T_offset1',\n",
       "       'Max1R13_1', 'Max1L13_1', 'aveAllR13_1', 'aveAllL13_1',\n",
       "       ...\n",
       "       'Age_31-40', 'Age_41-50', 'Age_51-60', 'Age_>60',\n",
       "       'Ethnicity_American Indian or Alaskan Native', 'Ethnicity_Asian',\n",
       "       'Ethnicity_Black or African-American', 'Ethnicity_Hispanic/Latino',\n",
       "       'Ethnicity_Multiracial', 'Ethnicity_White'],\n",
       "      dtype='object', length=1068)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/Thermography/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Walmart\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Weekly_Sales</th>\n",
       "      <th>Holiday_Flag</th>\n",
       "      <th>Temperature</th>\n",
       "      <th>Fuel_Price</th>\n",
       "      <th>CPI</th>\n",
       "      <th>Unemployment</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Store</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1643690.90</td>\n",
       "      <td>0</td>\n",
       "      <td>42.31</td>\n",
       "      <td>2.572</td>\n",
       "      <td>211.096358</td>\n",
       "      <td>8.106</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1641957.44</td>\n",
       "      <td>1</td>\n",
       "      <td>38.51</td>\n",
       "      <td>2.548</td>\n",
       "      <td>211.242170</td>\n",
       "      <td>8.106</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1611968.17</td>\n",
       "      <td>0</td>\n",
       "      <td>39.93</td>\n",
       "      <td>2.514</td>\n",
       "      <td>211.289143</td>\n",
       "      <td>8.106</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1409727.59</td>\n",
       "      <td>0</td>\n",
       "      <td>46.63</td>\n",
       "      <td>2.561</td>\n",
       "      <td>211.319643</td>\n",
       "      <td>8.106</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1554806.68</td>\n",
       "      <td>0</td>\n",
       "      <td>46.50</td>\n",
       "      <td>2.625</td>\n",
       "      <td>211.350143</td>\n",
       "      <td>8.106</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price         CPI  \\\n",
       "Store                                                                    \n",
       "1        1643690.90             0        42.31       2.572  211.096358   \n",
       "1        1641957.44             1        38.51       2.548  211.242170   \n",
       "1        1611968.17             0        39.93       2.514  211.289143   \n",
       "1        1409727.59             0        46.63       2.561  211.319643   \n",
       "1        1554806.68             0        46.50       2.625  211.350143   \n",
       "\n",
       "       Unemployment  \n",
       "Store                \n",
       "1             8.106  \n",
       "1             8.106  \n",
       "1             8.106  \n",
       "1             8.106  \n",
       "1             8.106  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Downloaded from https://www.kaggle.com/datasets/yasserh/walmart-dataset\n",
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"Walmart\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(6435, 6)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI',\n",
       "       'Unemployment'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/Walmart/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AirQuality\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>month</th>\n",
       "      <th>day</th>\n",
       "      <th>hour</th>\n",
       "      <th>pm2.5</th>\n",
       "      <th>DEWP</th>\n",
       "      <th>TEMP</th>\n",
       "      <th>PRES</th>\n",
       "      <th>Iws</th>\n",
       "      <th>Is</th>\n",
       "      <th>Ir</th>\n",
       "      <th>cbwd_NE</th>\n",
       "      <th>cbwd_NW</th>\n",
       "      <th>cbwd_SE</th>\n",
       "      <th>cbwd_cv</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>No</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>98.613215</td>\n",
       "      <td>-21</td>\n",
       "      <td>-11.0</td>\n",
       "      <td>1021.0</td>\n",
       "      <td>1.79</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>98.613215</td>\n",
       "      <td>-21</td>\n",
       "      <td>-12.0</td>\n",
       "      <td>1020.0</td>\n",
       "      <td>4.92</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>98.613215</td>\n",
       "      <td>-21</td>\n",
       "      <td>-11.0</td>\n",
       "      <td>1019.0</td>\n",
       "      <td>6.71</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>98.613215</td>\n",
       "      <td>-21</td>\n",
       "      <td>-14.0</td>\n",
       "      <td>1019.0</td>\n",
       "      <td>9.84</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2010</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>98.613215</td>\n",
       "      <td>-20</td>\n",
       "      <td>-12.0</td>\n",
       "      <td>1018.0</td>\n",
       "      <td>12.97</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    year  month  day  hour      pm2.5  DEWP  TEMP    PRES    Iws  Is  Ir  \\\n",
       "No                                                                         \n",
       "1   2010      1    1     0  98.613215   -21 -11.0  1021.0   1.79   0   0   \n",
       "2   2010      1    1     1  98.613215   -21 -12.0  1020.0   4.92   0   0   \n",
       "3   2010      1    1     2  98.613215   -21 -11.0  1019.0   6.71   0   0   \n",
       "4   2010      1    1     3  98.613215   -21 -14.0  1019.0   9.84   0   0   \n",
       "5   2010      1    1     4  98.613215   -20 -12.0  1018.0  12.97   0   0   \n",
       "\n",
       "    cbwd_NE  cbwd_NW  cbwd_SE  cbwd_cv  \n",
       "No                                      \n",
       "1         0        1        0        0  \n",
       "2         0        1        0        0  \n",
       "3         0        1        0        0  \n",
       "4         0        1        0        0  \n",
       "5         0        1        0        0  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Downloaded from https://www.kaggle.com/datasets/stealthtechnologies/preeeeeeee\n",
    "from preprocessing import preprocessing\n",
    "\n",
    "df = preprocessing(\"AirQuality\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(43824, 15)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['year', 'month', 'day', 'hour', 'pm2.5', 'DEWP', 'TEMP', 'PRES', 'Iws',\n",
       "       'Is', 'Ir', 'cbwd_NE', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./../../artifacts/data/AirQuality/processed.csv\")\n",
    "print(df.shape)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
