{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3df18609",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:48:01.485629Z",
     "iopub.status.busy": "2024-10-18T12:48:01.485095Z",
     "iopub.status.idle": "2024-10-18T12:48:02.283332Z",
     "shell.execute_reply": "2024-10-18T12:48:02.282411Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Basic Information about the Dataset:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 43789 entries, 0 to 43788\n",
      "Data columns (total 33 columns):\n",
      " #   Column                             Non-Null Count  Dtype  \n",
      "---  ------                             --------------  -----  \n",
      " 0   IsBadBuy                           43789 non-null  int64  \n",
      " 1   PurchDate                          43789 non-null  float64\n",
      " 2   Auction                            43789 non-null  object \n",
      " 3   VehYear                            43789 non-null  float64\n",
      " 4   VehicleAge                         43789 non-null  int64  \n",
      " 5   Make                               43789 non-null  object \n",
      " 6   Model                              43789 non-null  object \n",
      " 7   Trim                               42362 non-null  object \n",
      " 8   SubModel                           43785 non-null  object \n",
      " 9   Color                              43785 non-null  object \n",
      " 10  Transmission                       43784 non-null  object \n",
      " 11  WheelTypeID                        41909 non-null  float64\n",
      " 12  WheelType                          41907 non-null  object \n",
      " 13  VehOdo                             43789 non-null  float64\n",
      " 14  Nationality                        43786 non-null  object \n",
      " 15  Size                               43786 non-null  object \n",
      " 16  TopThreeAmericanName               43786 non-null  object \n",
      " 17  MMRAcquisitionAuctionAveragePrice  43781 non-null  float64\n",
      " 18  MMRAcquisitionAuctionCleanPrice    43781 non-null  float64\n",
      " 19  MMRAcquisitionRetailAveragePrice   43781 non-null  float64\n",
      " 20  MMRAcquisitonRetailCleanPrice      43781 non-null  float64\n",
      " 21  MMRCurrentAuctionAveragePrice      43609 non-null  float64\n",
      " 22  MMRCurrentAuctionCleanPrice        43609 non-null  float64\n",
      " 23  MMRCurrentRetailAveragePrice       43609 non-null  float64\n",
      " 24  MMRCurrentRetailCleanPrice         43609 non-null  float64\n",
      " 25  PRIMEUNIT                          2052 non-null   object \n",
      " 26  AUCGUART                           2052 non-null   object \n",
      " 27  BYRNO                              43789 non-null  int64  \n",
      " 28  VNZIP1                             43789 non-null  int64  \n",
      " 29  VNST                               43789 non-null  object \n",
      " 30  VehBCost                           43744 non-null  float64\n",
      " 31  IsOnlineSale                       43789 non-null  int64  \n",
      " 32  WarrantyCost                       43789 non-null  float64\n",
      "dtypes: float64(14), int64(5), object(14)\n",
      "memory usage: 11.0+ MB\n",
      "None\n",
      "\n",
      "First few rows of the Dataset:\n",
      "   IsBadBuy     PurchDate  Auction  VehYear  VehicleAge       Make  \\\n",
      "0         0  1.265155e+09  MANHEIM   2006.0           4      MAZDA   \n",
      "1         0  1.262650e+09  MANHEIM   2005.0           5       FORD   \n",
      "2         1  1.271030e+09  MANHEIM   2006.0           4      DODGE   \n",
      "3         0  1.254787e+09  MANHEIM   2003.0           6  CHEVROLET   \n",
      "4         0  1.257466e+09  MANHEIM   2003.0           6  CHEVROLET   \n",
      "\n",
      "                  Model Trim            SubModel   Color  ...  \\\n",
      "0                MAZDA5  NaN            4D WAGON    BLUE  ...   \n",
      "1                TAURUS   SE         4D SEDAN SE   WHITE  ...   \n",
      "2        DURANGO 4WD V8  Adv         4D SUV 4.7L  SILVER  ...   \n",
      "3  TRAILBLAZER 2WD 6C 4   LS      4D SUV 4.2L LS   WHITE  ...   \n",
      "4        VENTURE FWD V6  Bas  PASSENGER EXT 3.4L  SILVER  ...   \n",
      "\n",
      "  MMRCurrentRetailAveragePrice  MMRCurrentRetailCleanPrice PRIMEUNIT  \\\n",
      "0                      11748.0                     12248.0       NaN   \n",
      "1                       6651.0                      7676.0       NaN   \n",
      "2                      14709.0                     16655.0       NaN   \n",
      "3                       8294.0                     10324.0       NaN   \n",
      "4                       6128.0                      9924.0       NaN   \n",
      "\n",
      "   AUCGUART  BYRNO VNZIP1 VNST  VehBCost  IsOnlineSale  WarrantyCost  \n",
      "0       NaN  23359  92807   CA    7235.0             0        1038.0  \n",
      "1       NaN   3453  80022   CO    3900.0             0         983.0  \n",
      "2       NaN  52117  27542   NC    9800.0             0        1918.0  \n",
      "3       NaN    835  85040   AZ    7640.0             0        1543.0  \n",
      "4       NaN  17675  27542   NC    4300.0             0        5392.0  \n",
      "\n",
      "[5 rows x 33 columns]\n",
      "\n",
      "Summary Statistics for Numerical Columns:\n",
      "           IsBadBuy     PurchDate       VehYear    VehicleAge   WheelTypeID  \\\n",
      "count  43789.000000  4.378900e+04  43789.000000  43789.000000  41909.000000   \n",
      "mean       0.123273  1.263042e+09   2005.343260      4.174838      1.495741   \n",
      "std        0.328754  1.818059e+07      1.733392      1.714821      0.521434   \n",
      "min        0.000000  1.231114e+09   2001.000000      0.000000      0.000000   \n",
      "25%        0.000000  1.248048e+09   2004.000000      3.000000      1.000000   \n",
      "50%        0.000000  1.263946e+09   2005.000000      4.000000      1.000000   \n",
      "75%        0.000000  1.278979e+09   2007.000000      5.000000      2.000000   \n",
      "max        1.000000  1.293667e+09   2010.000000      9.000000      3.000000   \n",
      "\n",
      "              VehOdo  MMRAcquisitionAuctionAveragePrice  \\\n",
      "count   43789.000000                       43781.000000   \n",
      "mean    71488.040284                        6132.529910   \n",
      "std     14615.507183                        2474.101136   \n",
      "min      8706.000000                           0.000000   \n",
      "25%     61844.000000                        4271.000000   \n",
      "50%     73344.000000                        6099.000000   \n",
      "75%     82414.000000                        7764.000000   \n",
      "max    115717.000000                       35722.000000   \n",
      "\n",
      "       MMRAcquisitionAuctionCleanPrice  MMRAcquisitionRetailAveragePrice  \\\n",
      "count                     43781.000000                      43781.000000   \n",
      "mean                       7377.863639                       8498.416322   \n",
      "std                        2736.172776                       3169.673809   \n",
      "min                           0.000000                          0.000000   \n",
      "25%                        5394.000000                       6249.000000   \n",
      "50%                        7310.000000                       8443.000000   \n",
      "75%                        9028.000000                      10659.000000   \n",
      "max                       36859.000000                      39080.000000   \n",
      "\n",
      "       MMRAcquisitonRetailCleanPrice  MMRCurrentAuctionAveragePrice  \\\n",
      "count                   43781.000000                   43609.000000   \n",
      "mean                     9853.071058                    6135.565136   \n",
      "std                      3401.523460                    2448.535126   \n",
      "min                         0.000000                       0.000000   \n",
      "25%                      7478.000000                    4267.000000   \n",
      "50%                      9785.000000                    6062.000000   \n",
      "75%                     12098.000000                    7737.000000   \n",
      "max                     41482.000000                   35722.000000   \n",
      "\n",
      "       MMRCurrentAuctionCleanPrice  MMRCurrentRetailAveragePrice  \\\n",
      "count                 43609.000000                  43609.000000   \n",
      "mean                   7394.431837                   8776.359307   \n",
      "std                    2701.642976                   3106.611549   \n",
      "min                       0.000000                      0.000000   \n",
      "25%                    5396.000000                   6520.000000   \n",
      "50%                    7324.000000                   8734.000000   \n",
      "75%                    9014.000000                  10914.000000   \n",
      "max                   36859.000000                  39080.000000   \n",
      "\n",
      "       MMRCurrentRetailCleanPrice         BYRNO        VNZIP1      VehBCost  \\\n",
      "count                43609.000000  43789.000000  43789.000000  43744.000000   \n",
      "mean                 10146.624848  26385.758410  57934.964649   6726.404604   \n",
      "std                   3327.390654  25703.204478  26180.150542   1773.227017   \n",
      "min                      0.000000    835.000000   2764.000000      1.000000   \n",
      "25%                   7765.000000  17212.000000  30331.000000   5425.000000   \n",
      "50%                  10103.000000  19662.000000  73108.000000   6700.000000   \n",
      "75%                  12321.000000  22808.000000  80022.000000   7900.000000   \n",
      "max                  41062.000000  99761.000000  99224.000000  45469.000000   \n",
      "\n",
      "       IsOnlineSale  WarrantyCost  \n",
      "count  43789.000000  43789.000000  \n",
      "mean       0.025006   1275.402179  \n",
      "std        0.156146    598.513353  \n",
      "min        0.000000    462.000000  \n",
      "25%        0.000000    837.000000  \n",
      "50%        0.000000   1155.000000  \n",
      "75%        0.000000   1623.000000  \n",
      "max        1.000000   7498.000000  \n",
      "\n",
      "Summary Statistics for Categorical Columns:\n",
      "        Auction       Make       Model   Trim  SubModel   Color Transmission  \\\n",
      "count     43789      43789       43789  42362     43785   43785        43784   \n",
      "unique        3         33         982    133       787      16            3   \n",
      "top     MANHEIM  CHEVROLET  PT CRUISER    Bas  4D SEDAN  SILVER         AUTO   \n",
      "freq      24538      10364        1395   8421      9139    8892        42202   \n",
      "\n",
      "       WheelType Nationality    Size TopThreeAmericanName PRIMEUNIT AUCGUART  \\\n",
      "count      41907       43786   43786                43786      2052     2052   \n",
      "unique         3           4      12                    4         2        2   \n",
      "top        Alloy    AMERICAN  MEDIUM                   GM        NO    GREEN   \n",
      "freq       21586       36579   18442                15211      2015     2010   \n",
      "\n",
      "         VNST  \n",
      "count   43789  \n",
      "unique     37  \n",
      "top        TX  \n",
      "freq     8116  \n",
      "\n",
      "Missing Values in the Dataset:\n",
      "IsBadBuy                                 0\n",
      "PurchDate                                0\n",
      "Auction                                  0\n",
      "VehYear                                  0\n",
      "VehicleAge                               0\n",
      "Make                                     0\n",
      "Model                                    0\n",
      "Trim                                  1427\n",
      "SubModel                                 4\n",
      "Color                                    4\n",
      "Transmission                             5\n",
      "WheelTypeID                           1880\n",
      "WheelType                             1882\n",
      "VehOdo                                   0\n",
      "Nationality                              3\n",
      "Size                                     3\n",
      "TopThreeAmericanName                     3\n",
      "MMRAcquisitionAuctionAveragePrice        8\n",
      "MMRAcquisitionAuctionCleanPrice          8\n",
      "MMRAcquisitionRetailAveragePrice         8\n",
      "MMRAcquisitonRetailCleanPrice            8\n",
      "MMRCurrentAuctionAveragePrice          180\n",
      "MMRCurrentAuctionCleanPrice            180\n",
      "MMRCurrentRetailAveragePrice           180\n",
      "MMRCurrentRetailCleanPrice             180\n",
      "PRIMEUNIT                            41737\n",
      "AUCGUART                             41737\n",
      "BYRNO                                    0\n",
      "VNZIP1                                   0\n",
      "VNST                                     0\n",
      "VehBCost                                45\n",
      "IsOnlineSale                             0\n",
      "WarrantyCost                             0\n",
      "dtype: int64\n",
      "\n",
      "Distribution of the Target Column 'IsBadBuy':\n",
      "IsBadBuy\n",
      "0    0.876727\n",
      "1    0.123273\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Categorical Columns:\n",
      "Index(['Auction', 'Make', 'Model', 'Trim', 'SubModel', 'Color', 'Transmission',\n",
      "       'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'PRIMEUNIT',\n",
      "       'AUCGUART', 'VNST'],\n",
      "      dtype='object')\n",
      "\n",
      "Numerical Columns:\n",
      "Index(['IsBadBuy', 'PurchDate', 'VehYear', 'VehicleAge', 'WheelTypeID',\n",
      "       'VehOdo', 'MMRAcquisitionAuctionAveragePrice',\n",
      "       'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice',\n",
      "       'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice',\n",
      "       'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice',\n",
      "       'MMRCurrentRetailCleanPrice', 'BYRNO', 'VNZIP1', 'VehBCost',\n",
      "       'IsOnlineSale', 'WarrantyCost'],\n",
      "      dtype='object')\n",
      "\n",
      "Correlation Matrix for Numerical Columns:\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                   IsBadBuy  PurchDate   VehYear  VehicleAge  \\\n",
      "IsBadBuy                           1.000000   0.013166 -0.154487    0.162613   \n",
      "PurchDate                          0.013166   1.000000  0.208610    0.042673   \n",
      "VehYear                           -0.154487   0.208610  1.000000   -0.958059   \n",
      "VehicleAge                         0.162613   0.042673 -0.958059    1.000000   \n",
      "WheelTypeID                       -0.042615   0.039308  0.258113   -0.250807   \n",
      "VehOdo                             0.081803   0.137100 -0.279907    0.316966   \n",
      "MMRAcquisitionAuctionAveragePrice -0.107319   0.083618  0.582300   -0.565918   \n",
      "MMRAcquisitionAuctionCleanPrice   -0.100992   0.082994  0.534557   -0.519387   \n",
      "MMRAcquisitionRetailAveragePrice  -0.085481   0.412976  0.581727   -0.461259   \n",
      "MMRAcquisitonRetailCleanPrice     -0.081688   0.390918  0.540135   -0.427028   \n",
      "MMRCurrentAuctionAveragePrice     -0.107182   0.066836  0.592414   -0.576387   \n",
      "MMRCurrentAuctionCleanPrice       -0.102326   0.070540  0.550240   -0.533496   \n",
      "MMRCurrentRetailAveragePrice      -0.102807   0.365682  0.599428   -0.504721   \n",
      "MMRCurrentRetailCleanPrice        -0.099052   0.348838  0.559956   -0.470314   \n",
      "BYRNO                             -0.059045   0.091170  0.278102   -0.264761   \n",
      "VNZIP1                             0.006351  -0.033172  0.072182   -0.080027   \n",
      "VehBCost                          -0.100355   0.146166  0.352131   -0.313643   \n",
      "IsOnlineSale                      -0.002662   0.111111  0.059074   -0.026649   \n",
      "WarrantyCost                       0.053949  -0.036940 -0.265533    0.256667   \n",
      "\n",
      "                                   WheelTypeID    VehOdo  \\\n",
      "IsBadBuy                             -0.042615  0.081803   \n",
      "PurchDate                             0.039308  0.137100   \n",
      "VehYear                               0.258113 -0.279907   \n",
      "VehicleAge                           -0.250807  0.316966   \n",
      "WheelTypeID                           1.000000 -0.212199   \n",
      "VehOdo                               -0.212199  1.000000   \n",
      "MMRAcquisitionAuctionAveragePrice    -0.097526 -0.016251   \n",
      "MMRAcquisitionAuctionCleanPrice      -0.131511  0.025142   \n",
      "MMRAcquisitionRetailAveragePrice     -0.076838  0.033122   \n",
      "MMRAcquisitonRetailCleanPrice        -0.105688  0.064593   \n",
      "MMRCurrentAuctionAveragePrice        -0.090024 -0.028962   \n",
      "MMRCurrentAuctionCleanPrice          -0.122794  0.011445   \n",
      "MMRCurrentRetailAveragePrice         -0.078945  0.016426   \n",
      "MMRCurrentRetailCleanPrice           -0.107178  0.048406   \n",
      "BYRNO                                 0.185025 -0.286507   \n",
      "VNZIP1                                0.008210 -0.058077   \n",
      "VehBCost                             -0.164987 -0.061822   \n",
      "IsOnlineSale                         -0.003142  0.029631   \n",
      "WarrantyCost                         -0.137194  0.409489   \n",
      "\n",
      "                                   MMRAcquisitionAuctionAveragePrice  \\\n",
      "IsBadBuy                                                   -0.107319   \n",
      "PurchDate                                                   0.083618   \n",
      "VehYear                                                     0.582300   \n",
      "VehicleAge                                                 -0.565918   \n",
      "WheelTypeID                                                -0.097526   \n",
      "VehOdo                                                     -0.016251   \n",
      "MMRAcquisitionAuctionAveragePrice                           1.000000   \n",
      "MMRAcquisitionAuctionCleanPrice                             0.990472   \n",
      "MMRAcquisitionRetailAveragePrice                            0.911260   \n",
      "MMRAcquisitonRetailCleanPrice                               0.910965   \n",
      "MMRCurrentAuctionAveragePrice                               0.939200   \n",
      "MMRCurrentAuctionCleanPrice                                 0.933829   \n",
      "MMRCurrentRetailAveragePrice                                0.873593   \n",
      "MMRCurrentRetailCleanPrice                                  0.872822   \n",
      "BYRNO                                                       0.105009   \n",
      "VNZIP1                                                      0.045296   \n",
      "VehBCost                                                    0.790516   \n",
      "IsOnlineSale                                                0.034912   \n",
      "WarrantyCost                                               -0.046557   \n",
      "\n",
      "                                   MMRAcquisitionAuctionCleanPrice  \\\n",
      "IsBadBuy                                                 -0.100992   \n",
      "PurchDate                                                 0.082994   \n",
      "VehYear                                                   0.534557   \n",
      "VehicleAge                                               -0.519387   \n",
      "WheelTypeID                                              -0.131511   \n",
      "VehOdo                                                    0.025142   \n",
      "MMRAcquisitionAuctionAveragePrice                         0.990472   \n",
      "MMRAcquisitionAuctionCleanPrice                           1.000000   \n",
      "MMRAcquisitionRetailAveragePrice                          0.903793   \n",
      "MMRAcquisitonRetailCleanPrice                             0.919041   \n",
      "MMRCurrentAuctionAveragePrice                             0.925053   \n",
      "MMRCurrentAuctionCleanPrice                               0.933398   \n",
      "MMRCurrentRetailAveragePrice                              0.862620   \n",
      "MMRCurrentRetailCleanPrice                                0.873000   \n",
      "BYRNO                                                     0.060173   \n",
      "VNZIP1                                                    0.040235   \n",
      "VehBCost                                                  0.782189   \n",
      "IsOnlineSale                                              0.035200   \n",
      "WarrantyCost                                             -0.017042   \n",
      "\n",
      "                                   MMRAcquisitionRetailAveragePrice  \\\n",
      "IsBadBuy                                                  -0.085481   \n",
      "PurchDate                                                  0.412976   \n",
      "VehYear                                                    0.581727   \n",
      "VehicleAge                                                -0.461259   \n",
      "WheelTypeID                                               -0.076838   \n",
      "VehOdo                                                     0.033122   \n",
      "MMRAcquisitionAuctionAveragePrice                          0.911260   \n",
      "MMRAcquisitionAuctionCleanPrice                            0.903793   \n",
      "MMRAcquisitionRetailAveragePrice                           1.000000   \n",
      "MMRAcquisitonRetailCleanPrice                              0.990261   \n",
      "MMRCurrentAuctionAveragePrice                              0.854267   \n",
      "MMRCurrentAuctionCleanPrice                                0.852059   \n",
      "MMRCurrentRetailAveragePrice                               0.915040   \n",
      "MMRCurrentRetailCleanPrice                                 0.908111   \n",
      "BYRNO                                                      0.106328   \n",
      "VNZIP1                                                     0.034588   \n",
      "VehBCost                                                   0.747813   \n",
      "IsOnlineSale                                               0.077105   \n",
      "WarrantyCost                                              -0.048868   \n",
      "\n",
      "                                   MMRAcquisitonRetailCleanPrice  \\\n",
      "IsBadBuy                                               -0.081688   \n",
      "PurchDate                                               0.390918   \n",
      "VehYear                                                 0.540135   \n",
      "VehicleAge                                             -0.427028   \n",
      "WheelTypeID                                            -0.105688   \n",
      "VehOdo                                                  0.064593   \n",
      "MMRAcquisitionAuctionAveragePrice                       0.910965   \n",
      "MMRAcquisitionAuctionCleanPrice                         0.919041   \n",
      "MMRAcquisitionRetailAveragePrice                        0.990261   \n",
      "MMRAcquisitonRetailCleanPrice                           1.000000   \n",
      "MMRCurrentAuctionAveragePrice                           0.849453   \n",
      "MMRCurrentAuctionCleanPrice                             0.858182   \n",
      "MMRCurrentRetailAveragePrice                            0.905114   \n",
      "MMRCurrentRetailCleanPrice                              0.908845   \n",
      "BYRNO                                                   0.068640   \n",
      "VNZIP1                                                  0.030137   \n",
      "VehBCost                                                0.746738   \n",
      "IsOnlineSale                                            0.073886   \n",
      "WarrantyCost                                           -0.022387   \n",
      "\n",
      "                                   MMRCurrentAuctionAveragePrice  \\\n",
      "IsBadBuy                                               -0.107182   \n",
      "PurchDate                                               0.066836   \n",
      "VehYear                                                 0.592414   \n",
      "VehicleAge                                             -0.576387   \n",
      "WheelTypeID                                            -0.090024   \n",
      "VehOdo                                                 -0.028962   \n",
      "MMRAcquisitionAuctionAveragePrice                       0.939200   \n",
      "MMRAcquisitionAuctionCleanPrice                         0.925053   \n",
      "MMRAcquisitionRetailAveragePrice                        0.854267   \n",
      "MMRAcquisitonRetailCleanPrice                           0.849453   \n",
      "MMRCurrentAuctionAveragePrice                           1.000000   \n",
      "MMRCurrentAuctionCleanPrice                             0.990419   \n",
      "MMRCurrentRetailAveragePrice                            0.915991   \n",
      "MMRCurrentRetailCleanPrice                              0.913199   \n",
      "BYRNO                                                   0.109441   \n",
      "VNZIP1                                                  0.047287   \n",
      "VehBCost                                                0.778115   \n",
      "IsOnlineSale                                            0.036780   \n",
      "WarrantyCost                                           -0.052981   \n",
      "\n",
      "                                   MMRCurrentAuctionCleanPrice  \\\n",
      "IsBadBuy                                             -0.102326   \n",
      "PurchDate                                             0.070540   \n",
      "VehYear                                               0.550240   \n",
      "VehicleAge                                           -0.533496   \n",
      "WheelTypeID                                          -0.122794   \n",
      "VehOdo                                                0.011445   \n",
      "MMRAcquisitionAuctionAveragePrice                     0.933829   \n",
      "MMRAcquisitionAuctionCleanPrice                       0.933398   \n",
      "MMRAcquisitionRetailAveragePrice                      0.852059   \n",
      "MMRAcquisitonRetailCleanPrice                         0.858182   \n",
      "MMRCurrentAuctionAveragePrice                         0.990419   \n",
      "MMRCurrentAuctionCleanPrice                           1.000000   \n",
      "MMRCurrentRetailAveragePrice                          0.909595   \n",
      "MMRCurrentRetailCleanPrice                            0.922402   \n",
      "BYRNO                                                 0.067470   \n",
      "VNZIP1                                                0.041602   \n",
      "VehBCost                                              0.774333   \n",
      "IsOnlineSale                                          0.037423   \n",
      "WarrantyCost                                         -0.024415   \n",
      "\n",
      "                                   MMRCurrentRetailAveragePrice  \\\n",
      "IsBadBuy                                              -0.102807   \n",
      "PurchDate                                              0.365682   \n",
      "VehYear                                                0.599428   \n",
      "VehicleAge                                            -0.504721   \n",
      "WheelTypeID                                           -0.078945   \n",
      "VehOdo                                                 0.016426   \n",
      "MMRAcquisitionAuctionAveragePrice                      0.873593   \n",
      "MMRAcquisitionAuctionCleanPrice                        0.862620   \n",
      "MMRAcquisitionRetailAveragePrice                       0.915040   \n",
      "MMRAcquisitonRetailCleanPrice                          0.905114   \n",
      "MMRCurrentAuctionAveragePrice                          0.915991   \n",
      "MMRCurrentAuctionCleanPrice                            0.909595   \n",
      "MMRCurrentRetailAveragePrice                           1.000000   \n",
      "MMRCurrentRetailCleanPrice                             0.989865   \n",
      "BYRNO                                                  0.109715   \n",
      "VNZIP1                                                 0.038003   \n",
      "VehBCost                                               0.757011   \n",
      "IsOnlineSale                                           0.076163   \n",
      "WarrantyCost                                          -0.054274   \n",
      "\n",
      "                                   MMRCurrentRetailCleanPrice     BYRNO  \\\n",
      "IsBadBuy                                            -0.099052 -0.059045   \n",
      "PurchDate                                            0.348838  0.091170   \n",
      "VehYear                                              0.559956  0.278102   \n",
      "VehicleAge                                          -0.470314 -0.264761   \n",
      "WheelTypeID                                         -0.107178  0.185025   \n",
      "VehOdo                                               0.048406 -0.286507   \n",
      "MMRAcquisitionAuctionAveragePrice                    0.872822  0.105009   \n",
      "MMRAcquisitionAuctionCleanPrice                      0.873000  0.060173   \n",
      "MMRAcquisitionRetailAveragePrice                     0.908111  0.106328   \n",
      "MMRAcquisitonRetailCleanPrice                        0.908845  0.068640   \n",
      "MMRCurrentAuctionAveragePrice                        0.913199  0.109441   \n",
      "MMRCurrentAuctionCleanPrice                          0.922402  0.067470   \n",
      "MMRCurrentRetailAveragePrice                         0.989865  0.109715   \n",
      "MMRCurrentRetailCleanPrice                           1.000000  0.073364   \n",
      "BYRNO                                                0.073364  1.000000   \n",
      "VNZIP1                                               0.033078  0.036803   \n",
      "VehBCost                                             0.756768  0.047775   \n",
      "IsOnlineSale                                         0.073450 -0.142476   \n",
      "WarrantyCost                                        -0.027939 -0.084848   \n",
      "\n",
      "                                     VNZIP1  VehBCost  IsOnlineSale  \\\n",
      "IsBadBuy                           0.006351 -0.100355     -0.002662   \n",
      "PurchDate                         -0.033172  0.146166      0.111111   \n",
      "VehYear                            0.072182  0.352131      0.059074   \n",
      "VehicleAge                        -0.080027 -0.313643     -0.026649   \n",
      "WheelTypeID                        0.008210 -0.164987     -0.003142   \n",
      "VehOdo                            -0.058077 -0.061822      0.029631   \n",
      "MMRAcquisitionAuctionAveragePrice  0.045296  0.790516      0.034912   \n",
      "MMRAcquisitionAuctionCleanPrice    0.040235  0.782189      0.035200   \n",
      "MMRAcquisitionRetailAveragePrice   0.034588  0.747813      0.077105   \n",
      "MMRAcquisitonRetailCleanPrice      0.030137  0.746738      0.073886   \n",
      "MMRCurrentAuctionAveragePrice      0.047287  0.778115      0.036780   \n",
      "MMRCurrentAuctionCleanPrice        0.041602  0.774333      0.037423   \n",
      "MMRCurrentRetailAveragePrice       0.038003  0.757011      0.076163   \n",
      "MMRCurrentRetailCleanPrice         0.033078  0.756768      0.073450   \n",
      "BYRNO                              0.036803  0.047775     -0.142476   \n",
      "VNZIP1                             1.000000  0.014744      0.029770   \n",
      "VehBCost                           0.014744  1.000000      0.028227   \n",
      "IsOnlineSale                       0.029770  0.028227      1.000000   \n",
      "WarrantyCost                      -0.037747 -0.027115     -0.000308   \n",
      "\n",
      "                                   WarrantyCost  \n",
      "IsBadBuy                               0.053949  \n",
      "PurchDate                             -0.036940  \n",
      "VehYear                               -0.265533  \n",
      "VehicleAge                             0.256667  \n",
      "WheelTypeID                           -0.137194  \n",
      "VehOdo                                 0.409489  \n",
      "MMRAcquisitionAuctionAveragePrice     -0.046557  \n",
      "MMRAcquisitionAuctionCleanPrice       -0.017042  \n",
      "MMRAcquisitionRetailAveragePrice      -0.048868  \n",
      "MMRAcquisitonRetailCleanPrice         -0.022387  \n",
      "MMRCurrentAuctionAveragePrice         -0.052981  \n",
      "MMRCurrentAuctionCleanPrice           -0.024415  \n",
      "MMRCurrentRetailAveragePrice          -0.054274  \n",
      "MMRCurrentRetailCleanPrice            -0.027939  \n",
      "BYRNO                                 -0.084848  \n",
      "VNZIP1                                -0.037747  \n",
      "VehBCost                              -0.027115  \n",
      "IsOnlineSale                          -0.000308  \n",
      "WarrantyCost                           1.000000  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Load the training dataset\n",
    "train_df = pd.read_csv('/data/datasets/kick/split_train.csv')\n",
    "\n",
    "# Display basic information about the dataset\n",
    "print(\"Basic Information about the Dataset:\")\n",
    "print(train_df.info())\n",
    "\n",
    "# Display the first few rows of the dataset\n",
    "print(\"\\nFirst few rows of the Dataset:\")\n",
    "print(train_df.head())\n",
    "\n",
    "# Summary statistics for numerical columns\n",
    "print(\"\\nSummary Statistics for Numerical Columns:\")\n",
    "print(train_df.describe())\n",
    "\n",
    "# Summary statistics for categorical columns\n",
    "print(\"\\nSummary Statistics for Categorical Columns:\")\n",
    "print(train_df.describe(include=['object']))\n",
    "\n",
    "# Check for missing values\n",
    "print(\"\\nMissing Values in the Dataset:\")\n",
    "print(train_df.isnull().sum())\n",
    "\n",
    "# Check the distribution of the target column 'IsBadBuy'\n",
    "print(\"\\nDistribution of the Target Column 'IsBadBuy':\")\n",
    "print(train_df['IsBadBuy'].value_counts(normalize=True))\n",
    "\n",
    "# Identify categorical and numerical columns\n",
    "categorical_cols = train_df.select_dtypes(include=['object']).columns\n",
    "numerical_cols = train_df.select_dtypes(include=['number']).columns\n",
    "\n",
    "print(\"\\nCategorical Columns:\")\n",
    "print(categorical_cols)\n",
    "\n",
    "print(\"\\nNumerical Columns:\")\n",
    "print(numerical_cols)\n",
    "\n",
    "# Correlation matrix for numerical columns\n",
    "print(\"\\nCorrelation Matrix for Numerical Columns:\")\n",
    "print(train_df[numerical_cols].corr())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4164c9ad",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:48:02.310797Z",
     "iopub.status.busy": "2024-10-18T12:48:02.310400Z",
     "iopub.status.idle": "2024-10-18T12:48:03.578592Z",
     "shell.execute_reply": "2024-10-18T12:48:03.577755Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler, OrdinalEncoder\n",
    "from sklearn.impute import SimpleImputer\n",
    "import pandas as pd\n",
    "\n",
    "# Function to preprocess the dataset\n",
    "def preprocess_data(df, numeric_cols, categorical_cols):\n",
    "    df_copy = df.copy()\n",
    "    \n",
    "    # Handle missing values\n",
    "    imputer_numeric = SimpleImputer(strategy='mean')\n",
    "    imputer_categorical = SimpleImputer(strategy='most_frequent')\n",
    "    \n",
    "    df_copy[numeric_cols] = imputer_numeric.fit_transform(df_copy[numeric_cols])\n",
    "    df_copy[categorical_cols] = imputer_categorical.fit_transform(df_copy[categorical_cols])\n",
    "    \n",
    "    # Convert categorical columns to strings to ensure consistent encoding\n",
    "    df_copy[categorical_cols] = df_copy[categorical_cols].astype(str)\n",
    "    \n",
    "    # Encode categorical variables\n",
    "    ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)\n",
    "    df_copy[categorical_cols] = ordinal_encoder.fit_transform(df_copy[categorical_cols])\n",
    "    \n",
    "    # Scale numerical features\n",
    "    scaler = StandardScaler()\n",
    "    df_copy[numeric_cols] = scaler.fit_transform(df_copy[numeric_cols])\n",
    "    \n",
    "    return df_copy, ordinal_encoder, scaler\n",
    "\n",
    "# Define columns\n",
    "numeric_cols = ['PurchDate', 'VehYear', 'VehicleAge', 'WheelTypeID', 'VehOdo', \n",
    "                'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', \n",
    "                'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', \n",
    "                'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', \n",
    "                'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice']\n",
    "categorical_cols = ['Auction', 'Make', 'Model', 'Trim', 'SubModel', 'Color', 'Transmission', \n",
    "                    'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'PRIMEUNIT', \n",
    "                    'AUCGUART', 'VNST']\n",
    "\n",
    "# Preprocess train, dev, and test sets\n",
    "train_df_processed, train_ordinal_encoder, train_scaler = preprocess_data(train_df, numeric_cols, categorical_cols)\n",
    "dev_df = pd.read_csv('/data/datasets/kick/split_dev.csv')\n",
    "dev_df_processed, _, _ = preprocess_data(dev_df, numeric_cols, categorical_cols)\n",
    "test_df = pd.read_csv('/data/datasets/kick/split_test_wo_target.csv')\n",
    "test_df_processed, _, _ = preprocess_data(test_df, numeric_cols, categorical_cols)\n",
    "\n",
    "# Ensure the ordinal encoder and scaler are consistent across train, dev, and test sets\n",
    "dev_df_processed[categorical_cols] = train_ordinal_encoder.transform(dev_df_processed[categorical_cols].astype(str))\n",
    "test_df_processed[categorical_cols] = train_ordinal_encoder.transform(test_df_processed[categorical_cols].astype(str))\n",
    "\n",
    "test_df_processed[numeric_cols] = train_scaler.transform(test_df_processed[numeric_cols])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a8a152a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:48:08.285389Z",
     "iopub.status.busy": "2024-10-18T12:48:08.284716Z",
     "iopub.status.idle": "2024-10-18T12:48:09.244905Z",
     "shell.execute_reply": "2024-10-18T12:48:09.244032Z"
    }
   },
   "outputs": [],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "column_info = get_column_info(train_df_processed)\n",
    "print(\"column_info\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6c879083",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:48:33.790912Z",
     "iopub.status.busy": "2024-10-18T12:48:33.789951Z",
     "iopub.status.idle": "2024-10-18T12:48:33.907599Z",
     "shell.execute_reply": "2024-10-18T12:48:33.906441Z"
    }
   },
   "outputs": [],
   "source": [
    "# Feature Engineering: Time-Series Analysis on 'PurchDate'\n",
    "\n",
    "# Convert 'PurchDate' to datetime format\n",
    "train_df_processed['PurchDate'] = pd.to_datetime(train_df_processed['PurchDate'], unit='s')\n",
    "dev_df_processed['PurchDate'] = pd.to_datetime(dev_df_processed['PurchDate'], unit='s')\n",
    "test_df_processed['PurchDate'] = pd.to_datetime(test_df_processed['PurchDate'], unit='s')\n",
    "\n",
    "# Extract temporal features\n",
    "def extract_temporal_features(df):\n",
    "    df_copy = df.copy()\n",
    "    df_copy['Year'] = df_copy['PurchDate'].dt.year\n",
    "    df_copy['Month'] = df_copy['PurchDate'].dt.month\n",
    "    df_copy['Day'] = df_copy['PurchDate'].dt.day\n",
    "    df_copy['DayOfWeek'] = df_copy['PurchDate'].dt.dayofweek\n",
    "    df_copy['IsWeekend'] = df_copy['DayOfWeek'].isin([5, 6]).astype(int)\n",
    "    df_copy['Quarter'] = df_copy['PurchDate'].dt.quarter\n",
    "    return df_copy\n",
    "\n",
    "train_df_processed = extract_temporal_features(train_df_processed)\n",
    "dev_df_processed = extract_temporal_features(dev_df_processed)\n",
    "test_df_processed = extract_temporal_features(test_df_processed)\n",
    "\n",
    "# Drop the original 'PurchDate' column\n",
    "train_df_processed.drop(columns=['PurchDate'], inplace=True)\n",
    "dev_df_processed.drop(columns=['PurchDate'], inplace=True)\n",
    "test_df_processed.drop(columns=['PurchDate'], inplace=True)\n",
    "\n",
    "# Update the list of numerical columns to include the new temporal features\n",
    "numeric_cols.extend(['Year', 'Month', 'Day', 'DayOfWeek', 'IsWeekend', 'Quarter'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3cf03f86",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:48:38.524964Z",
     "iopub.status.busy": "2024-10-18T12:48:38.524313Z",
     "iopub.status.idle": "2024-10-18T12:48:38.536565Z",
     "shell.execute_reply": "2024-10-18T12:48:38.535632Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "column_info\n",
      "{'Category': [], 'Numeric': ['IsBadBuy', 'Auction', 'VehYear', 'VehicleAge', 'Make', 'Model', 'Trim', 'SubModel', 'Color', 'Transmission', 'WheelTypeID', 'WheelType', 'VehOdo', 'Nationality', 'Size', 'TopThreeAmericanName', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'PRIMEUNIT', 'AUCGUART', 'BYRNO', 'VNZIP1', 'VNST', 'VehBCost', 'IsOnlineSale', 'WarrantyCost', 'Year', 'Month', 'Day', 'DayOfWeek', 'IsWeekend', 'Quarter'], 'Datetime': [], 'Others': []}\n"
     ]
    }
   ],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "column_info = get_column_info(train_df_processed)\n",
    "print(\"column_info\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ced1b287",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:50:06.605466Z",
     "iopub.status.busy": "2024-10-18T12:50:06.604793Z",
     "iopub.status.idle": "2024-10-18T12:51:47.471373Z",
     "shell.execute_reply": "2024-10-18T12:51:47.469646Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import f1_score\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.ensemble import StackingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n",
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "# Prepare the data\n",
    "X_train = train_df_processed.drop(columns=['IsBadBuy'])\n",
    "y_train = train_df_processed['IsBadBuy']\n",
    "X_dev = dev_df_processed.drop(columns=['IsBadBuy'])\n",
    "y_dev = dev_df_processed['IsBadBuy']\n",
    "X_test = test_df_processed\n",
    "\n",
    "# Impute missing values\n",
    "imputer = SimpleImputer(strategy='mean')\n",
    "X_train = imputer.fit_transform(X_train)\n",
    "X_dev = imputer.transform(X_dev)\n",
    "X_test = imputer.transform(X_test)\n",
    "\n",
    "# Define base models\n",
    "base_models = [\n",
    "    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n",
    "    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n",
    "    ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42))\n",
    "]\n",
    "\n",
    "# Define the meta-model\n",
    "meta_model = LogisticRegression()\n",
    "\n",
    "# Create the stacking ensemble\n",
    "stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n",
    "\n",
    "# Train the stacking model\n",
    "stacking_model.fit(X_train, y_train)\n",
    "\n",
    "# Predict on the dev set\n",
    "y_dev_pred = stacking_model.predict(X_dev)\n",
    "\n",
    "# Evaluate the model on the dev set\n",
    "dev_f1 = f1_score(y_dev, y_dev_pred)\n",
    "print(f\"Dev Set F1 Score: {dev_f1}\")\n",
    "\n",
    "# Predict on the test set\n",
    "y_test_pred = stacking_model.predict(X_test)\n",
    "\n",
    "# Save the predictions\n",
    "import os\n",
    "output_dir = '../workspace/kick'\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "pd.DataFrame({'target': y_dev_pred}).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n",
    "pd.DataFrame({'target': y_test_pred}).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5f9bc413",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:51:54.885101Z",
     "iopub.status.busy": "2024-10-18T12:51:54.884243Z",
     "iopub.status.idle": "2024-10-18T12:51:55.317798Z",
     "shell.execute_reply": "2024-10-18T12:51:55.315251Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classification Report for Dev Set:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.88      0.98      0.93     12775\n",
      "           1       0.42      0.09      0.16      1822\n",
      "\n",
      "    accuracy                           0.87     14597\n",
      "   macro avg       0.65      0.54      0.54     14597\n",
      "weighted avg       0.83      0.87      0.83     14597\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the base model on the dev set and print the performance metrics\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "# Predict on the dev set using the trained stacking model\n",
    "y_dev_pred = stacking_model.predict(X_dev)\n",
    "\n",
    "# Print the classification report for the dev set\n",
    "print(\"Classification Report for Dev Set:\")\n",
    "print(classification_report(y_dev, y_dev_pred))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ac911eab",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:52:00.974526Z",
     "iopub.status.busy": "2024-10-18T12:52:00.974078Z",
     "iopub.status.idle": "2024-10-18T12:52:00.983041Z",
     "shell.execute_reply": "2024-10-18T12:52:00.981541Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "column_info\n",
      "{'Category': [], 'Numeric': ['IsBadBuy', 'Auction', 'VehYear', 'VehicleAge', 'Make', 'Model', 'Trim', 'SubModel', 'Color', 'Transmission', 'WheelTypeID', 'WheelType', 'VehOdo', 'Nationality', 'Size', 'TopThreeAmericanName', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'PRIMEUNIT', 'AUCGUART', 'BYRNO', 'VNZIP1', 'VNST', 'VehBCost', 'IsOnlineSale', 'WarrantyCost', 'Year', 'Month', 'Day', 'DayOfWeek', 'IsWeekend', 'Quarter'], 'Datetime': [], 'Others': []}\n"
     ]
    }
   ],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "# Using the processed train DataFrame from the finished tasks\n",
    "column_info = get_column_info(train_df_processed)\n",
    "print(\"column_info\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8769400c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:52:40.238093Z",
     "iopub.status.busy": "2024-10-18T12:52:40.237242Z",
     "iopub.status.idle": "2024-10-18T12:54:19.500993Z",
     "shell.execute_reply": "2024-10-18T12:54:19.498649Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import f1_score\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.ensemble import StackingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n",
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "# Prepare the data\n",
    "X_train = train_df_processed.drop(columns=['IsBadBuy'])\n",
    "y_train = train_df_processed['IsBadBuy']\n",
    "X_dev = dev_df_processed.drop(columns=['IsBadBuy'])\n",
    "y_dev = dev_df_processed['IsBadBuy']\n",
    "X_test = test_df_processed\n",
    "\n",
    "# Impute missing values\n",
    "imputer = SimpleImputer(strategy='mean')\n",
    "X_train = imputer.fit_transform(X_train)\n",
    "X_dev = imputer.transform(X_dev)\n",
    "X_test = imputer.transform(X_test)\n",
    "\n",
    "# Define base models\n",
    "base_models = [\n",
    "    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n",
    "    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n",
    "    ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42))\n",
    "]\n",
    "\n",
    "# Define the meta-model\n",
    "meta_model = LogisticRegression()\n",
    "\n",
    "# Create the stacking ensemble\n",
    "stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n",
    "\n",
    "# Train the stacking model\n",
    "stacking_model.fit(X_train, y_train)\n",
    "\n",
    "# Predict on the dev set\n",
    "y_dev_pred = stacking_model.predict(X_dev)\n",
    "\n",
    "# Evaluate the model on the dev set\n",
    "dev_f1 = f1_score(y_dev, y_dev_pred)\n",
    "print(f\"Dev Set F1 Score: {dev_f1}\")\n",
    "\n",
    "# Predict on the test set\n",
    "y_test_pred = stacking_model.predict(X_test)\n",
    "\n",
    "# Save the predictions\n",
    "import os\n",
    "output_dir = '../workspace/kick'\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "pd.DataFrame({'target': y_dev_pred}).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n",
    "pd.DataFrame({'target': y_test_pred}).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
    "\n",
    "# Evaluate the base model on the dev set and print the performance metrics\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "# Predict on the dev set using the trained stacking model\n",
    "y_dev_pred = stacking_model.predict(X_dev)\n",
    "\n",
    "# Print the classification report for the dev set\n",
    "print(\"Classification Report for Dev Set:\")\n",
    "print(classification_report(y_dev, y_dev_pred))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a7836072",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:54:27.555494Z",
     "iopub.status.busy": "2024-10-18T12:54:27.554783Z",
     "iopub.status.idle": "2024-10-18T12:54:28.016957Z",
     "shell.execute_reply": "2024-10-18T12:54:28.014790Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classification Report for Dev Set:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.88      0.98      0.93     12775\n",
      "           1       0.42      0.09      0.16      1822\n",
      "\n",
      "    accuracy                           0.87     14597\n",
      "   macro avg       0.65      0.54      0.54     14597\n",
      "weighted avg       0.83      0.87      0.83     14597\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the base model on the dev set and print the performance metrics\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "# Predict on the dev set using the trained stacking model\n",
    "y_dev_pred = stacking_model.predict(X_dev)\n",
    "\n",
    "# Print the classification report for the dev set\n",
    "print(\"Classification Report for Dev Set:\")\n",
    "print(classification_report(y_dev, y_dev_pred))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c63d0e50",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:54:34.659761Z",
     "iopub.status.busy": "2024-10-18T12:54:34.659076Z",
     "iopub.status.idle": "2024-10-18T12:54:35.109390Z",
     "shell.execute_reply": "2024-10-18T12:54:35.107909Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classification Report for Dev Set:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.88      0.98      0.93     12775\n",
      "           1       0.42      0.09      0.16      1822\n",
      "\n",
      "    accuracy                           0.87     14597\n",
      "   macro avg       0.65      0.54      0.54     14597\n",
      "weighted avg       0.83      0.87      0.83     14597\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the base model on the dev set and print the performance metrics\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "# Predict on the dev set using the trained stacking model\n",
    "y_dev_pred = stacking_model.predict(X_dev)\n",
    "\n",
    "# Print the classification report for the dev set\n",
    "print(\"Classification Report for Dev Set:\")\n",
    "print(classification_report(y_dev, y_dev_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "00123973",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T12:54:41.915423Z",
     "iopub.status.busy": "2024-10-18T12:54:41.914744Z",
     "iopub.status.idle": "2024-10-18T12:54:42.333685Z",
     "shell.execute_reply": "2024-10-18T12:54:42.332003Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classification Report for Dev Set:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.88      0.98      0.93     12775\n",
      "           1       0.42      0.09      0.16      1822\n",
      "\n",
      "    accuracy                           0.87     14597\n",
      "   macro avg       0.65      0.54      0.54     14597\n",
      "weighted avg       0.83      0.87      0.83     14597\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the base model on the dev set and print the performance metrics\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "# Predict on the dev set using the trained stacking model\n",
    "y_dev_pred = stacking_model.predict(X_dev)\n",
    "\n",
    "# Print the classification report for the dev set\n",
    "print(\"Classification Report for Dev Set:\")\n",
    "print(classification_report(y_dev, y_dev_pred))"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
