{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b4ffe6e8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:30:31.499553Z",
     "iopub.status.busy": "2024-10-18T16:30:31.499019Z",
     "iopub.status.idle": "2024-10-18T16:30:31.968361Z",
     "shell.execute_reply": "2024-10-18T16:30:31.967463Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Basic Information about the Train Dataset:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2938 entries, 0 to 2937\n",
      "Data columns (total 12 columns):\n",
      " #   Column  Non-Null Count  Dtype  \n",
      "---  ------  --------------  -----  \n",
      " 0   V1      2938 non-null   float64\n",
      " 1   V2      2938 non-null   float64\n",
      " 2   V3      2938 non-null   float64\n",
      " 3   V4      2938 non-null   float64\n",
      " 4   V5      2938 non-null   float64\n",
      " 5   V6      2938 non-null   float64\n",
      " 6   V7      2938 non-null   float64\n",
      " 7   V8      2938 non-null   float64\n",
      " 8   V9      2938 non-null   float64\n",
      " 9   V10     2938 non-null   float64\n",
      " 10  V11     2938 non-null   float64\n",
      " 11  Class   2938 non-null   int64  \n",
      "dtypes: float64(11), int64(1)\n",
      "memory usage: 275.6 KB\n",
      "None\n",
      "\n",
      "First few rows of the Train Dataset:\n",
      "    V1    V2    V3    V4     V5    V6     V7       V8    V9   V10   V11  Class\n",
      "0  7.0  0.29  0.26   1.6  0.044  12.0   87.0  0.99230  3.08  0.46  10.5      4\n",
      "1  6.5  0.20  0.50  18.1  0.054  50.0  221.0  0.99941  2.94  0.64   8.8      4\n",
      "2  7.4  0.19  0.31  14.5  0.045  39.0  193.0  0.99860  3.10  0.50   9.2      4\n",
      "3  6.0  0.33  0.26   5.1  0.051  16.0  119.0  0.99416  3.15  0.41   9.2      3\n",
      "4  6.0  0.26  0.32   3.8  0.029  48.0  180.0  0.99011  3.15  0.34  12.0      4\n",
      "\n",
      "Summary Statistics for Numerical Columns:\n",
      "                V1           V2           V3           V4           V5  \\\n",
      "count  2938.000000  2938.000000  2938.000000  2938.000000  2938.000000   \n",
      "mean      6.847583     0.277362     0.334704     6.418482     0.045776   \n",
      "std       0.836395     0.101146     0.118871     5.028103     0.021986   \n",
      "min       3.800000     0.080000     0.000000     0.600000     0.009000   \n",
      "25%       6.300000     0.210000     0.270000     1.800000     0.036000   \n",
      "50%       6.800000     0.260000     0.320000     5.400000     0.043000   \n",
      "75%       7.300000     0.320000     0.390000     9.800000     0.050000   \n",
      "max      10.300000     1.005000     1.230000    65.800000     0.290000   \n",
      "\n",
      "                V6           V7           V8           V9          V10  \\\n",
      "count  2938.000000  2938.000000  2938.000000  2938.000000  2938.000000   \n",
      "mean     35.529442   138.489789     0.994068     3.190939     0.490129   \n",
      "std      17.286341    42.925975     0.003026     0.154362     0.112972   \n",
      "min       2.000000     9.000000     0.987110     2.720000     0.250000   \n",
      "25%      23.125000   108.000000     0.991740     3.090000     0.410000   \n",
      "50%      34.000000   134.000000     0.993825     3.180000     0.470000   \n",
      "75%      46.000000   168.000000     0.996200     3.280000     0.550000   \n",
      "max     289.000000   440.000000     1.038980     3.820000     1.080000   \n",
      "\n",
      "               V11        Class  \n",
      "count  2938.000000  2938.000000  \n",
      "mean     10.500447     3.866916  \n",
      "std       1.224625     0.880880  \n",
      "min       8.000000     1.000000  \n",
      "25%       9.400000     3.000000  \n",
      "50%      10.400000     4.000000  \n",
      "75%      11.400000     4.000000  \n",
      "max      14.050000     7.000000  \n",
      "\n",
      "Missing Values in the Train Dataset:\n",
      "V1       0\n",
      "V2       0\n",
      "V3       0\n",
      "V4       0\n",
      "V5       0\n",
      "V6       0\n",
      "V7       0\n",
      "V8       0\n",
      "V9       0\n",
      "V10      0\n",
      "V11      0\n",
      "Class    0\n",
      "dtype: int64\n",
      "\n",
      "Distribution of the Target Column 'Class':\n",
      "Class\n",
      "4    1302\n",
      "3     894\n",
      "5     527\n",
      "2     104\n",
      "6     101\n",
      "1       8\n",
      "7       2\n",
      "Name: count, dtype: int64\n",
      "\n",
      "Correlation Matrix for Numerical Features:\n",
      "             V1        V2        V3        V4        V5        V6        V7  \\\n",
      "V1     1.000000  0.002963  0.289830  0.089740  0.037553 -0.037302  0.098135   \n",
      "V2     0.002963  1.000000 -0.137952  0.074792  0.078061 -0.091265  0.099419   \n",
      "V3     0.289830 -0.137952  1.000000  0.079508  0.128055  0.090799  0.139039   \n",
      "V4     0.089740  0.074792  0.079508  1.000000  0.095135  0.290131  0.408267   \n",
      "V5     0.037553  0.078061  0.128055  0.095135  1.000000  0.085324  0.186074   \n",
      "V6    -0.037302 -0.091265  0.090799  0.290131  0.085324  1.000000  0.612612   \n",
      "V7     0.098135  0.099419  0.139039  0.408267  0.186074  0.612612  1.000000   \n",
      "V8     0.270519  0.056320  0.149570  0.838659  0.258739  0.279164  0.527171   \n",
      "V9    -0.425863 -0.055267 -0.156217 -0.193751 -0.090197 -0.017159  0.003335   \n",
      "V10   -0.001672 -0.033935  0.086039 -0.012097  0.028218  0.060661  0.150862   \n",
      "V11   -0.130240  0.046026 -0.084173 -0.454094 -0.352852 -0.243381 -0.449616   \n",
      "Class -0.104994 -0.209517 -0.010343 -0.088310 -0.204473  0.010767 -0.155251   \n",
      "\n",
      "             V8        V9       V10       V11     Class  \n",
      "V1     0.270519 -0.425863 -0.001672 -0.130240 -0.104994  \n",
      "V2     0.056320 -0.055267 -0.033935  0.046026 -0.209517  \n",
      "V3     0.149570 -0.156217  0.086039 -0.084173 -0.010343  \n",
      "V4     0.838659 -0.193751 -0.012097 -0.454094 -0.088310  \n",
      "V5     0.258739 -0.090197  0.028218 -0.352852 -0.204473  \n",
      "V6     0.279164 -0.017159  0.060661 -0.243381  0.010767  \n",
      "V7     0.527171  0.003335  0.150862 -0.449616 -0.155251  \n",
      "V8     1.000000 -0.077633  0.097006 -0.772220 -0.294456  \n",
      "V9    -0.077633  1.000000  0.161076  0.108778  0.102834  \n",
      "V10    0.097006  0.161076  1.000000 -0.029276  0.062033  \n",
      "V11   -0.772220  0.108778 -0.029276  1.000000  0.435316  \n",
      "Class -0.294456  0.102834  0.062033  0.435316  1.000000  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Load the train dataset\n",
    "train_df = pd.read_csv('/data/datasets/wine-quality-white/split_train.csv')\n",
    "\n",
    "# Display basic information about the dataset\n",
    "print(\"Basic Information about the Train Dataset:\")\n",
    "print(train_df.info())\n",
    "\n",
    "# Display the first few rows of the dataset\n",
    "print(\"\\nFirst few rows of the Train Dataset:\")\n",
    "print(train_df.head())\n",
    "\n",
    "# Summary statistics for numerical columns\n",
    "print(\"\\nSummary Statistics for Numerical Columns:\")\n",
    "print(train_df.describe())\n",
    "\n",
    "# Check for missing values\n",
    "print(\"\\nMissing Values in the Train Dataset:\")\n",
    "print(train_df.isnull().sum())\n",
    "\n",
    "# Check the distribution of the target column 'Class'\n",
    "print(\"\\nDistribution of the Target Column 'Class':\")\n",
    "print(train_df['Class'].value_counts())\n",
    "\n",
    "# Correlation matrix for numerical features\n",
    "print(\"\\nCorrelation Matrix for Numerical Features:\")\n",
    "print(train_df.corr())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e4b7f1c6",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:30:31.984302Z",
     "iopub.status.busy": "2024-10-18T16:30:31.984036Z",
     "iopub.status.idle": "2024-10-18T16:30:32.391274Z",
     "shell.execute_reply": "2024-10-18T16:30:32.390454Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First few rows of the processed Train Dataset:\n",
      "         V1        V2        V3        V4        V5        V6        V7  \\\n",
      "0  0.182261  0.124968 -0.628550 -0.958473 -0.080811 -1.361390 -1.199706   \n",
      "1 -0.415644 -0.764986  1.390783  2.323641  0.374108  0.837252  1.922478   \n",
      "2  0.660586 -0.863870 -0.207856  1.607543 -0.035319  0.200803  1.270082   \n",
      "3 -1.013549  0.520503 -0.628550 -0.262267  0.237632 -1.129954 -0.454110   \n",
      "4 -1.013549 -0.171683 -0.123717 -0.520858 -0.763189  0.721534  0.967183   \n",
      "\n",
      "         V8        V9       V10       V11  Class  \n",
      "0 -0.584401 -0.718819 -0.266743 -0.000365      4  \n",
      "1  1.765831 -1.625934  1.326844 -1.388781      4  \n",
      "2  1.498083 -0.589232  0.087388 -1.062095      4  \n",
      "3  0.030428 -0.265262 -0.709406 -1.062095      3  \n",
      "4 -1.308312 -0.265262 -1.329134  1.224708      4  \n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Function to preprocess the dataset\n",
    "def preprocess_data(df):\n",
    "    df_copy = df.copy()\n",
    "    \n",
    "    # Separate features and target\n",
    "    if 'Class' in df_copy.columns:\n",
    "        X = df_copy.drop(columns=['Class'])\n",
    "        y = df_copy['Class']\n",
    "    else:\n",
    "        X = df_copy\n",
    "        y = None\n",
    "    \n",
    "    # Standardize numerical features\n",
    "    scaler = StandardScaler()\n",
    "    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)\n",
    "    \n",
    "    # Combine features and target if target is present\n",
    "    if y is not None:\n",
    "        df_processed = X_scaled.copy()\n",
    "        df_processed['Class'] = y\n",
    "    else:\n",
    "        df_processed = X_scaled\n",
    "    \n",
    "    return df_processed\n",
    "\n",
    "# Preprocess train, dev, and test sets\n",
    "train_df_processed = preprocess_data(train_df)\n",
    "dev_df = pd.read_csv('/data/datasets/wine-quality-white/split_dev.csv')\n",
    "dev_df_processed = preprocess_data(dev_df)\n",
    "test_df = pd.read_csv('/data/datasets/wine-quality-white/split_test_wo_target.csv')\n",
    "test_df_processed = preprocess_data(test_df)\n",
    "\n",
    "# Display the first few rows of the processed train dataset\n",
    "print(\"First few rows of the processed Train Dataset:\")\n",
    "print(train_df_processed.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "80d78541",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:30:32.403514Z",
     "iopub.status.busy": "2024-10-18T16:30:32.403164Z",
     "iopub.status.idle": "2024-10-18T16:30:32.433866Z",
     "shell.execute_reply": "2024-10-18T16:30:32.433031Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First few rows of the engineered Train Dataset:\n",
      "         V1        V2        V3        V4        V5        V6        V7  \\\n",
      "0  0.182261  0.124968 -0.628550 -0.958473 -0.080811 -1.361390 -1.199706   \n",
      "1 -0.415644 -0.764986  1.390783  2.323641  0.374108  0.837252  1.922478   \n",
      "2  0.660586 -0.863870 -0.207856  1.607543 -0.035319  0.200803  1.270082   \n",
      "3 -1.013549  0.520503 -0.628550 -0.262267  0.237632 -1.129954 -0.454110   \n",
      "4 -1.013549 -0.171683 -0.123717 -0.520858 -0.763189  0.721534  0.967183   \n",
      "\n",
      "         V8        V9       V10  ...     V7 V9    V7 V10    V7 V11     V8 V9  \\\n",
      "0 -0.584401 -0.718819 -0.266743  ...  0.862372  0.320013  0.000438  0.420079   \n",
      "1  1.765831 -1.625934  1.326844  ... -3.125822  2.550829 -2.669902 -2.871124   \n",
      "2  1.498083 -0.589232  0.087388  ... -0.748372  0.110989 -1.348947 -0.882718   \n",
      "3  0.030428 -0.265262 -0.709406  ...  0.120458  0.322148  0.482308 -0.008071   \n",
      "4 -1.308312 -0.265262 -1.329134  ... -0.256557 -1.285516  1.184517  0.347046   \n",
      "\n",
      "     V8 V10    V8 V11    V9 V10    V9 V11   V10 V11  Class  \n",
      "0  0.155885  0.000213  0.191740  0.000262  0.000097      4  \n",
      "1  2.342982 -2.452353 -2.157360  2.258066 -1.842696      4  \n",
      "2  0.130914 -1.591107 -0.051491  0.625820 -0.092814      4  \n",
      "3 -0.021586 -0.032317  0.188179  0.281734  0.753456      3  \n",
      "4  1.738922 -1.602300  0.352569 -0.324869 -1.627801      4  \n",
      "\n",
      "[5 rows x 67 columns]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "\n",
    "def engineer_features(df):\n",
    "    df_copy = df.copy()\n",
    "    if 'Class' in df_copy.columns:\n",
    "        X = df_copy.drop(columns=['Class'])\n",
    "        y = df_copy['Class']\n",
    "    else:\n",
    "        X = df_copy\n",
    "        y = None\n",
    "    \n",
    "    # Polynomial Features\n",
    "    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n",
    "    X_poly = poly.fit_transform(X)\n",
    "    X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))\n",
    "    \n",
    "    if y is not None:\n",
    "        df_engineered = X_poly_df.copy()\n",
    "        df_engineered['Class'] = y\n",
    "    else:\n",
    "        df_engineered = X_poly_df\n",
    "    \n",
    "    return df_engineered\n",
    "\n",
    "train_df_engineered = engineer_features(train_df_processed)\n",
    "dev_df_engineered = engineer_features(dev_df_processed)\n",
    "test_df_engineered = engineer_features(test_df_processed)\n",
    "\n",
    "print(\"First few rows of the engineered Train Dataset:\")\n",
    "print(train_df_engineered.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5676a93b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:30:40.453940Z",
     "iopub.status.busy": "2024-10-18T16:30:40.453297Z",
     "iopub.status.idle": "2024-10-18T16:30:41.518977Z",
     "shell.execute_reply": "2024-10-18T16:30:41.518132Z"
    }
   },
   "outputs": [],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "# Get column information for the processed and engineered train dataset\n",
    "column_info_train_processed = get_column_info(train_df_processed)\n",
    "print(\"Column Info for Processed Train Dataset:\")\n",
    "print(column_info_train_processed)\n",
    "\n",
    "column_info_train_engineered = get_column_info(train_df_engineered)\n",
    "print(\"\\nColumn Info for Engineered Train Dataset:\")\n",
    "print(column_info_train_engineered)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "acca4f1f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:31:03.846404Z",
     "iopub.status.busy": "2024-10-18T16:31:03.845462Z",
     "iopub.status.idle": "2024-10-18T16:31:06.126307Z",
     "shell.execute_reply": "2024-10-18T16:31:06.125359Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1 Weighted Score on Dev Set: 0.6201733091709429\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "import joblib\n",
    "\n",
    "# Split the data into features and target\n",
    "X_train = train_df_engineered.drop(columns=['Class'])\n",
    "y_train = train_df_engineered['Class']\n",
    "X_dev = dev_df_engineered.drop(columns=['Class'])\n",
    "y_dev = dev_df_engineered['Class']\n",
    "\n",
    "# Train a base model (RandomForestClassifier)\n",
    "base_model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
    "base_model.fit(X_train, y_train)\n",
    "\n",
    "# Predict on the dev set\n",
    "y_dev_pred = base_model.predict(X_dev)\n",
    "\n",
    "# Evaluate the base model\n",
    "f1_weighted = f1_score(y_dev, y_dev_pred, average='weighted')\n",
    "print(f\"F1 Weighted Score on Dev Set: {f1_weighted}\")\n",
    "\n",
    "# Save the base model\n",
    "joblib.dump(base_model, '../workspace/wine-quality-white/base_model.pkl')\n",
    "\n",
    "# Predict on the test set\n",
    "X_test = test_df_engineered\n",
    "y_test_pred = base_model.predict(X_test)\n",
    "\n",
    "# Save the predictions\n",
    "pd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/wine-quality-white/dev_predictions.csv', index=False)\n",
    "pd.DataFrame({'target': y_test_pred}).to_csv('../workspace/wine-quality-white/test_predictions.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bdba15cd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:31:11.701385Z",
     "iopub.status.busy": "2024-10-18T16:31:11.700743Z",
     "iopub.status.idle": "2024-10-18T16:31:11.750615Z",
     "shell.execute_reply": "2024-10-18T16:31:11.749509Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1 Weighted Score on Dev Set: 0.6201733091709429\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the base model on the dev set and print the f1 weighted score\n",
    "y_dev_pred = base_model.predict(X_dev)\n",
    "f1_weighted = f1_score(y_dev, y_dev_pred, average='weighted')\n",
    "print(f\"F1 Weighted Score on Dev Set: {f1_weighted}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1160f7dd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:31:17.281116Z",
     "iopub.status.busy": "2024-10-18T16:31:17.280462Z",
     "iopub.status.idle": "2024-10-18T16:31:17.291011Z",
     "shell.execute_reply": "2024-10-18T16:31:17.289869Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "column_info\n",
      "{'Category': [], 'Numeric': ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V1 V2', 'V1 V3', 'V1 V4', 'V1 V5', 'V1 V6', 'V1 V7', 'V1 V8', 'V1 V9', 'V1 V10', 'V1 V11', 'V2 V3', 'V2 V4', 'V2 V5', 'V2 V6', 'V2 V7', 'V2 V8', 'V2 V9', 'V2 V10', 'V2 V11', 'V3 V4', 'V3 V5', 'V3 V6', 'V3 V7', 'V3 V8', 'V3 V9', 'V3 V10', 'V3 V11', 'V4 V5', 'V4 V6', 'V4 V7', 'V4 V8', 'V4 V9', 'V4 V10', 'V4 V11', 'V5 V6', 'V5 V7', 'V5 V8', 'V5 V9', 'V5 V10', 'V5 V11', 'V6 V7', 'V6 V8', 'V6 V9', 'V6 V10', 'V6 V11', 'V7 V8', 'V7 V9', 'V7 V10', 'V7 V11', 'V8 V9', 'V8 V10', 'V8 V11', 'V9 V10', 'V9 V11', 'V10 V11', 'Class'], 'Datetime': [], 'Others': []}\n"
     ]
    }
   ],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "# Check the latest column information for the processed and engineered train dataset\n",
    "column_info = get_column_info(train_df_engineered)\n",
    "print(\"column_info\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "701d82aa",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:32:16.923357Z",
     "iopub.status.busy": "2024-10-18T16:32:16.922696Z",
     "iopub.status.idle": "2024-10-18T16:32:36.955131Z",
     "shell.execute_reply": "2024-10-18T16:32:36.954345Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import StackingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "import joblib\n",
    "\n",
    "# Define base models without CatBoostClassifier\n",
    "base_models = [\n",
    "    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),\n",
    "    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))\n",
    "]\n",
    "\n",
    "# Define the meta-model\n",
    "meta_model = LogisticRegression(random_state=42)\n",
    "\n",
    "# Create the stacking ensemble model\n",
    "stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n",
    "\n",
    "# Train the stacking ensemble model\n",
    "stacking_model.fit(X_train, y_train)\n",
    "\n",
    "# Predict on the dev set\n",
    "y_dev_pred = stacking_model.predict(X_dev)\n",
    "\n",
    "# Evaluate the stacking model\n",
    "f1_weighted = f1_score(y_dev, y_dev_pred, average='weighted')\n",
    "print(f\"F1 Weighted Score on Dev Set with Stacking Model: {f1_weighted}\")\n",
    "\n",
    "# Save the stacking model\n",
    "joblib.dump(stacking_model, '../workspace/wine-quality-white/stacking_model.pkl')\n",
    "\n",
    "# Predict on the test set\n",
    "y_test_pred = stacking_model.predict(X_test)\n",
    "\n",
    "# Save the predictions\n",
    "pd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/wine-quality-white/dev_predictions.csv', index=False)\n",
    "pd.DataFrame({'target': y_test_pred}).to_csv('../workspace/wine-quality-white/test_predictions.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9b390f63",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:32:43.425660Z",
     "iopub.status.busy": "2024-10-18T16:32:43.424994Z",
     "iopub.status.idle": "2024-10-18T16:32:43.500919Z",
     "shell.execute_reply": "2024-10-18T16:32:43.499959Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1 Weighted Score on Dev Set with Stacking Model: 0.6323695473807042\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the stacking model on the dev set and print the f1 weighted score\n",
    "y_dev_pred = stacking_model.predict(X_dev)\n",
    "f1_weighted = f1_score(y_dev, y_dev_pred, average='weighted')\n",
    "print(f\"F1 Weighted Score on Dev Set with Stacking Model: {f1_weighted}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5c2f07aa",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:32:48.421837Z",
     "iopub.status.busy": "2024-10-18T16:32:48.421186Z",
     "iopub.status.idle": "2024-10-18T16:32:48.432647Z",
     "shell.execute_reply": "2024-10-18T16:32:48.431487Z"
    }
   },
   "outputs": [],
   "source": [
    "# Save the predictions of the ensemble model on the dev set to 'dev_predictions.csv'\n",
    "pd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/wine-quality-white/dev_predictions.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "9773f49d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T16:32:54.388592Z",
     "iopub.status.busy": "2024-10-18T16:32:54.387923Z",
     "iopub.status.idle": "2024-10-18T16:32:54.467805Z",
     "shell.execute_reply": "2024-10-18T16:32:54.466544Z"
    }
   },
   "outputs": [],
   "source": [
    "# Save the predictions of the ensemble model on the test set to 'test_predictions.csv'\n",
    "y_test_pred = stacking_model.predict(X_test)\n",
    "pd.DataFrame({'target': y_test_pred}).to_csv('../workspace/wine-quality-white/test_predictions.csv', index=False)\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
