{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "805f9cfb",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:26:32.491281Z",
     "iopub.status.busy": "2024-10-18T10:26:32.490744Z",
     "iopub.status.idle": "2024-10-18T10:26:32.949414Z",
     "shell.execute_reply": "2024-10-18T10:26:32.948565Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Data Info:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 600 entries, 0 to 599\n",
      "Data columns (total 21 columns):\n",
      " #   Column                  Non-Null Count  Dtype  \n",
      "---  ------                  --------------  -----  \n",
      " 0   checking_status         600 non-null    object \n",
      " 1   duration                600 non-null    int64  \n",
      " 2   credit_history          600 non-null    object \n",
      " 3   purpose                 600 non-null    object \n",
      " 4   credit_amount           600 non-null    float64\n",
      " 5   savings_status          600 non-null    object \n",
      " 6   employment              600 non-null    object \n",
      " 7   installment_commitment  600 non-null    int64  \n",
      " 8   personal_status         600 non-null    object \n",
      " 9   other_parties           600 non-null    object \n",
      " 10  residence_since         600 non-null    int64  \n",
      " 11  property_magnitude      600 non-null    object \n",
      " 12  age                     600 non-null    int64  \n",
      " 13  other_payment_plans     600 non-null    object \n",
      " 14  housing                 600 non-null    object \n",
      " 15  existing_credits        600 non-null    int64  \n",
      " 16  job                     600 non-null    object \n",
      " 17  num_dependents          600 non-null    int64  \n",
      " 18  own_telephone           600 non-null    object \n",
      " 19  foreign_worker          600 non-null    object \n",
      " 20  class                   600 non-null    object \n",
      "dtypes: float64(1), int64(6), object(14)\n",
      "memory usage: 98.6+ KB\n",
      "None\n",
      "\n",
      "Dev Data Info:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 200 entries, 0 to 199\n",
      "Data columns (total 21 columns):\n",
      " #   Column                  Non-Null Count  Dtype  \n",
      "---  ------                  --------------  -----  \n",
      " 0   checking_status         200 non-null    object \n",
      " 1   duration                200 non-null    int64  \n",
      " 2   credit_history          200 non-null    object \n",
      " 3   purpose                 200 non-null    object \n",
      " 4   credit_amount           200 non-null    float64\n",
      " 5   savings_status          200 non-null    object \n",
      " 6   employment              200 non-null    object \n",
      " 7   installment_commitment  200 non-null    int64  \n",
      " 8   personal_status         200 non-null    object \n",
      " 9   other_parties           200 non-null    object \n",
      " 10  residence_since         200 non-null    int64  \n",
      " 11  property_magnitude      200 non-null    object \n",
      " 12  age                     200 non-null    int64  \n",
      " 13  other_payment_plans     200 non-null    object \n",
      " 14  housing                 200 non-null    object \n",
      " 15  existing_credits        200 non-null    int64  \n",
      " 16  job                     200 non-null    object \n",
      " 17  num_dependents          200 non-null    int64  \n",
      " 18  own_telephone           200 non-null    object \n",
      " 19  foreign_worker          200 non-null    object \n",
      " 20  class                   200 non-null    object \n",
      "dtypes: float64(1), int64(6), object(14)\n",
      "memory usage: 32.9+ KB\n",
      "None\n",
      "\n",
      "Train Data Summary Statistics:\n",
      "         duration  credit_amount  installment_commitment  residence_since  \\\n",
      "count  600.000000     600.000000              600.000000       600.000000   \n",
      "mean    21.210000    3322.735000                2.950000         2.871667   \n",
      "std     12.548905    2931.012196                1.131209         1.075698   \n",
      "min      4.000000     250.000000                1.000000         1.000000   \n",
      "25%     12.000000    1336.000000                2.000000         2.000000   \n",
      "50%     18.000000    2330.000000                3.000000         3.000000   \n",
      "75%     24.000000    4025.500000                4.000000         4.000000   \n",
      "max     72.000000   18424.000000                4.000000         4.000000   \n",
      "\n",
      "              age  existing_credits  num_dependents  \n",
      "count  600.000000        600.000000      600.000000  \n",
      "mean    35.571667          1.400000        1.158333  \n",
      "std     11.241388          0.577832        0.365358  \n",
      "min     19.000000          1.000000        1.000000  \n",
      "25%     27.000000          1.000000        1.000000  \n",
      "50%     33.000000          1.000000        1.000000  \n",
      "75%     42.000000          2.000000        1.000000  \n",
      "max     75.000000          4.000000        2.000000  \n",
      "\n",
      "Dev Data Summary Statistics:\n",
      "         duration  credit_amount  installment_commitment  residence_since  \\\n",
      "count  200.000000      200.00000              200.000000       200.000000   \n",
      "mean    20.030000     3005.70500                2.980000         2.815000   \n",
      "std     11.354016     2447.76726                1.107126         1.134553   \n",
      "min      4.000000      368.00000                1.000000         1.000000   \n",
      "25%     12.000000     1390.25000                2.000000         2.000000   \n",
      "50%     18.000000     2247.50000                3.000000         3.000000   \n",
      "75%     24.000000     3666.75000                4.000000         4.000000   \n",
      "max     60.000000    14896.00000                4.000000         4.000000   \n",
      "\n",
      "              age  existing_credits  num_dependents  \n",
      "count  200.000000          200.0000      200.000000  \n",
      "mean    35.745000            1.4100        1.125000  \n",
      "std     11.770416            0.5947        0.331549  \n",
      "min     20.000000            1.0000        1.000000  \n",
      "25%     27.000000            1.0000        1.000000  \n",
      "50%     33.000000            1.0000        1.000000  \n",
      "75%     43.000000            2.0000        1.000000  \n",
      "max     68.000000            4.0000        2.000000  \n",
      "\n",
      "Unique Values in Categorical Columns:\n",
      "checking_status: 4 unique values\n",
      "credit_history: 5 unique values\n",
      "purpose: 10 unique values\n",
      "savings_status: 5 unique values\n",
      "employment: 5 unique values\n",
      "personal_status: 4 unique values\n",
      "other_parties: 3 unique values\n",
      "property_magnitude: 4 unique values\n",
      "other_payment_plans: 3 unique values\n",
      "housing: 3 unique values\n",
      "job: 4 unique values\n",
      "own_telephone: 2 unique values\n",
      "foreign_worker: 2 unique values\n",
      "class: 2 unique values\n",
      "\n",
      "Missing Values in Train Data:\n",
      "checking_status           0\n",
      "duration                  0\n",
      "credit_history            0\n",
      "purpose                   0\n",
      "credit_amount             0\n",
      "savings_status            0\n",
      "employment                0\n",
      "installment_commitment    0\n",
      "personal_status           0\n",
      "other_parties             0\n",
      "residence_since           0\n",
      "property_magnitude        0\n",
      "age                       0\n",
      "other_payment_plans       0\n",
      "housing                   0\n",
      "existing_credits          0\n",
      "job                       0\n",
      "num_dependents            0\n",
      "own_telephone             0\n",
      "foreign_worker            0\n",
      "class                     0\n",
      "dtype: int64\n",
      "\n",
      "Missing Values in Dev Data:\n",
      "checking_status           0\n",
      "duration                  0\n",
      "credit_history            0\n",
      "purpose                   0\n",
      "credit_amount             0\n",
      "savings_status            0\n",
      "employment                0\n",
      "installment_commitment    0\n",
      "personal_status           0\n",
      "other_parties             0\n",
      "residence_since           0\n",
      "property_magnitude        0\n",
      "age                       0\n",
      "other_payment_plans       0\n",
      "housing                   0\n",
      "existing_credits          0\n",
      "job                       0\n",
      "num_dependents            0\n",
      "own_telephone             0\n",
      "foreign_worker            0\n",
      "class                     0\n",
      "dtype: int64\n",
      "\n",
      "Target Distribution in Train Data:\n",
      "class\n",
      "good    0.681667\n",
      "bad     0.318333\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Target Distribution in Dev Data:\n",
      "class\n",
      "good    0.735\n",
      "bad     0.265\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Load the dataset\n",
    "train_data = pd.read_csv('/data/datasets/credit-g/split_train.csv')\n",
    "dev_data = pd.read_csv('/data/datasets/credit-g/split_dev.csv')\n",
    "\n",
    "# Display basic information about the datasets\n",
    "print(\"Train Data Info:\")\n",
    "print(train_data.info())\n",
    "print(\"\\nDev Data Info:\")\n",
    "print(dev_data.info())\n",
    "\n",
    "# Display summary statistics for numerical columns\n",
    "print(\"\\nTrain Data Summary Statistics:\")\n",
    "print(train_data.describe())\n",
    "print(\"\\nDev Data Summary Statistics:\")\n",
    "print(dev_data.describe())\n",
    "\n",
    "# Display the number of unique values for categorical columns\n",
    "categorical_columns = train_data.select_dtypes(include=['object']).columns\n",
    "print(\"\\nUnique Values in Categorical Columns:\")\n",
    "for col in categorical_columns:\n",
    "    print(f\"{col}: {train_data[col].nunique()} unique values\")\n",
    "\n",
    "# Check for missing values\n",
    "print(\"\\nMissing Values in Train Data:\")\n",
    "print(train_data.isnull().sum())\n",
    "print(\"\\nMissing Values in Dev Data:\")\n",
    "print(dev_data.isnull().sum())\n",
    "\n",
    "# Display the distribution of the target column\n",
    "print(\"\\nTarget Distribution in Train Data:\")\n",
    "print(train_data['class'].value_counts(normalize=True))\n",
    "print(\"\\nTarget Distribution in Dev Data:\")\n",
    "print(dev_data['class'].value_counts(normalize=True))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dce219e0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:26:32.971580Z",
     "iopub.status.busy": "2024-10-18T10:26:32.971276Z",
     "iopub.status.idle": "2024-10-18T10:26:33.407770Z",
     "shell.execute_reply": "2024-10-18T10:26:33.406880Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "# Function to preprocess data\n",
    "def preprocess_data(train_data, dev_data, test_data):\n",
    "    # Copy data to avoid modifying the original dataframes\n",
    "    train_data_copy = train_data.copy()\n",
    "    dev_data_copy = dev_data.copy()\n",
    "    test_data_copy = test_data.copy()\n",
    "    \n",
    "    # Separate target column\n",
    "    train_target = train_data_copy.pop('class')\n",
    "    dev_target = dev_data_copy.pop('class')\n",
    "    \n",
    "    # Define categorical and numerical columns\n",
    "    categorical_columns = train_data_copy.select_dtypes(include=['object']).columns\n",
    "    numerical_columns = train_data_copy.select_dtypes(include=['int64', 'float64']).columns\n",
    "    \n",
    "    # Label encoding for categorical columns\n",
    "    label_encoders = {}\n",
    "    for col in categorical_columns:\n",
    "        le = LabelEncoder()\n",
    "        train_data_copy[col] = le.fit_transform(train_data_copy[col])\n",
    "        dev_data_copy[col] = le.transform(dev_data_copy[col])\n",
    "        test_data_copy[col] = le.transform(test_data_copy[col])\n",
    "        label_encoders[col] = le\n",
    "    \n",
    "    # Scaling numerical columns\n",
    "    scaler = StandardScaler()\n",
    "    train_data_copy[numerical_columns] = scaler.fit_transform(train_data_copy[numerical_columns])\n",
    "    dev_data_copy[numerical_columns] = scaler.transform(dev_data_copy[numerical_columns])\n",
    "    test_data_copy[numerical_columns] = scaler.transform(test_data_copy[numerical_columns])\n",
    "    \n",
    "    return train_data_copy, dev_data_copy, test_data_copy, train_target, dev_target\n",
    "\n",
    "# Load test data\n",
    "test_data = pd.read_csv('/data/datasets/credit-g/split_test_wo_target.csv')\n",
    "\n",
    "# Preprocess data\n",
    "train_data_processed, dev_data_processed, test_data_processed, train_target, dev_target = preprocess_data(train_data, dev_data, test_data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3684361",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:26:33.429655Z",
     "iopub.status.busy": "2024-10-18T10:26:33.429338Z",
     "iopub.status.idle": "2024-10-18T10:26:33.494981Z",
     "shell.execute_reply": "2024-10-18T10:26:33.494183Z"
    }
   },
   "outputs": [],
   "source": [
    "from scipy.stats import skew\n",
    "\n",
    "def engineer_features(train_data, dev_data, test_data):\n",
    "    train_data_copy = train_data.copy()\n",
    "    dev_data_copy = dev_data.copy()\n",
    "    test_data_copy = test_data.copy()\n",
    "    \n",
    "    # Log transformation for skewed numerical features\n",
    "    numerical_columns = train_data_copy.select_dtypes(include=['int64', 'float64']).columns\n",
    "    for col in numerical_columns:\n",
    "        if skew(train_data_copy[col]) > 0.5:\n",
    "            train_data_copy[f'log_{col}'] = np.log1p(train_data_copy[col])\n",
    "            dev_data_copy[f'log_{col}'] = np.log1p(dev_data_copy[col])\n",
    "            test_data_copy[f'log_{col}'] = np.log1p(test_data_copy[col])\n",
    "    \n",
    "    # Interaction features\n",
    "    train_data_copy['credit_amount_per_duration'] = train_data_copy['credit_amount'] / train_data_copy['duration']\n",
    "    dev_data_copy['credit_amount_per_duration'] = dev_data_copy['credit_amount'] / dev_data_copy['duration']\n",
    "    test_data_copy['credit_amount_per_duration'] = test_data_copy['credit_amount'] / test_data_copy['duration']\n",
    "    \n",
    "    # Polynomial features\n",
    "    train_data_copy['age_squared'] = train_data_copy['age'] ** 2\n",
    "    dev_data_copy['age_squared'] = dev_data_copy['age'] ** 2\n",
    "    test_data_copy['age_squared'] = test_data_copy['age'] ** 2\n",
    "    \n",
    "    # Drop original skewed columns after transformation\n",
    "    for col in numerical_columns:\n",
    "        if skew(train_data_copy[col]) > 0.5:\n",
    "            train_data_copy.drop(col, axis=1, inplace=True)\n",
    "            dev_data_copy.drop(col, axis=1, inplace=True)\n",
    "            test_data_copy.drop(col, axis=1, inplace=True)\n",
    "    \n",
    "    return train_data_copy, dev_data_copy, test_data_copy\n",
    "\n",
    "train_data_engineered, dev_data_engineered, test_data_engineered = engineer_features(train_data_processed, dev_data_processed, test_data_processed)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "074e2854",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:26:38.747316Z",
     "iopub.status.busy": "2024-10-18T10:26:38.746642Z",
     "iopub.status.idle": "2024-10-18T10:26:39.810507Z",
     "shell.execute_reply": "2024-10-18T10:26:39.809592Z"
    }
   },
   "outputs": [],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "# Print column information for the latest processed and engineered data\n",
    "column_info = get_column_info(train_data_engineered)\n",
    "print(\"Column Info for Engineered Train Data:\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5178e635",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:27:01.821781Z",
     "iopub.status.busy": "2024-10-18T10:27:01.820822Z",
     "iopub.status.idle": "2024-10-18T10:27:11.533783Z",
     "shell.execute_reply": "2024-10-18T10:27:11.532758Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "# Define the base model\n",
    "base_model = RandomForestClassifier(random_state=42)\n",
    "\n",
    "# Define hyperparameters for grid search\n",
    "param_grid = {\n",
    "    'n_estimators': [100, 200, 300],\n",
    "    'max_depth': [None, 10, 20, 30],\n",
    "    'min_samples_split': [2, 5, 10],\n",
    "    'min_samples_leaf': [1, 2, 4]\n",
    "}\n",
    "\n",
    "# Perform grid search with cross-validation\n",
    "grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)\n",
    "grid_search.fit(train_data_engineered, train_target)\n",
    "\n",
    "# Get the best model\n",
    "best_model = grid_search.best_estimator_\n",
    "\n",
    "# Predict on train and dev sets\n",
    "train_predictions = best_model.predict(train_data_engineered)\n",
    "dev_predictions = best_model.predict(dev_data_engineered)\n",
    "\n",
    "# Calculate F1 scores\n",
    "train_f1 = f1_score(train_target, train_predictions, average='weighted')\n",
    "dev_f1 = f1_score(dev_target, dev_predictions, average='weighted')\n",
    "\n",
    "print(f\"Train F1 Score: {train_f1}\")\n",
    "print(f\"Dev F1 Score: {dev_f1}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f20deac9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:27:25.261879Z",
     "iopub.status.busy": "2024-10-18T10:27:25.260041Z",
     "iopub.status.idle": "2024-10-18T10:27:25.310328Z",
     "shell.execute_reply": "2024-10-18T10:27:25.309237Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train F1 Score: 1.0\n",
      "Dev F1 Score: 0.7713971128913657\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "\n",
    "# Predict on train and dev sets\n",
    "train_predictions = best_model.predict(train_data_engineered)\n",
    "dev_predictions = best_model.predict(dev_data_engineered)\n",
    "\n",
    "# Calculate F1 scores\n",
    "train_f1 = f1_score(train_target, train_predictions, average='weighted')\n",
    "dev_f1 = f1_score(dev_target, dev_predictions, average='weighted')\n",
    "\n",
    "print(f\"Train F1 Score: {train_f1}\")\n",
    "print(f\"Dev F1 Score: {dev_f1}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "007a65cb",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:27:35.360147Z",
     "iopub.status.busy": "2024-10-18T10:27:35.359485Z",
     "iopub.status.idle": "2024-10-18T10:27:35.376488Z",
     "shell.execute_reply": "2024-10-18T10:27:35.375397Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column Info for Engineered Train Data:\n",
      "{'Category': [], 'Numeric': ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'installment_commitment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'other_payment_plans', 'housing', 'own_telephone', 'foreign_worker', 'log_duration', 'log_credit_amount', 'log_age', 'log_existing_credits', 'log_job', 'log_num_dependents', 'credit_amount_per_duration', 'age_squared'], 'Datetime': [], 'Others': []}\n",
      "\n",
      "Column Info for Engineered Dev Data:\n",
      "{'Category': [], 'Numeric': ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'installment_commitment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'other_payment_plans', 'housing', 'own_telephone', 'foreign_worker', 'log_duration', 'log_credit_amount', 'log_age', 'log_existing_credits', 'log_job', 'log_num_dependents', 'credit_amount_per_duration', 'age_squared'], 'Datetime': [], 'Others': []}\n",
      "\n",
      "Column Info for Engineered Test Data:\n",
      "{'Category': [], 'Numeric': ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'installment_commitment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'other_payment_plans', 'housing', 'own_telephone', 'foreign_worker', 'log_duration', 'log_credit_amount', 'log_age', 'log_existing_credits', 'log_job', 'log_num_dependents', 'credit_amount_per_duration', 'age_squared'], 'Datetime': [], 'Others': []}\n"
     ]
    }
   ],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "# Print column information for the latest processed data\n",
    "column_info = get_column_info(train_data_engineered)\n",
    "print(\"Column Info for Engineered Train Data:\")\n",
    "print(column_info)\n",
    "\n",
    "# Print column information for the latest processed data\n",
    "column_info = get_column_info(dev_data_engineered)\n",
    "print(\"\\nColumn Info for Engineered Dev Data:\")\n",
    "print(column_info)\n",
    "\n",
    "# Print column information for the latest processed data\n",
    "column_info = get_column_info(test_data_engineered)\n",
    "print(\"\\nColumn Info for Engineered Test Data:\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d2d1fa4f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:31:37.912295Z",
     "iopub.status.busy": "2024-10-18T10:31:37.911606Z",
     "iopub.status.idle": "2024-10-18T10:31:45.000919Z",
     "shell.execute_reply": "2024-10-18T10:31:44.999725Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train F1 Score: 0.9899566205329478\n",
      "Dev F1 Score: 0.7740332940595188\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import StackingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "# Handle missing values using SimpleImputer\n",
    "imputer = SimpleImputer(strategy='mean')\n",
    "train_data_engineered_imputed = imputer.fit_transform(train_data_engineered)\n",
    "dev_data_engineered_imputed = imputer.transform(dev_data_engineered)\n",
    "\n",
    "# Define base models\n",
    "base_models = [\n",
    "    ('rf', RandomForestClassifier(random_state=42, n_estimators=300, max_depth=30, min_samples_split=10, min_samples_leaf=2)),\n",
    "    ('gb', GradientBoostingClassifier(random_state=42, n_estimators=200, max_depth=5)),\n",
    "    ('knn', KNeighborsClassifier(n_neighbors=5))\n",
    "]\n",
    "\n",
    "# Define the meta-model\n",
    "meta_model = LogisticRegression(random_state=42)\n",
    "\n",
    "# Create the stacking ensemble model\n",
    "stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n",
    "\n",
    "# Train the stacking ensemble model\n",
    "stacking_model.fit(train_data_engineered_imputed, train_target)\n",
    "\n",
    "# Predict on train and dev sets\n",
    "train_predictions = stacking_model.predict(train_data_engineered_imputed)\n",
    "dev_predictions = stacking_model.predict(dev_data_engineered_imputed)\n",
    "\n",
    "# Calculate F1 scores\n",
    "train_f1 = f1_score(train_target, train_predictions, average='weighted')\n",
    "dev_f1 = f1_score(dev_target, dev_predictions, average='weighted')\n",
    "\n",
    "print(f\"Train F1 Score: {train_f1}\")\n",
    "print(f\"Dev F1 Score: {dev_f1}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "f120af01",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:41:14.656158Z",
     "iopub.status.busy": "2024-10-18T10:41:14.655501Z",
     "iopub.status.idle": "2024-10-18T10:41:21.627425Z",
     "shell.execute_reply": "2024-10-18T10:41:21.626264Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train F1 Score: 0.9899566205329478\n",
      "Dev F1 Score: 0.7740332940595188\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.ensemble import StackingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "# Handle missing values using SimpleImputer\n",
    "imputer = SimpleImputer(strategy='mean')\n",
    "train_data_engineered_imputed = imputer.fit_transform(train_data_engineered)\n",
    "dev_data_engineered_imputed = imputer.transform(dev_data_engineered)\n",
    "test_data_engineered_imputed = imputer.transform(test_data_engineered)\n",
    "\n",
    "# Define base models\n",
    "base_models = [\n",
    "    ('rf', RandomForestClassifier(random_state=42, n_estimators=300, max_depth=30, min_samples_split=10, min_samples_leaf=2)),\n",
    "    ('gb', GradientBoostingClassifier(random_state=42, n_estimators=200, max_depth=5)),\n",
    "    ('knn', KNeighborsClassifier(n_neighbors=5))\n",
    "]\n",
    "\n",
    "# Define the meta-model\n",
    "meta_model = LogisticRegression(random_state=42)\n",
    "\n",
    "# Create the stacking ensemble model\n",
    "stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n",
    "\n",
    "# Train the stacking ensemble model\n",
    "stacking_model.fit(train_data_engineered_imputed, train_target)\n",
    "\n",
    "# Predict on train and dev sets\n",
    "train_predictions = stacking_model.predict(train_data_engineered_imputed)\n",
    "dev_predictions = stacking_model.predict(dev_data_engineered_imputed)\n",
    "\n",
    "# Calculate F1 scores\n",
    "train_f1 = f1_score(train_target, train_predictions, average='weighted')\n",
    "dev_f1 = f1_score(dev_target, dev_predictions, average='weighted')\n",
    "\n",
    "print(f\"Train F1 Score: {train_f1}\")\n",
    "print(f\"Dev F1 Score: {dev_f1}\")\n",
    "\n",
    "# Save the prediction results of the dev set\n",
    "dev_predictions_df = pd.DataFrame(dev_predictions, columns=['target'])\n",
    "dev_predictions_df.to_csv('../workspace/credit-g/dev_predictions.csv', index=False)\n",
    "\n",
    "# Save the prediction results of the test set\n",
    "test_predictions = stacking_model.predict(test_data_engineered_imputed)\n",
    "test_predictions_df = pd.DataFrame(test_predictions, columns=['target'])\n",
    "test_predictions_df.to_csv('../workspace/credit-g/test_predictions.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b5495da8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T10:42:00.353920Z",
     "iopub.status.busy": "2024-10-18T10:42:00.353230Z",
     "iopub.status.idle": "2024-10-18T10:42:07.468379Z",
     "shell.execute_reply": "2024-10-18T10:42:07.466832Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train F1 Score: 0.9899566205329478\n",
      "Dev F1 Score: 0.7740332940595188\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.ensemble import StackingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "# Handle missing values using SimpleImputer\n",
    "imputer = SimpleImputer(strategy='mean')\n",
    "train_data_engineered_imputed = imputer.fit_transform(train_data_engineered)\n",
    "dev_data_engineered_imputed = imputer.transform(dev_data_engineered)\n",
    "test_data_engineered_imputed = imputer.transform(test_data_engineered)\n",
    "\n",
    "# Define base models\n",
    "base_models = [\n",
    "    ('rf', RandomForestClassifier(random_state=42, n_estimators=300, max_depth=30, min_samples_split=10, min_samples_leaf=2)),\n",
    "    ('gb', GradientBoostingClassifier(random_state=42, n_estimators=200, max_depth=5)),\n",
    "    ('knn', KNeighborsClassifier(n_neighbors=5))\n",
    "]\n",
    "\n",
    "# Define the meta-model\n",
    "meta_model = LogisticRegression(random_state=42)\n",
    "\n",
    "# Create the stacking ensemble model\n",
    "stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n",
    "\n",
    "# Train the stacking ensemble model\n",
    "stacking_model.fit(train_data_engineered_imputed, train_target)\n",
    "\n",
    "# Predict on train and dev sets\n",
    "train_predictions = stacking_model.predict(train_data_engineered_imputed)\n",
    "dev_predictions = stacking_model.predict(dev_data_engineered_imputed)\n",
    "\n",
    "# Calculate F1 scores\n",
    "train_f1 = f1_score(train_target, train_predictions, average='weighted')\n",
    "dev_f1 = f1_score(dev_target, dev_predictions, average='weighted')\n",
    "\n",
    "print(f\"Train F1 Score: {train_f1}\")\n",
    "print(f\"Dev F1 Score: {dev_f1}\")\n",
    "\n",
    "# Save the prediction results of the dev set\n",
    "dev_predictions_df = pd.DataFrame(dev_predictions, columns=['target'])\n",
    "dev_predictions_df.to_csv('../workspace/credit-g/dev_predictions.csv', index=False)\n",
    "\n",
    "# Save the prediction results of the test set\n",
    "test_predictions = stacking_model.predict(test_data_engineered_imputed)\n",
    "test_predictions_df = pd.DataFrame(test_predictions, columns=['target'])\n",
    "test_predictions_df.to_csv('../workspace/credit-g/test_predictions.csv', index=False)\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
