{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7767213d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:17:11.153831Z",
     "iopub.status.busy": "2024-10-18T01:17:11.153303Z",
     "iopub.status.idle": "2024-10-18T01:17:11.829885Z",
     "shell.execute_reply": "2024-10-18T01:17:11.829043Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Basic Information about the Dataset:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 68689 entries, 0 to 68688\n",
      "Data columns (total 22 columns):\n",
      " #   Column             Non-Null Count  Dtype  \n",
      "---  ------             --------------  -----  \n",
      " 0   loc                68689 non-null  float64\n",
      " 1   v(g)               68689 non-null  float64\n",
      " 2   ev(g)              68689 non-null  float64\n",
      " 3   iv(g)              68689 non-null  float64\n",
      " 4   n                  68689 non-null  float64\n",
      " 5   v                  68689 non-null  float64\n",
      " 6   l                  68689 non-null  float64\n",
      " 7   d                  68689 non-null  float64\n",
      " 8   i                  68689 non-null  float64\n",
      " 9   e                  68689 non-null  float64\n",
      " 10  b                  68689 non-null  float64\n",
      " 11  t                  68689 non-null  float64\n",
      " 12  lOCode             68689 non-null  int64  \n",
      " 13  lOComment          68689 non-null  int64  \n",
      " 14  lOBlank            68689 non-null  int64  \n",
      " 15  locCodeAndComment  68689 non-null  int64  \n",
      " 16  uniq_Op            68689 non-null  float64\n",
      " 17  uniq_Opnd          68689 non-null  float64\n",
      " 18  total_Op           68689 non-null  float64\n",
      " 19  total_Opnd         68689 non-null  float64\n",
      " 20  branchCount        68689 non-null  float64\n",
      " 21  defects            68689 non-null  bool   \n",
      "dtypes: bool(1), float64(17), int64(4)\n",
      "memory usage: 11.1 MB\n",
      "None\n",
      "\n",
      "First few rows of the Dataset:\n",
      "    loc  v(g)  ev(g)  iv(g)      n        v     l      d      i         e  \\\n",
      "0  60.0   3.0    1.0    6.0    0.0     0.00  0.00   0.00   0.00      0.00   \n",
      "1  47.0   7.0    6.0    5.0  126.0   621.48  0.04  23.33  27.63  15796.21   \n",
      "2  33.0   4.0    4.0    2.0   86.0   430.00  0.06  16.64  26.95   7672.54   \n",
      "3  12.0   2.0    1.0    2.0   27.0   108.00  0.13   7.86  13.09    840.00   \n",
      "4  85.0   9.0    6.0    6.0  253.0  1482.07  0.03  31.05  42.00  44839.15   \n",
      "\n",
      "   ...  lOCode  lOComment  lOBlank  locCodeAndComment  uniq_Op  uniq_Opnd  \\\n",
      "0  ...       0          0        0                  0      0.0        0.0   \n",
      "1  ...      38          1        6                  0     18.0       18.0   \n",
      "2  ...      24          0        7                  0     14.0       18.0   \n",
      "3  ...       7          0        3                  0     10.0        7.0   \n",
      "4  ...      70          3        5                  0     19.0       31.0   \n",
      "\n",
      "   total_Op  total_Opnd  branchCount  defects  \n",
      "0       0.0         0.0          5.0     True  \n",
      "1      82.0        44.0         13.0     True  \n",
      "2      49.0        38.0          7.0    False  \n",
      "3      16.0        11.0          3.0    False  \n",
      "4     154.0       109.0         17.0     True  \n",
      "\n",
      "[5 rows x 22 columns]\n",
      "\n",
      "Summary Statistics for Numerical Columns:\n",
      "                loc          v(g)         ev(g)         iv(g)             n  \\\n",
      "count  68689.000000  68689.000000  68689.000000  68689.000000  68689.000000   \n",
      "mean      37.457613      5.494461      2.856309      3.493529     96.213033   \n",
      "std       54.486525      7.742780      4.646986      5.307669    167.522447   \n",
      "min        1.000000      1.000000      1.000000      1.000000      0.000000   \n",
      "25%       13.000000      2.000000      1.000000      1.000000     25.000000   \n",
      "50%       22.000000      3.000000      1.000000      2.000000     51.000000   \n",
      "75%       42.000000      6.000000      3.000000      4.000000    110.000000   \n",
      "max     2224.000000    404.000000    165.000000    402.000000   5669.000000   \n",
      "\n",
      "                  v             l             d             i             e  \\\n",
      "count  68689.000000  68689.000000  68689.000000  68689.000000  6.868900e+04   \n",
      "mean     537.551170      0.111698     13.632083     27.511406  2.013269e+04   \n",
      "std     1284.466539      0.100478     14.116092     22.916078  1.696508e+05   \n",
      "min        0.000000      0.000000      0.000000      0.000000  0.000000e+00   \n",
      "25%       96.210000      0.050000      5.500000     15.500000  5.600000e+02   \n",
      "50%      230.750000      0.090000      9.800000     23.280000  2.238010e+03   \n",
      "75%      559.820000      0.150000     18.000000     34.380000  1.019226e+04   \n",
      "max    80843.080000      1.000000    418.200000    569.780000  1.684662e+07   \n",
      "\n",
      "       ...              t        lOCode     lOComment       lOBlank  \\\n",
      "count  ...   68689.000000  68689.000000  68689.000000  68689.000000   \n",
      "mean   ...    1122.063656     22.736595      1.771521      3.979982   \n",
      "std    ...    9367.692814     37.907976      5.868741      6.500387   \n",
      "min    ...       0.000000      0.000000      0.000000      0.000000   \n",
      "25%    ...      31.110000      7.000000      0.000000      1.000000   \n",
      "50%    ...     124.330000     13.000000      0.000000      2.000000   \n",
      "75%    ...     565.920000     26.000000      1.000000      5.000000   \n",
      "max    ...  935923.390000   1739.000000    344.000000    219.000000   \n",
      "\n",
      "       locCodeAndComment       uniq_Op     uniq_Opnd      total_Op  \\\n",
      "count       68689.000000  68689.000000  68689.000000  68689.000000   \n",
      "mean            0.195956     11.857590     15.585421     57.397502   \n",
      "std             0.975014      6.526538     18.548109    103.573808   \n",
      "min             0.000000      0.000000      0.000000      0.000000   \n",
      "25%             0.000000      8.000000      7.000000     15.000000   \n",
      "50%             0.000000     11.000000     12.000000     30.000000   \n",
      "75%             0.000000     16.000000     20.000000     66.000000   \n",
      "max            38.000000    280.000000   1026.000000   3368.000000   \n",
      "\n",
      "         total_Opnd   branchCount  \n",
      "count  68689.000000  68689.000000  \n",
      "mean      39.179971      9.851097  \n",
      "std       71.973961     14.448305  \n",
      "min        0.000000      1.000000  \n",
      "25%       10.000000      3.000000  \n",
      "50%       20.000000      5.000000  \n",
      "75%       44.000000     11.000000  \n",
      "max     3021.000000    503.000000  \n",
      "\n",
      "[8 rows x 21 columns]\n",
      "\n",
      "Missing Values in the Dataset:\n",
      "loc                  0\n",
      "v(g)                 0\n",
      "ev(g)                0\n",
      "iv(g)                0\n",
      "n                    0\n",
      "v                    0\n",
      "l                    0\n",
      "d                    0\n",
      "i                    0\n",
      "e                    0\n",
      "b                    0\n",
      "t                    0\n",
      "lOCode               0\n",
      "lOComment            0\n",
      "lOBlank              0\n",
      "locCodeAndComment    0\n",
      "uniq_Op              0\n",
      "uniq_Opnd            0\n",
      "total_Op             0\n",
      "total_Opnd           0\n",
      "branchCount          0\n",
      "defects              0\n",
      "dtype: int64\n",
      "\n",
      "Distribution of the Target Column 'defects':\n",
      "defects\n",
      "False    53136\n",
      "True     15553\n",
      "Name: count, dtype: int64\n",
      "\n",
      "Data Types of the Columns:\n",
      "loc                  float64\n",
      "v(g)                 float64\n",
      "ev(g)                float64\n",
      "iv(g)                float64\n",
      "n                    float64\n",
      "v                    float64\n",
      "l                    float64\n",
      "d                    float64\n",
      "i                    float64\n",
      "e                    float64\n",
      "b                    float64\n",
      "t                    float64\n",
      "lOCode                 int64\n",
      "lOComment              int64\n",
      "lOBlank                int64\n",
      "locCodeAndComment      int64\n",
      "uniq_Op              float64\n",
      "uniq_Opnd            float64\n",
      "total_Op             float64\n",
      "total_Opnd           float64\n",
      "branchCount          float64\n",
      "defects                 bool\n",
      "dtype: object\n",
      "\n",
      "Numerical Columns:\n",
      "Index(['loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',\n",
      "       'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',\n",
      "       'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount'],\n",
      "      dtype='object')\n",
      "\n",
      "Categorical Columns:\n",
      "Index([], dtype='object')\n",
      "\n",
      "Correlation Matrix for Numerical Columns:\n",
      "                        loc      v(g)     ev(g)     iv(g)         n         v  \\\n",
      "loc                1.000000  0.772659  0.541684  0.698867  0.748678  0.751980   \n",
      "v(g)               0.772659  1.000000  0.748375  0.787846  0.621993  0.603685   \n",
      "ev(g)              0.541684  0.748375  1.000000  0.564268  0.384108  0.360183   \n",
      "iv(g)              0.698867  0.787846  0.564268  1.000000  0.580817  0.574524   \n",
      "n                  0.748678  0.621993  0.384108  0.580817  1.000000  0.920577   \n",
      "v                  0.751980  0.603685  0.360183  0.574524  0.920577  1.000000   \n",
      "l                 -0.390423 -0.394744 -0.325680 -0.302777 -0.303934 -0.250606   \n",
      "d                  0.599495  0.567198  0.352349  0.487529  0.812736  0.746161   \n",
      "i                  0.476834  0.296455  0.154041  0.313030  0.729311  0.662378   \n",
      "e                  0.512336  0.442402  0.264681  0.412279  0.571461  0.617254   \n",
      "b                  0.734862  0.594021  0.351393  0.553917  0.907138  0.926597   \n",
      "t                  0.486477  0.470352  0.266466  0.409694  0.583000  0.586598   \n",
      "lOCode             0.786764  0.642416  0.376763  0.634122  0.908009  0.896261   \n",
      "lOComment          0.516056  0.387442  0.279796  0.344980  0.570743  0.524419   \n",
      "lOBlank            0.670591  0.454375  0.274864  0.433420  0.781996  0.727599   \n",
      "locCodeAndComment  0.248806  0.217624  0.166707  0.193587  0.274331  0.247717   \n",
      "uniq_Op            0.348179  0.338520  0.199378  0.313088  0.614386  0.523154   \n",
      "uniq_Opnd          0.618573  0.519534  0.268827  0.525719  0.806924  0.782823   \n",
      "total_Op           0.758842  0.630817  0.387858  0.576755  0.953443  0.930073   \n",
      "total_Opnd         0.754277  0.601535  0.363201  0.555426  0.949457  0.937952   \n",
      "branchCount        0.783137  0.982072  0.782521  0.778716  0.629025  0.603418   \n",
      "\n",
      "                          l         d         i         e  ...         t  \\\n",
      "loc               -0.390423  0.599495  0.476834  0.512336  ...  0.486477   \n",
      "v(g)              -0.394744  0.567198  0.296455  0.442402  ...  0.470352   \n",
      "ev(g)             -0.325680  0.352349  0.154041  0.264681  ...  0.266466   \n",
      "iv(g)             -0.302777  0.487529  0.313030  0.412279  ...  0.409694   \n",
      "n                 -0.303934  0.812736  0.729311  0.571461  ...  0.583000   \n",
      "v                 -0.250606  0.746161  0.662378  0.617254  ...  0.586598   \n",
      "l                  1.000000 -0.422558 -0.159482 -0.098775  ... -0.099738   \n",
      "d                 -0.422558  1.000000  0.513764  0.509950  ...  0.503662   \n",
      "i                 -0.159482  0.513764  1.000000  0.297187  ...  0.299860   \n",
      "e                 -0.098775  0.509950  0.297187  1.000000  ...  0.758629   \n",
      "b                 -0.248114  0.753145  0.666061  0.660110  ...  0.614011   \n",
      "t                 -0.099738  0.503662  0.299860  0.758629  ...  1.000000   \n",
      "lOCode            -0.292538  0.788745  0.674521  0.602992  ...  0.572249   \n",
      "lOComment         -0.187832  0.491329  0.434801  0.338234  ...  0.330206   \n",
      "lOBlank           -0.283229  0.671518  0.646738  0.463052  ...  0.444448   \n",
      "locCodeAndComment -0.126589  0.270637  0.214721  0.132196  ...  0.125406   \n",
      "uniq_Op           -0.408253  0.788754  0.532853  0.282519  ...  0.294660   \n",
      "uniq_Opnd         -0.275946  0.689269  0.779053  0.438547  ...  0.450263   \n",
      "total_Op          -0.294957  0.809755  0.700437  0.607184  ...  0.601222   \n",
      "total_Opnd        -0.288294  0.783569  0.738072  0.588833  ...  0.574819   \n",
      "branchCount       -0.415343  0.571224  0.306888  0.443906  ...  0.459233   \n",
      "\n",
      "                     lOCode  lOComment   lOBlank  locCodeAndComment   uniq_Op  \\\n",
      "loc                0.786764   0.516056  0.670591           0.248806  0.348179   \n",
      "v(g)               0.642416   0.387442  0.454375           0.217624  0.338520   \n",
      "ev(g)              0.376763   0.279796  0.274864           0.166707  0.199378   \n",
      "iv(g)              0.634122   0.344980  0.433420           0.193587  0.313088   \n",
      "n                  0.908009   0.570743  0.781996           0.274331  0.614386   \n",
      "v                  0.896261   0.524419  0.727599           0.247717  0.523154   \n",
      "l                 -0.292538  -0.187832 -0.283229          -0.126589 -0.408253   \n",
      "d                  0.788745   0.491329  0.671518           0.270637  0.788754   \n",
      "i                  0.674521   0.434801  0.646738           0.214721  0.532853   \n",
      "e                  0.602992   0.338234  0.463052           0.132196  0.282519   \n",
      "b                  0.884061   0.525994  0.731844           0.243357  0.513824   \n",
      "t                  0.572249   0.330206  0.444448           0.125406  0.294660   \n",
      "lOCode             1.000000   0.548126  0.774780           0.267934  0.614097   \n",
      "lOComment          0.548126   1.000000  0.587496           0.255960  0.386643   \n",
      "lOBlank            0.774780   0.587496  1.000000           0.268805  0.562863   \n",
      "locCodeAndComment  0.267934   0.255960  0.268805           1.000000  0.250898   \n",
      "uniq_Op            0.614097   0.386643  0.562863           0.250898  1.000000   \n",
      "uniq_Opnd          0.816109   0.464002  0.681307           0.251326  0.627725   \n",
      "total_Op           0.914482   0.558315  0.770793           0.270124  0.602984   \n",
      "total_Opnd         0.915332   0.542360  0.768461           0.263266  0.580223   \n",
      "branchCount        0.640607   0.400990  0.468438           0.225231  0.343959   \n",
      "\n",
      "                   uniq_Opnd  total_Op  total_Opnd  branchCount  \n",
      "loc                 0.618573  0.758842    0.754277     0.783137  \n",
      "v(g)                0.519534  0.630817    0.601535     0.982072  \n",
      "ev(g)               0.268827  0.387858    0.363201     0.782521  \n",
      "iv(g)               0.525719  0.576755    0.555426     0.778716  \n",
      "n                   0.806924  0.953443    0.949457     0.629025  \n",
      "v                   0.782823  0.930073    0.937952     0.603418  \n",
      "l                  -0.275946 -0.294957   -0.288294    -0.415343  \n",
      "d                   0.689269  0.809755    0.783569     0.571224  \n",
      "i                   0.779053  0.700437    0.738072     0.306888  \n",
      "e                   0.438547  0.607184    0.588833     0.443906  \n",
      "b                   0.776260  0.915538    0.930069     0.596498  \n",
      "t                   0.450263  0.601222    0.574819     0.459233  \n",
      "lOCode              0.816109  0.914482    0.915332     0.640607  \n",
      "lOComment           0.464002  0.558315    0.542360     0.400990  \n",
      "lOBlank             0.681307  0.770793    0.768461     0.468438  \n",
      "locCodeAndComment   0.251326  0.270124    0.263266     0.225231  \n",
      "uniq_Op             0.627725  0.602984    0.580223     0.343959  \n",
      "uniq_Opnd           1.000000  0.791873    0.823089     0.504611  \n",
      "total_Op            0.791873  1.000000    0.962081     0.633643  \n",
      "total_Opnd          0.823089  0.962081    1.000000     0.607089  \n",
      "branchCount         0.504611  0.633643    0.607089     1.000000  \n",
      "\n",
      "[21 rows x 21 columns]\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Load the training dataset\n",
    "train_df = pd.read_csv('/data/datasets/software-defects/split_train.csv')\n",
    "\n",
    "# Display basic information about the dataset\n",
    "print(\"Basic Information about the Dataset:\")\n",
    "print(train_df.info())\n",
    "\n",
    "# Display the first few rows of the dataset\n",
    "print(\"\\nFirst few rows of the Dataset:\")\n",
    "print(train_df.head())\n",
    "\n",
    "# Summary statistics for numerical columns\n",
    "print(\"\\nSummary Statistics for Numerical Columns:\")\n",
    "print(train_df.describe())\n",
    "\n",
    "# Check for missing values\n",
    "print(\"\\nMissing Values in the Dataset:\")\n",
    "print(train_df.isnull().sum())\n",
    "\n",
    "# Check the distribution of the target column 'defects'\n",
    "print(\"\\nDistribution of the Target Column 'defects':\")\n",
    "print(train_df['defects'].value_counts())\n",
    "\n",
    "# Check the data types of the columns\n",
    "print(\"\\nData Types of the Columns:\")\n",
    "print(train_df.dtypes)\n",
    "\n",
    "# Separate numerical and categorical columns\n",
    "numerical_cols = train_df.select_dtypes(include=[np.number]).columns\n",
    "categorical_cols = train_df.select_dtypes(include=['object']).columns\n",
    "\n",
    "print(\"\\nNumerical Columns:\")\n",
    "print(numerical_cols)\n",
    "\n",
    "print(\"\\nCategorical Columns:\")\n",
    "print(categorical_cols)\n",
    "\n",
    "# Check the correlation matrix for numerical columns\n",
    "print(\"\\nCorrelation Matrix for Numerical Columns:\")\n",
    "print(train_df[numerical_cols].corr())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "48d199de",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:17:11.842872Z",
     "iopub.status.busy": "2024-10-18T01:17:11.842589Z",
     "iopub.status.idle": "2024-10-18T01:17:12.560986Z",
     "shell.execute_reply": "2024-10-18T01:17:12.560020Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Function to preprocess data\n",
    "def preprocess_data(df):\n",
    "    df_copy = df.copy()\n",
    "    \n",
    "    # Handle missing values (if any, though the dataset has no missing values)\n",
    "    df_copy.fillna(df_copy.median(), inplace=True)\n",
    "    \n",
    "    # Separate target column if it exists\n",
    "    if 'defects' in df_copy.columns:\n",
    "        y = df_copy['defects']\n",
    "        df_copy.drop('defects', axis=1, inplace=True)\n",
    "    else:\n",
    "        y = None\n",
    "    \n",
    "    # Scale numerical features\n",
    "    scaler = StandardScaler()\n",
    "    df_copy[numerical_cols] = scaler.fit_transform(df_copy[numerical_cols])\n",
    "    \n",
    "    # Reattach the target column if it was separated\n",
    "    if y is not None:\n",
    "        df_copy['defects'] = y\n",
    "    \n",
    "    return df_copy\n",
    "\n",
    "# Preprocess train, dev, and test sets\n",
    "train_df = preprocess_data(train_df)\n",
    "dev_df = pd.read_csv('/data/datasets/software-defects/split_dev.csv')\n",
    "dev_df = preprocess_data(dev_df)\n",
    "test_df = pd.read_csv('/data/datasets/software-defects/split_test_wo_target.csv')\n",
    "test_df = preprocess_data(test_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "794608a2",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:17:18.210326Z",
     "iopub.status.busy": "2024-10-18T01:17:18.209543Z",
     "iopub.status.idle": "2024-10-18T01:17:19.241846Z",
     "shell.execute_reply": "2024-10-18T01:17:19.240960Z"
    }
   },
   "outputs": [],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "column_info = get_column_info(train_df)\n",
    "print(\"column_info\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ba08c412",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:17:54.383705Z",
     "iopub.status.busy": "2024-10-18T01:17:54.382655Z",
     "iopub.status.idle": "2024-10-18T01:17:54.494089Z",
     "shell.execute_reply": "2024-10-18T01:17:54.493142Z"
    }
   },
   "outputs": [],
   "source": [
    "def engineer_features(df):\n",
    "    df_copy = df.copy()\n",
    "    \n",
    "    # Feature Engineering\n",
    "    # Ratio features to reduce multicollinearity\n",
    "    df_copy['loc_to_v(g)'] = df_copy['loc'] / df_copy['v(g)']\n",
    "    df_copy['n_to_v'] = df_copy['n'] / df_copy['v']\n",
    "    df_copy['uniq_Op_to_total_Op'] = df_copy['uniq_Op'] / df_copy['total_Op']\n",
    "    df_copy['uniq_Opnd_to_total_Opnd'] = df_copy['uniq_Opnd'] / df_copy['total_Opnd']\n",
    "    \n",
    "    # Interaction features\n",
    "    df_copy['loc_times_v(g)'] = df_copy['loc'] * df_copy['v(g)']\n",
    "    df_copy['n_times_v'] = df_copy['n'] * df_copy['v']\n",
    "    df_copy['uniq_Op_times_total_Op'] = df_copy['uniq_Op'] * df_copy['total_Op']\n",
    "    df_copy['uniq_Opnd_times_total_Opnd'] = df_copy['uniq_Opnd'] * df_copy['total_Opnd']\n",
    "    \n",
    "    # Polynomial features\n",
    "    df_copy['loc_squared'] = df_copy['loc'] ** 2\n",
    "    df_copy['v(g)_squared'] = df_copy['v(g)'] ** 2\n",
    "    df_copy['n_squared'] = df_copy['n'] ** 2\n",
    "    df_copy['v_squared'] = df_copy['v'] ** 2\n",
    "    \n",
    "    # Logarithmic features\n",
    "    df_copy['log_loc'] = np.log1p(df_copy['loc'])\n",
    "    df_copy['log_v(g)'] = np.log1p(df_copy['v(g)'])\n",
    "    df_copy['log_n'] = np.log1p(df_copy['n'])\n",
    "    df_copy['log_v'] = np.log1p(df_copy['v'])\n",
    "    \n",
    "    return df_copy\n",
    "\n",
    "train_df = engineer_features(train_df)\n",
    "dev_df = engineer_features(dev_df)\n",
    "test_df = engineer_features(test_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4773ea43",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:17:59.531687Z",
     "iopub.status.busy": "2024-10-18T01:17:59.530898Z",
     "iopub.status.idle": "2024-10-18T01:17:59.543351Z",
     "shell.execute_reply": "2024-10-18T01:17:59.542230Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "column_info\n",
      "{'Category': [], 'Numeric': ['loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't', 'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount', 'loc_to_v(g)', 'n_to_v', 'uniq_Op_to_total_Op', 'uniq_Opnd_to_total_Opnd', 'loc_times_v(g)', 'n_times_v', 'uniq_Op_times_total_Op', 'uniq_Opnd_times_total_Opnd', 'loc_squared', 'v(g)_squared', 'n_squared', 'v_squared', 'log_loc', 'log_v(g)', 'log_n', 'log_v'], 'Datetime': [], 'Others': ['defects']}\n"
     ]
    }
   ],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "column_info = get_column_info(train_df)\n",
    "print(\"column_info\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "72346abe",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:18:23.926063Z",
     "iopub.status.busy": "2024-10-18T01:18:23.925265Z",
     "iopub.status.idle": "2024-10-18T01:18:48.938540Z",
     "shell.execute_reply": "2024-10-18T01:18:48.937594Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dev F1 Score: 0.4930555555555556\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['../workspace/software-defects/model.pkl']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "import joblib\n",
    "\n",
    "# Prepare the data\n",
    "X_train = train_df.drop('defects', axis=1)\n",
    "y_train = train_df['defects']\n",
    "X_dev = dev_df.drop('defects', axis=1)\n",
    "y_dev = dev_df['defects']\n",
    "X_test = test_df\n",
    "\n",
    "# Train a base model\n",
    "model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)\n",
    "model.fit(X_train, y_train)\n",
    "\n",
    "# Predict on dev set\n",
    "y_dev_pred = model.predict(X_dev)\n",
    "dev_f1 = f1_score(y_dev, y_dev_pred)\n",
    "print(f\"Dev F1 Score: {dev_f1}\")\n",
    "\n",
    "# Predict on test set\n",
    "y_test_pred = model.predict(X_test)\n",
    "\n",
    "# Save predictions\n",
    "import os\n",
    "output_dir = '../workspace/software-defects'\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "pd.DataFrame({'target': y_dev_pred}).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n",
    "pd.DataFrame({'target': y_test_pred}).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
    "\n",
    "# Save the model\n",
    "joblib.dump(model, os.path.join(output_dir, 'model.pkl'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ddf27b9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:19:14.440375Z",
     "iopub.status.busy": "2024-10-18T01:19:14.439522Z",
     "iopub.status.idle": "2024-10-18T01:23:50.262255Z",
     "shell.execute_reply": "2024-10-18T01:23:50.260214Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "\n",
    "# Define the parameter grid for hyperparameter tuning\n",
    "param_grid = {\n",
    "    'n_estimators': [100, 200, 300],\n",
    "    'max_depth': [10, 20, 30],\n",
    "    'min_samples_split': [2, 5, 10],\n",
    "    'min_samples_leaf': [1, 2, 4]\n",
    "}\n",
    "\n",
    "# Initialize the GridSearchCV\n",
    "grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)\n",
    "\n",
    "# Fit the GridSearchCV on the training data\n",
    "grid_search.fit(X_train, y_train)\n",
    "\n",
    "# Get the best model from the grid search\n",
    "best_model = grid_search.best_estimator_\n",
    "\n",
    "# Predict on dev set using the best model\n",
    "y_dev_pred_best = best_model.predict(X_dev)\n",
    "dev_f1_best = f1_score(y_dev, y_dev_pred_best)\n",
    "print(f\"Best Dev F1 Score: {dev_f1_best}\")\n",
    "\n",
    "# Predict on test set using the best model\n",
    "y_test_pred_best = best_model.predict(X_test)\n",
    "\n",
    "# Save the best model predictions\n",
    "pd.DataFrame({'target': y_dev_pred_best}).to_csv(os.path.join(output_dir, 'dev_predictions_best.csv'), index=False)\n",
    "pd.DataFrame({'target': y_test_pred_best}).to_csv(os.path.join(output_dir, 'test_predictions_best.csv'), index=False)\n",
    "\n",
    "# Save the best model\n",
    "joblib.dump(best_model, os.path.join(output_dir, 'best_model.pkl'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e4526121",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:23:55.669973Z",
     "iopub.status.busy": "2024-10-18T01:23:55.669273Z",
     "iopub.status.idle": "2024-10-18T01:23:55.680982Z",
     "shell.execute_reply": "2024-10-18T01:23:55.679626Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "column_info\n",
      "{'Category': [], 'Numeric': ['loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't', 'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount', 'loc_to_v(g)', 'n_to_v', 'uniq_Op_to_total_Op', 'uniq_Opnd_to_total_Opnd', 'loc_times_v(g)', 'n_times_v', 'uniq_Op_times_total_Op', 'uniq_Opnd_times_total_Opnd', 'loc_squared', 'v(g)_squared', 'n_squared', 'v_squared', 'log_loc', 'log_v(g)', 'log_n', 'log_v'], 'Datetime': [], 'Others': ['defects']}\n"
     ]
    }
   ],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "column_info = get_column_info(train_df)\n",
    "print(\"column_info\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e80f69da",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:24:53.804716Z",
     "iopub.status.busy": "2024-10-18T01:24:53.803814Z",
     "iopub.status.idle": "2024-10-18T01:27:03.222154Z",
     "shell.execute_reply": "2024-10-18T01:27:03.220918Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import StackingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "import joblib\n",
    "\n",
    "# Define base models without CatBoostClassifier\n",
    "base_models = [\n",
    "    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n",
    "    ('knn', KNeighborsClassifier(n_neighbors=5)),\n",
    "    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))\n",
    "]\n",
    "\n",
    "# Define the meta-model\n",
    "meta_model = LogisticRegression()\n",
    "\n",
    "# Initialize the StackingClassifier\n",
    "stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n",
    "\n",
    "# Fit the StackingClassifier on the training data\n",
    "stacking_model.fit(X_train, y_train)\n",
    "\n",
    "# Predict on dev set using the stacking model\n",
    "y_dev_pred_stacking = stacking_model.predict(X_dev)\n",
    "dev_f1_stacking = f1_score(y_dev, y_dev_pred_stacking)\n",
    "print(f\"Stacking Dev F1 Score: {dev_f1_stacking}\")\n",
    "\n",
    "# Predict on test set using the stacking model\n",
    "y_test_pred_stacking = stacking_model.predict(X_test)\n",
    "\n",
    "# Save the stacking model predictions\n",
    "pd.DataFrame({'target': y_dev_pred_stacking}).to_csv(os.path.join(output_dir, 'dev_predictions_stacking.csv'), index=False)\n",
    "pd.DataFrame({'target': y_test_pred_stacking}).to_csv(os.path.join(output_dir, 'test_predictions_stacking.csv'), index=False)\n",
    "\n",
    "# Save the stacking model\n",
    "joblib.dump(stacking_model, os.path.join(output_dir, 'stacking_model.pkl'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0725e9cc",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:27:11.566340Z",
     "iopub.status.busy": "2024-10-18T01:27:11.565644Z",
     "iopub.status.idle": "2024-10-18T01:27:12.784242Z",
     "shell.execute_reply": "2024-10-18T01:27:12.782699Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stacking Dev F1 Score: 0.4896435452793834\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the ensemble model on the dev set and print the performance\n",
    "from sklearn.metrics import f1_score\n",
    "\n",
    "# Predict on dev set using the stacking model\n",
    "y_dev_pred_stacking = stacking_model.predict(X_dev)\n",
    "dev_f1_stacking = f1_score(y_dev, y_dev_pred_stacking)\n",
    "print(f\"Stacking Dev F1 Score: {dev_f1_stacking}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "889fe4b4",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T01:27:20.534084Z",
     "iopub.status.busy": "2024-10-18T01:27:20.533393Z",
     "iopub.status.idle": "2024-10-18T01:27:21.807482Z",
     "shell.execute_reply": "2024-10-18T01:27:21.805492Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stacking Dev F1 Score: 0.4896435452793834\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the ensemble model on the dev set and print the performance\n",
    "from sklearn.metrics import f1_score\n",
    "\n",
    "# Predict on dev set using the stacking model\n",
    "y_dev_pred_stacking = stacking_model.predict(X_dev)\n",
    "dev_f1_stacking = f1_score(y_dev, y_dev_pred_stacking)\n",
    "print(f\"Stacking Dev F1 Score: {dev_f1_stacking}\")\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
