{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a0febbbf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.naive_bayes import GaussianNB, BernoulliNB\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from xgboost import XGBRegressor\n",
    "\n",
    "from sklearn.metrics import roc_auc_score, average_precision_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fa58054",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 410,
   "id": "eb7339aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_pickle(\"credit/train_credit.pkl.gz\")\n",
    "test_df = pd.read_pickle(\"credit/test_credit.pkl.gz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "580a2f1a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 416,
   "id": "6132b979",
   "metadata": {},
   "outputs": [],
   "source": [
    "min_max_scaler = MinMaxScaler()\n",
    "train_data = min_max_scaler.fit_transform(train_df)\n",
    "train_X, train_y = train_data[:, :-1], train_data[:, -1].astype(int)\n",
    "\n",
    "test_data = min_max_scaler.transform(test_df)\n",
    "test_X, test_y = test_data[:,:-1], test_data[:, -1].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 417,
   "id": "7d2258e2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9680214446498914 0.6997161432931679\n"
     ]
    }
   ],
   "source": [
    "model = LogisticRegression()\n",
    "model.fit(train_X, train_y)\n",
    "predict_y = model.predict_proba(test_X)[:,1]\n",
    "\n",
    "auc = roc_auc_score(test_y, predict_y)\n",
    "apr = average_precision_score(test_y, predict_y)\n",
    "\n",
    "print(auc, apr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f96d130",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 420,
   "id": "f52563d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODELS = [\n",
    "    \"LogisticRegression\",\n",
    "    \"RandomForest\",\n",
    "    \"GaussianNB\",\n",
    "    \"BernoulliNB\",\n",
    "    \"LinearSVM\",\n",
    "    \"DecisionTree\",\n",
    "    \"LDA\",\n",
    "    \"AdaBoost\",\n",
    "    'Bagging',\n",
    "    'GBM',\n",
    "    \"MLP\",\n",
    "    'XGB'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0bce9f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "Logistic Regression\n",
    "Random Forest\n",
    "Gaussian Naive Bayes\n",
    "Bernoulli Naive Bayes\n",
    "Linear SVM\n",
    "Decision Tree\n",
    "LDA\n",
    "AdaBoost\n",
    "Bagging\n",
    "GBM\n",
    "Multi-layer Perceptron\n",
    "XgBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6765a81e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 421,
   "id": "4ad180b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_results(train_X, train_y, test_X, test_y):\n",
    "    for i, model_name in enumerate(MODELS):\n",
    "        print(i, model_name)\n",
    "\n",
    "        if model_name == 'LogisticRegression':\n",
    "            model = LogisticRegression()\n",
    "        elif model_name == 'RandomForest':      \n",
    "            model = RandomForestClassifier()\n",
    "        elif model_name == 'GaussianNB':  \n",
    "            model = GaussianNB()\n",
    "        elif model_name == 'BernoulliNB':  \n",
    "            model = BernoulliNB()\n",
    "        elif model_name == 'LinearSVM':         \n",
    "            model = svm.LinearSVC()\n",
    "        elif model_name == 'DecisionTree':         \n",
    "            model = DecisionTreeClassifier()\n",
    "        elif model_name == 'LDA':\n",
    "            model = LinearDiscriminantAnalysis()\n",
    "        elif model_name == 'AdaBoost':\n",
    "            model = AdaBoostClassifier()\n",
    "        elif model_name == 'Bagging':\n",
    "            model = BaggingClassifier()\n",
    "        elif model_name == 'gbm':         \n",
    "            model = GradientBoostingClassifier() \n",
    "        elif model_name == 'mlp':         \n",
    "            model = MLPClassifier() \n",
    "        elif model_name == 'xgb':\n",
    "            model = XGBRegressor()\n",
    "            \n",
    "            \n",
    "            \n",
    "        if(model_name=='svmlin' or model_name=='Passive Aggressive'): \n",
    "            model.fit(train_X, train_y)\n",
    "            predict = model.decision_function(test_X)\n",
    "        elif (model_name =='xgb'):\n",
    "            model.fit(np.asarray(train_X), train_y)\n",
    "            predict = model.predict(np.asarray(test_X))\n",
    "        else:\n",
    "            model.fit(train_X, train_y)\n",
    "            predict = model.predict_proba(test_X)[:,1]\n",
    "    \n",
    "#         model.fit(train_X, train_y)\n",
    "#         predict_y = model.predict_proba(test_X)[:,1]\n",
    "\n",
    "        auc = roc_auc_score(test_y, predict_y)\n",
    "        apr = average_precision_score(test_y, predict_y)\n",
    "\n",
    "        results[i, :] = [auc, apr]\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 422,
   "id": "3a690214",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 logisticregression\n",
      "1 randomforest\n",
      "2 gaussiannb\n",
      "3 bernoullinb\n",
      "4 svmlin\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/georgi/venvs/bin-synth/lib/python3.11/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5 Extra Trees\n",
      "6 LDA\n",
      "7 AdaBoostBagging\n",
      "8 gbm\n"
     ]
    },
    {
     "ename": "IndexError",
     "evalue": "index 8 is out of bounds for axis 0 with size 8",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[422], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mget_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_X\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_y\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_X\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_y\u001b[49m\u001b[43m)\u001b[49m\n",
      "Cell \u001b[0;32mIn[421], line 48\u001b[0m, in \u001b[0;36mget_results\u001b[0;34m(train_X, train_y, test_X, test_y)\u001b[0m\n\u001b[1;32m     45\u001b[0m     auc \u001b[38;5;241m=\u001b[39m roc_auc_score(test_y, predict_y)\n\u001b[1;32m     46\u001b[0m     apr \u001b[38;5;241m=\u001b[39m average_precision_score(test_y, predict_y)\n\u001b[0;32m---> 48\u001b[0m     \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;241m=\u001b[39m [auc, apr]\n\u001b[1;32m     49\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n",
      "\u001b[0;31mIndexError\u001b[0m: index 8 is out of bounds for axis 0 with size 8"
     ]
    }
   ],
   "source": [
    "results = get_results(train_X, train_y, test_X, test_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7898ef82",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea88765d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "764c8dc6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1414d0e9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdb69c5f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "686982c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# paper - 858 x 35"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "49deb625",
   "metadata": {},
   "outputs": [],
   "source": [
    "cancer_path = \"cervical_cancer/data.csv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "id": "6a0e94e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(cancer_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "id": "c19f8bf8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>N_sex_partners</th>\n",
       "      <th>First_sex_intercourse</th>\n",
       "      <th>N_pregnancies</th>\n",
       "      <th>Smokes</th>\n",
       "      <th>Smokes_years</th>\n",
       "      <th>Smokes_packs_year</th>\n",
       "      <th>Hormonal_Contraceptives</th>\n",
       "      <th>Hormonal_Contraceptives_years</th>\n",
       "      <th>IUD</th>\n",
       "      <th>...</th>\n",
       "      <th>STDs_Time_since_first_diagnosis</th>\n",
       "      <th>STDs_Time_since_last_diagnosis</th>\n",
       "      <th>Dx_Cancer</th>\n",
       "      <th>Dx_CIN</th>\n",
       "      <th>Dx_HPV</th>\n",
       "      <th>Dx</th>\n",
       "      <th>Hinselmann</th>\n",
       "      <th>Schiller</th>\n",
       "      <th>Citology</th>\n",
       "      <th>Biopsy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>18</td>\n",
       "      <td>4.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>34</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>52</td>\n",
       "      <td>5.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>46</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>15.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>853</th>\n",
       "      <td>34</td>\n",
       "      <td>3.0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>854</th>\n",
       "      <td>32</td>\n",
       "      <td>2.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>8.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>855</th>\n",
       "      <td>25</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.08</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>856</th>\n",
       "      <td>33</td>\n",
       "      <td>2.0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.08</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>857</th>\n",
       "      <td>29</td>\n",
       "      <td>2.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>858 rows × 36 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Age  N_sex_partners  First_sex_intercourse  N_pregnancies Smokes  \\\n",
       "0     18             4.0                   15.0            1.0    0.0   \n",
       "1     15             1.0                   14.0            1.0    0.0   \n",
       "2     34             1.0                    NaN            1.0    0.0   \n",
       "3     52             5.0                   16.0            4.0    1.0   \n",
       "4     46             3.0                   21.0            4.0    0.0   \n",
       "..   ...             ...                    ...            ...    ...   \n",
       "853   34             3.0                   18.0            0.0    0.0   \n",
       "854   32             2.0                   19.0            1.0    0.0   \n",
       "855   25             2.0                   17.0            0.0    0.0   \n",
       "856   33             2.0                   24.0            2.0    0.0   \n",
       "857   29             2.0                   20.0            1.0    0.0   \n",
       "\n",
       "     Smokes_years  Smokes_packs_year Hormonal_Contraceptives  \\\n",
       "0             0.0                0.0                     0.0   \n",
       "1             0.0                0.0                     0.0   \n",
       "2             0.0                0.0                     0.0   \n",
       "3            37.0               37.0                     1.0   \n",
       "4             0.0                0.0                     1.0   \n",
       "..            ...                ...                     ...   \n",
       "853           0.0                0.0                     0.0   \n",
       "854           0.0                0.0                     1.0   \n",
       "855           0.0                0.0                     1.0   \n",
       "856           0.0                0.0                     1.0   \n",
       "857           0.0                0.0                     1.0   \n",
       "\n",
       "     Hormonal_Contraceptives_years  IUD  ...  STDs_Time_since_first_diagnosis  \\\n",
       "0                             0.00  0.0  ...                              NaN   \n",
       "1                             0.00  0.0  ...                              NaN   \n",
       "2                             0.00  0.0  ...                              NaN   \n",
       "3                             3.00  0.0  ...                              NaN   \n",
       "4                            15.00  0.0  ...                              NaN   \n",
       "..                             ...  ...  ...                              ...   \n",
       "853                           0.00  0.0  ...                              NaN   \n",
       "854                           8.00  0.0  ...                              NaN   \n",
       "855                           0.08  0.0  ...                              NaN   \n",
       "856                           0.08  0.0  ...                              NaN   \n",
       "857                           0.50  0.0  ...                              NaN   \n",
       "\n",
       "    STDs_Time_since_last_diagnosis  Dx_Cancer Dx_CIN Dx_HPV Dx Hinselmann  \\\n",
       "0                              NaN          0      0      0  0          0   \n",
       "1                              NaN          0      0      0  0          0   \n",
       "2                              NaN          0      0      0  0          0   \n",
       "3                              NaN          1      0      1  0          0   \n",
       "4                              NaN          0      0      0  0          0   \n",
       "..                             ...        ...    ...    ... ..        ...   \n",
       "853                            NaN          0      0      0  0          0   \n",
       "854                            NaN          0      0      0  0          0   \n",
       "855                            NaN          0      0      0  0          0   \n",
       "856                            NaN          0      0      0  0          0   \n",
       "857                            NaN          0      0      0  0          0   \n",
       "\n",
       "    Schiller Citology Biopsy  \n",
       "0          0        0      0  \n",
       "1          0        0      0  \n",
       "2          0        0      0  \n",
       "3          0        0      0  \n",
       "4          0        0      0  \n",
       "..       ...      ...    ...  \n",
       "853        0        0      0  \n",
       "854        0        0      0  \n",
       "855        0        1      0  \n",
       "856        0        0      0  \n",
       "857        0        0      0  \n",
       "\n",
       "[858 rows x 36 columns]"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23a4381b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "6236cf76",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.replace('?', np.nan)\n",
    "df = df.drop([\"STDs_Time_since_first_diagnosis\", \"STDs_Time_since_last_diagnosis\"], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "a093e171",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>N_sex_partners</th>\n",
       "      <th>First_sex_intercourse</th>\n",
       "      <th>N_pregnancies</th>\n",
       "      <th>Smokes</th>\n",
       "      <th>Smokes_years</th>\n",
       "      <th>Smokes_packs_year</th>\n",
       "      <th>Hormonal_Contraceptives</th>\n",
       "      <th>Hormonal_Contraceptives_years</th>\n",
       "      <th>IUD</th>\n",
       "      <th>IUD_years</th>\n",
       "      <th>...</th>\n",
       "      <th>STDs_vaginal_condylomatosis</th>\n",
       "      <th>STDs_vulvo_perineal_condylomatosis</th>\n",
       "      <th>STDs_syphilis</th>\n",
       "      <th>STDs_pelvic_inflammatory_disease</th>\n",
       "      <th>STDs_genital_herpes</th>\n",
       "      <th>STDs_molluscum_contagiosum</th>\n",
       "      <th>STDs_AIDS</th>\n",
       "      <th>STDs_HIV</th>\n",
       "      <th>STDs_Hepatitis_B</th>\n",
       "      <th>STDs_HPV</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>15.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>853</th>\n",
       "      <td>3.0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>854</th>\n",
       "      <td>2.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>8.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>855</th>\n",
       "      <td>2.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.08</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>856</th>\n",
       "      <td>2.0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.08</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>857</th>\n",
       "      <td>2.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>858 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     N_sex_partners  First_sex_intercourse  N_pregnancies Smokes  \\\n",
       "0               4.0                   15.0            1.0    0.0   \n",
       "1               1.0                   14.0            1.0    0.0   \n",
       "2               1.0                    NaN            1.0    0.0   \n",
       "3               5.0                   16.0            4.0    1.0   \n",
       "4               3.0                   21.0            4.0    0.0   \n",
       "..              ...                    ...            ...    ...   \n",
       "853             3.0                   18.0            0.0    0.0   \n",
       "854             2.0                   19.0            1.0    0.0   \n",
       "855             2.0                   17.0            0.0    0.0   \n",
       "856             2.0                   24.0            2.0    0.0   \n",
       "857             2.0                   20.0            1.0    0.0   \n",
       "\n",
       "     Smokes_years  Smokes_packs_year Hormonal_Contraceptives  \\\n",
       "0             0.0                0.0                     0.0   \n",
       "1             0.0                0.0                     0.0   \n",
       "2             0.0                0.0                     0.0   \n",
       "3            37.0               37.0                     1.0   \n",
       "4             0.0                0.0                     1.0   \n",
       "..            ...                ...                     ...   \n",
       "853           0.0                0.0                     0.0   \n",
       "854           0.0                0.0                     1.0   \n",
       "855           0.0                0.0                     1.0   \n",
       "856           0.0                0.0                     1.0   \n",
       "857           0.0                0.0                     1.0   \n",
       "\n",
       "     Hormonal_Contraceptives_years  IUD  IUD_years  ...  \\\n",
       "0                             0.00  0.0        0.0  ...   \n",
       "1                             0.00  0.0        0.0  ...   \n",
       "2                             0.00  0.0        0.0  ...   \n",
       "3                             3.00  0.0        0.0  ...   \n",
       "4                            15.00  0.0        0.0  ...   \n",
       "..                             ...  ...        ...  ...   \n",
       "853                           0.00  0.0        0.0  ...   \n",
       "854                           8.00  0.0        0.0  ...   \n",
       "855                           0.08  0.0        0.0  ...   \n",
       "856                           0.08  0.0        0.0  ...   \n",
       "857                           0.50  0.0        0.0  ...   \n",
       "\n",
       "    STDs_vaginal_condylomatosis  STDs_vulvo_perineal_condylomatosis  \\\n",
       "0                           0.0                                 0.0   \n",
       "1                           0.0                                 0.0   \n",
       "2                           0.0                                 0.0   \n",
       "3                           0.0                                 0.0   \n",
       "4                           0.0                                 0.0   \n",
       "..                          ...                                 ...   \n",
       "853                         0.0                                 0.0   \n",
       "854                         0.0                                 0.0   \n",
       "855                         0.0                                 0.0   \n",
       "856                         0.0                                 0.0   \n",
       "857                         0.0                                 0.0   \n",
       "\n",
       "    STDs_syphilis STDs_pelvic_inflammatory_disease STDs_genital_herpes  \\\n",
       "0             0.0                              0.0                 0.0   \n",
       "1             0.0                              0.0                 0.0   \n",
       "2             0.0                              0.0                 0.0   \n",
       "3             0.0                              0.0                 0.0   \n",
       "4             0.0                              0.0                 0.0   \n",
       "..            ...                              ...                 ...   \n",
       "853           0.0                              0.0                 0.0   \n",
       "854           0.0                              0.0                 0.0   \n",
       "855           0.0                              0.0                 0.0   \n",
       "856           0.0                              0.0                 0.0   \n",
       "857           0.0                              0.0                 0.0   \n",
       "\n",
       "    STDs_molluscum_contagiosum STDs_AIDS STDs_HIV STDs_Hepatitis_B STDs_HPV  \n",
       "0                          0.0       0.0      0.0              0.0      0.0  \n",
       "1                          0.0       0.0      0.0              0.0      0.0  \n",
       "2                          0.0       0.0      0.0              0.0      0.0  \n",
       "3                          0.0       0.0      0.0              0.0      0.0  \n",
       "4                          0.0       0.0      0.0              0.0      0.0  \n",
       "..                         ...       ...      ...              ...      ...  \n",
       "853                        0.0       0.0      0.0              0.0      0.0  \n",
       "854                        0.0       0.0      0.0              0.0      0.0  \n",
       "855                        0.0       0.0      0.0              0.0      0.0  \n",
       "856                        0.0       0.0      0.0              0.0      0.0  \n",
       "857                        0.0       0.0      0.0              0.0      0.0  \n",
       "\n",
       "[858 rows x 24 columns]"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[:, df.isna().any()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "fdaabb8f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Age                                     int64\n",
       "N_sex_partners                        float64\n",
       "First_sex_intercourse                 float64\n",
       "N_pregnancies                         float64\n",
       "Smokes                                 object\n",
       "Smokes_years                          float64\n",
       "Smokes_packs_year                     float64\n",
       "Hormonal_Contraceptives                object\n",
       "Hormonal_Contraceptives_years         float64\n",
       "IUD                                    object\n",
       "IUD_years                             float64\n",
       "STDs                                   object\n",
       "STDs_number                           float64\n",
       "STDs_condylomatosis                    object\n",
       "STDs_cervical_condylomatosis           object\n",
       "STDs_vaginal_condylomatosis            object\n",
       "STDs_vulvo_perineal_condylomatosis     object\n",
       "STDs_syphilis                          object\n",
       "STDs_pelvic_inflammatory_disease       object\n",
       "STDs_genital_herpes                    object\n",
       "STDs_molluscum_contagiosum             object\n",
       "STDs_AIDS                              object\n",
       "STDs_HIV                               object\n",
       "STDs_Hepatitis_B                       object\n",
       "STDs_HPV                               object\n",
       "STDs_N_diagnosis                        int64\n",
       "Dx_Cancer                               int64\n",
       "Dx_CIN                                  int64\n",
       "Dx_HPV                                  int64\n",
       "Dx                                      int64\n",
       "Hinselmann                              int64\n",
       "Schiller                                int64\n",
       "Citology                                int64\n",
       "Biopsy                                  int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "7a5ef85c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.apply(pd.to_numeric)\n",
    "df = df.fillna(df.mode().iloc[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14341e8c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "1d3a9654",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Age                                     int64\n",
       "N_sex_partners                        float64\n",
       "First_sex_intercourse                 float64\n",
       "N_pregnancies                         float64\n",
       "Smokes                                float64\n",
       "Smokes_years                          float64\n",
       "Smokes_packs_year                     float64\n",
       "Hormonal_Contraceptives               float64\n",
       "Hormonal_Contraceptives_years         float64\n",
       "IUD                                   float64\n",
       "IUD_years                             float64\n",
       "STDs                                  float64\n",
       "STDs_number                           float64\n",
       "STDs_condylomatosis                   float64\n",
       "STDs_cervical_condylomatosis          float64\n",
       "STDs_vaginal_condylomatosis           float64\n",
       "STDs_vulvo_perineal_condylomatosis    float64\n",
       "STDs_syphilis                         float64\n",
       "STDs_pelvic_inflammatory_disease      float64\n",
       "STDs_genital_herpes                   float64\n",
       "STDs_molluscum_contagiosum            float64\n",
       "STDs_AIDS                             float64\n",
       "STDs_HIV                              float64\n",
       "STDs_Hepatitis_B                      float64\n",
       "STDs_HPV                              float64\n",
       "STDs_N_diagnosis                        int64\n",
       "Dx_Cancer                               int64\n",
       "Dx_CIN                                  int64\n",
       "Dx_HPV                                  int64\n",
       "Dx                                      int64\n",
       "Hinselmann                              int64\n",
       "Schiller                                int64\n",
       "Citology                                int64\n",
       "Biopsy                                  int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19fb21ce",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "7cc0dc9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "dtypes = {\n",
    "    \"Age\": int,\n",
    "    \"N_sex_partners\": int,\n",
    "    \"First_sex_intercourse\": int,\n",
    "    \"N_pregnancies\": int,\n",
    "    \"Smokes\": int,\n",
    "    \"Smokes_years\": float,\n",
    "    \"Smokes_packs_year\": float,\n",
    "    \"Hormonal_Contraceptives\": int,\n",
    "    \"Hormonal_Contraceptives_years\": float,\n",
    "    \"IUD\": int,\n",
    "    \"IUD_years\": float,\n",
    "    \"STDs\": int,\n",
    "    \"STDs_number\": int,\n",
    "    \"STDs_condylomatosis\": int,\n",
    "    \"STDs_cervical_condylomatosis\": int,\n",
    "    \"STDs_vaginal_condylomatosis\": int,\n",
    "    \"STDs_vulvo_perineal_condylomatosis\": int,\n",
    "    \"STDs_syphilis\": int,\n",
    "    \"STDs_pelvic_inflammatory_disease\": int,\n",
    "    \"STDs_genital_herpes\": int,\n",
    "    \"STDs_molluscum_contagiosum\": int,\n",
    "    \"STDs_AIDS\": int,\n",
    "    \"STDs_HIV\": int,\n",
    "    \"STDs_Hepatitis_B\": int,\n",
    "    \"STDs_HPV\": int,\n",
    "    \"STDs_N_diagnosis\": int,\n",
    "    \"Dx_Cancer\":  int,\n",
    "    \"Dx_CIN\": int,\n",
    "    \"Dx_HPV\": int,\n",
    "    \"Dx\": int,\n",
    "    \"Hinselmann\": int,\n",
    "    \"Schiller\": int,\n",
    "    \"Citology\": int,\n",
    "    \"Biopsy\": int,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "63be4abc",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.astype(dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49f2953c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b761d731",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "id": "82234085",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier\n",
    "from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB\n",
    "from sklearn import svm\n",
    "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier\n",
    "# from xgboost import XGBRegressor\n",
    "\n",
    "from sklearn.metrics import roc_auc_score, average_precision_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "745685f4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c52dc337",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "id": "6a3c1cdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df, test_df = train_test_split(df, stratify=df[\"Biopsy\"], test_size=0.2, random_state=13)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b19a5df",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "id": "cdc067ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# apperantly, we assume that we know the bounds (TODO: comment on that)\n",
    "min_max_scaler = MinMaxScaler()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "id": "57f3bce9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = min_max_scaler.fit_transform(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "id": "f4c626ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "# use train_data to train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "id": "6d80bbc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, train_y = train_data[:, :-1], train_data[:, -1].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "981d39d7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "id": "4dca8f5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "id": "ff450702",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = min_max_scaler.transform(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "id": "a20fa2c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_X, test_y = test_data[:,:-1], test_data[:, -1].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51d6cf9a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e94c81e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c9b8f9a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b148f2f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "id": "a860be62",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LogisticRegression()\n",
    "model.fit(train_X, train_y)\n",
    "predict_y = model.predict_proba(test_X)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "id": "7e4c5728",
   "metadata": {},
   "outputs": [],
   "source": [
    "auc = roc_auc_score(test_y, predict_y)\n",
    "apr = average_precision_score(test_y, predict_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "id": "7bd7221c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.9909655561829476, 0.8697116312089574)"
      ]
     },
     "execution_count": 187,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "auc, apr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "425c748b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "id": "5ee0df1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODELS = [\n",
    "    'logisticregression',\n",
    "    'randomforest',\n",
    "    'gaussiannb',\n",
    "    'bernoullinb',\n",
    "#     'svmlin',\n",
    "    'Extra Trees',\n",
    "    'LDA',\n",
    "    'AdaBoost'\n",
    "    'Bagging',\n",
    "    'gbm',\n",
    "#     'xgb'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d59e63e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b910cbb8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "id": "73d6b011",
   "metadata": {},
   "outputs": [],
   "source": [
    "results = np.zeros([len(MODELS), 2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "id": "9b046cbb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 logisticregression\n",
      "1 randomforest\n",
      "2 gaussiannb\n",
      "3 bernoullinb\n",
      "4 Extra Trees\n",
      "5 LDA\n",
      "6 AdaBoostBagging\n",
      "7 gbm\n"
     ]
    }
   ],
   "source": [
    "for i, model_name in enumerate(MODELS):\n",
    "    print(i, model_name)\n",
    "\n",
    "    if model_name == 'logisticregression':\n",
    "        model = LogisticRegression()\n",
    "    elif model_name == 'randomforest':      \n",
    "        model = RandomForestClassifier()\n",
    "    elif model_name == 'gaussiannb':  \n",
    "        model = GaussianNB()\n",
    "    elif model_name == 'bernoullinb':  \n",
    "        model = BernoulliNB()\n",
    "    elif model_name == 'multinb':  \n",
    "        model = MultinomialNB()\n",
    "    elif model_name == 'svmlin':         \n",
    "        model = svm.LinearSVC() \n",
    "    elif model_name == 'gbm':         \n",
    "        model = GradientBoostingClassifier()   \n",
    "    elif model_name == 'Extra Trees':\n",
    "        model =  ExtraTreesClassifier(n_estimators=20)\n",
    "    elif model_name == 'LDA':\n",
    "        model =  LinearDiscriminantAnalysis() \n",
    "    elif model_name == 'Passive Aggressive':\n",
    "        model =   PassiveAggressiveClassifier()\n",
    "    elif model_name == 'AdaBoost':\n",
    "        model = AdaBoostClassifier()\n",
    "    elif model_name == 'Bagging':\n",
    "        model = BaggingClassifier()\n",
    "    elif model_name == 'xgb':\n",
    "        model = XGBRegressor()\n",
    "        \n",
    "    model.fit(train_X, train_y)\n",
    "    predict_y = model.predict_proba(test_X)[:,1]\n",
    "    \n",
    "    auc = roc_auc_score(test_y, predict_y)\n",
    "    apr = average_precision_score(test_y, predict_y)\n",
    "    \n",
    "    results[i, :] = [auc, apr]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "id": "281d6d81",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.99096556, 0.86971163],\n",
       "       [0.98136646, 0.7032541 ],\n",
       "       [0.94099379, 0.36666667],\n",
       "       [0.99068323, 0.89220779],\n",
       "       [0.98193111, 0.71147437],\n",
       "       [0.98475438, 0.79221514],\n",
       "       [0.98475438, 0.79221514],\n",
       "       [0.97741389, 0.63868707]])"
      ]
     },
     "execution_count": 205,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 206,
   "id": "43bff37b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.97910785, 0.72080399])"
      ]
     },
     "execution_count": 206,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results.mean(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e06e441",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87707d2f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbbc70a0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae1ebd9c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb4167d3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 297,
   "id": "054b760b",
   "metadata": {},
   "outputs": [],
   "source": [
    "seizure_path = \"epileptic_seizure/data.csv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 298,
   "id": "98758e77",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(seizure_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 299,
   "id": "d894e37d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df = df.astype(\"int\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "id": "b51637dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"y\"][df[\"y\"].isin([2, 3, 4, 5])] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 301,
   "id": "94131a15",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X1</th>\n",
       "      <th>X2</th>\n",
       "      <th>X3</th>\n",
       "      <th>X4</th>\n",
       "      <th>X5</th>\n",
       "      <th>X6</th>\n",
       "      <th>X7</th>\n",
       "      <th>X8</th>\n",
       "      <th>X9</th>\n",
       "      <th>X10</th>\n",
       "      <th>...</th>\n",
       "      <th>X170</th>\n",
       "      <th>X171</th>\n",
       "      <th>X172</th>\n",
       "      <th>X173</th>\n",
       "      <th>X174</th>\n",
       "      <th>X175</th>\n",
       "      <th>X176</th>\n",
       "      <th>X177</th>\n",
       "      <th>X178</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>135</td>\n",
       "      <td>190</td>\n",
       "      <td>229</td>\n",
       "      <td>223</td>\n",
       "      <td>192</td>\n",
       "      <td>125</td>\n",
       "      <td>55</td>\n",
       "      <td>-9</td>\n",
       "      <td>-33</td>\n",
       "      <td>-38</td>\n",
       "      <td>...</td>\n",
       "      <td>-17</td>\n",
       "      <td>-15</td>\n",
       "      <td>-31</td>\n",
       "      <td>-77</td>\n",
       "      <td>-103</td>\n",
       "      <td>-127</td>\n",
       "      <td>-116</td>\n",
       "      <td>-83</td>\n",
       "      <td>-51</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>386</td>\n",
       "      <td>382</td>\n",
       "      <td>356</td>\n",
       "      <td>331</td>\n",
       "      <td>320</td>\n",
       "      <td>315</td>\n",
       "      <td>307</td>\n",
       "      <td>272</td>\n",
       "      <td>244</td>\n",
       "      <td>232</td>\n",
       "      <td>...</td>\n",
       "      <td>164</td>\n",
       "      <td>150</td>\n",
       "      <td>146</td>\n",
       "      <td>152</td>\n",
       "      <td>157</td>\n",
       "      <td>156</td>\n",
       "      <td>154</td>\n",
       "      <td>143</td>\n",
       "      <td>129</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-32</td>\n",
       "      <td>-39</td>\n",
       "      <td>-47</td>\n",
       "      <td>-37</td>\n",
       "      <td>-32</td>\n",
       "      <td>-36</td>\n",
       "      <td>-57</td>\n",
       "      <td>-73</td>\n",
       "      <td>-85</td>\n",
       "      <td>-94</td>\n",
       "      <td>...</td>\n",
       "      <td>57</td>\n",
       "      <td>64</td>\n",
       "      <td>48</td>\n",
       "      <td>19</td>\n",
       "      <td>-12</td>\n",
       "      <td>-30</td>\n",
       "      <td>-35</td>\n",
       "      <td>-35</td>\n",
       "      <td>-36</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-105</td>\n",
       "      <td>-101</td>\n",
       "      <td>-96</td>\n",
       "      <td>-92</td>\n",
       "      <td>-89</td>\n",
       "      <td>-95</td>\n",
       "      <td>-102</td>\n",
       "      <td>-100</td>\n",
       "      <td>-87</td>\n",
       "      <td>-79</td>\n",
       "      <td>...</td>\n",
       "      <td>-82</td>\n",
       "      <td>-81</td>\n",
       "      <td>-80</td>\n",
       "      <td>-77</td>\n",
       "      <td>-85</td>\n",
       "      <td>-77</td>\n",
       "      <td>-72</td>\n",
       "      <td>-69</td>\n",
       "      <td>-65</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-9</td>\n",
       "      <td>-65</td>\n",
       "      <td>-98</td>\n",
       "      <td>-102</td>\n",
       "      <td>-78</td>\n",
       "      <td>-48</td>\n",
       "      <td>-16</td>\n",
       "      <td>0</td>\n",
       "      <td>-21</td>\n",
       "      <td>-59</td>\n",
       "      <td>...</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>-12</td>\n",
       "      <td>-32</td>\n",
       "      <td>-41</td>\n",
       "      <td>-65</td>\n",
       "      <td>-83</td>\n",
       "      <td>-89</td>\n",
       "      <td>-73</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11495</th>\n",
       "      <td>-22</td>\n",
       "      <td>-22</td>\n",
       "      <td>-23</td>\n",
       "      <td>-26</td>\n",
       "      <td>-36</td>\n",
       "      <td>-42</td>\n",
       "      <td>-45</td>\n",
       "      <td>-42</td>\n",
       "      <td>-45</td>\n",
       "      <td>-49</td>\n",
       "      <td>...</td>\n",
       "      <td>15</td>\n",
       "      <td>16</td>\n",
       "      <td>12</td>\n",
       "      <td>5</td>\n",
       "      <td>-1</td>\n",
       "      <td>-18</td>\n",
       "      <td>-37</td>\n",
       "      <td>-47</td>\n",
       "      <td>-48</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11496</th>\n",
       "      <td>-47</td>\n",
       "      <td>-11</td>\n",
       "      <td>28</td>\n",
       "      <td>77</td>\n",
       "      <td>141</td>\n",
       "      <td>211</td>\n",
       "      <td>246</td>\n",
       "      <td>240</td>\n",
       "      <td>193</td>\n",
       "      <td>136</td>\n",
       "      <td>...</td>\n",
       "      <td>-65</td>\n",
       "      <td>-33</td>\n",
       "      <td>-7</td>\n",
       "      <td>14</td>\n",
       "      <td>27</td>\n",
       "      <td>48</td>\n",
       "      <td>77</td>\n",
       "      <td>117</td>\n",
       "      <td>170</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11497</th>\n",
       "      <td>14</td>\n",
       "      <td>6</td>\n",
       "      <td>-13</td>\n",
       "      <td>-16</td>\n",
       "      <td>10</td>\n",
       "      <td>26</td>\n",
       "      <td>27</td>\n",
       "      <td>-9</td>\n",
       "      <td>4</td>\n",
       "      <td>14</td>\n",
       "      <td>...</td>\n",
       "      <td>-65</td>\n",
       "      <td>-48</td>\n",
       "      <td>-61</td>\n",
       "      <td>-62</td>\n",
       "      <td>-67</td>\n",
       "      <td>-30</td>\n",
       "      <td>-2</td>\n",
       "      <td>-1</td>\n",
       "      <td>-8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11498</th>\n",
       "      <td>-40</td>\n",
       "      <td>-25</td>\n",
       "      <td>-9</td>\n",
       "      <td>-12</td>\n",
       "      <td>-2</td>\n",
       "      <td>12</td>\n",
       "      <td>7</td>\n",
       "      <td>19</td>\n",
       "      <td>22</td>\n",
       "      <td>29</td>\n",
       "      <td>...</td>\n",
       "      <td>121</td>\n",
       "      <td>135</td>\n",
       "      <td>148</td>\n",
       "      <td>143</td>\n",
       "      <td>116</td>\n",
       "      <td>86</td>\n",
       "      <td>68</td>\n",
       "      <td>59</td>\n",
       "      <td>55</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11499</th>\n",
       "      <td>29</td>\n",
       "      <td>41</td>\n",
       "      <td>57</td>\n",
       "      <td>72</td>\n",
       "      <td>74</td>\n",
       "      <td>62</td>\n",
       "      <td>54</td>\n",
       "      <td>43</td>\n",
       "      <td>31</td>\n",
       "      <td>23</td>\n",
       "      <td>...</td>\n",
       "      <td>-59</td>\n",
       "      <td>-25</td>\n",
       "      <td>-4</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>-2</td>\n",
       "      <td>2</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>11500 rows × 179 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  ...  X170  X171  \\\n",
       "0      135  190  229  223  192  125   55   -9  -33  -38  ...   -17   -15   \n",
       "1      386  382  356  331  320  315  307  272  244  232  ...   164   150   \n",
       "2      -32  -39  -47  -37  -32  -36  -57  -73  -85  -94  ...    57    64   \n",
       "3     -105 -101  -96  -92  -89  -95 -102 -100  -87  -79  ...   -82   -81   \n",
       "4       -9  -65  -98 -102  -78  -48  -16    0  -21  -59  ...     4     2   \n",
       "...    ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   \n",
       "11495  -22  -22  -23  -26  -36  -42  -45  -42  -45  -49  ...    15    16   \n",
       "11496  -47  -11   28   77  141  211  246  240  193  136  ...   -65   -33   \n",
       "11497   14    6  -13  -16   10   26   27   -9    4   14  ...   -65   -48   \n",
       "11498  -40  -25   -9  -12   -2   12    7   19   22   29  ...   121   135   \n",
       "11499   29   41   57   72   74   62   54   43   31   23  ...   -59   -25   \n",
       "\n",
       "       X172  X173  X174  X175  X176  X177  X178  y  \n",
       "0       -31   -77  -103  -127  -116   -83   -51  0  \n",
       "1       146   152   157   156   154   143   129  1  \n",
       "2        48    19   -12   -30   -35   -35   -36  0  \n",
       "3       -80   -77   -85   -77   -72   -69   -65  0  \n",
       "4       -12   -32   -41   -65   -83   -89   -73  0  \n",
       "...     ...   ...   ...   ...   ...   ...   ... ..  \n",
       "11495    12     5    -1   -18   -37   -47   -48  0  \n",
       "11496    -7    14    27    48    77   117   170  1  \n",
       "11497   -61   -62   -67   -30    -2    -1    -8  0  \n",
       "11498   148   143   116    86    68    59    55  0  \n",
       "11499    -4     2     5     4    -2     2    20  0  \n",
       "\n",
       "[11500 rows x 179 columns]"
      ]
     },
     "execution_count": 301,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 302,
   "id": "26542d1c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.4609026465028355 0.42785059515141555\n"
     ]
    }
   ],
   "source": [
    "train_df, test_df = train_test_split(df, stratify=df[\"y\"], test_size=0.2, random_state=13)\n",
    "\n",
    "min_max_scaler = MinMaxScaler()\n",
    "train_data = min_max_scaler.fit_transform(train_df)\n",
    "train_X, train_y = train_data[:, :-1], train_data[:, -1].astype(int)\n",
    "\n",
    "test_data = min_max_scaler.transform(test_df)\n",
    "test_X, test_y = test_data[:, :-1], test_data[:, -1].astype(int)\n",
    "\n",
    "model = LogisticRegression(max_iter=2000)\n",
    "model.fit(train_X, train_y)\n",
    "predict_y = model.predict_proba(test_X)[:,1]\n",
    "\n",
    "auc = roc_auc_score(test_y, predict_y)\n",
    "apr = average_precision_score(test_y, predict_y)\n",
    "\n",
    "print(auc, apr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9699f9f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 303,
   "id": "86305b0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "results = np.zeros([len(MODELS), 2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 304,
   "id": "c4b52620",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 logisticregression\n",
      "1 randomforest\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/georgi/venvs/bin-synth/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  n_iter_i = _check_optimize_result(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 gaussiannb\n",
      "3 bernoullinb\n",
      "4 Extra Trees\n",
      "5 LDA\n",
      "6 AdaBoostBagging\n",
      "7 gbm\n"
     ]
    }
   ],
   "source": [
    "for i, model_name in enumerate(MODELS):\n",
    "    print(i, model_name)\n",
    "\n",
    "    if model_name == 'logisticregression':\n",
    "        model = LogisticRegression()\n",
    "    elif model_name == 'randomforest':      \n",
    "        model = RandomForestClassifier()\n",
    "    elif model_name == 'gaussiannb':  \n",
    "        model = GaussianNB()\n",
    "    elif model_name == 'bernoullinb':  \n",
    "        model = BernoulliNB()\n",
    "    elif model_name == 'multinb':  \n",
    "        model = MultinomialNB()\n",
    "    elif model_name == 'svmlin':         \n",
    "        model = svm.LinearSVC() \n",
    "    elif model_name == 'gbm':         \n",
    "        model = GradientBoostingClassifier()   \n",
    "    elif model_name == 'Extra Trees':\n",
    "        model =  ExtraTreesClassifier(n_estimators=20)\n",
    "    elif model_name == 'LDA':\n",
    "        model =  LinearDiscriminantAnalysis() \n",
    "    elif model_name == 'Passive Aggressive':\n",
    "        model =   PassiveAggressiveClassifier()\n",
    "    elif model_name == 'AdaBoost':\n",
    "        model = AdaBoostClassifier()\n",
    "    elif model_name == 'Bagging':\n",
    "        model = BaggingClassifier()\n",
    "    elif model_name == 'xgb':\n",
    "        model = XGBRegressor()\n",
    "        \n",
    "    model.fit(train_X, train_y)\n",
    "    predict_y = model.predict_proba(test_X)[:,1]\n",
    "    \n",
    "    auc = roc_auc_score(test_y, predict_y)\n",
    "    apr = average_precision_score(test_y, predict_y)\n",
    "    \n",
    "    results[i, :] = [auc, apr]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8904ef57",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 305,
   "id": "5e8f698b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.46077977 0.43347903]\n",
      " [0.99131498 0.97615955]\n",
      " [0.98196479 0.89089344]\n",
      " [0.52282609 0.23652174]\n",
      " [0.9917663  0.97109   ]\n",
      " [0.48111886 0.3933929 ]\n",
      " [0.48111886 0.3933929 ]\n",
      " [0.99183483 0.97252662]]\n"
     ]
    }
   ],
   "source": [
    "print(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 307,
   "id": "30d1aa4f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.73784056, 0.65843202])"
      ]
     },
     "execution_count": 307,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results.mean(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8227817",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6218f9d0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed0b8b71",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b591a54",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ba5d188",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eac819aa",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee9fa86f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd49ad0a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 359,
   "id": "8adfce5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "isolet_path = \"isolet/data.csv.gz\"\n",
    "isolet_test_path = \"isolet/test_data.csv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 360,
   "id": "2ffaf8b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(isolet_path)\n",
    "test_df = pd.read_csv(isolet_test_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 361,
   "id": "50771afb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>f1</th>\n",
       "      <th>f2</th>\n",
       "      <th>f3</th>\n",
       "      <th>f4</th>\n",
       "      <th>f5</th>\n",
       "      <th>f6</th>\n",
       "      <th>f7</th>\n",
       "      <th>f8</th>\n",
       "      <th>f9</th>\n",
       "      <th>f10</th>\n",
       "      <th>...</th>\n",
       "      <th>f609</th>\n",
       "      <th>f610</th>\n",
       "      <th>f611</th>\n",
       "      <th>f612</th>\n",
       "      <th>f613</th>\n",
       "      <th>f614</th>\n",
       "      <th>f615</th>\n",
       "      <th>f616</th>\n",
       "      <th>f617</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.4394</td>\n",
       "      <td>-0.0930</td>\n",
       "      <td>0.1718</td>\n",
       "      <td>0.4620</td>\n",
       "      <td>0.6226</td>\n",
       "      <td>0.4704</td>\n",
       "      <td>0.3578</td>\n",
       "      <td>0.0478</td>\n",
       "      <td>-0.1184</td>\n",
       "      <td>-0.2310</td>\n",
       "      <td>...</td>\n",
       "      <td>0.4102</td>\n",
       "      <td>0.2052</td>\n",
       "      <td>0.3846</td>\n",
       "      <td>0.3590</td>\n",
       "      <td>0.5898</td>\n",
       "      <td>0.3334</td>\n",
       "      <td>0.6410</td>\n",
       "      <td>0.5898</td>\n",
       "      <td>-0.4872</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.4348</td>\n",
       "      <td>-0.1198</td>\n",
       "      <td>0.2474</td>\n",
       "      <td>0.4036</td>\n",
       "      <td>0.5026</td>\n",
       "      <td>0.6328</td>\n",
       "      <td>0.4948</td>\n",
       "      <td>0.0338</td>\n",
       "      <td>-0.0520</td>\n",
       "      <td>-0.1302</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.2954</td>\n",
       "      <td>0.2046</td>\n",
       "      <td>0.4772</td>\n",
       "      <td>0.0454</td>\n",
       "      <td>0.2046</td>\n",
       "      <td>0.4318</td>\n",
       "      <td>0.4546</td>\n",
       "      <td>-0.0910</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.2330</td>\n",
       "      <td>0.2124</td>\n",
       "      <td>0.5014</td>\n",
       "      <td>0.5222</td>\n",
       "      <td>-0.3422</td>\n",
       "      <td>-0.5840</td>\n",
       "      <td>-0.7168</td>\n",
       "      <td>-0.6342</td>\n",
       "      <td>-0.8614</td>\n",
       "      <td>-0.8318</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.1112</td>\n",
       "      <td>-0.0476</td>\n",
       "      <td>-0.1746</td>\n",
       "      <td>0.0318</td>\n",
       "      <td>-0.0476</td>\n",
       "      <td>0.1112</td>\n",
       "      <td>0.2540</td>\n",
       "      <td>0.1588</td>\n",
       "      <td>-0.4762</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.3808</td>\n",
       "      <td>-0.0096</td>\n",
       "      <td>0.2602</td>\n",
       "      <td>0.2554</td>\n",
       "      <td>-0.4290</td>\n",
       "      <td>-0.6746</td>\n",
       "      <td>-0.6868</td>\n",
       "      <td>-0.6650</td>\n",
       "      <td>-0.8410</td>\n",
       "      <td>-0.9614</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.0504</td>\n",
       "      <td>-0.0360</td>\n",
       "      <td>-0.1224</td>\n",
       "      <td>0.1366</td>\n",
       "      <td>0.2950</td>\n",
       "      <td>0.0792</td>\n",
       "      <td>-0.0072</td>\n",
       "      <td>0.0936</td>\n",
       "      <td>-0.1510</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.3412</td>\n",
       "      <td>0.0946</td>\n",
       "      <td>0.6082</td>\n",
       "      <td>0.6216</td>\n",
       "      <td>-0.1622</td>\n",
       "      <td>-0.3784</td>\n",
       "      <td>-0.4324</td>\n",
       "      <td>-0.4358</td>\n",
       "      <td>-0.4966</td>\n",
       "      <td>-0.5406</td>\n",
       "      <td>...</td>\n",
       "      <td>0.1562</td>\n",
       "      <td>0.3124</td>\n",
       "      <td>0.2500</td>\n",
       "      <td>-0.0938</td>\n",
       "      <td>0.1562</td>\n",
       "      <td>0.3124</td>\n",
       "      <td>0.3124</td>\n",
       "      <td>0.2188</td>\n",
       "      <td>-0.2500</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6233</th>\n",
       "      <td>-0.5742</td>\n",
       "      <td>0.1050</td>\n",
       "      <td>0.4936</td>\n",
       "      <td>0.3986</td>\n",
       "      <td>-0.2058</td>\n",
       "      <td>-0.4130</td>\n",
       "      <td>-0.4188</td>\n",
       "      <td>-0.5194</td>\n",
       "      <td>-0.5080</td>\n",
       "      <td>-0.4878</td>\n",
       "      <td>...</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>0.6800</td>\n",
       "      <td>0.8200</td>\n",
       "      <td>0.8400</td>\n",
       "      <td>0.8400</td>\n",
       "      <td>0.7400</td>\n",
       "      <td>0.8200</td>\n",
       "      <td>0.6400</td>\n",
       "      <td>0.3200</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6234</th>\n",
       "      <td>-0.4520</td>\n",
       "      <td>0.0154</td>\n",
       "      <td>0.5078</td>\n",
       "      <td>0.8978</td>\n",
       "      <td>0.7956</td>\n",
       "      <td>0.4366</td>\n",
       "      <td>0.2352</td>\n",
       "      <td>0.1300</td>\n",
       "      <td>0.0682</td>\n",
       "      <td>0.3004</td>\n",
       "      <td>...</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>0.2250</td>\n",
       "      <td>0.7500</td>\n",
       "      <td>0.8750</td>\n",
       "      <td>0.6750</td>\n",
       "      <td>0.6000</td>\n",
       "      <td>0.4500</td>\n",
       "      <td>-0.1250</td>\n",
       "      <td>-0.2250</td>\n",
       "      <td>23.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6235</th>\n",
       "      <td>-0.5824</td>\n",
       "      <td>-0.1646</td>\n",
       "      <td>0.1406</td>\n",
       "      <td>0.6224</td>\n",
       "      <td>0.6626</td>\n",
       "      <td>0.3172</td>\n",
       "      <td>0.0924</td>\n",
       "      <td>0.0120</td>\n",
       "      <td>-0.1646</td>\n",
       "      <td>-0.1326</td>\n",
       "      <td>...</td>\n",
       "      <td>0.8068</td>\n",
       "      <td>0.7392</td>\n",
       "      <td>0.7392</td>\n",
       "      <td>0.6908</td>\n",
       "      <td>0.7294</td>\n",
       "      <td>0.7004</td>\n",
       "      <td>0.6812</td>\n",
       "      <td>0.5170</td>\n",
       "      <td>0.3430</td>\n",
       "      <td>24.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6236</th>\n",
       "      <td>0.0160</td>\n",
       "      <td>0.8168</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.7814</td>\n",
       "      <td>0.4084</td>\n",
       "      <td>0.2122</td>\n",
       "      <td>-0.2218</td>\n",
       "      <td>-0.6848</td>\n",
       "      <td>-0.8424</td>\n",
       "      <td>-0.7588</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0344</td>\n",
       "      <td>0.0344</td>\n",
       "      <td>-0.0344</td>\n",
       "      <td>0.4252</td>\n",
       "      <td>0.2874</td>\n",
       "      <td>-0.0114</td>\n",
       "      <td>0.1034</td>\n",
       "      <td>-0.1954</td>\n",
       "      <td>-0.8620</td>\n",
       "      <td>25.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6237</th>\n",
       "      <td>-0.6116</td>\n",
       "      <td>-0.1040</td>\n",
       "      <td>0.2566</td>\n",
       "      <td>0.2316</td>\n",
       "      <td>-0.0568</td>\n",
       "      <td>-0.3648</td>\n",
       "      <td>-0.3870</td>\n",
       "      <td>-0.4868</td>\n",
       "      <td>-0.4674</td>\n",
       "      <td>-0.3232</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.0178</td>\n",
       "      <td>-0.0536</td>\n",
       "      <td>0.5178</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9464</td>\n",
       "      <td>0.2500</td>\n",
       "      <td>-0.0536</td>\n",
       "      <td>0.0714</td>\n",
       "      <td>-0.0892</td>\n",
       "      <td>26.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6238 rows × 618 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          f1      f2      f3      f4      f5      f6      f7      f8      f9  \\\n",
       "0    -0.4394 -0.0930  0.1718  0.4620  0.6226  0.4704  0.3578  0.0478 -0.1184   \n",
       "1    -0.4348 -0.1198  0.2474  0.4036  0.5026  0.6328  0.4948  0.0338 -0.0520   \n",
       "2    -0.2330  0.2124  0.5014  0.5222 -0.3422 -0.5840 -0.7168 -0.6342 -0.8614   \n",
       "3    -0.3808 -0.0096  0.2602  0.2554 -0.4290 -0.6746 -0.6868 -0.6650 -0.8410   \n",
       "4    -0.3412  0.0946  0.6082  0.6216 -0.1622 -0.3784 -0.4324 -0.4358 -0.4966   \n",
       "...      ...     ...     ...     ...     ...     ...     ...     ...     ...   \n",
       "6233 -0.5742  0.1050  0.4936  0.3986 -0.2058 -0.4130 -0.4188 -0.5194 -0.5080   \n",
       "6234 -0.4520  0.0154  0.5078  0.8978  0.7956  0.4366  0.2352  0.1300  0.0682   \n",
       "6235 -0.5824 -0.1646  0.1406  0.6224  0.6626  0.3172  0.0924  0.0120 -0.1646   \n",
       "6236  0.0160  0.8168  1.0000  0.7814  0.4084  0.2122 -0.2218 -0.6848 -0.8424   \n",
       "6237 -0.6116 -0.1040  0.2566  0.2316 -0.0568 -0.3648 -0.3870 -0.4868 -0.4674   \n",
       "\n",
       "         f10  ...    f609    f610    f611    f612    f613    f614    f615  \\\n",
       "0    -0.2310  ...  0.4102  0.2052  0.3846  0.3590  0.5898  0.3334  0.6410   \n",
       "1    -0.1302  ...  0.0000  0.2954  0.2046  0.4772  0.0454  0.2046  0.4318   \n",
       "2    -0.8318  ... -0.1112 -0.0476 -0.1746  0.0318 -0.0476  0.1112  0.2540   \n",
       "3    -0.9614  ... -0.0504 -0.0360 -0.1224  0.1366  0.2950  0.0792 -0.0072   \n",
       "4    -0.5406  ...  0.1562  0.3124  0.2500 -0.0938  0.1562  0.3124  0.3124   \n",
       "...      ...  ...     ...     ...     ...     ...     ...     ...     ...   \n",
       "6233 -0.4878  ...  0.5000  0.6800  0.8200  0.8400  0.8400  0.7400  0.8200   \n",
       "6234  0.3004  ...  0.5000  0.2250  0.7500  0.8750  0.6750  0.6000  0.4500   \n",
       "6235 -0.1326  ...  0.8068  0.7392  0.7392  0.6908  0.7294  0.7004  0.6812   \n",
       "6236 -0.7588  ...  0.0344  0.0344 -0.0344  0.4252  0.2874 -0.0114  0.1034   \n",
       "6237 -0.3232  ... -0.0178 -0.0536  0.5178  1.0000  0.9464  0.2500 -0.0536   \n",
       "\n",
       "        f616    f617  class  \n",
       "0     0.5898 -0.4872    1.0  \n",
       "1     0.4546 -0.0910    1.0  \n",
       "2     0.1588 -0.4762    2.0  \n",
       "3     0.0936 -0.1510    2.0  \n",
       "4     0.2188 -0.2500    3.0  \n",
       "...      ...     ...    ...  \n",
       "6233  0.6400  0.3200   22.0  \n",
       "6234 -0.1250 -0.2250   23.0  \n",
       "6235  0.5170  0.3430   24.0  \n",
       "6236 -0.1954 -0.8620   25.0  \n",
       "6237  0.0714 -0.0892   26.0  \n",
       "\n",
       "[6238 rows x 618 columns]"
      ]
     },
     "execution_count": 361,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 362,
   "id": "15ef67b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "id": "6aa20cba",
   "metadata": {},
   "outputs": [],
   "source": [
    "vowels = \"aeiou\"\n",
    "vowels_idx = [i + 1 for i, ch in enumerate(string.ascii_lowercase) if ch in vowels]\n",
    "consonants_idx = [i + 1 for i, ch in enumerate(string.ascii_lowercase) if ch not in vowels]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd82fe34",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 364,
   "id": "8764bd94",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"class\"][df[\"class\"].isin(consonant_idx)] = 0\n",
    "df[\"class\"][df[\"class\"].isin(vowels_idx)] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 365,
   "id": "3b54e0d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df[\"class\"][test_df[\"class\"].isin(consonant_idx)] = 0\n",
    "test_df[\"class\"][test_df[\"class\"].isin(vowels_idx)] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 367,
   "id": "90ab4052",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "class\n",
       "0.0    5038\n",
       "1.0    1200\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 367,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"class\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 368,
   "id": "4efa7187",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>f1</th>\n",
       "      <th>f2</th>\n",
       "      <th>f3</th>\n",
       "      <th>f4</th>\n",
       "      <th>f5</th>\n",
       "      <th>f6</th>\n",
       "      <th>f7</th>\n",
       "      <th>f8</th>\n",
       "      <th>f9</th>\n",
       "      <th>f10</th>\n",
       "      <th>...</th>\n",
       "      <th>f609</th>\n",
       "      <th>f610</th>\n",
       "      <th>f611</th>\n",
       "      <th>f612</th>\n",
       "      <th>f613</th>\n",
       "      <th>f614</th>\n",
       "      <th>f615</th>\n",
       "      <th>f616</th>\n",
       "      <th>f617</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.4394</td>\n",
       "      <td>-0.0930</td>\n",
       "      <td>0.1718</td>\n",
       "      <td>0.4620</td>\n",
       "      <td>0.6226</td>\n",
       "      <td>0.4704</td>\n",
       "      <td>0.3578</td>\n",
       "      <td>0.0478</td>\n",
       "      <td>-0.1184</td>\n",
       "      <td>-0.2310</td>\n",
       "      <td>...</td>\n",
       "      <td>0.4102</td>\n",
       "      <td>0.2052</td>\n",
       "      <td>0.3846</td>\n",
       "      <td>0.3590</td>\n",
       "      <td>0.5898</td>\n",
       "      <td>0.3334</td>\n",
       "      <td>0.6410</td>\n",
       "      <td>0.5898</td>\n",
       "      <td>-0.4872</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.4348</td>\n",
       "      <td>-0.1198</td>\n",
       "      <td>0.2474</td>\n",
       "      <td>0.4036</td>\n",
       "      <td>0.5026</td>\n",
       "      <td>0.6328</td>\n",
       "      <td>0.4948</td>\n",
       "      <td>0.0338</td>\n",
       "      <td>-0.0520</td>\n",
       "      <td>-0.1302</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.2954</td>\n",
       "      <td>0.2046</td>\n",
       "      <td>0.4772</td>\n",
       "      <td>0.0454</td>\n",
       "      <td>0.2046</td>\n",
       "      <td>0.4318</td>\n",
       "      <td>0.4546</td>\n",
       "      <td>-0.0910</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.2330</td>\n",
       "      <td>0.2124</td>\n",
       "      <td>0.5014</td>\n",
       "      <td>0.5222</td>\n",
       "      <td>-0.3422</td>\n",
       "      <td>-0.5840</td>\n",
       "      <td>-0.7168</td>\n",
       "      <td>-0.6342</td>\n",
       "      <td>-0.8614</td>\n",
       "      <td>-0.8318</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.1112</td>\n",
       "      <td>-0.0476</td>\n",
       "      <td>-0.1746</td>\n",
       "      <td>0.0318</td>\n",
       "      <td>-0.0476</td>\n",
       "      <td>0.1112</td>\n",
       "      <td>0.2540</td>\n",
       "      <td>0.1588</td>\n",
       "      <td>-0.4762</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.3808</td>\n",
       "      <td>-0.0096</td>\n",
       "      <td>0.2602</td>\n",
       "      <td>0.2554</td>\n",
       "      <td>-0.4290</td>\n",
       "      <td>-0.6746</td>\n",
       "      <td>-0.6868</td>\n",
       "      <td>-0.6650</td>\n",
       "      <td>-0.8410</td>\n",
       "      <td>-0.9614</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.0504</td>\n",
       "      <td>-0.0360</td>\n",
       "      <td>-0.1224</td>\n",
       "      <td>0.1366</td>\n",
       "      <td>0.2950</td>\n",
       "      <td>0.0792</td>\n",
       "      <td>-0.0072</td>\n",
       "      <td>0.0936</td>\n",
       "      <td>-0.1510</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.3412</td>\n",
       "      <td>0.0946</td>\n",
       "      <td>0.6082</td>\n",
       "      <td>0.6216</td>\n",
       "      <td>-0.1622</td>\n",
       "      <td>-0.3784</td>\n",
       "      <td>-0.4324</td>\n",
       "      <td>-0.4358</td>\n",
       "      <td>-0.4966</td>\n",
       "      <td>-0.5406</td>\n",
       "      <td>...</td>\n",
       "      <td>0.1562</td>\n",
       "      <td>0.3124</td>\n",
       "      <td>0.2500</td>\n",
       "      <td>-0.0938</td>\n",
       "      <td>0.1562</td>\n",
       "      <td>0.3124</td>\n",
       "      <td>0.3124</td>\n",
       "      <td>0.2188</td>\n",
       "      <td>-0.2500</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6233</th>\n",
       "      <td>-0.5742</td>\n",
       "      <td>0.1050</td>\n",
       "      <td>0.4936</td>\n",
       "      <td>0.3986</td>\n",
       "      <td>-0.2058</td>\n",
       "      <td>-0.4130</td>\n",
       "      <td>-0.4188</td>\n",
       "      <td>-0.5194</td>\n",
       "      <td>-0.5080</td>\n",
       "      <td>-0.4878</td>\n",
       "      <td>...</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>0.6800</td>\n",
       "      <td>0.8200</td>\n",
       "      <td>0.8400</td>\n",
       "      <td>0.8400</td>\n",
       "      <td>0.7400</td>\n",
       "      <td>0.8200</td>\n",
       "      <td>0.6400</td>\n",
       "      <td>0.3200</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6234</th>\n",
       "      <td>-0.4520</td>\n",
       "      <td>0.0154</td>\n",
       "      <td>0.5078</td>\n",
       "      <td>0.8978</td>\n",
       "      <td>0.7956</td>\n",
       "      <td>0.4366</td>\n",
       "      <td>0.2352</td>\n",
       "      <td>0.1300</td>\n",
       "      <td>0.0682</td>\n",
       "      <td>0.3004</td>\n",
       "      <td>...</td>\n",
       "      <td>0.5000</td>\n",
       "      <td>0.2250</td>\n",
       "      <td>0.7500</td>\n",
       "      <td>0.8750</td>\n",
       "      <td>0.6750</td>\n",
       "      <td>0.6000</td>\n",
       "      <td>0.4500</td>\n",
       "      <td>-0.1250</td>\n",
       "      <td>-0.2250</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6235</th>\n",
       "      <td>-0.5824</td>\n",
       "      <td>-0.1646</td>\n",
       "      <td>0.1406</td>\n",
       "      <td>0.6224</td>\n",
       "      <td>0.6626</td>\n",
       "      <td>0.3172</td>\n",
       "      <td>0.0924</td>\n",
       "      <td>0.0120</td>\n",
       "      <td>-0.1646</td>\n",
       "      <td>-0.1326</td>\n",
       "      <td>...</td>\n",
       "      <td>0.8068</td>\n",
       "      <td>0.7392</td>\n",
       "      <td>0.7392</td>\n",
       "      <td>0.6908</td>\n",
       "      <td>0.7294</td>\n",
       "      <td>0.7004</td>\n",
       "      <td>0.6812</td>\n",
       "      <td>0.5170</td>\n",
       "      <td>0.3430</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6236</th>\n",
       "      <td>0.0160</td>\n",
       "      <td>0.8168</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.7814</td>\n",
       "      <td>0.4084</td>\n",
       "      <td>0.2122</td>\n",
       "      <td>-0.2218</td>\n",
       "      <td>-0.6848</td>\n",
       "      <td>-0.8424</td>\n",
       "      <td>-0.7588</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0344</td>\n",
       "      <td>0.0344</td>\n",
       "      <td>-0.0344</td>\n",
       "      <td>0.4252</td>\n",
       "      <td>0.2874</td>\n",
       "      <td>-0.0114</td>\n",
       "      <td>0.1034</td>\n",
       "      <td>-0.1954</td>\n",
       "      <td>-0.8620</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6237</th>\n",
       "      <td>-0.6116</td>\n",
       "      <td>-0.1040</td>\n",
       "      <td>0.2566</td>\n",
       "      <td>0.2316</td>\n",
       "      <td>-0.0568</td>\n",
       "      <td>-0.3648</td>\n",
       "      <td>-0.3870</td>\n",
       "      <td>-0.4868</td>\n",
       "      <td>-0.4674</td>\n",
       "      <td>-0.3232</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.0178</td>\n",
       "      <td>-0.0536</td>\n",
       "      <td>0.5178</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9464</td>\n",
       "      <td>0.2500</td>\n",
       "      <td>-0.0536</td>\n",
       "      <td>0.0714</td>\n",
       "      <td>-0.0892</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6238 rows × 618 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          f1      f2      f3      f4      f5      f6      f7      f8      f9  \\\n",
       "0    -0.4394 -0.0930  0.1718  0.4620  0.6226  0.4704  0.3578  0.0478 -0.1184   \n",
       "1    -0.4348 -0.1198  0.2474  0.4036  0.5026  0.6328  0.4948  0.0338 -0.0520   \n",
       "2    -0.2330  0.2124  0.5014  0.5222 -0.3422 -0.5840 -0.7168 -0.6342 -0.8614   \n",
       "3    -0.3808 -0.0096  0.2602  0.2554 -0.4290 -0.6746 -0.6868 -0.6650 -0.8410   \n",
       "4    -0.3412  0.0946  0.6082  0.6216 -0.1622 -0.3784 -0.4324 -0.4358 -0.4966   \n",
       "...      ...     ...     ...     ...     ...     ...     ...     ...     ...   \n",
       "6233 -0.5742  0.1050  0.4936  0.3986 -0.2058 -0.4130 -0.4188 -0.5194 -0.5080   \n",
       "6234 -0.4520  0.0154  0.5078  0.8978  0.7956  0.4366  0.2352  0.1300  0.0682   \n",
       "6235 -0.5824 -0.1646  0.1406  0.6224  0.6626  0.3172  0.0924  0.0120 -0.1646   \n",
       "6236  0.0160  0.8168  1.0000  0.7814  0.4084  0.2122 -0.2218 -0.6848 -0.8424   \n",
       "6237 -0.6116 -0.1040  0.2566  0.2316 -0.0568 -0.3648 -0.3870 -0.4868 -0.4674   \n",
       "\n",
       "         f10  ...    f609    f610    f611    f612    f613    f614    f615  \\\n",
       "0    -0.2310  ...  0.4102  0.2052  0.3846  0.3590  0.5898  0.3334  0.6410   \n",
       "1    -0.1302  ...  0.0000  0.2954  0.2046  0.4772  0.0454  0.2046  0.4318   \n",
       "2    -0.8318  ... -0.1112 -0.0476 -0.1746  0.0318 -0.0476  0.1112  0.2540   \n",
       "3    -0.9614  ... -0.0504 -0.0360 -0.1224  0.1366  0.2950  0.0792 -0.0072   \n",
       "4    -0.5406  ...  0.1562  0.3124  0.2500 -0.0938  0.1562  0.3124  0.3124   \n",
       "...      ...  ...     ...     ...     ...     ...     ...     ...     ...   \n",
       "6233 -0.4878  ...  0.5000  0.6800  0.8200  0.8400  0.8400  0.7400  0.8200   \n",
       "6234  0.3004  ...  0.5000  0.2250  0.7500  0.8750  0.6750  0.6000  0.4500   \n",
       "6235 -0.1326  ...  0.8068  0.7392  0.7392  0.6908  0.7294  0.7004  0.6812   \n",
       "6236 -0.7588  ...  0.0344  0.0344 -0.0344  0.4252  0.2874 -0.0114  0.1034   \n",
       "6237 -0.3232  ... -0.0178 -0.0536  0.5178  1.0000  0.9464  0.2500 -0.0536   \n",
       "\n",
       "        f616    f617  class  \n",
       "0     0.5898 -0.4872    1.0  \n",
       "1     0.4546 -0.0910    1.0  \n",
       "2     0.1588 -0.4762    0.0  \n",
       "3     0.0936 -0.1510    0.0  \n",
       "4     0.2188 -0.2500    0.0  \n",
       "...      ...     ...    ...  \n",
       "6233  0.6400  0.3200    0.0  \n",
       "6234 -0.1250 -0.2250    0.0  \n",
       "6235  0.5170  0.3430    0.0  \n",
       "6236 -0.1954 -0.8620    0.0  \n",
       "6237  0.0714 -0.0892    0.0  \n",
       "\n",
       "[6238 rows x 618 columns]"
      ]
     },
     "execution_count": 368,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 369,
   "id": "a8dbc629",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>f1</th>\n",
       "      <th>f2</th>\n",
       "      <th>f3</th>\n",
       "      <th>f4</th>\n",
       "      <th>f5</th>\n",
       "      <th>f6</th>\n",
       "      <th>f7</th>\n",
       "      <th>f8</th>\n",
       "      <th>f9</th>\n",
       "      <th>f10</th>\n",
       "      <th>...</th>\n",
       "      <th>f609</th>\n",
       "      <th>f610</th>\n",
       "      <th>f611</th>\n",
       "      <th>f612</th>\n",
       "      <th>f613</th>\n",
       "      <th>f614</th>\n",
       "      <th>f615</th>\n",
       "      <th>f616</th>\n",
       "      <th>f617</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.2080</td>\n",
       "      <td>0.3480</td>\n",
       "      <td>0.3280</td>\n",
       "      <td>0.5040</td>\n",
       "      <td>0.9320</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.8360</td>\n",
       "      <td>0.6680</td>\n",
       "      <td>0.2720</td>\n",
       "      <td>0.2400</td>\n",
       "      <td>...</td>\n",
       "      <td>0.2500</td>\n",
       "      <td>-0.0624</td>\n",
       "      <td>0.2188</td>\n",
       "      <td>0.4532</td>\n",
       "      <td>0.1094</td>\n",
       "      <td>0.1718</td>\n",
       "      <td>0.1562</td>\n",
       "      <td>0.0468</td>\n",
       "      <td>-0.3750</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.2864</td>\n",
       "      <td>0.1992</td>\n",
       "      <td>0.2822</td>\n",
       "      <td>0.4398</td>\n",
       "      <td>0.7012</td>\n",
       "      <td>0.7800</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9792</td>\n",
       "      <td>0.5850</td>\n",
       "      <td>0.4066</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.0078</td>\n",
       "      <td>-0.1472</td>\n",
       "      <td>-0.1782</td>\n",
       "      <td>0.0078</td>\n",
       "      <td>0.1162</td>\n",
       "      <td>-0.0542</td>\n",
       "      <td>-0.0542</td>\n",
       "      <td>-0.0388</td>\n",
       "      <td>-0.7984</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.2348</td>\n",
       "      <td>0.3826</td>\n",
       "      <td>0.6142</td>\n",
       "      <td>0.7492</td>\n",
       "      <td>0.0546</td>\n",
       "      <td>-0.4020</td>\n",
       "      <td>-0.3504</td>\n",
       "      <td>-0.2990</td>\n",
       "      <td>-0.6848</td>\n",
       "      <td>-0.6528</td>\n",
       "      <td>...</td>\n",
       "      <td>0.2834</td>\n",
       "      <td>0.1500</td>\n",
       "      <td>0.0834</td>\n",
       "      <td>-0.2000</td>\n",
       "      <td>-0.1834</td>\n",
       "      <td>0.0500</td>\n",
       "      <td>-0.0166</td>\n",
       "      <td>-0.1834</td>\n",
       "      <td>-0.8666</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.1856</td>\n",
       "      <td>0.3592</td>\n",
       "      <td>0.7126</td>\n",
       "      <td>0.7366</td>\n",
       "      <td>0.3414</td>\n",
       "      <td>0.1018</td>\n",
       "      <td>-0.1556</td>\n",
       "      <td>-0.2514</td>\n",
       "      <td>-0.2514</td>\n",
       "      <td>-0.3892</td>\n",
       "      <td>...</td>\n",
       "      <td>0.2840</td>\n",
       "      <td>0.5556</td>\n",
       "      <td>0.4568</td>\n",
       "      <td>0.4568</td>\n",
       "      <td>0.4568</td>\n",
       "      <td>0.2098</td>\n",
       "      <td>0.0370</td>\n",
       "      <td>-0.0618</td>\n",
       "      <td>-0.3334</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.1814</td>\n",
       "      <td>0.4404</td>\n",
       "      <td>0.8394</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.7564</td>\n",
       "      <td>0.1866</td>\n",
       "      <td>0.0260</td>\n",
       "      <td>-0.0726</td>\n",
       "      <td>-0.2124</td>\n",
       "      <td>-0.3730</td>\n",
       "      <td>...</td>\n",
       "      <td>0.1688</td>\n",
       "      <td>-0.1688</td>\n",
       "      <td>0.2728</td>\n",
       "      <td>0.2988</td>\n",
       "      <td>0.2468</td>\n",
       "      <td>0.1948</td>\n",
       "      <td>-0.0130</td>\n",
       "      <td>-0.2988</td>\n",
       "      <td>-0.7662</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1554</th>\n",
       "      <td>-0.6842</td>\n",
       "      <td>-0.3280</td>\n",
       "      <td>-0.1984</td>\n",
       "      <td>0.2956</td>\n",
       "      <td>0.8786</td>\n",
       "      <td>0.8948</td>\n",
       "      <td>0.3118</td>\n",
       "      <td>0.1822</td>\n",
       "      <td>0.1012</td>\n",
       "      <td>0.1740</td>\n",
       "      <td>...</td>\n",
       "      <td>0.7738</td>\n",
       "      <td>0.7738</td>\n",
       "      <td>0.7142</td>\n",
       "      <td>0.6428</td>\n",
       "      <td>0.5952</td>\n",
       "      <td>0.5714</td>\n",
       "      <td>0.3928</td>\n",
       "      <td>0.4286</td>\n",
       "      <td>0.2858</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1555</th>\n",
       "      <td>-0.5912</td>\n",
       "      <td>-0.2420</td>\n",
       "      <td>0.8174</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.4642</td>\n",
       "      <td>0.6428</td>\n",
       "      <td>0.6944</td>\n",
       "      <td>0.3056</td>\n",
       "      <td>-0.3888</td>\n",
       "      <td>-0.6826</td>\n",
       "      <td>...</td>\n",
       "      <td>0.1924</td>\n",
       "      <td>-0.1154</td>\n",
       "      <td>0.0192</td>\n",
       "      <td>0.2116</td>\n",
       "      <td>-0.0384</td>\n",
       "      <td>0.0192</td>\n",
       "      <td>-0.2308</td>\n",
       "      <td>-0.4230</td>\n",
       "      <td>-0.7116</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1556</th>\n",
       "      <td>-0.6696</td>\n",
       "      <td>-0.3730</td>\n",
       "      <td>0.1584</td>\n",
       "      <td>0.8910</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.9762</td>\n",
       "      <td>0.9762</td>\n",
       "      <td>0.7684</td>\n",
       "      <td>0.4106</td>\n",
       "      <td>0.0154</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0910</td>\n",
       "      <td>0.1818</td>\n",
       "      <td>0.2000</td>\n",
       "      <td>0.1454</td>\n",
       "      <td>0.0182</td>\n",
       "      <td>-0.2910</td>\n",
       "      <td>0.0728</td>\n",
       "      <td>0.0728</td>\n",
       "      <td>-0.5818</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1557</th>\n",
       "      <td>-0.5764</td>\n",
       "      <td>-0.1764</td>\n",
       "      <td>0.5106</td>\n",
       "      <td>0.3742</td>\n",
       "      <td>-0.1670</td>\n",
       "      <td>-0.5858</td>\n",
       "      <td>-0.7882</td>\n",
       "      <td>-0.7224</td>\n",
       "      <td>-0.6330</td>\n",
       "      <td>-0.8212</td>\n",
       "      <td>...</td>\n",
       "      <td>0.4130</td>\n",
       "      <td>0.5870</td>\n",
       "      <td>0.4348</td>\n",
       "      <td>0.5652</td>\n",
       "      <td>0.3478</td>\n",
       "      <td>-0.0434</td>\n",
       "      <td>0.3044</td>\n",
       "      <td>-0.0434</td>\n",
       "      <td>-0.5000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1558</th>\n",
       "      <td>-0.6624</td>\n",
       "      <td>-0.3334</td>\n",
       "      <td>0.3666</td>\n",
       "      <td>0.4292</td>\n",
       "      <td>-0.2084</td>\n",
       "      <td>-0.5374</td>\n",
       "      <td>-0.4542</td>\n",
       "      <td>-0.6208</td>\n",
       "      <td>-0.6376</td>\n",
       "      <td>-0.5042</td>\n",
       "      <td>...</td>\n",
       "      <td>0.2520</td>\n",
       "      <td>0.2846</td>\n",
       "      <td>0.4146</td>\n",
       "      <td>0.3170</td>\n",
       "      <td>0.2520</td>\n",
       "      <td>-0.0244</td>\n",
       "      <td>-0.0894</td>\n",
       "      <td>-0.1708</td>\n",
       "      <td>-0.3170</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1559 rows × 618 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          f1      f2      f3      f4      f5      f6      f7      f8      f9  \\\n",
       "0    -0.2080  0.3480  0.3280  0.5040  0.9320  1.0000  0.8360  0.6680  0.2720   \n",
       "1    -0.2864  0.1992  0.2822  0.4398  0.7012  0.7800  1.0000  0.9792  0.5850   \n",
       "2    -0.2348  0.3826  0.6142  0.7492  0.0546 -0.4020 -0.3504 -0.2990 -0.6848   \n",
       "3    -0.1856  0.3592  0.7126  0.7366  0.3414  0.1018 -0.1556 -0.2514 -0.2514   \n",
       "4    -0.1814  0.4404  0.8394  1.0000  0.7564  0.1866  0.0260 -0.0726 -0.2124   \n",
       "...      ...     ...     ...     ...     ...     ...     ...     ...     ...   \n",
       "1554 -0.6842 -0.3280 -0.1984  0.2956  0.8786  0.8948  0.3118  0.1822  0.1012   \n",
       "1555 -0.5912 -0.2420  0.8174  1.0000  0.4642  0.6428  0.6944  0.3056 -0.3888   \n",
       "1556 -0.6696 -0.3730  0.1584  0.8910  1.0000  0.9762  0.9762  0.7684  0.4106   \n",
       "1557 -0.5764 -0.1764  0.5106  0.3742 -0.1670 -0.5858 -0.7882 -0.7224 -0.6330   \n",
       "1558 -0.6624 -0.3334  0.3666  0.4292 -0.2084 -0.5374 -0.4542 -0.6208 -0.6376   \n",
       "\n",
       "         f10  ...    f609    f610    f611    f612    f613    f614    f615  \\\n",
       "0     0.2400  ...  0.2500 -0.0624  0.2188  0.4532  0.1094  0.1718  0.1562   \n",
       "1     0.4066  ... -0.0078 -0.1472 -0.1782  0.0078  0.1162 -0.0542 -0.0542   \n",
       "2    -0.6528  ...  0.2834  0.1500  0.0834 -0.2000 -0.1834  0.0500 -0.0166   \n",
       "3    -0.3892  ...  0.2840  0.5556  0.4568  0.4568  0.4568  0.2098  0.0370   \n",
       "4    -0.3730  ...  0.1688 -0.1688  0.2728  0.2988  0.2468  0.1948 -0.0130   \n",
       "...      ...  ...     ...     ...     ...     ...     ...     ...     ...   \n",
       "1554  0.1740  ...  0.7738  0.7738  0.7142  0.6428  0.5952  0.5714  0.3928   \n",
       "1555 -0.6826  ...  0.1924 -0.1154  0.0192  0.2116 -0.0384  0.0192 -0.2308   \n",
       "1556  0.0154  ...  0.0910  0.1818  0.2000  0.1454  0.0182 -0.2910  0.0728   \n",
       "1557 -0.8212  ...  0.4130  0.5870  0.4348  0.5652  0.3478 -0.0434  0.3044   \n",
       "1558 -0.5042  ...  0.2520  0.2846  0.4146  0.3170  0.2520 -0.0244 -0.0894   \n",
       "\n",
       "        f616    f617  class  \n",
       "0     0.0468 -0.3750    1.0  \n",
       "1    -0.0388 -0.7984    1.0  \n",
       "2    -0.1834 -0.8666    0.0  \n",
       "3    -0.0618 -0.3334    0.0  \n",
       "4    -0.2988 -0.7662    0.0  \n",
       "...      ...     ...    ...  \n",
       "1554  0.4286  0.2858    0.0  \n",
       "1555 -0.4230 -0.7116    0.0  \n",
       "1556  0.0728 -0.5818    0.0  \n",
       "1557 -0.0434 -0.5000    0.0  \n",
       "1558 -0.1708 -0.3170    0.0  \n",
       "\n",
       "[1559 rows x 618 columns]"
      ]
     },
     "execution_count": 369,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 371,
   "id": "bf1fa3c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9808445856499867 0.8881323373150318\n"
     ]
    }
   ],
   "source": [
    "min_max_scaler = MinMaxScaler()\n",
    "train_data = min_max_scaler.fit_transform(df)\n",
    "train_X, train_y = train_data[:, :-1], train_data[:, -1].astype(int)\n",
    "\n",
    "test_data = min_max_scaler.transform(test_df)\n",
    "test_X, test_y = test_data[:, :-1], test_data[:, -1].astype(int)\n",
    "\n",
    "model = LogisticRegression(max_iter=2000)\n",
    "model.fit(train_X, train_y)\n",
    "predict_y = model.predict_proba(test_X)[:,1]\n",
    "\n",
    "auc = roc_auc_score(test_y, predict_y)\n",
    "apr = average_precision_score(test_y, predict_y)\n",
    "\n",
    "print(auc, apr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00f2c8f9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 372,
   "id": "84a67684",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 logisticregression\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/georgi/venvs/bin-synth/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  n_iter_i = _check_optimize_result(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 randomforest\n",
      "2 gaussiannb\n",
      "3 bernoullinb\n",
      "4 Extra Trees\n",
      "5 LDA\n",
      "6 AdaBoostBagging\n",
      "7 gbm\n"
     ]
    }
   ],
   "source": [
    "for i, model_name in enumerate(MODELS):\n",
    "    print(i, model_name)\n",
    "\n",
    "    if model_name == 'logisticregression':\n",
    "        model = LogisticRegression()\n",
    "    elif model_name == 'randomforest':      \n",
    "        model = RandomForestClassifier()\n",
    "    elif model_name == 'gaussiannb':  \n",
    "        model = GaussianNB()\n",
    "    elif model_name == 'bernoullinb':  \n",
    "        model = BernoulliNB()\n",
    "    elif model_name == 'multinb':  \n",
    "        model = MultinomialNB()\n",
    "    elif model_name == 'svmlin':         \n",
    "        model = svm.LinearSVC() \n",
    "    elif model_name == 'gbm':         \n",
    "        model = GradientBoostingClassifier()   \n",
    "    elif model_name == 'Extra Trees':\n",
    "        model =  ExtraTreesClassifier(n_estimators=20)\n",
    "    elif model_name == 'LDA':\n",
    "        model =  LinearDiscriminantAnalysis() \n",
    "    elif model_name == 'Passive Aggressive':\n",
    "        model =   PassiveAggressiveClassifier()\n",
    "    elif model_name == 'AdaBoost':\n",
    "        model = AdaBoostClassifier()\n",
    "    elif model_name == 'Bagging':\n",
    "        model = BaggingClassifier()\n",
    "    elif model_name == 'xgb':\n",
    "        model = XGBRegressor()\n",
    "        \n",
    "    model.fit(train_X, train_y)\n",
    "    predict_y = model.predict_proba(test_X)[:,1]\n",
    "    \n",
    "    auc = roc_auc_score(test_y, predict_y)\n",
    "    apr = average_precision_score(test_y, predict_y)\n",
    "    \n",
    "    results[i, :] = [auc, apr]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 373,
   "id": "360aaf5a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.93696998, 0.80278226])"
      ]
     },
     "execution_count": 373,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results.mean(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d47352e9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd755eb1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 376,
   "id": "47e95b91",
   "metadata": {},
   "outputs": [],
   "source": [
    "dtype = {f\"f{i}\": float for i in range(1, 618)} | {\"class\": int}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 377,
   "id": "fb87e709",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'f1': float,\n",
       " 'f2': float,\n",
       " 'f3': float,\n",
       " 'f4': float,\n",
       " 'f5': float,\n",
       " 'f6': float,\n",
       " 'f7': float,\n",
       " 'f8': float,\n",
       " 'f9': float,\n",
       " 'f10': float,\n",
       " 'f11': float,\n",
       " 'f12': float,\n",
       " 'f13': float,\n",
       " 'f14': float,\n",
       " 'f15': float,\n",
       " 'f16': float,\n",
       " 'f17': float,\n",
       " 'f18': float,\n",
       " 'f19': float,\n",
       " 'f20': float,\n",
       " 'f21': float,\n",
       " 'f22': float,\n",
       " 'f23': float,\n",
       " 'f24': float,\n",
       " 'f25': float,\n",
       " 'f26': float,\n",
       " 'f27': float,\n",
       " 'f28': float,\n",
       " 'f29': float,\n",
       " 'f30': float,\n",
       " 'f31': float,\n",
       " 'f32': float,\n",
       " 'f33': float,\n",
       " 'f34': float,\n",
       " 'f35': float,\n",
       " 'f36': float,\n",
       " 'f37': float,\n",
       " 'f38': float,\n",
       " 'f39': float,\n",
       " 'f40': float,\n",
       " 'f41': float,\n",
       " 'f42': float,\n",
       " 'f43': float,\n",
       " 'f44': float,\n",
       " 'f45': float,\n",
       " 'f46': float,\n",
       " 'f47': float,\n",
       " 'f48': float,\n",
       " 'f49': float,\n",
       " 'f50': float,\n",
       " 'f51': float,\n",
       " 'f52': float,\n",
       " 'f53': float,\n",
       " 'f54': float,\n",
       " 'f55': float,\n",
       " 'f56': float,\n",
       " 'f57': float,\n",
       " 'f58': float,\n",
       " 'f59': float,\n",
       " 'f60': float,\n",
       " 'f61': float,\n",
       " 'f62': float,\n",
       " 'f63': float,\n",
       " 'f64': float,\n",
       " 'f65': float,\n",
       " 'f66': float,\n",
       " 'f67': float,\n",
       " 'f68': float,\n",
       " 'f69': float,\n",
       " 'f70': float,\n",
       " 'f71': float,\n",
       " 'f72': float,\n",
       " 'f73': float,\n",
       " 'f74': float,\n",
       " 'f75': float,\n",
       " 'f76': float,\n",
       " 'f77': float,\n",
       " 'f78': float,\n",
       " 'f79': float,\n",
       " 'f80': float,\n",
       " 'f81': float,\n",
       " 'f82': float,\n",
       " 'f83': float,\n",
       " 'f84': float,\n",
       " 'f85': float,\n",
       " 'f86': float,\n",
       " 'f87': float,\n",
       " 'f88': float,\n",
       " 'f89': float,\n",
       " 'f90': float,\n",
       " 'f91': float,\n",
       " 'f92': float,\n",
       " 'f93': float,\n",
       " 'f94': float,\n",
       " 'f95': float,\n",
       " 'f96': float,\n",
       " 'f97': float,\n",
       " 'f98': float,\n",
       " 'f99': float,\n",
       " 'f100': float,\n",
       " 'f101': float,\n",
       " 'f102': float,\n",
       " 'f103': float,\n",
       " 'f104': float,\n",
       " 'f105': float,\n",
       " 'f106': float,\n",
       " 'f107': float,\n",
       " 'f108': float,\n",
       " 'f109': float,\n",
       " 'f110': float,\n",
       " 'f111': float,\n",
       " 'f112': float,\n",
       " 'f113': float,\n",
       " 'f114': float,\n",
       " 'f115': float,\n",
       " 'f116': float,\n",
       " 'f117': float,\n",
       " 'f118': float,\n",
       " 'f119': float,\n",
       " 'f120': float,\n",
       " 'f121': float,\n",
       " 'f122': float,\n",
       " 'f123': float,\n",
       " 'f124': float,\n",
       " 'f125': float,\n",
       " 'f126': float,\n",
       " 'f127': float,\n",
       " 'f128': float,\n",
       " 'f129': float,\n",
       " 'f130': float,\n",
       " 'f131': float,\n",
       " 'f132': float,\n",
       " 'f133': float,\n",
       " 'f134': float,\n",
       " 'f135': float,\n",
       " 'f136': float,\n",
       " 'f137': float,\n",
       " 'f138': float,\n",
       " 'f139': float,\n",
       " 'f140': float,\n",
       " 'f141': float,\n",
       " 'f142': float,\n",
       " 'f143': float,\n",
       " 'f144': float,\n",
       " 'f145': float,\n",
       " 'f146': float,\n",
       " 'f147': float,\n",
       " 'f148': float,\n",
       " 'f149': float,\n",
       " 'f150': float,\n",
       " 'f151': float,\n",
       " 'f152': float,\n",
       " 'f153': float,\n",
       " 'f154': float,\n",
       " 'f155': float,\n",
       " 'f156': float,\n",
       " 'f157': float,\n",
       " 'f158': float,\n",
       " 'f159': float,\n",
       " 'f160': float,\n",
       " 'f161': float,\n",
       " 'f162': float,\n",
       " 'f163': float,\n",
       " 'f164': float,\n",
       " 'f165': float,\n",
       " 'f166': float,\n",
       " 'f167': float,\n",
       " 'f168': float,\n",
       " 'f169': float,\n",
       " 'f170': float,\n",
       " 'f171': float,\n",
       " 'f172': float,\n",
       " 'f173': float,\n",
       " 'f174': float,\n",
       " 'f175': float,\n",
       " 'f176': float,\n",
       " 'f177': float,\n",
       " 'f178': float,\n",
       " 'f179': float,\n",
       " 'f180': float,\n",
       " 'f181': float,\n",
       " 'f182': float,\n",
       " 'f183': float,\n",
       " 'f184': float,\n",
       " 'f185': float,\n",
       " 'f186': float,\n",
       " 'f187': float,\n",
       " 'f188': float,\n",
       " 'f189': float,\n",
       " 'f190': float,\n",
       " 'f191': float,\n",
       " 'f192': float,\n",
       " 'f193': float,\n",
       " 'f194': float,\n",
       " 'f195': float,\n",
       " 'f196': float,\n",
       " 'f197': float,\n",
       " 'f198': float,\n",
       " 'f199': float,\n",
       " 'f200': float,\n",
       " 'f201': float,\n",
       " 'f202': float,\n",
       " 'f203': float,\n",
       " 'f204': float,\n",
       " 'f205': float,\n",
       " 'f206': float,\n",
       " 'f207': float,\n",
       " 'f208': float,\n",
       " 'f209': float,\n",
       " 'f210': float,\n",
       " 'f211': float,\n",
       " 'f212': float,\n",
       " 'f213': float,\n",
       " 'f214': float,\n",
       " 'f215': float,\n",
       " 'f216': float,\n",
       " 'f217': float,\n",
       " 'f218': float,\n",
       " 'f219': float,\n",
       " 'f220': float,\n",
       " 'f221': float,\n",
       " 'f222': float,\n",
       " 'f223': float,\n",
       " 'f224': float,\n",
       " 'f225': float,\n",
       " 'f226': float,\n",
       " 'f227': float,\n",
       " 'f228': float,\n",
       " 'f229': float,\n",
       " 'f230': float,\n",
       " 'f231': float,\n",
       " 'f232': float,\n",
       " 'f233': float,\n",
       " 'f234': float,\n",
       " 'f235': float,\n",
       " 'f236': float,\n",
       " 'f237': float,\n",
       " 'f238': float,\n",
       " 'f239': float,\n",
       " 'f240': float,\n",
       " 'f241': float,\n",
       " 'f242': float,\n",
       " 'f243': float,\n",
       " 'f244': float,\n",
       " 'f245': float,\n",
       " 'f246': float,\n",
       " 'f247': float,\n",
       " 'f248': float,\n",
       " 'f249': float,\n",
       " 'f250': float,\n",
       " 'f251': float,\n",
       " 'f252': float,\n",
       " 'f253': float,\n",
       " 'f254': float,\n",
       " 'f255': float,\n",
       " 'f256': float,\n",
       " 'f257': float,\n",
       " 'f258': float,\n",
       " 'f259': float,\n",
       " 'f260': float,\n",
       " 'f261': float,\n",
       " 'f262': float,\n",
       " 'f263': float,\n",
       " 'f264': float,\n",
       " 'f265': float,\n",
       " 'f266': float,\n",
       " 'f267': float,\n",
       " 'f268': float,\n",
       " 'f269': float,\n",
       " 'f270': float,\n",
       " 'f271': float,\n",
       " 'f272': float,\n",
       " 'f273': float,\n",
       " 'f274': float,\n",
       " 'f275': float,\n",
       " 'f276': float,\n",
       " 'f277': float,\n",
       " 'f278': float,\n",
       " 'f279': float,\n",
       " 'f280': float,\n",
       " 'f281': float,\n",
       " 'f282': float,\n",
       " 'f283': float,\n",
       " 'f284': float,\n",
       " 'f285': float,\n",
       " 'f286': float,\n",
       " 'f287': float,\n",
       " 'f288': float,\n",
       " 'f289': float,\n",
       " 'f290': float,\n",
       " 'f291': float,\n",
       " 'f292': float,\n",
       " 'f293': float,\n",
       " 'f294': float,\n",
       " 'f295': float,\n",
       " 'f296': float,\n",
       " 'f297': float,\n",
       " 'f298': float,\n",
       " 'f299': float,\n",
       " 'f300': float,\n",
       " 'f301': float,\n",
       " 'f302': float,\n",
       " 'f303': float,\n",
       " 'f304': float,\n",
       " 'f305': float,\n",
       " 'f306': float,\n",
       " 'f307': float,\n",
       " 'f308': float,\n",
       " 'f309': float,\n",
       " 'f310': float,\n",
       " 'f311': float,\n",
       " 'f312': float,\n",
       " 'f313': float,\n",
       " 'f314': float,\n",
       " 'f315': float,\n",
       " 'f316': float,\n",
       " 'f317': float,\n",
       " 'f318': float,\n",
       " 'f319': float,\n",
       " 'f320': float,\n",
       " 'f321': float,\n",
       " 'f322': float,\n",
       " 'f323': float,\n",
       " 'f324': float,\n",
       " 'f325': float,\n",
       " 'f326': float,\n",
       " 'f327': float,\n",
       " 'f328': float,\n",
       " 'f329': float,\n",
       " 'f330': float,\n",
       " 'f331': float,\n",
       " 'f332': float,\n",
       " 'f333': float,\n",
       " 'f334': float,\n",
       " 'f335': float,\n",
       " 'f336': float,\n",
       " 'f337': float,\n",
       " 'f338': float,\n",
       " 'f339': float,\n",
       " 'f340': float,\n",
       " 'f341': float,\n",
       " 'f342': float,\n",
       " 'f343': float,\n",
       " 'f344': float,\n",
       " 'f345': float,\n",
       " 'f346': float,\n",
       " 'f347': float,\n",
       " 'f348': float,\n",
       " 'f349': float,\n",
       " 'f350': float,\n",
       " 'f351': float,\n",
       " 'f352': float,\n",
       " 'f353': float,\n",
       " 'f354': float,\n",
       " 'f355': float,\n",
       " 'f356': float,\n",
       " 'f357': float,\n",
       " 'f358': float,\n",
       " 'f359': float,\n",
       " 'f360': float,\n",
       " 'f361': float,\n",
       " 'f362': float,\n",
       " 'f363': float,\n",
       " 'f364': float,\n",
       " 'f365': float,\n",
       " 'f366': float,\n",
       " 'f367': float,\n",
       " 'f368': float,\n",
       " 'f369': float,\n",
       " 'f370': float,\n",
       " 'f371': float,\n",
       " 'f372': float,\n",
       " 'f373': float,\n",
       " 'f374': float,\n",
       " 'f375': float,\n",
       " 'f376': float,\n",
       " 'f377': float,\n",
       " 'f378': float,\n",
       " 'f379': float,\n",
       " 'f380': float,\n",
       " 'f381': float,\n",
       " 'f382': float,\n",
       " 'f383': float,\n",
       " 'f384': float,\n",
       " 'f385': float,\n",
       " 'f386': float,\n",
       " 'f387': float,\n",
       " 'f388': float,\n",
       " 'f389': float,\n",
       " 'f390': float,\n",
       " 'f391': float,\n",
       " 'f392': float,\n",
       " 'f393': float,\n",
       " 'f394': float,\n",
       " 'f395': float,\n",
       " 'f396': float,\n",
       " 'f397': float,\n",
       " 'f398': float,\n",
       " 'f399': float,\n",
       " 'f400': float,\n",
       " 'f401': float,\n",
       " 'f402': float,\n",
       " 'f403': float,\n",
       " 'f404': float,\n",
       " 'f405': float,\n",
       " 'f406': float,\n",
       " 'f407': float,\n",
       " 'f408': float,\n",
       " 'f409': float,\n",
       " 'f410': float,\n",
       " 'f411': float,\n",
       " 'f412': float,\n",
       " 'f413': float,\n",
       " 'f414': float,\n",
       " 'f415': float,\n",
       " 'f416': float,\n",
       " 'f417': float,\n",
       " 'f418': float,\n",
       " 'f419': float,\n",
       " 'f420': float,\n",
       " 'f421': float,\n",
       " 'f422': float,\n",
       " 'f423': float,\n",
       " 'f424': float,\n",
       " 'f425': float,\n",
       " 'f426': float,\n",
       " 'f427': float,\n",
       " 'f428': float,\n",
       " 'f429': float,\n",
       " 'f430': float,\n",
       " 'f431': float,\n",
       " 'f432': float,\n",
       " 'f433': float,\n",
       " 'f434': float,\n",
       " 'f435': float,\n",
       " 'f436': float,\n",
       " 'f437': float,\n",
       " 'f438': float,\n",
       " 'f439': float,\n",
       " 'f440': float,\n",
       " 'f441': float,\n",
       " 'f442': float,\n",
       " 'f443': float,\n",
       " 'f444': float,\n",
       " 'f445': float,\n",
       " 'f446': float,\n",
       " 'f447': float,\n",
       " 'f448': float,\n",
       " 'f449': float,\n",
       " 'f450': float,\n",
       " 'f451': float,\n",
       " 'f452': float,\n",
       " 'f453': float,\n",
       " 'f454': float,\n",
       " 'f455': float,\n",
       " 'f456': float,\n",
       " 'f457': float,\n",
       " 'f458': float,\n",
       " 'f459': float,\n",
       " 'f460': float,\n",
       " 'f461': float,\n",
       " 'f462': float,\n",
       " 'f463': float,\n",
       " 'f464': float,\n",
       " 'f465': float,\n",
       " 'f466': float,\n",
       " 'f467': float,\n",
       " 'f468': float,\n",
       " 'f469': float,\n",
       " 'f470': float,\n",
       " 'f471': float,\n",
       " 'f472': float,\n",
       " 'f473': float,\n",
       " 'f474': float,\n",
       " 'f475': float,\n",
       " 'f476': float,\n",
       " 'f477': float,\n",
       " 'f478': float,\n",
       " 'f479': float,\n",
       " 'f480': float,\n",
       " 'f481': float,\n",
       " 'f482': float,\n",
       " 'f483': float,\n",
       " 'f484': float,\n",
       " 'f485': float,\n",
       " 'f486': float,\n",
       " 'f487': float,\n",
       " 'f488': float,\n",
       " 'f489': float,\n",
       " 'f490': float,\n",
       " 'f491': float,\n",
       " 'f492': float,\n",
       " 'f493': float,\n",
       " 'f494': float,\n",
       " 'f495': float,\n",
       " 'f496': float,\n",
       " 'f497': float,\n",
       " 'f498': float,\n",
       " 'f499': float,\n",
       " 'f500': float,\n",
       " 'f501': float,\n",
       " 'f502': float,\n",
       " 'f503': float,\n",
       " 'f504': float,\n",
       " 'f505': float,\n",
       " 'f506': float,\n",
       " 'f507': float,\n",
       " 'f508': float,\n",
       " 'f509': float,\n",
       " 'f510': float,\n",
       " 'f511': float,\n",
       " 'f512': float,\n",
       " 'f513': float,\n",
       " 'f514': float,\n",
       " 'f515': float,\n",
       " 'f516': float,\n",
       " 'f517': float,\n",
       " 'f518': float,\n",
       " 'f519': float,\n",
       " 'f520': float,\n",
       " 'f521': float,\n",
       " 'f522': float,\n",
       " 'f523': float,\n",
       " 'f524': float,\n",
       " 'f525': float,\n",
       " 'f526': float,\n",
       " 'f527': float,\n",
       " 'f528': float,\n",
       " 'f529': float,\n",
       " 'f530': float,\n",
       " 'f531': float,\n",
       " 'f532': float,\n",
       " 'f533': float,\n",
       " 'f534': float,\n",
       " 'f535': float,\n",
       " 'f536': float,\n",
       " 'f537': float,\n",
       " 'f538': float,\n",
       " 'f539': float,\n",
       " 'f540': float,\n",
       " 'f541': float,\n",
       " 'f542': float,\n",
       " 'f543': float,\n",
       " 'f544': float,\n",
       " 'f545': float,\n",
       " 'f546': float,\n",
       " 'f547': float,\n",
       " 'f548': float,\n",
       " 'f549': float,\n",
       " 'f550': float,\n",
       " 'f551': float,\n",
       " 'f552': float,\n",
       " 'f553': float,\n",
       " 'f554': float,\n",
       " 'f555': float,\n",
       " 'f556': float,\n",
       " 'f557': float,\n",
       " 'f558': float,\n",
       " 'f559': float,\n",
       " 'f560': float,\n",
       " 'f561': float,\n",
       " 'f562': float,\n",
       " 'f563': float,\n",
       " 'f564': float,\n",
       " 'f565': float,\n",
       " 'f566': float,\n",
       " 'f567': float,\n",
       " 'f568': float,\n",
       " 'f569': float,\n",
       " 'f570': float,\n",
       " 'f571': float,\n",
       " 'f572': float,\n",
       " 'f573': float,\n",
       " 'f574': float,\n",
       " 'f575': float,\n",
       " 'f576': float,\n",
       " 'f577': float,\n",
       " 'f578': float,\n",
       " 'f579': float,\n",
       " 'f580': float,\n",
       " 'f581': float,\n",
       " 'f582': float,\n",
       " 'f583': float,\n",
       " 'f584': float,\n",
       " 'f585': float,\n",
       " 'f586': float,\n",
       " 'f587': float,\n",
       " 'f588': float,\n",
       " 'f589': float,\n",
       " 'f590': float,\n",
       " 'f591': float,\n",
       " 'f592': float,\n",
       " 'f593': float,\n",
       " 'f594': float,\n",
       " 'f595': float,\n",
       " 'f596': float,\n",
       " 'f597': float,\n",
       " 'f598': float,\n",
       " 'f599': float,\n",
       " 'f600': float,\n",
       " 'f601': float,\n",
       " 'f602': float,\n",
       " 'f603': float,\n",
       " 'f604': float,\n",
       " 'f605': float,\n",
       " 'f606': float,\n",
       " 'f607': float,\n",
       " 'f608': float,\n",
       " 'f609': float,\n",
       " 'f610': float,\n",
       " 'f611': float,\n",
       " 'f612': float,\n",
       " 'f613': float,\n",
       " 'f614': float,\n",
       " 'f615': float,\n",
       " 'f616': float,\n",
       " 'f617': float,\n",
       " 'class': int}"
      ]
     },
     "execution_count": 377,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42f3a6be",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 389,
   "id": "caf3fdf9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv(isolet_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 390,
   "id": "8b459969",
   "metadata": {},
   "outputs": [],
   "source": [
    "# binarize label\n",
    "\n",
    "vowels = [i + 1 for i, ch in enumerate(string.ascii_lowercase) if ch in \"aeiou\"]\n",
    "# consonants_idx = [i + 1 for i, ch in enumerate(string.ascii_lowercase) if ch not in vowels]\n",
    "\n",
    "vowels_idx = train_df[\"class\"].isin(vowels)\n",
    "train_df.loc[vowels_idx, \"class\"], train_df.loc[~vowels_idx, \"class\"] = 1, 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 391,
   "id": "f8a4b38a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "class\n",
       "0.0    5038\n",
       "1.0    1200\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 391,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df[\"class\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4d897ea",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a62d703b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 398,
   "id": "a4d52b0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_path = \"credit/train_credit.pkl.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 399,
   "id": "afa45b76",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_pickle(df_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 400,
   "id": "12df482d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>V1</th>\n",
       "      <th>V2</th>\n",
       "      <th>V3</th>\n",
       "      <th>V4</th>\n",
       "      <th>V5</th>\n",
       "      <th>V6</th>\n",
       "      <th>V7</th>\n",
       "      <th>V8</th>\n",
       "      <th>V9</th>\n",
       "      <th>V10</th>\n",
       "      <th>...</th>\n",
       "      <th>V21</th>\n",
       "      <th>V22</th>\n",
       "      <th>V23</th>\n",
       "      <th>V24</th>\n",
       "      <th>V25</th>\n",
       "      <th>V26</th>\n",
       "      <th>V27</th>\n",
       "      <th>V28</th>\n",
       "      <th>Amount</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.330078</td>\n",
       "      <td>0.304199</td>\n",
       "      <td>-0.384521</td>\n",
       "      <td>0.166260</td>\n",
       "      <td>0.594238</td>\n",
       "      <td>0.260010</td>\n",
       "      <td>0.002243</td>\n",
       "      <td>0.009941</td>\n",
       "      <td>-0.224609</td>\n",
       "      <td>-0.135010</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.339111</td>\n",
       "      <td>-0.974609</td>\n",
       "      <td>-0.108337</td>\n",
       "      <td>-1.417969</td>\n",
       "      <td>0.441406</td>\n",
       "      <td>0.181274</td>\n",
       "      <td>-0.024673</td>\n",
       "      <td>0.005226</td>\n",
       "      <td>4.488281</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.237305</td>\n",
       "      <td>0.670898</td>\n",
       "      <td>-0.360596</td>\n",
       "      <td>1.358398</td>\n",
       "      <td>0.016846</td>\n",
       "      <td>-1.262695</td>\n",
       "      <td>0.260742</td>\n",
       "      <td>-0.202515</td>\n",
       "      <td>0.023651</td>\n",
       "      <td>-0.659180</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.076416</td>\n",
       "      <td>-0.183960</td>\n",
       "      <td>-0.114441</td>\n",
       "      <td>0.231445</td>\n",
       "      <td>0.665527</td>\n",
       "      <td>-0.324951</td>\n",
       "      <td>0.030304</td>\n",
       "      <td>0.055878</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.144531</td>\n",
       "      <td>-0.228271</td>\n",
       "      <td>0.644043</td>\n",
       "      <td>-0.158325</td>\n",
       "      <td>-0.722168</td>\n",
       "      <td>-0.354248</td>\n",
       "      <td>-0.403564</td>\n",
       "      <td>0.139771</td>\n",
       "      <td>0.216064</td>\n",
       "      <td>-0.077942</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.037079</td>\n",
       "      <td>-0.173462</td>\n",
       "      <td>0.130127</td>\n",
       "      <td>0.269531</td>\n",
       "      <td>-0.042084</td>\n",
       "      <td>0.904297</td>\n",
       "      <td>-0.065857</td>\n",
       "      <td>-0.000086</td>\n",
       "      <td>21.796875</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.291016</td>\n",
       "      <td>-0.670898</td>\n",
       "      <td>-1.862305</td>\n",
       "      <td>-1.125977</td>\n",
       "      <td>-0.052307</td>\n",
       "      <td>-0.985352</td>\n",
       "      <td>-0.171021</td>\n",
       "      <td>-0.365479</td>\n",
       "      <td>-0.515625</td>\n",
       "      <td>0.882324</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.022263</td>\n",
       "      <td>-0.155884</td>\n",
       "      <td>0.089355</td>\n",
       "      <td>-1.024414</td>\n",
       "      <td>0.045868</td>\n",
       "      <td>-0.250488</td>\n",
       "      <td>-0.055359</td>\n",
       "      <td>-0.072998</td>\n",
       "      <td>19.984375</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.887695</td>\n",
       "      <td>-0.770020</td>\n",
       "      <td>-0.697266</td>\n",
       "      <td>1.164062</td>\n",
       "      <td>-1.638672</td>\n",
       "      <td>1.767578</td>\n",
       "      <td>3.052734</td>\n",
       "      <td>0.252686</td>\n",
       "      <td>-0.818359</td>\n",
       "      <td>-1.083008</td>\n",
       "      <td>...</td>\n",
       "      <td>0.550781</td>\n",
       "      <td>0.344238</td>\n",
       "      <td>1.640625</td>\n",
       "      <td>-0.059448</td>\n",
       "      <td>0.361816</td>\n",
       "      <td>-0.497803</td>\n",
       "      <td>0.051147</td>\n",
       "      <td>0.226807</td>\n",
       "      <td>799.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>227840</th>\n",
       "      <td>1.259766</td>\n",
       "      <td>0.204346</td>\n",
       "      <td>0.500488</td>\n",
       "      <td>0.627441</td>\n",
       "      <td>-0.537598</td>\n",
       "      <td>-1.008789</td>\n",
       "      <td>0.019669</td>\n",
       "      <td>-0.170166</td>\n",
       "      <td>0.139893</td>\n",
       "      <td>-0.047394</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.264404</td>\n",
       "      <td>-0.795410</td>\n",
       "      <td>0.144043</td>\n",
       "      <td>0.363037</td>\n",
       "      <td>0.196045</td>\n",
       "      <td>0.096619</td>\n",
       "      <td>-0.033203</td>\n",
       "      <td>0.017715</td>\n",
       "      <td>1.980469</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>227841</th>\n",
       "      <td>-1.913086</td>\n",
       "      <td>3.699219</td>\n",
       "      <td>-3.048828</td>\n",
       "      <td>1.005859</td>\n",
       "      <td>0.140625</td>\n",
       "      <td>-1.888672</td>\n",
       "      <td>0.530762</td>\n",
       "      <td>0.238525</td>\n",
       "      <td>1.585938</td>\n",
       "      <td>2.375000</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.417725</td>\n",
       "      <td>-0.320068</td>\n",
       "      <td>0.265381</td>\n",
       "      <td>0.114990</td>\n",
       "      <td>0.014420</td>\n",
       "      <td>-0.405029</td>\n",
       "      <td>1.107422</td>\n",
       "      <td>0.382080</td>\n",
       "      <td>1.790039</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>227842</th>\n",
       "      <td>-2.216797</td>\n",
       "      <td>-0.663086</td>\n",
       "      <td>2.121094</td>\n",
       "      <td>1.121094</td>\n",
       "      <td>1.402344</td>\n",
       "      <td>1.480469</td>\n",
       "      <td>-1.422852</td>\n",
       "      <td>0.846680</td>\n",
       "      <td>0.206299</td>\n",
       "      <td>0.520020</td>\n",
       "      <td>...</td>\n",
       "      <td>0.061096</td>\n",
       "      <td>1.102539</td>\n",
       "      <td>-0.635742</td>\n",
       "      <td>-0.871094</td>\n",
       "      <td>-0.112610</td>\n",
       "      <td>0.939453</td>\n",
       "      <td>0.685059</td>\n",
       "      <td>0.103638</td>\n",
       "      <td>29.984375</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>227843</th>\n",
       "      <td>2.248047</td>\n",
       "      <td>-0.609375</td>\n",
       "      <td>-2.396484</td>\n",
       "      <td>-1.040039</td>\n",
       "      <td>0.253906</td>\n",
       "      <td>-1.316406</td>\n",
       "      <td>0.297852</td>\n",
       "      <td>-0.571289</td>\n",
       "      <td>-0.915527</td>\n",
       "      <td>0.911621</td>\n",
       "      <td>...</td>\n",
       "      <td>0.324707</td>\n",
       "      <td>0.878418</td>\n",
       "      <td>-0.186890</td>\n",
       "      <td>-0.710938</td>\n",
       "      <td>0.519043</td>\n",
       "      <td>0.126343</td>\n",
       "      <td>-0.084290</td>\n",
       "      <td>-0.083008</td>\n",
       "      <td>49.906250</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>227844</th>\n",
       "      <td>0.953613</td>\n",
       "      <td>-0.372314</td>\n",
       "      <td>0.816895</td>\n",
       "      <td>1.576172</td>\n",
       "      <td>-0.874023</td>\n",
       "      <td>-0.186035</td>\n",
       "      <td>-0.171021</td>\n",
       "      <td>0.006126</td>\n",
       "      <td>0.946777</td>\n",
       "      <td>-0.314209</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.046967</td>\n",
       "      <td>-0.046936</td>\n",
       "      <td>-0.199341</td>\n",
       "      <td>0.406250</td>\n",
       "      <td>0.648438</td>\n",
       "      <td>-0.313965</td>\n",
       "      <td>0.036469</td>\n",
       "      <td>0.044647</td>\n",
       "      <td>118.312500</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>227845 rows × 30 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              V1        V2        V3        V4        V5        V6        V7  \\\n",
       "0       1.330078  0.304199 -0.384521  0.166260  0.594238  0.260010  0.002243   \n",
       "1       1.237305  0.670898 -0.360596  1.358398  0.016846 -1.262695  0.260742   \n",
       "2       1.144531 -0.228271  0.644043 -0.158325 -0.722168 -0.354248 -0.403564   \n",
       "3       2.291016 -0.670898 -1.862305 -1.125977 -0.052307 -0.985352 -0.171021   \n",
       "4      -1.887695 -0.770020 -0.697266  1.164062 -1.638672  1.767578  3.052734   \n",
       "...          ...       ...       ...       ...       ...       ...       ...   \n",
       "227840  1.259766  0.204346  0.500488  0.627441 -0.537598 -1.008789  0.019669   \n",
       "227841 -1.913086  3.699219 -3.048828  1.005859  0.140625 -1.888672  0.530762   \n",
       "227842 -2.216797 -0.663086  2.121094  1.121094  1.402344  1.480469 -1.422852   \n",
       "227843  2.248047 -0.609375 -2.396484 -1.040039  0.253906 -1.316406  0.297852   \n",
       "227844  0.953613 -0.372314  0.816895  1.576172 -0.874023 -0.186035 -0.171021   \n",
       "\n",
       "              V8        V9       V10  ...       V21       V22       V23  \\\n",
       "0       0.009941 -0.224609 -0.135010  ... -0.339111 -0.974609 -0.108337   \n",
       "1      -0.202515  0.023651 -0.659180  ... -0.076416 -0.183960 -0.114441   \n",
       "2       0.139771  0.216064 -0.077942  ... -0.037079 -0.173462  0.130127   \n",
       "3      -0.365479 -0.515625  0.882324  ... -0.022263 -0.155884  0.089355   \n",
       "4       0.252686 -0.818359 -1.083008  ...  0.550781  0.344238  1.640625   \n",
       "...          ...       ...       ...  ...       ...       ...       ...   \n",
       "227840 -0.170166  0.139893 -0.047394  ... -0.264404 -0.795410  0.144043   \n",
       "227841  0.238525  1.585938  2.375000  ... -0.417725 -0.320068  0.265381   \n",
       "227842  0.846680  0.206299  0.520020  ...  0.061096  1.102539 -0.635742   \n",
       "227843 -0.571289 -0.915527  0.911621  ...  0.324707  0.878418 -0.186890   \n",
       "227844  0.006126  0.946777 -0.314209  ... -0.046967 -0.046936 -0.199341   \n",
       "\n",
       "             V24       V25       V26       V27       V28      Amount  Class  \n",
       "0      -1.417969  0.441406  0.181274 -0.024673  0.005226    4.488281      0  \n",
       "1       0.231445  0.665527 -0.324951  0.030304  0.055878    1.000000      0  \n",
       "2       0.269531 -0.042084  0.904297 -0.065857 -0.000086   21.796875      0  \n",
       "3      -1.024414  0.045868 -0.250488 -0.055359 -0.072998   19.984375      0  \n",
       "4      -0.059448  0.361816 -0.497803  0.051147  0.226807  799.000000      0  \n",
       "...          ...       ...       ...       ...       ...         ...    ...  \n",
       "227840  0.363037  0.196045  0.096619 -0.033203  0.017715    1.980469      0  \n",
       "227841  0.114990  0.014420 -0.405029  1.107422  0.382080    1.790039      0  \n",
       "227842 -0.871094 -0.112610  0.939453  0.685059  0.103638   29.984375      0  \n",
       "227843 -0.710938  0.519043  0.126343 -0.084290 -0.083008   49.906250      0  \n",
       "227844  0.406250  0.648438 -0.313965  0.036469  0.044647  118.312500      0  \n",
       "\n",
       "[227845 rows x 30 columns]"
      ]
     },
     "execution_count": 400,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d39c0d9c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "bin-synth",
   "language": "python",
   "name": "bin-synth"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
