{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "846eb389",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7f41ed36",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3cf6ae42",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(466285, 75)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loans_data = pd.read_csv(\"../data/loan_data_2007_2014.csv\",low_memory=False)\n",
    "loans_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9fd3a7f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(43236, 75)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "default_status = ['Charged Off','Does not meet the credit policy. Status:Charged Off']\n",
    "loan_data_defaults = loans_data[loans_data['loan_status'].isin(default_status)]\n",
    "loan_data_defaults.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fd1f9460",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/3082587556.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults['recovery_rate'] = loan_data_defaults['recoveries'] / loan_data_defaults['funded_amnt']\n",
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/3082587556.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults['CCF'] = \\\n"
     ]
    }
   ],
   "source": [
    "loan_data_defaults['recovery_rate'] = loan_data_defaults['recoveries'] / loan_data_defaults['funded_amnt']\n",
    "loan_data_defaults['CCF'] = \\\n",
    "(loan_data_defaults['funded_amnt'] - loan_data_defaults['total_rec_prncp'])/ loan_data_defaults['funded_amnt']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2c8a4e92",
   "metadata": {},
   "source": [
    "### Null rate in columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d387b570",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['desc', 'mths_since_last_delinq', 'mths_since_last_record',\n",
       "       'next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint',\n",
       "       'dti_joint', 'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal',\n",
       "       'open_acc_6m', 'open_il_6m', 'open_il_12m', 'open_il_24m',\n",
       "       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',\n",
       "       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',\n",
       "       'total_cu_tl', 'inq_last_12m'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "null_rate = loan_data_defaults.isna().mean()\n",
    "null_rate.index[null_rate>0.1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "07cf4cc4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/262330903.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults[\"mths_since_last_delinq\"].fillna(0,inplace=True)\n",
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/262330903.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults[\"mths_since_last_major_derog\"].fillna(0,inplace=True)\n"
     ]
    }
   ],
   "source": [
    "loan_data_defaults[\"mths_since_last_delinq\"].fillna(0,inplace=True)\n",
    "loan_data_defaults[\"mths_since_last_major_derog\"].fillna(0,inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3a3e6f8c",
   "metadata": {},
   "source": [
    "### emp length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "013e451b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['< 1 year', '4 years', '3 years', '10+ years', '1 year', '9 years',\n",
       "       '2 years', '8 years', '7 years', '5 years', nan, '6 years'],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loan_data_defaults[\"emp_length\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2d1844ee",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/2425163457.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults.loc[:,'emp_length_ad'] = \\\n"
     ]
    }
   ],
   "source": [
    "loan_data_defaults.loc[:,'emp_length_ad'] = \\\n",
    "loan_data_defaults.loc[:,'emp_length'].str.replace(' years', '')\n",
    "\n",
    "loan_data_defaults.loc[:,'emp_length_ad'] = \\\n",
    "loan_data_defaults.loc[:,'emp_length_ad'].str.replace(' year', '')\n",
    "\n",
    "loan_data_defaults.loc[:,'emp_length_ad'] = \\\n",
    "loan_data_defaults.loc[:,'emp_length_ad'].str.replace('< 1', str(0))\n",
    "\n",
    "loan_data_defaults.loc[:,'emp_length_ad'] = \\\n",
    "loan_data_defaults.loc[:,'emp_length_ad'].str.replace('10+', str(10))\n",
    "\n",
    "loan_data_defaults.loc[:,'emp_length_ad'] = pd.to_numeric(loan_data_defaults.loc[:,'emp_length_ad'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "48df66e2",
   "metadata": {},
   "source": [
    "### terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5e380eee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([' 60 months', ' 36 months'], dtype=object)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loan_data_defaults['term'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c8e15b23",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/1588289919.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults[\"terms_ad\"] = np.where(loan_data_defaults['term'] == ' 60 months',\\\n"
     ]
    }
   ],
   "source": [
    "loan_data_defaults[\"terms_ad\"] = np.where(loan_data_defaults['term'] == ' 60 months',\\\n",
    "                                         60, 36)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d7b9a6d",
   "metadata": {},
   "source": [
    "### earliest credit line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "95b588c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/287086427.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults['earliest_cr_line_date'] = \\\n"
     ]
    }
   ],
   "source": [
    "loan_data_defaults['earliest_cr_line_date'] = \\\n",
    "pd.to_datetime(loan_data_defaults['earliest_cr_line'], format = '%b-%y')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "0ff8a550",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count                            43233\n",
       "mean     1998-09-11 22:51:27.141766912\n",
       "min                1969-01-01 00:00:00\n",
       "25%                1995-03-01 00:00:00\n",
       "50%                1999-10-01 00:00:00\n",
       "75%                2003-03-01 00:00:00\n",
       "max                2068-10-01 00:00:00\n",
       "Name: earliest_cr_line_date, dtype: object"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loan_data_defaults['earliest_cr_line_date'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f7370aed",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/542785589.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults['mths_since_earliest_cr_line'] = (pd.to_datetime('2017-12-01') - loan_data_defaults['earliest_cr_line_date']).dt.days/30\n"
     ]
    }
   ],
   "source": [
    "loan_data_defaults['mths_since_earliest_cr_line'] = (pd.to_datetime('2017-12-01') - loan_data_defaults['earliest_cr_line_date']).dt.days/30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "83b32cff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.002983624757146822"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(loan_data_defaults['mths_since_earliest_cr_line']<0).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "7ab288c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/2312019347.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults['mths_since_earliest_cr_line'] = np.where(loan_data_defaults['mths_since_earliest_cr_line']<0,0,\\\n"
     ]
    }
   ],
   "source": [
    "loan_data_defaults['mths_since_earliest_cr_line'] = np.where(loan_data_defaults['mths_since_earliest_cr_line']<0,0,\\\n",
    "                                                            loan_data_defaults['mths_since_earliest_cr_line'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7dd6a01f",
   "metadata": {},
   "source": [
    "### issue date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "dbfd6aa2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/1800607014.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults['issue_d_date'] = \\\n",
      "/var/folders/kb/v0p0ypbd0wx1q2qb1kpxmmcc0000gn/T/ipykernel_60574/1800607014.py:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  loan_data_defaults['mths_since_issue_d'] = (pd.to_datetime('2017-12-01') - loan_data_defaults['issue_d_date']).dt.days/30\n"
     ]
    }
   ],
   "source": [
    "loan_data_defaults['issue_d_date'] = \\\n",
    "pd.to_datetime(loan_data_defaults['issue_d'], format = '%b-%y')\n",
    "\n",
    "loan_data_defaults['mths_since_issue_d'] = (pd.to_datetime('2017-12-01') - loan_data_defaults['issue_d_date']).dt.days/30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "9e279de5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(loan_data_defaults['mths_since_issue_d']<0).mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca324a50",
   "metadata": {},
   "source": [
    "### one hot encode cat variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "672ce9f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "loan_data_dummies = [pd.get_dummies(loan_data_defaults['grade'], prefix = 'grade', prefix_sep = ':', dtype= \"float\"),\n",
    "                     pd.get_dummies(loan_data_defaults['sub_grade'], prefix = 'sub_grade', prefix_sep = ':', dtype= \"float\"),\n",
    "                     pd.get_dummies(loan_data_defaults['home_ownership'], prefix = 'home_ownership', prefix_sep = ':', dtype= \"float\"),\n",
    "                     pd.get_dummies(loan_data_defaults['verification_status'], prefix = 'verification_status', prefix_sep = ':', dtype= \"float\"),\n",
    "                     pd.get_dummies(loan_data_defaults['loan_status'], prefix = 'loan_status', prefix_sep = ':', dtype= \"float\"),\n",
    "                     pd.get_dummies(loan_data_defaults['purpose'], prefix = 'purpose', prefix_sep = ':', dtype= \"float\"),\n",
    "                     pd.get_dummies(loan_data_defaults['addr_state'], prefix = 'addr_state', prefix_sep = ':', dtype= \"float\"),\n",
    "                     pd.get_dummies(loan_data_defaults['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':', dtype= \"float\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "2e008ea3",
   "metadata": {},
   "outputs": [],
   "source": [
    "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "384f1f05",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(43236, 200)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loan_data_defaults1 = pd.concat([loan_data_defaults, loan_data_dummies], axis = 1)\n",
    "loan_data_defaults1.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "829471db",
   "metadata": {},
   "source": [
    "### Subset for X columns and y Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b0497ecc",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_cols = ['grade:A',\n",
    "'grade:B',\n",
    "'grade:C',\n",
    "'grade:D',\n",
    "'grade:E',\n",
    "'grade:F',\n",
    "'grade:G',\n",
    "'home_ownership:MORTGAGE',\n",
    "'home_ownership:NONE',\n",
    "'home_ownership:OTHER',\n",
    "'home_ownership:OWN',\n",
    "'home_ownership:RENT',\n",
    "'verification_status:Not Verified',\n",
    "'verification_status:Source Verified',\n",
    "'verification_status:Verified',\n",
    "'purpose:car',\n",
    "'purpose:credit_card',\n",
    "'purpose:debt_consolidation',\n",
    "'purpose:educational',\n",
    "'purpose:home_improvement',\n",
    "'purpose:house',\n",
    "'purpose:major_purchase',\n",
    "'purpose:medical',\n",
    "'purpose:moving',\n",
    "'purpose:other',\n",
    "'purpose:renewable_energy',\n",
    "'purpose:small_business',\n",
    "'purpose:vacation',\n",
    "'purpose:wedding',\n",
    "'initial_list_status:f',\n",
    "'initial_list_status:w',\n",
    "'terms_ad',\n",
    "'emp_length_ad',\n",
    "'mths_since_issue_d',\n",
    "'mths_since_earliest_cr_line',\n",
    "'funded_amnt',\n",
    "'int_rate',\n",
    "'installment',\n",
    "'annual_inc',\n",
    "'dti',\n",
    "'delinq_2yrs',\n",
    "'inq_last_6mths',\n",
    "'mths_since_last_delinq',\n",
    "'open_acc',\n",
    "'pub_rec',\n",
    "'total_acc',\n",
    "'acc_now_delinq']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "3f6d463d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "grade:A                                   0\n",
       "grade:B                                   0\n",
       "grade:C                                   0\n",
       "grade:D                                   0\n",
       "grade:E                                   0\n",
       "grade:F                                   0\n",
       "grade:G                                   0\n",
       "home_ownership:MORTGAGE                   0\n",
       "home_ownership:NONE                       0\n",
       "home_ownership:OTHER                      0\n",
       "home_ownership:OWN                        0\n",
       "home_ownership:RENT                       0\n",
       "verification_status:Not Verified          0\n",
       "verification_status:Source Verified       0\n",
       "verification_status:Verified              0\n",
       "purpose:car                               0\n",
       "purpose:credit_card                       0\n",
       "purpose:debt_consolidation                0\n",
       "purpose:educational                       0\n",
       "purpose:home_improvement                  0\n",
       "purpose:house                             0\n",
       "purpose:major_purchase                    0\n",
       "purpose:medical                           0\n",
       "purpose:moving                            0\n",
       "purpose:other                             0\n",
       "purpose:renewable_energy                  0\n",
       "purpose:small_business                    0\n",
       "purpose:vacation                          0\n",
       "purpose:wedding                           0\n",
       "initial_list_status:f                     0\n",
       "initial_list_status:w                     0\n",
       "terms_ad                                  0\n",
       "emp_length_ad                          2337\n",
       "mths_since_issue_d                        0\n",
       "mths_since_earliest_cr_line               3\n",
       "funded_amnt                               0\n",
       "int_rate                                  0\n",
       "installment                               0\n",
       "annual_inc                                0\n",
       "dti                                       0\n",
       "delinq_2yrs                               3\n",
       "inq_last_6mths                            3\n",
       "mths_since_last_delinq                    0\n",
       "open_acc                                  3\n",
       "pub_rec                                   3\n",
       "total_acc                                 3\n",
       "acc_now_delinq                            3\n",
       "dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loan_data_defaults1[x_cols].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "e4d9f51b",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_cols = [\"recovery_rate\",\"CCF\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "267adf64",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "recovery_rate    0\n",
       "CCF              0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loan_data_defaults1[y_cols].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "5916530b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loan_data_defaults1[\"member_id\"].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "9dc668dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(43236, 49)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessed_data = loan_data_defaults1[x_cols+y_cols]\n",
    "preprocessed_data.index = loan_data_defaults1.member_id\n",
    "preprocessed_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "1eee6835",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(40896, 49)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessed_data = preprocessed_data.dropna()\n",
    "preprocessed_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "fde043b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessed_data.to_csv(\"../data/data_preprocessed_v0.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d667cba",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
