{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "59d41c91-10f1-4907-bb91-4fb7751b4b73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('german.data', <http.client.HTTPMessage at 0x7efd692db3d0>)"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from urllib.request import urlretrieve\n",
    "urlretrieve('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data', 'german.data')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "528d672c-84ca-4010-bbe7-1f9ef2b696c4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bb0d81ed-3fae-43d3-8e5a-299b5e8b7730",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "german_df_old = pd.read_csv(\"german-train.csv\")\n",
    "german_df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data', \n",
    "                        delimiter=' ',header=None)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a517d2a6-2111-428d-bc53-f9afc6b1a694",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['duration', 'amount', 'installment-rate', 'present-residence', 'age',\n",
       "       'number-credits', 'people-liable', 'foreign-worker', 'status_1',\n",
       "       'status_2', 'status_3', 'status_4', 'credit-history_0',\n",
       "       'credit-history_1', 'credit-history_2', 'credit-history_3',\n",
       "       'credit-history_4', 'purpose_0', 'purpose_1', 'purpose_2', 'purpose_3',\n",
       "       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_9',\n",
       "       'purpose_10', 'savings_1', 'savings_2', 'savings_3', 'savings_4',\n",
       "       'savings_5', 'employment-duration_1', 'employment-duration_2',\n",
       "       'employment-duration_3', 'employment-duration_4',\n",
       "       'employment-duration_5', 'personal-status-sex_1',\n",
       "       'personal-status-sex_2', 'personal-status-sex_3',\n",
       "       'personal-status-sex_5', 'other-debtors_1', 'other-debtors_2',\n",
       "       'other-debtors_3', 'property_1', 'property_2', 'property_3',\n",
       "       'property_4', 'other-installment-plans_1', 'other-installment-plans_2',\n",
       "       'other-installment-plans_3', 'housing_1', 'housing_2', 'housing_3',\n",
       "       'job_1', 'job_2', 'job_3', 'job_4', 'telephone_1', 'telephone_2',\n",
       "       'credit-risk'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "german_df_old.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "22b3aa32-22b9-4d7d-b802-3b32c077c6c5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Description of the German credit dataset.\n",
      "\n",
      "1. Title: German Credit data\n",
      "\n",
      "2. Source Information\n",
      "\n",
      "Professor Dr. Hans Hofmann  \n",
      "Institut f\"ur Statistik und \"Okonometrie  \n",
      "Universit\"at Hamburg  \n",
      "FB Wirtschaftswissenschaften  \n",
      "Von-Melle-Park 5    \n",
      "2000 Hamburg 13 \n",
      "\n",
      "3. Number of Instances:  1000\n",
      "\n",
      "Two datasets are provided.  the original dataset, in the form provided\n",
      "by Prof. Hofmann, contains categorical/symbolic attributes and\n",
      "is in the file \"german.data\".   \n",
      " \n",
      "For algorithms that need numerical attributes, Strathclyde University \n",
      "produced the file \"german.data-numeric\".  This file has been edited \n",
      "and several indicator variables added to make it suitable for \n",
      "algorithms which cannot cope with categorical variables.   Several\n",
      "attributes that are ordered categorical (such as attribute 17) have\n",
      "been coded as integer.    This was the form used by StatLog.\n",
      "\n",
      "\n",
      "6. Number of Attributes german: 20 (7 numerical, 13 categorical)\n",
      "   Number of Attributes german.numer: 24 (24 numerical)\n",
      "\n",
      "\n",
      "7.  Attribute description for german\n",
      "\n",
      "Attribute 1:  (qualitative)\n",
      "\t       Status of existing checking account\n",
      "               A11 :      ... <    0 DM\n",
      "\t       A12 : 0 <= ... <  200 DM\n",
      "\t       A13 :      ... >= 200 DM /\n",
      "\t\t     salary assignments for at least 1 year\n",
      "               A14 : no checking account\n",
      "\n",
      "Attribute 2:  (numerical)\n",
      "\t      Duration in month\n",
      "\n",
      "Attribute 3:  (qualitative)\n",
      "\t      Credit history\n",
      "\t      A30 : no credits taken/\n",
      "\t\t    all credits paid back duly\n",
      "              A31 : all credits at this bank paid back duly\n",
      "\t      A32 : existing credits paid back duly till now\n",
      "              A33 : delay in paying off in the past\n",
      "\t      A34 : critical account/\n",
      "\t\t    other credits existing (not at this bank)\n",
      "\n",
      "Attribute 4:  (qualitative)\n",
      "\t      Purpose\n",
      "\t      A40 : car (new)\n",
      "\t      A41 : car (used)\n",
      "\t      A42 : furniture/equipment\n",
      "\t      A43 : radio/television\n",
      "\t      A44 : domestic appliances\n",
      "\t      A45 : repairs\n",
      "\t      A46 : education\n",
      "\t      A47 : (vacation - does not exist?)\n",
      "\t      A48 : retraining\n",
      "\t      A49 : business\n",
      "\t      A410 : others\n",
      "\n",
      "Attribute 5:  (numerical)\n",
      "\t      Credit amount\n",
      "\n",
      "Attibute 6:  (qualitative)\n",
      "\t      Savings account/bonds\n",
      "\t      A61 :          ... <  100 DM\n",
      "\t      A62 :   100 <= ... <  500 DM\n",
      "\t      A63 :   500 <= ... < 1000 DM\n",
      "\t      A64 :          .. >= 1000 DM\n",
      "              A65 :   unknown/ no savings account\n",
      "\n",
      "Attribute 7:  (qualitative)\n",
      "\t      Present employment since\n",
      "\t      A71 : unemployed\n",
      "\t      A72 :       ... < 1 year\n",
      "\t      A73 : 1  <= ... < 4 years  \n",
      "\t      A74 : 4  <= ... < 7 years\n",
      "\t      A75 :       .. >= 7 years\n",
      "\n",
      "Attribute 8:  (numerical)\n",
      "\t      Installment rate in percentage of disposable income\n",
      "\n",
      "Attribute 9:  (qualitative)\n",
      "\t      Personal status and sex\n",
      "\t      A91 : male   : divorced/separated\n",
      "\t      A92 : female : divorced/separated/married\n",
      "              A93 : male   : single\n",
      "\t      A94 : male   : married/widowed\n",
      "\t      A95 : female : single\n",
      "\n",
      "Attribute 10: (qualitative)\n",
      "\t      Other debtors / guarantors\n",
      "\t      A101 : none\n",
      "\t      A102 : co-applicant\n",
      "\t      A103 : guarantor\n",
      "\n",
      "Attribute 11: (numerical)\n",
      "\t      Present residence since\n",
      "\n",
      "Attribute 12: (qualitative)\n",
      "\t      Property\n",
      "\t      A121 : real estate\n",
      "\t      A122 : if not A121 : building society savings agreement/\n",
      "\t\t\t\t   life insurance\n",
      "              A123 : if not A121/A122 : car or other, not in attribute 6\n",
      "\t      A124 : unknown / no property\n",
      "\n",
      "Attribute 13: (numerical)\n",
      "\t      Age in years\n",
      "\n",
      "Attribute 14: (qualitative)\n",
      "\t      Other installment plans \n",
      "\t      A141 : bank\n",
      "\t      A142 : stores\n",
      "\t      A143 : none\n",
      "\n",
      "Attribute 15: (qualitative)\n",
      "\t      Housing\n",
      "\t      A151 : rent\n",
      "\t      A152 : own\n",
      "\t      A153 : for free\n",
      "\n",
      "Attribute 16: (numerical)\n",
      "              Number of existing credits at this bank\n",
      "\n",
      "Attribute 17: (qualitative)\n",
      "\t      Job\n",
      "\t      A171 : unemployed/ unskilled  - non-resident\n",
      "\t      A172 : unskilled - resident\n",
      "\t      A173 : skilled employee / official\n",
      "\t      A174 : management/ self-employed/\n",
      "\t\t     highly qualified employee/ officer\n",
      "\n",
      "Attribute 18: (numerical)\n",
      "\t      Number of people being liable to provide maintenance for\n",
      "\n",
      "Attribute 19: (qualitative)\n",
      "\t      Telephone\n",
      "\t      A191 : none\n",
      "\t      A192 : yes, registered under the customers name\n",
      "\n",
      "Attribute 20: (qualitative)\n",
      "\t      foreign worker\n",
      "\t      A201 : yes\n",
      "\t      A202 : no\n",
      "\n",
      "\n",
      "\n",
      "8.  Cost Matrix\n",
      "\n",
      "This dataset requires use of a cost matrix (see below)\n",
      "\n",
      "\n",
      "      1        2\n",
      "----------------------------\n",
      "  1   0        1\n",
      "-----------------------\n",
      "  2   5        0\n",
      "\n",
      "(1 = Good,  2 = Bad)\n",
      "\n",
      "the rows represent the actual classification and the columns\n",
      "the predicted classification.\n",
      "\n",
      "It is worse to class a customer as good when they are bad (5), \n",
      "than it is to class a customer as bad when they are good (1).\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "urlretrieve('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc', 'german.doc')\n",
    "f = open('german.doc')\n",
    "german_doc= f.read()\n",
    "print(german_doc)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1d7554a2-f1e2-41cf-a91a-125b912376d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "german_df.columns=['account_bal','duration','payment_status','purpose',\n",
    "                   'credit_amount','savings_bond_value','employed_since',\n",
    "                   'intallment_rate','sex_marital','guarantor','residence_since',\n",
    "                   'most_valuable_asset','age','concurrent_credits','type_of_housing',\n",
    "                   'number_of_existcr','job','number_of_dependents','telephon',\n",
    "                   'foreign','target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3f82a858-ad02-41f8-9846-55533f2015e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove non-ordinal features\n",
    "german_p1_df = german_df.drop(['purpose', 'sex_marital', 'guarantor', 'most_valuable_asset', 'concurrent_credits', 'foreign', 'telephon', 'type_of_housing'],axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7af02d66-d00b-4b90-9116-9a583378c848",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 13)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "german_p1_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "28d1bb31-ca1b-408a-b348-a8f22273e2c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "german_p2_df= german_p1_df.replace(['A11','A12','A13','A14','A71','A72','A73','A74', 'A75', 'A124', 'A123', 'A122', 'A121', 'A171', 'A172', 'A173', 'A174', 'A30', 'A31',\n",
    "                                    'A32', 'A33', 'A34', 'A61', 'A62', 'A63', 'A64', 'A65'],\n",
    "                  [0,1,2,3, 0,1,2,3,4, 0,1,2,3, 0,1,2,3, 4,3,2,1,0, 0,1,2,3,4])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "86f96ffe-5ad4-4fe8-ba14-5f495029870a",
   "metadata": {},
   "outputs": [],
   "source": [
    "german_p2_df['target'] = german_p2_df['target'].replace([1,2], [1,0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ac587af0-a2fc-4572-ac5c-9363259107df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "german_p2_df.isna().any().any()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5b07b04c-a3cb-42f6-afd5-40a37aa00a8d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1000 entries, 0 to 999\n",
      "Data columns (total 13 columns):\n",
      " #   Column                Non-Null Count  Dtype\n",
      "---  ------                --------------  -----\n",
      " 0   account_bal           1000 non-null   int64\n",
      " 1   duration              1000 non-null   int64\n",
      " 2   payment_status        1000 non-null   int64\n",
      " 3   credit_amount         1000 non-null   int64\n",
      " 4   savings_bond_value    1000 non-null   int64\n",
      " 5   employed_since        1000 non-null   int64\n",
      " 6   intallment_rate       1000 non-null   int64\n",
      " 7   residence_since       1000 non-null   int64\n",
      " 8   age                   1000 non-null   int64\n",
      " 9   number_of_existcr     1000 non-null   int64\n",
      " 10  job                   1000 non-null   int64\n",
      " 11  number_of_dependents  1000 non-null   int64\n",
      " 12  target                1000 non-null   int64\n",
      "dtypes: int64(13)\n",
      "memory usage: 101.7 KB\n"
     ]
    }
   ],
   "source": [
    "german_p2_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "e66f77af-30c6-4bfd-8403-6d666bb3e68f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#store without upsampling\n",
    "\n",
    "x, y = german_p2_df.drop('target', axis=1), german_p2_df['target']\n",
    "x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=.2, random_state=55)\n",
    "x_train_df = pd.concat([x_train, y_train], axis=1)\n",
    "x_test_df = pd.concat([x_test, y_test], axis=1)\n",
    "x_train_df.to_csv(\"german-train-processed.csv\", index = False)\n",
    "x_test_df.to_csv(\"german-test-processed.csv\", index = False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ce7903f3-9d71-4c82-830f-cdc8cb23aae9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "700"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(german_p2_df[german_p2_df[\"target\"] == 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "2b8aa8e8-a373-478e-9ffe-2461f75654a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Balance Data for German Credit \n",
    "\n",
    "from sklearn.utils import resample\n",
    "# import numpy as np\n",
    "# y = np.array([0,1,2,4])\n",
    "# print(resample(y, n_samples = 10, random_state = 3))\n",
    "german_upsampled = resample(german_p2_df[german_p2_df[\"target\"] == 0], n_samples = 500, random_state = 1)\n",
    "german_balanced = pd.concat([german_p2_df, german_upsampled], axis=0)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "d87dccec-a3d8-419c-9a72-7e4cedec44e0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1500, 13)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "german_balanced.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "6a7e8cb7-97f7-4212-965a-2617e10d7e6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "17a6a38a-ea79-4c6b-aa30-38928ca2ee4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "x, y = german_balanced.drop('target', axis=1), german_balanced['target']\n",
    "x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "4500e235-72bd-4c89-93c5-780357948e54",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(633,)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train[y_train==0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "0be36dd3-ae18-492d-9633-898495d03e9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train_df = pd.concat([x_train, y_train], axis=1)\n",
    "x_test_df = pd.concat([x_test, y_test], axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a98fc47d-fce2-4790-bbd7-371235a51e7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train_df.to_csv(\"german-train-unbal51.csv\", index = False)\n",
    "x_test_df.to_csv(\"german-test-unbal51.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "6b046539-042d-42ae-89d5-497877bb4a56",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(141, 13)"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_test_df[x_test_df[\"target\"] ==1].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "406d3262-6e53-4b77-b0d0-0b7c955261e7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(59, 13)"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_test_df[x_test_df[\"target\"] ==0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "44d2c782-94a8-4f15-886f-ebd11a2598ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['account_bal', 'duration', 'payment_status', 'credit_amount',\n",
       "       'savings_bond_value', 'employed_since', 'intallment_rate',\n",
       "       'residence_since', 'age', 'number_of_existcr', 'job',\n",
       "       'number_of_dependents', 'target'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "german_balanced.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "648cf7d3-dde8-4597-b772-f2f2a817d047",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(700, 13)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "german_balanced[german_balanced[\"target\"]==1].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d21d0c0-ca53-4bc3-9f91-3e3fb4c6fd95",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
