{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "# https://hbiostat.org/data/repo/csupport2\n",
    "# https://hbiostat.org/data/repo/supportdesc\n",
    "# https://archive.ics.uci.edu/dataset/880/support2\n",
    "df = pd.read_csv('support2.csv')\n",
    "df = df.drop(columns=['dnrday', 'adlp', 'adls', 'totcst', 'totmcst'])\n",
    "print(df.shape)\n",
    "df = df[~(df['charges'].isna())]\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'dzgroup',\n",
       "       'dzclass', 'num.co', 'edu', 'income', 'scoma', 'charges', 'avtisst',\n",
       "       'race', 'sps', 'aps', 'surv2m', 'surv6m', 'hday', 'diabetes',\n",
       "       'dementia', 'ca', 'prg2m', 'prg6m', 'dnr', 'meanbp', 'wblc', 'hrt',\n",
       "       'resp', 'temp', 'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose',\n",
       "       'bun', 'urine', 'sfdm2', 'adlsc'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/sq/2cxnps7s2n30yvs0mnt35vb40000gq/T/ipykernel_11296/2217994135.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  df[col].fillna(value, inplace=True)\n"
     ]
    }
   ],
   "source": [
    "# Baseline Variable\tNormal Fill-in Value\n",
    "# Serum albumin\t3.5\n",
    "# PaO2/FiO2 ratio (pafi)\t333.3\n",
    "# Bilirubin\t1.01\n",
    "# Creatinine\t1.01\n",
    "# BUN\t6.51\n",
    "# White blood count\t9 (thousands)\n",
    "# Urine output\t2502\n",
    "\n",
    "fill_values = {\n",
    "    \"alb\": 3.5,      # Serum albumin\n",
    "    \"pafi\": 333.3,   # PaO2/FiO2 ratio\n",
    "    \"bili\": 1.01,    # Bilirubin\n",
    "    \"crea\": 1.01,    # Creatinine\n",
    "    \"bun\": 6.51,     # Blood Urea Nitrogen (BUN)\n",
    "    \"wblc\": 9,       # White blood count (in thousands)\n",
    "    \"urine\": 2502    # Urine output\n",
    "}\n",
    "for col, value in fill_values.items():\n",
    "    df[col].fillna(value, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>scoma</th>\n",
       "      <th>sps</th>\n",
       "      <th>aps</th>\n",
       "      <th>surv2m</th>\n",
       "      <th>surv6m</th>\n",
       "      <th>meanbp</th>\n",
       "      <th>hrt</th>\n",
       "      <th>resp</th>\n",
       "      <th>temp</th>\n",
       "      <th>sod</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5441</th>\n",
       "      <td>0.0</td>\n",
       "      <td>31.398438</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.7229</td>\n",
       "      <td>0.638916</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      scoma        sps   aps  surv2m    surv6m  meanbp  hrt  resp  temp  sod\n",
       "5441    0.0  31.398438  60.0  0.7229  0.638916     NaN  NaN   NaN   NaN  NaN"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[(df[['scoma', 'sps', 'aps', 'surv2m', 'surv6m', 'meanbp', 'hrt', 'resp', 'temp', 'sod']].isna().any(axis=1))][['scoma', 'sps', 'aps', 'surv2m', 'surv6m', 'meanbp', 'hrt', 'resp', 'temp', 'sod']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(8933, 42)\n",
      "(8928, 42)\n"
     ]
    }
   ],
   "source": [
    "print(df.shape)\n",
    "df = df[~(df[['scoma', 'sps', 'aps', 'surv2m', 'surv6m', 'meanbp', 'hrt', 'resp', 'temp', 'sod', 'dnr']].isna().any(axis=1))]\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summary for column: sex\n",
      "sex\n",
      "female    3895\n",
      "male      5033\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: dzgroup\n",
      "dzgroup\n",
      "Cirrhosis             501\n",
      "Colon Cancer          502\n",
      "Coma                  584\n",
      "MOSF w/Malig          694\n",
      "Lung Cancer           893\n",
      "COPD                  956\n",
      "CHF                  1371\n",
      "ARF/MOSF w/Sepsis    3427\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: dzclass\n",
      "dzclass\n",
      "Coma                   584\n",
      "Cancer                1395\n",
      "COPD/CHF/Cirrhosis    2828\n",
      "ARF/MOSF              4121\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: income\n",
      "income\n",
      ">$50k          672\n",
      "$25-$50k      1028\n",
      "$11-$25k      1496\n",
      "under $11k    2803\n",
      "NaN           2929\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: race\n",
      "race\n",
      "NaN           42\n",
      "asian         79\n",
      "other        111\n",
      "hispanic     286\n",
      "black       1352\n",
      "white       7058\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: ca\n",
      "ca\n",
      "yes           1230\n",
      "metastatic    1821\n",
      "no            5877\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: dnr\n",
      "dnr\n",
      "dnr before sadm     239\n",
      "dnr after sadm     2903\n",
      "no dnr             5786\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: sfdm2\n",
      "sfdm2\n",
      "Coma or Intub            41\n",
      "SIP>=30                 557\n",
      "adl>=4 (>=5 if sur)     891\n",
      "NaN                    1369\n",
      "no(M2 and SIP pres)    3013\n",
      "<2 mo. follow-up       3057\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n"
     ]
    }
   ],
   "source": [
    "for col in ['sex', 'dzgroup', 'dzclass', 'income', 'race', 'ca', 'dnr', 'sfdm2']:\n",
    "    print(f\"Summary for column: {col}\")\n",
    "    print(df[col].value_counts(dropna=False).sort_values())\n",
    "    print(\"-\" * 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "age            0\n",
      "death          0\n",
      "sex            0\n",
      "hospdead       0\n",
      "slos           0\n",
      "d.time         0\n",
      "num.co         0\n",
      "edu         1609\n",
      "scoma          0\n",
      "charges        0\n",
      "avtisst       81\n",
      "sps            0\n",
      "aps            0\n",
      "surv2m         0\n",
      "surv6m         0\n",
      "hday           0\n",
      "diabetes       0\n",
      "dementia       0\n",
      "prg2m       1604\n",
      "prg6m       1588\n",
      "meanbp         0\n",
      "wblc           0\n",
      "hrt            0\n",
      "resp           0\n",
      "temp           0\n",
      "pafi           0\n",
      "alb            0\n",
      "bili           0\n",
      "crea           0\n",
      "sod            0\n",
      "ph          2239\n",
      "glucose     4435\n",
      "bun            0\n",
      "urine          0\n",
      "adlsc          0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(df[['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'num.co', 'edu',\n",
    "       'scoma', 'charges', 'avtisst', 'sps', 'aps', 'surv2m', 'surv6m', 'hday',\n",
    "       'diabetes', 'dementia', 'prg2m', 'prg6m', 'meanbp', 'wblc', 'hrt',\n",
    "       'resp', 'temp', 'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose',\n",
    "       'bun', 'urine', 'adlsc']].isna().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(8928, 42)\n",
      "(7270, 42)\n",
      "age         0\n",
      "death       0\n",
      "sex         0\n",
      "hospdead    0\n",
      "slos        0\n",
      "d.time      0\n",
      "num.co      0\n",
      "edu         0\n",
      "scoma       0\n",
      "charges     0\n",
      "avtisst     0\n",
      "sps         0\n",
      "aps         0\n",
      "surv2m      0\n",
      "surv6m      0\n",
      "hday        0\n",
      "diabetes    0\n",
      "dementia    0\n",
      "prg2m       0\n",
      "prg6m       0\n",
      "meanbp      0\n",
      "wblc        0\n",
      "hrt         0\n",
      "resp        0\n",
      "temp        0\n",
      "pafi        0\n",
      "alb         0\n",
      "bili        0\n",
      "crea        0\n",
      "sod         0\n",
      "ph          0\n",
      "glucose     0\n",
      "bun         0\n",
      "urine       0\n",
      "adlsc       0\n",
      "dtype: int64\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/sq/2cxnps7s2n30yvs0mnt35vb40000gq/T/ipykernel_11296/846671107.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  df['edu'].fillna(df['edu'].mean(), inplace=True)\n",
      "/var/folders/sq/2cxnps7s2n30yvs0mnt35vb40000gq/T/ipykernel_11296/846671107.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  df['ph'].fillna(df['ph'].mean(), inplace=True)\n",
      "/var/folders/sq/2cxnps7s2n30yvs0mnt35vb40000gq/T/ipykernel_11296/846671107.py:6: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  df['glucose'].fillna(df['glucose'].mean(), inplace=True)\n"
     ]
    }
   ],
   "source": [
    "print(df.shape)\n",
    "df = df[~(df[['prg2m', 'avtisst']].isna().any(axis=1))]\n",
    "print(df.shape)\n",
    "df['edu'].fillna(df['edu'].mean(), inplace=True)\n",
    "df['ph'].fillna(df['ph'].mean(), inplace=True)\n",
    "df['glucose'].fillna(df['glucose'].mean(), inplace=True)\n",
    "print(df[['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'num.co', 'edu',\n",
    "       'scoma', 'charges', 'avtisst', 'sps', 'aps', 'surv2m', 'surv6m', 'hday',\n",
    "       'diabetes', 'dementia', 'prg2m', 'prg6m', 'meanbp', 'wblc', 'hrt',\n",
    "       'resp', 'temp', 'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose',\n",
    "       'bun', 'urine', 'adlsc']].isna().sum())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summary for column: death\n",
      "death\n",
      "0    2333\n",
      "1    4937\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: dzgroup\n",
      "dzgroup\n",
      "Colon Cancer          392\n",
      "Cirrhosis             408\n",
      "Coma                  458\n",
      "MOSF w/Malig          563\n",
      "Lung Cancer           707\n",
      "COPD                  824\n",
      "CHF                  1020\n",
      "ARF/MOSF w/Sepsis    2898\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: dzclass\n",
      "dzclass\n",
      "Coma                   458\n",
      "Cancer                1099\n",
      "COPD/CHF/Cirrhosis    2252\n",
      "ARF/MOSF              3461\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: income\n",
      "income\n",
      ">$50k          563\n",
      "$25-$50k       835\n",
      "$11-$25k      1216\n",
      "NaN           2251\n",
      "under $11k    2405\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: race\n",
      "race\n",
      "NaN           33\n",
      "asian         60\n",
      "other         73\n",
      "hispanic     233\n",
      "black       1094\n",
      "white       5777\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: ca\n",
      "ca\n",
      "yes           1027\n",
      "metastatic    1453\n",
      "no            4790\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: dnr\n",
      "dnr\n",
      "dnr before sadm     209\n",
      "dnr after sadm     2386\n",
      "no dnr             4675\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n",
      "Summary for column: sfdm2\n",
      "sfdm2\n",
      "Coma or Intub            35\n",
      "SIP>=30                 489\n",
      "adl>=4 (>=5 if sur)     776\n",
      "NaN                    1047\n",
      "<2 mo. follow-up       2451\n",
      "no(M2 and SIP pres)    2472\n",
      "Name: count, dtype: int64\n",
      "------------------------------\n"
     ]
    }
   ],
   "source": [
    "for col in ['death', 'dzgroup', 'dzclass', 'income', 'race', 'ca', 'dnr', 'sfdm2']:\n",
    "    print(f\"Summary for column: {col}\")\n",
    "    print(df[col].value_counts(dropna=False).sort_values())\n",
    "    print(\"-\" * 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   dzgroup_ARF/MOSF w/Sepsis  dzgroup_CHF  dzgroup_COPD  dzgroup_Cirrhosis  \\\n",
      "1                      False        False         False              False   \n",
      "3                      False        False         False               True   \n",
      "4                      False        False         False              False   \n",
      "5                       True        False         False              False   \n",
      "6                      False        False         False              False   \n",
      "\n",
      "   dzgroup_Colon Cancer  dzgroup_Coma  dzgroup_Lung Cancer  \\\n",
      "1                 False         False                 True   \n",
      "3                 False         False                False   \n",
      "4                 False         False                 True   \n",
      "5                 False         False                False   \n",
      "6                 False          True                False   \n",
      "\n",
      "   dzgroup_MOSF w/Malig  \n",
      "1                 False  \n",
      "3                 False  \n",
      "4                 False  \n",
      "5                 False  \n",
      "6                 False  \n",
      "   dzclass_ARF/MOSF  dzclass_COPD/CHF/Cirrhosis  dzclass_Cancer  dzclass_Coma\n",
      "1             False                       False            True         False\n",
      "3             False                        True           False         False\n",
      "4             False                       False            True         False\n",
      "5              True                       False           False         False\n",
      "6             False                       False           False          True\n",
      "   income_$11-$25k  income_$25-$50k  income_>$50k  income_under $11k\n",
      "1             True            False         False              False\n",
      "3            False            False         False               True\n",
      "4            False            False         False               True\n",
      "5            False            False         False              False\n",
      "6            False            False         False              False\n",
      "   race_asian  race_black  race_hispanic  race_other  race_white\n",
      "1       False       False          False        True       False\n",
      "3       False       False          False       False        True\n",
      "4       False       False          False       False        True\n",
      "5       False       False          False       False        True\n",
      "6       False       False          False       False        True\n",
      "   dnr_dnr after sadm  dnr_dnr before sadm  dnr_no dnr\n",
      "1               False                False        True\n",
      "3               False                False        True\n",
      "4               False                False        True\n",
      "5               False                False        True\n",
      "6               False                False        True\n",
      "   sfdm2_<2 mo. follow-up  sfdm2_Coma or Intub  sfdm2_SIP>=30  \\\n",
      "1                   False                False          False   \n",
      "3                    True                False          False   \n",
      "4                   False                False          False   \n",
      "5                   False                False          False   \n",
      "6                    True                False          False   \n",
      "\n",
      "   sfdm2_adl>=4 (>=5 if sur)  sfdm2_no(M2 and SIP pres)  \n",
      "1                      False                      False  \n",
      "3                      False                      False  \n",
      "4                      False                       True  \n",
      "5                      False                       True  \n",
      "6                      False                      False  \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'num.co', 'edu',\n",
       "       'scoma', 'charges', 'avtisst', 'sps', 'aps', 'surv2m', 'surv6m', 'hday',\n",
       "       'diabetes', 'dementia', 'prg2m', 'prg6m', 'meanbp', 'wblc', 'hrt',\n",
       "       'resp', 'temp', 'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose',\n",
       "       'bun', 'urine', 'adlsc', 'dzgroup_ARF/MOSF w/Sepsis', 'dzgroup_CHF',\n",
       "       'dzgroup_COPD', 'dzgroup_Cirrhosis', 'dzgroup_Colon Cancer',\n",
       "       'dzgroup_Coma', 'dzgroup_Lung Cancer', 'dzgroup_MOSF w/Malig',\n",
       "       'dzclass_ARF/MOSF', 'dzclass_COPD/CHF/Cirrhosis', 'dzclass_Cancer',\n",
       "       'dzclass_Coma', 'income_$11-$25k', 'income_$25-$50k', 'income_>$50k',\n",
       "       'income_under $11k', 'race_asian', 'race_black', 'race_hispanic',\n",
       "       'race_other', 'race_white', 'dnr_dnr after sadm', 'dnr_dnr before sadm',\n",
       "       'dnr_no dnr', 'sfdm2_<2 mo. follow-up', 'sfdm2_Coma or Intub',\n",
       "       'sfdm2_SIP>=30', 'sfdm2_adl>=4 (>=5 if sur)',\n",
       "       'sfdm2_no(M2 and SIP pres)', 'ca_metastatic', 'ca_yes'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for c in ['dzgroup', 'dzclass', 'income', 'race', 'dnr', 'sfdm2']:\n",
    "    df = df.join(pd.get_dummies(df[c], prefix=c))\n",
    "    print(pd.get_dummies(df[c], prefix=c).head())\n",
    "df['sex'] = np.where((df['sex'] == 'male'), 1, 0)\n",
    "df['ca_metastatic'] = df[\"ca\"].map({\"metastatic\": 1, \"no\": 0, \"yes\": 0})\n",
    "df['ca_yes'] = df[\"ca\"].map({\"metastatic\": 1, \"no\": 0, \"yes\": 1})\n",
    "df = df.drop(columns=['dzgroup', 'dzclass', 'income', 'race', 'ca', 'dnr', 'sfdm2'])\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>death</th>\n",
       "      <th>sex</th>\n",
       "      <th>hospdead</th>\n",
       "      <th>slos</th>\n",
       "      <th>d.time</th>\n",
       "      <th>num.co</th>\n",
       "      <th>edu</th>\n",
       "      <th>scoma</th>\n",
       "      <th>charges</th>\n",
       "      <th>...</th>\n",
       "      <th>dnr_dnr after sadm</th>\n",
       "      <th>dnr_dnr before sadm</th>\n",
       "      <th>dnr_no dnr</th>\n",
       "      <th>sfdm2_&lt;2 mo. follow-up</th>\n",
       "      <th>sfdm2_Coma or Intub</th>\n",
       "      <th>sfdm2_SIP&gt;=30</th>\n",
       "      <th>sfdm2_adl&gt;=4 (&gt;=5 if sur)</th>\n",
       "      <th>sfdm2_no(M2 and SIP pres)</th>\n",
       "      <th>ca_metastatic</th>\n",
       "      <th>ca_yes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>62.84998</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2029</td>\n",
       "      <td>0</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9715.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>52.74698</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>47</td>\n",
       "      <td>2</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>41094.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>42.38498</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>133</td>\n",
       "      <td>2</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3075.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>79.88495</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>2029</td>\n",
       "      <td>1</td>\n",
       "      <td>11.745941</td>\n",
       "      <td>26.0</td>\n",
       "      <td>50127.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>93.01599</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>55.0</td>\n",
       "      <td>6884.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9099</th>\n",
       "      <td>70.42297</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15</td>\n",
       "      <td>17</td>\n",
       "      <td>4</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>80504.0</td>\n",
       "      <td>...</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9101</th>\n",
       "      <td>66.07300</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>23</td>\n",
       "      <td>350</td>\n",
       "      <td>1</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>52870.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9102</th>\n",
       "      <td>55.15399</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>29</td>\n",
       "      <td>347</td>\n",
       "      <td>1</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>41.0</td>\n",
       "      <td>35377.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9103</th>\n",
       "      <td>70.38196</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>346</td>\n",
       "      <td>1</td>\n",
       "      <td>11.745941</td>\n",
       "      <td>0.0</td>\n",
       "      <td>46564.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9104</th>\n",
       "      <td>47.01999</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>13.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>58439.0</td>\n",
       "      <td>...</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7270 rows × 66 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           age  death  sex  hospdead  slos  d.time  num.co        edu  scoma  \\\n",
       "1     62.84998      0    1         0     5    2029       0  11.000000    0.0   \n",
       "3     52.74698      1    0         0    17      47       2  12.000000    0.0   \n",
       "4     42.38498      1    0         0     3     133       2  11.000000    0.0   \n",
       "5     79.88495      0    0         0    16    2029       1  11.745941   26.0   \n",
       "6     93.01599      1    1         1     4       4       1  14.000000   55.0   \n",
       "...        ...    ...  ...       ...   ...     ...     ...        ...    ...   \n",
       "9099  70.42297      1    1         0    15      17       4  12.000000    0.0   \n",
       "9101  66.07300      0    1         0    23     350       1   8.000000    0.0   \n",
       "9102  55.15399      0    0         0    29     347       1  11.000000   41.0   \n",
       "9103  70.38196      0    1         0     8     346       1  11.745941    0.0   \n",
       "9104  47.01999      1    1         1     7       7       1  13.000000    0.0   \n",
       "\n",
       "      charges  ...  dnr_dnr after sadm  dnr_dnr before sadm  dnr_no dnr  \\\n",
       "1      9715.0  ...               False                False        True   \n",
       "3     41094.0  ...               False                False        True   \n",
       "4      3075.0  ...               False                False        True   \n",
       "5     50127.0  ...               False                False        True   \n",
       "6      6884.0  ...               False                False        True   \n",
       "...       ...  ...                 ...                  ...         ...   \n",
       "9099  80504.0  ...                True                False       False   \n",
       "9101  52870.0  ...               False                False        True   \n",
       "9102  35377.0  ...               False                False        True   \n",
       "9103  46564.0  ...               False                False        True   \n",
       "9104  58439.0  ...                True                False       False   \n",
       "\n",
       "      sfdm2_<2 mo. follow-up  sfdm2_Coma or Intub  sfdm2_SIP>=30  \\\n",
       "1                      False                False          False   \n",
       "3                       True                False          False   \n",
       "4                      False                False          False   \n",
       "5                      False                False          False   \n",
       "6                       True                False          False   \n",
       "...                      ...                  ...            ...   \n",
       "9099                    True                False          False   \n",
       "9101                   False                False          False   \n",
       "9102                   False                False          False   \n",
       "9103                   False                False          False   \n",
       "9104                    True                False          False   \n",
       "\n",
       "      sfdm2_adl>=4 (>=5 if sur)  sfdm2_no(M2 and SIP pres)  ca_metastatic  \\\n",
       "1                         False                      False              1   \n",
       "3                         False                      False              0   \n",
       "4                         False                       True              1   \n",
       "5                         False                       True              0   \n",
       "6                         False                      False              0   \n",
       "...                         ...                        ...            ...   \n",
       "9099                      False                      False              0   \n",
       "9101                      False                      False              0   \n",
       "9102                      False                      False              0   \n",
       "9103                      False                      False              0   \n",
       "9104                      False                      False              0   \n",
       "\n",
       "      ca_yes  \n",
       "1          1  \n",
       "3          0  \n",
       "4          1  \n",
       "5          0  \n",
       "6          0  \n",
       "...      ...  \n",
       "9099       1  \n",
       "9101       0  \n",
       "9102       0  \n",
       "9103       0  \n",
       "9104       1  \n",
       "\n",
       "[7270 rows x 66 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df.to_csv('support2_processed.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "struct",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
