{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ed977fa0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "import re\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "import statsmodels.formula.api as smf\n",
    "import statsmodels.api as sm\n",
    "from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor\n",
    "import seaborn as sns\n",
    "import copy\n",
    "from sklearn import preprocessing\n",
    "import pickle\n",
    "import warnings\n",
    "from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "99b38e6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filtering out warnings in case they appear to prevent flooding of outputs\n",
    "warnings.simplefilter('ignore', ConvergenceWarning)\n",
    "warnings.simplefilter('ignore', HessianInversionWarning)\n",
    "warnings.simplefilter('ignore', pd.errors.DtypeWarning)\n",
    "warnings.simplefilter('ignore', RuntimeWarning)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "79f67a7a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_ci_95(arr):\n",
    "    \"\"\"\n",
    "    Helper function to calculate 95% interval given an array\n",
    "    \"\"\"\n",
    "    return [np.percentile(arr, 2.5), np.percentile(arr, 97.5)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31f898d2",
   "metadata": {},
   "source": [
    "### Merge full mimiic data with mimic data predictions for smoking status"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9c279c24",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Retrieving merged_data -- can't show due to MIMIC Privacy Policy\n",
    "full_data_df = pd.read_csv(\"full_data_df_no_index.csv\")\n",
    "pred_mimic_df = pd.read_csv(\"...\")# Should be the csv file with mimic smoking status predictions for each entry\n",
    "pred_mimic_df = pred_mimic_df.rename(columns={'SUBJECT_ID': 'subject_id'})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "904fc33a",
   "metadata": {},
   "source": [
    "#### full_data_df contains 6361 rows and 130 columns (including subject_id, age, echo, etc...)\n",
    "#### pred_mimic_df contains 34312 rows and 46 columns (including subject_id, SMOKING_STATUS, etc..)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d4b34a8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df = pd.merge(full_data_df, pred_mimic_df[[\"subject_id\",\"SMOKING_STATUS\"]], on=[\"subject_id\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c6ce87fc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    2058\n",
       "3    1413\n",
       "4    1171\n",
       "2      93\n",
       "0      64\n",
       "Name: SMOKING_STATUS, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_df[\"SMOKING_STATUS\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "41a900c5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    2058\n",
       "3    1413\n",
       "4    1171\n",
       "2      93\n",
       "Name: SMOKING_STATUS, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Droppping 0 labels to ensure we only have 4 possible smoking status labels\n",
    "# 0 labels derived from merging two dataframes where some entries may not have a prediction\n",
    "merged_df = merged_df.drop(merged_df[merged_df[\"SMOKING_STATUS\"] == 0].index)\n",
    "merged_df[\"SMOKING_STATUS\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6f986148",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Converting integers to weekdays\n",
    "def int_to_weekday(row):\n",
    "    r = int(row)\n",
    "    if r == 0:\n",
    "        return 'sunday'\n",
    "    elif r == 1:\n",
    "        return \"monday\"\n",
    "    elif r == 2:\n",
    "        return \"tuesday\"\n",
    "    elif r == 3:\n",
    "        return \"wednesday\"\n",
    "    elif r == 4:\n",
    "        return \"thursday\"\n",
    "    elif r== 5:\n",
    "        return \"friday\"\n",
    "    else:\n",
    "        return \"saturday\"\n",
    "\n",
    "merged_df[\"icu_adm_weekday\"] = merged_df[\"icu_adm_weekday\"].apply(int_to_weekday)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d22ae029",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df[\"first_careunit\"] = merged_df[\"first_careunit\"].astype('category')\n",
    "merged_df[\"first_careunit\"] = merged_df[\"first_careunit\"].cat.reorder_categories([\"SICU\", \"MICU\"])\n",
    "\n",
    "merged_df[\"gender\"] = merged_df[\"gender\"].astype(\"category\")\n",
    "merged_df[\"gender\"] = merged_df[\"gender\"].cat.reorder_categories([\"M\", \"F\"])\n",
    "\n",
    "merged_df[\"icu_adm_weekday\"] = merged_df[\"icu_adm_weekday\"].astype(\"category\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "285b9678",
   "metadata": {},
   "source": [
    "### Viewing the finalized merged dataframe\n",
    "\n",
    "#### 4735 rows and 131 columns (including subject_id, echo, mort_28_day, SMOKING_STATUS, etc...)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c217bb7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df # Unable to show due to MIMIC Privacy Policy"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a5719af",
   "metadata": {},
   "source": [
    "### Defining helper functions to calculate causal effects w.r.t effect restoration from measurement bias"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a8b14608",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_models(dataframe):\n",
    "    '''\n",
    "    Given a pre-processed MIMIC + proxy prediction dataframe, train three logistic regression models \n",
    "    using smf.logit. \n",
    "    The formula strings will be hard-coded into the function. The assumptions for these models are:\n",
    "        1) Categorical smoking categories \n",
    "        2) Not all feature are binary, but at least the output (mort_28_day) and treatment (echo) \n",
    "           should be binary\n",
    "    '''\n",
    "    \n",
    "    # Calculating P(y | u*, a, c) --> y ~ u* + a + c for each u* label in [1,2,3,4] \n",
    "    fstring = 'mort_28_day ~ echo + age + weight + saps + sofa + elix_score + vent + \\\n",
    "            vaso + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \\\n",
    "            icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \\\n",
    "            lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_chloride_first + \\\n",
    "            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_creatinine_first + \\\n",
    "            lab_potassium_first + vs_cvp_flag + lab_creatinine_kinase_flag + lab_bnp_flag + gender + \\\n",
    "            lab_troponin_flag + first_careunit + icu_adm_weekday + lab_ph_first + lab_pco2_first + \\\n",
    "            lab_po2_first + lab_lactate_first + sedative + C(SMOKING_STATUS)'\n",
    "    eq1 = smf.logit(fstring, data=dataframe)\n",
    "    eq1_model = eq1.fit(disp=0)\n",
    "    \n",
    "    # Calculating P(u* | a, c)\n",
    "    f_string2 = \"SMOKING_STATUS ~ echo + first_careunit + age + gender + weight + saps + sofa + elix_score + \\\n",
    "            vent + vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + \\\n",
    "            icd_copd + icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + \\\n",
    "            vs_temp_first + lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + \\\n",
    "            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \\\n",
    "            lab_chloride_first + lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + \\\n",
    "            vs_cvp_flag + lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag\"\n",
    "    eq2 = smf.mnlogit(f_string2, data=dataframe)\n",
    "    eq2_model = eq2.fit(disp=0)\n",
    "    \n",
    "    # Calculating P(a|c)\n",
    "    f_string3 = \"echo ~ first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \\\n",
    "                vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \\\n",
    "                icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \\\n",
    "                lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \\\n",
    "                lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \\\n",
    "                lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \\\n",
    "                lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag\"\n",
    "    eq3 = smf.logit(f_string3, data=dataframe)\n",
    "    eq3_model = eq3.fit(disp=0)\n",
    "    \n",
    "    return eq1_model, eq2_model, eq3_model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36ac2713",
   "metadata": {},
   "source": [
    "### Implementing Risk Ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "3db7275b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def risk_ratio(dataframe, model1, model2, model3):\n",
    "    '''\n",
    "    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe as well as three trained models \n",
    "    from generate_models(), calculate the risk ratio as defined by: \n",
    "    causal_effect = summation(c,u){ p(c,u) * ( E[Y=1 | A=1,c,u*] / E[Y=1 | A=0,c,u*] ) }\n",
    "    The assumptions of this function are:\n",
    "        1) Smoking proxy predictions are categorical\n",
    "        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 for not \n",
    "           receiving treatment\n",
    "        3) Order for model inputs matter:\n",
    "            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c\n",
    "            b) model2 = P(u* | a, c)\n",
    "            c) model3 = P(a | c)\n",
    "        4) Default prediction is probability of getting 1 due to how statsmodels works\n",
    "    '''\n",
    "    \n",
    "    tmp_df = None\n",
    "    unique_smoking = [1,2,3,4]\n",
    "    unique_echo = [1,0]\n",
    "    exp_array = []\n",
    "    \n",
    "    # Understanding Matrix of Error Adjustments\n",
    "    confusion = [\n",
    "                    [8, 0, 2, 1],\n",
    "                    [4, 4, 3, 0],\n",
    "                    [1, 0, 14, 1],\n",
    "                    [1, 0, 1, 61]\n",
    "                ] # rows represent the ground truth labels and cols represents the predicted labels\n",
    "\n",
    "    error_mat = [\n",
    "                    [8/11, 0, 2/11, 1/11],\n",
    "                    [4/11, 4/11, 3/11, 0],\n",
    "                    [1/16, 0, 14/16, 1/16],\n",
    "                    [1/63, 0, 1/63, 61/63]\n",
    "                ] # rows represent U* and cols represent U\n",
    "    inverse = np.linalg.pinv(error_mat)\n",
    "    \n",
    "    # Getting P(A, c, y=1, u*) \n",
    "    prob_a1_c_y1_u = []\n",
    "    prob_a0_c_y1_u = []\n",
    "    for s in unique_smoking:\n",
    "        tmp_df = copy.deepcopy(dataframe)\n",
    "    \n",
    "        # Presetting the smoking status in the dataframe to be a cateogrical value in [1,2,3,4]\n",
    "        tmp_df[\"SMOKING_STATUS\"] = [s] * tmp_df.shape[0]\n",
    "        \n",
    "        for e in unique_echo:\n",
    "            tmp_tmp_df = copy.deepcopy(tmp_df)\n",
    "            tmp_tmp_df[\"echo\"] = [e] * tmp_df.shape[0]\n",
    "            \n",
    "            prob_1 = model1.predict(tmp_tmp_df)\n",
    "            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]\n",
    "            prob_3 = model3.predict(tmp_tmp_df)\n",
    "            \n",
    "            \n",
    "            if e == 0:\n",
    "                output = prob_1 * prob_2 * (1 - prob_3)\n",
    "                prob_a0_c_y1_u.append(output)\n",
    "            else:\n",
    "                output = prob_1 * prob_2 * prob_3\n",
    "                prob_a1_c_y1_u.append(output)\n",
    "    \n",
    "    # Getting P(A, c, y=0, u*)\n",
    "    prob_a1_c_y0_u = []\n",
    "    prob_a0_c_y0_u = []\n",
    "    for s in unique_smoking:\n",
    "        tmp_df = copy.deepcopy(dataframe)\n",
    "    \n",
    "        # Presetting the smoking status in the dataframe to be a cateogrical value in [1,2,3,4]\n",
    "        tmp_df[\"SMOKING_STATUS\"] = [s] * tmp_df.shape[0]\n",
    "        \n",
    "        for e in unique_echo:\n",
    "            tmp_tmp_df = copy.deepcopy(tmp_df)\n",
    "            tmp_tmp_df[\"echo\"] = [e] * tmp_df.shape[0]\n",
    "            \n",
    "            prob_1 = 1 - model1.predict(tmp_tmp_df)\n",
    "            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]\n",
    "            prob_3 = model3.predict(tmp_tmp_df)\n",
    "            \n",
    "            \n",
    "            if e == 0:\n",
    "                output = prob_1 * prob_2 * (1 - prob_3)\n",
    "                prob_a0_c_y0_u.append(output)\n",
    "            else:\n",
    "                output = prob_1 * prob_2 * prob_3\n",
    "                prob_a1_c_y0_u.append(output)\n",
    "        \n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=0)\n",
    "    num_0a = prob_a1_c_y1_u[0] * inverse[0][0] + prob_a1_c_y1_u[1] * inverse[1][0] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][0] + prob_a1_c_y1_u[3] * inverse[3][0]\n",
    "    tmp_0a = prob_a1_c_y0_u[0] * inverse[0][0] + prob_a1_c_y0_u[1] * inverse[1][0] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][0] + prob_a1_c_y0_u[3] * inverse[3][0]\n",
    "    denom_0a = num_0a + tmp_0a\n",
    "    upper_0a = num_0a / denom_0a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=0)\n",
    "    num_0b = prob_a0_c_y1_u[0] * inverse[0][0] + prob_a0_c_y1_u[1] * inverse[1][0] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][0] + prob_a0_c_y1_u[3] * inverse[3][0]\n",
    "    tmp_0b = prob_a0_c_y0_u[0] * inverse[0][0] + prob_a0_c_y0_u[1] * inverse[1][0] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][0] + prob_a0_c_y0_u[3] * inverse[3][0]\n",
    "    denom_0b = num_0b + tmp_0b\n",
    "    lower_0b = num_0b / denom_0b\n",
    "    \n",
    "    comp_0 = upper_0a / lower_0b\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=1)\n",
    "    num_1a = prob_a1_c_y1_u[0] * inverse[0][1] + prob_a1_c_y1_u[1] * inverse[1][1] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][1] + prob_a1_c_y1_u[3] * inverse[3][1]\n",
    "    tmp_1a = prob_a1_c_y0_u[0] * inverse[0][1] + prob_a1_c_y0_u[1] * inverse[1][1] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][1] + prob_a1_c_y0_u[3] * inverse[3][1]\n",
    "    denom_1a = num_1a + tmp_1a\n",
    "    upper_1a = num_1a / denom_1a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=1)\n",
    "    num_1b = prob_a0_c_y1_u[0] * inverse[0][1] + prob_a0_c_y1_u[1] * inverse[1][1] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][1] + prob_a0_c_y1_u[3] * inverse[3][1]\n",
    "    tmp_1b = prob_a0_c_y0_u[0] * inverse[0][1] + prob_a0_c_y0_u[1] * inverse[1][1] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][1] + prob_a0_c_y0_u[3] * inverse[3][1]\n",
    "    denom_1b = num_1b + tmp_1b\n",
    "    lower_1b = num_1b / denom_1b\n",
    "    \n",
    "    comp_1 = upper_1a / lower_1b\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=2)\n",
    "    num_2a = prob_a1_c_y1_u[0] * inverse[0][2] + prob_a1_c_y1_u[1] * inverse[1][2] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][2] + prob_a1_c_y1_u[3] * inverse[3][2]\n",
    "    tmp_2a = prob_a1_c_y0_u[0] * inverse[0][2] + prob_a1_c_y0_u[1] * inverse[1][2] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][2] + prob_a1_c_y0_u[3] * inverse[3][2]\n",
    "    denom_2a = num_2a + tmp_2a\n",
    "    upper_2a = num_2a / denom_2a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=2)\n",
    "    num_2b = prob_a0_c_y1_u[0] * inverse[0][2] + prob_a0_c_y1_u[1] * inverse[1][2] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][2] + prob_a0_c_y1_u[3] * inverse[3][2]\n",
    "    tmp_2b = prob_a0_c_y0_u[0] * inverse[0][2] + prob_a0_c_y0_u[1] * inverse[1][2] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][2] + prob_a0_c_y0_u[3] * inverse[3][2]\n",
    "    denom_2b = num_2b + tmp_2b\n",
    "    lower_2b = num_2b / denom_2b\n",
    "    \n",
    "    comp_2 = upper_2a / lower_2b\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=3)\n",
    "    num_3a = prob_a1_c_y1_u[0] * inverse[0][3] + prob_a1_c_y1_u[1] * inverse[1][3] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][3] + prob_a1_c_y1_u[3] * inverse[3][3]\n",
    "    tmp_3a = prob_a1_c_y0_u[0] * inverse[0][3] + prob_a1_c_y0_u[1] * inverse[1][3] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][3] + prob_a1_c_y0_u[3] * inverse[3][3]\n",
    "    denom_3a = num_3a + tmp_3a\n",
    "    upper_3a = num_3a / denom_3a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=3)\n",
    "    num_3b = prob_a0_c_y1_u[0] * inverse[0][3] + prob_a0_c_y1_u[1] * inverse[1][3] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][3] + prob_a0_c_y1_u[3] * inverse[3][3]\n",
    "    tmp_3b = prob_a0_c_y0_u[0] * inverse[0][3] + prob_a0_c_y0_u[1] * inverse[1][3] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][3] + prob_a0_c_y0_u[3] * inverse[3][3]\n",
    "    denom_3b = num_3b + tmp_3b\n",
    "    lower_3b = num_3b / denom_3b\n",
    "    \n",
    "    comp_3 = upper_3a / lower_3b\n",
    "    \n",
    "    # Getting P(u | c) \n",
    "    prob_u0_c = num_0a + tmp_0a + num_0b + tmp_0b\n",
    "    prob_u1_c = num_1a + tmp_1a + num_1b + tmp_1b\n",
    "    prob_u2_c = num_2a + tmp_2a + num_2b + tmp_2b\n",
    "    prob_u3_c = num_3a + tmp_3a + num_3b + tmp_3b\n",
    "    \n",
    "    rr = np.mean(comp_0 * prob_u0_c) + np.mean(comp_1 * prob_u1_c) + np.mean(comp_2 * prob_u2_c) \\\n",
    "            + np.mean(comp_3 * prob_u3_c)\n",
    "    sub_array = [np.mean(comp_0), np.mean(comp_1), np.mean(comp_2), np.mean(comp_3)]\n",
    "    return rr, sub_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "ae5d0f14",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.8768839707583874,\n",
       " [0.8615914941715498,\n",
       "  0.9221790334361804,\n",
       "  1.1265191373553538,\n",
       "  0.880485625812281])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m1, m2, m3 = generate_models(merged_df)\n",
    "risk_ratio(merged_df, m1, m2, m3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2099073",
   "metadata": {},
   "source": [
    "### Bootstrapping Merged Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "fb77c500",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8766765708032103"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def bootstrap_merged_data_rr(dataframe, model1, model2, model3):\n",
    "    '''\n",
    "    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe as well as three trained models \n",
    "    from generate_models(), perform bootstrapping by shuffling the merged dataframe for\n",
    "    risk ratio calculations. Iterations set to 100.\n",
    "    The assumptions of this function are:\n",
    "        1) Smoking proxy predictions are categorical\n",
    "        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 \n",
    "           for not receiving treatment\n",
    "        3) Order for model inputs matter:\n",
    "            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c\n",
    "            b) model2 = P(u* | a, c)\n",
    "            c) model3 = P(a | c)\n",
    "    '''\n",
    "    \n",
    "    \n",
    "    iterations = 100\n",
    "    output = []\n",
    "    sub_matrix = np.zeros((iterations, 4))\n",
    "    for i in range(iterations):\n",
    "        bt_df = dataframe.sample(frac=1, replace=True, ignore_index=True)\n",
    "        rr, sub_array = risk_ratio(bt_df, model1, model2, model3)\n",
    "        output.append(rr)\n",
    "\n",
    "        for idx, c in enumerate(sub_array):\n",
    "            sub_matrix[i, idx] = c\n",
    "        sub_avg = sub_matrix.mean(axis=0)\n",
    "        \n",
    "    res_dict = {\"bs_rr\": sum(output) / len(output), \"bs_arr_rr\": output, \"sub_avg_rr\": sub_avg, \\\n",
    "                \"sub_arr_rr\": sub_matrix}\n",
    "    \n",
    "    return res_dict\n",
    "\n",
    "bt_merged_rr = bootstrap_merged_data_rr(merged_df, m1, m2, m3)\n",
    "bt_merged_rr[\"bs_rr\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "b1769789",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.8573429757703789, 0.8918207176816655]"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Computing 95% interval for risk ratio while boostrapping merged dataframe \n",
    "compute_ci_95(bt_merged_rr[\"bs_arr_rr\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "50f775ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.75513996, 0.9206174 , 0.65203891, 0.69939179],\n",
       "       [0.92084893, 0.92375223, 1.60260195, 1.00687674]])"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Computing 95% interval for risk ratio subgroups while bootstrapping merged dataframe\n",
    "np.apply_along_axis(compute_ci_95, axis=0, arr=bt_merged_rr[\"sub_arr_rr\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f18fa90",
   "metadata": {},
   "source": [
    "### Bootstrapping Error Rate Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "1c87a84d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def risk_ratio_bootstrap(dataframe, model1, model2, model3, error_mat):\n",
    "    '''\n",
    "    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe, three trained models \n",
    "    from generate_models(), and an error rate matrix, perform bootstrapping on error-rate matrix for\n",
    "    risk-ratio calculations. Helper function for bootstrap()\n",
    "    The assumptions of this function are:\n",
    "        1) Smoking proxy predictions are categorical\n",
    "        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 \n",
    "           for not receiving treatment\n",
    "        3) Order for model inputs matter:\n",
    "            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c\n",
    "            b) model2 = P(u* | a, c)\n",
    "            c) model3 = P(a | c)\n",
    "    '''\n",
    "    \n",
    "    tmp_df = None\n",
    "    unique_smoking = [1,2,3,4]\n",
    "    unique_echo = [1,0]\n",
    "    exp_array = []\n",
    "    \n",
    "    # Inversing Error Rate Matrices\n",
    "    inverse = np.linalg.pinv(error_mat)\n",
    "    \n",
    "    # Getting P(A, c, y=1, u*) \n",
    "    prob_a1_c_y1_u = []\n",
    "    prob_a0_c_y1_u = []\n",
    "    for s in unique_smoking:\n",
    "        tmp_df = copy.deepcopy(dataframe)\n",
    "    \n",
    "        # Presetting the smoking status in the dataframe to be a cateogrical value in [1,2,3,4]\n",
    "        tmp_df[\"SMOKING_STATUS\"] = [s] * tmp_df.shape[0]\n",
    "        \n",
    "        for e in unique_echo:\n",
    "            tmp_tmp_df = copy.deepcopy(tmp_df)\n",
    "            tmp_tmp_df[\"echo\"] = [e] * tmp_df.shape[0]\n",
    "            \n",
    "            prob_1 = model1.predict(tmp_tmp_df)\n",
    "            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]\n",
    "            prob_3 = model3.predict(tmp_tmp_df)\n",
    "            \n",
    "            \n",
    "            if e == 0:\n",
    "                output = prob_1 * prob_2 * (1 - prob_3)\n",
    "                prob_a0_c_y1_u.append(output)\n",
    "            else:\n",
    "                output = prob_1 * prob_2 * prob_3\n",
    "                prob_a1_c_y1_u.append(output)\n",
    "    \n",
    "    # Getting P(A, c, y=0, u*)\n",
    "    prob_a1_c_y0_u = []\n",
    "    prob_a0_c_y0_u = []\n",
    "    for s in unique_smoking:\n",
    "        tmp_df = copy.deepcopy(dataframe)\n",
    "    \n",
    "        # Presetting the smoking status in the dataframe to be a cateogrical value in [1,2,3,4]\n",
    "        tmp_df[\"SMOKING_STATUS\"] = [s] * tmp_df.shape[0]\n",
    "        \n",
    "        for e in unique_echo:\n",
    "            tmp_tmp_df = copy.deepcopy(tmp_df)\n",
    "            tmp_tmp_df[\"echo\"] = [e] * tmp_df.shape[0]\n",
    "            \n",
    "            prob_1 = 1 - model1.predict(tmp_tmp_df)\n",
    "            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]\n",
    "            prob_3 = model3.predict(tmp_tmp_df)\n",
    "            \n",
    "            \n",
    "            if e == 0:\n",
    "                output = prob_1 * prob_2 * (1 - prob_3)\n",
    "                prob_a0_c_y0_u.append(output)\n",
    "            else:\n",
    "                output = prob_1 * prob_2 * prob_3\n",
    "                prob_a1_c_y0_u.append(output)\n",
    "        \n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=0)\n",
    "    num_0a = prob_a1_c_y1_u[0] * inverse[0][0] + prob_a1_c_y1_u[1] * inverse[1][0] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][0] + prob_a1_c_y1_u[3] * inverse[3][0]\n",
    "    tmp_0a = prob_a1_c_y0_u[0] * inverse[0][0] + prob_a1_c_y0_u[1] * inverse[1][0] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][0] + prob_a1_c_y0_u[3] * inverse[3][0]\n",
    "    denom_0a = num_0a + tmp_0a\n",
    "    upper_0a = num_0a / denom_0a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=0)\n",
    "    num_0b = prob_a0_c_y1_u[0] * inverse[0][0] + prob_a0_c_y1_u[1] * inverse[1][0] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][0] + prob_a0_c_y1_u[3] * inverse[3][0]\n",
    "    tmp_0b = prob_a0_c_y0_u[0] * inverse[0][0] + prob_a0_c_y0_u[1] * inverse[1][0] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][0] + prob_a0_c_y0_u[3] * inverse[3][0]\n",
    "    denom_0b = num_0b + tmp_0b\n",
    "    lower_0b = num_0b / denom_0b\n",
    "    \n",
    "    comp_0 = upper_0a / lower_0b\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=1)\n",
    "    num_1a = prob_a1_c_y1_u[0] * inverse[0][1] + prob_a1_c_y1_u[1] * inverse[1][1] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][1] + prob_a1_c_y1_u[3] * inverse[3][1]\n",
    "    tmp_1a = prob_a1_c_y0_u[0] * inverse[0][1] + prob_a1_c_y0_u[1] * inverse[1][1] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][1] + prob_a1_c_y0_u[3] * inverse[3][1]\n",
    "    denom_1a = num_1a + tmp_1a\n",
    "    upper_1a = num_1a / denom_1a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=1)\n",
    "    num_1b = prob_a0_c_y1_u[0] * inverse[0][1] + prob_a0_c_y1_u[1] * inverse[1][1] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][1] + prob_a0_c_y1_u[3] * inverse[3][1]\n",
    "    tmp_1b = prob_a0_c_y0_u[0] * inverse[0][1] + prob_a0_c_y0_u[1] * inverse[1][1] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][1] + prob_a0_c_y0_u[3] * inverse[3][1]\n",
    "    denom_1b = num_1b + tmp_1b\n",
    "    lower_1b = num_1b / denom_1b\n",
    "    \n",
    "    comp_1 = upper_1a / lower_1b\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=2)\n",
    "    num_2a = prob_a1_c_y1_u[0] * inverse[0][2] + prob_a1_c_y1_u[1] * inverse[1][2] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][2] + prob_a1_c_y1_u[3] * inverse[3][2]\n",
    "    tmp_2a = prob_a1_c_y0_u[0] * inverse[0][2] + prob_a1_c_y0_u[1] * inverse[1][2] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][2] + prob_a1_c_y0_u[3] * inverse[3][2]\n",
    "    denom_2a = num_2a + tmp_2a\n",
    "    upper_2a = num_2a / denom_2a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=2)\n",
    "    num_2b = prob_a0_c_y1_u[0] * inverse[0][2] + prob_a0_c_y1_u[1] * inverse[1][2] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][2] + prob_a0_c_y1_u[3] * inverse[3][2]\n",
    "    tmp_2b = prob_a0_c_y0_u[0] * inverse[0][2] + prob_a0_c_y0_u[1] * inverse[1][2] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][2] + prob_a0_c_y0_u[3] * inverse[3][2]\n",
    "    denom_2b = num_2b + tmp_2b\n",
    "    lower_2b = num_2b / denom_2b\n",
    "    \n",
    "    comp_2 = upper_2a / lower_2b\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=3)\n",
    "    num_3a = prob_a1_c_y1_u[0] * inverse[0][3] + prob_a1_c_y1_u[1] * inverse[1][3] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][3] + prob_a1_c_y1_u[3] * inverse[3][3]\n",
    "    tmp_3a = prob_a1_c_y0_u[0] * inverse[0][3] + prob_a1_c_y0_u[1] * inverse[1][3] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][3] + prob_a1_c_y0_u[3] * inverse[3][3]\n",
    "    denom_3a = num_3a + tmp_3a\n",
    "    upper_3a = num_3a / denom_3a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=3)\n",
    "    num_3b = prob_a0_c_y1_u[0] * inverse[0][3] + prob_a0_c_y1_u[1] * inverse[1][3] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][3] + prob_a0_c_y1_u[3] * inverse[3][3]\n",
    "    tmp_3b = prob_a0_c_y0_u[0] * inverse[0][3] + prob_a0_c_y0_u[1] * inverse[1][3] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][3] + prob_a0_c_y0_u[3] * inverse[3][3]\n",
    "    denom_3b = num_3b + tmp_3b\n",
    "    lower_3b = num_3b / denom_3b\n",
    "    \n",
    "    comp_3 = upper_3a / lower_3b\n",
    "    \n",
    "    # Getting P(u | c) \n",
    "    prob_u0_c = num_0a + tmp_0a + num_0b + tmp_0b\n",
    "    prob_u1_c = num_1a + tmp_1a + num_1b + tmp_1b\n",
    "    prob_u2_c = num_2a + tmp_2a + num_2b + tmp_2b\n",
    "    prob_u3_c = num_3a + tmp_3a + num_3b + tmp_3b\n",
    "    \n",
    "    rr = np.mean(comp_0 * prob_u0_c) + np.mean(comp_1 * prob_u1_c) + np.mean(comp_2 * prob_u2_c) \\\n",
    "        + np.mean(comp_3 * prob_u3_c)\n",
    "    sub_array = [np.mean(comp_0), np.mean(comp_1), np.mean(comp_2), np.mean(comp_3)]\n",
    "    return rr, sub_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "c53e94eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def bootstrap(dataframe, model1, model2, model3):\n",
    "    '''\n",
    "    Given a dataframe and 3 models generated from generate_models(), bootstrap the testing set \n",
    "    for n2c2 2006 smoking dataset to get different error rate matrices to test robustness of the \n",
    "    risk ratio casual effect. Iterations set to 100\n",
    "    Utilize predict_bootstrap_2006.py to generate pickle files that store the confusion matrices.\n",
    "    '''\n",
    "    \n",
    "    # Iterating through the bootstrapped confusion matrices \n",
    "    # \"iterations\" var depends on how many bootstrapped confusion matrics were generated\n",
    "    # Default in predict_bootstrap_2006.py is 10\n",
    "    iterations = 100 \n",
    "    rr_arr = []\n",
    "    sub_matrix = np.zeros((iterations, 4))\n",
    "\n",
    "    for x in range(iterations):\n",
    "        # Access each pickle file containing the confusion matrix\n",
    "        f = open(\"...\", \"rb\")  # First input should be the bootstrapped matrices (pkl file)\n",
    "        con_matrix = pickle.load(f)\n",
    "        res = con_matrix/con_matrix.sum(axis=1)[:,None]\n",
    "        \n",
    "        rr, sub_array = risk_ratio_bootstrap(dataframe, model1, model2, model3, res)\n",
    "        rr_arr.append(rr)\n",
    "        \n",
    "        for idx, c in enumerate(sub_array):\n",
    "            sub_matrix[x, idx] = c\n",
    "        sub_avg = sub_matrix.mean(axis=0)\n",
    "        \n",
    "    res_dict = {\"bt_rr\": sum(rr_arr) / len(rr_arr), \"bt_arr_rr\": rr_arr, \"sub_avg_rr\": sub_avg, \\\n",
    "                \"sub_arr_rr\": sub_matrix}\n",
    "    \n",
    "    return res_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "2ad238fb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8995671878792302"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m1, m2, m3 = generate_models(merged_df)\n",
    "bt_rr = bootstrap(merged_df, m1, m2, m3)\n",
    "bt_rr[\"bt_rr\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "7ecd94f8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.7708971127593199, 0.9874741092631595]"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Computing 95% interval for risk ratio while boostrapping error rate matrix\n",
    "compute_ci_95(bt_rr[\"bt_arr_rr\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "848f9c40",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.31628614, 0.92217903, 0.06079537, 0.53900034],\n",
       "       [1.37075198, 0.92217903, 2.39388607, 1.15786023]])"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Computing 95% interval for risk ratio subgroups while\n",
    "# bootstrapping error rate matrix\n",
    "np.apply_along_axis(compute_ci_95, axis=0, arr=bt_rr[\"sub_arr_rr\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2112e2a4",
   "metadata": {},
   "source": [
    "### Combined bootstrapping for RR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "8fbf3012",
   "metadata": {},
   "outputs": [],
   "source": [
    "def combined_bootstrap_rr(dataframe, model1, model2, model3):\n",
    "    '''\n",
    "    Given a dataframe and 3 models generated from generate_models(), do combined bootstrapping to see\n",
    "    robustness of risk ratio calculations. Iterations set to 100\n",
    "    Utilize predict_bootstrap_2006.py to generate pickle files that store the confusion matrices.\n",
    "    '''\n",
    "    \n",
    "    # Iterate through 10 of bootstrapped error matrices\n",
    "    iterations_em = 10\n",
    "    rr_arr = []\n",
    "    for iem in range(iterations_em):\n",
    "        f = open(\"...\", \"rb\")\n",
    "        con_matrix = pickle.load(f)\n",
    "        res = con_matrix/con_matrix.sum(axis=1)[:,None]\n",
    "        \n",
    "        # Iterate through 10 bootstrapped (shuffled) dataframes\n",
    "        iterations_df = 10\n",
    "        for idf in range(iterations_em):\n",
    "            bt_df = dataframe.sample(frac=1, replace=True, ignore_index=True)\n",
    "            rr, sub_array = risk_ratio_bootstrap(bt_df, model1, model2, model3, res)\n",
    "            rr_arr.append(rr)\n",
    "    print(\"Number of calcs:\", len(rr_arr)) # == 100 based on default settings\n",
    "    print(\"Mean combined bootstrap rr:\", sum(rr_arr) / len(rr_arr))\n",
    "    return [sum(rr_arr) / len(rr_arr), rr_arr]\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "174a0e18",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of calcs: 100\n",
      "Mean combined bootstrap rr: 0.8735481793257273\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0.6752182228746604, 0.9440202895674686]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### Compute 95% invertal for RR while doing combined bootstrapping\n",
    "combined_rr = combined_bootstrap_rr(merged_df, m1, m2, m3)\n",
    "compute_ci_95(combined_rr[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c6cbf51",
   "metadata": {},
   "source": [
    "### Implementing OR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "8025cedf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def odds_ratio(dataframe, model1, model2, model3):\n",
    "    '''\n",
    "    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe as well as three trained models \n",
    "    from generate_models(), calculate the odds ratio as defined by: \n",
    "    causal_effect = (P(Y^{a=1}=1) * P(Y^{a=0}=0)) / (P(Y^{a=1}=0) * P(Y^{a=0}=1))\n",
    "    The assumptions of this function are:\n",
    "        1) Smoking proxy predictions are categorical\n",
    "        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 \n",
    "           for not receiving treatment\n",
    "        3) Order for model inputs matter:\n",
    "            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c\n",
    "            b) model2 = P(u* | a, c)\n",
    "            c) model5 = P(a | c)\n",
    "        4) Default prediction is probability of getting 1 due to how statsmodels works\n",
    "    '''\n",
    "    \n",
    "    tmp_df = None\n",
    "    unique_smoking = [1,2,3,4]\n",
    "    unique_echo = [1,0]\n",
    "    exp_array = []\n",
    "    \n",
    "    # Creating Matrix of Error Adjustments\n",
    "    confusion = [\n",
    "                    [8, 0, 2, 1],\n",
    "                    [4, 4, 3, 0],\n",
    "                    [1, 0, 14, 1],\n",
    "                    [1, 0, 1, 61]\n",
    "                ] # rows represent the ground truth labels and cols represents the predicted labels\n",
    "\n",
    "    error_mat = [\n",
    "                    [8/11, 0, 2/11, 1/11],\n",
    "                    [4/11, 4/11, 3/11, 0],\n",
    "                    [1/16, 0, 14/16, 1/16],\n",
    "                    [1/63, 0, 1/63, 61/63]\n",
    "                ] # rows represent U* and cols represent U\n",
    "    inverse = np.linalg.pinv(error_mat)\n",
    "    \n",
    "    # Getting P(A, c, y=1, u*) \n",
    "    prob_a1_c_y1_u = []\n",
    "    prob_a0_c_y1_u = []\n",
    "    for s in unique_smoking:\n",
    "        tmp_df = copy.deepcopy(dataframe)\n",
    "    \n",
    "        # Presetting the smoking status in the dataframe to be a cateogrical value in [1,2,3,4]\n",
    "        tmp_df[\"SMOKING_STATUS\"] = [s] * tmp_df.shape[0]\n",
    "        \n",
    "        for e in unique_echo:\n",
    "            tmp_tmp_df = copy.deepcopy(tmp_df)\n",
    "            tmp_tmp_df[\"echo\"] = [e] * tmp_df.shape[0]\n",
    "            \n",
    "            prob_1 = model1.predict(tmp_tmp_df)\n",
    "            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]\n",
    "            prob_3 = model3.predict(tmp_tmp_df)\n",
    "            \n",
    "            \n",
    "            if e == 0:\n",
    "                output = prob_1 * prob_2 * (1 - prob_3)\n",
    "                prob_a0_c_y1_u.append(output)\n",
    "            else:\n",
    "                output = prob_1 * prob_2 * prob_3\n",
    "                prob_a1_c_y1_u.append(output)\n",
    "    \n",
    "    # Getting P(A, c, y=0, u*)\n",
    "    prob_a1_c_y0_u = []\n",
    "    prob_a0_c_y0_u = []\n",
    "    for s in unique_smoking:\n",
    "        tmp_df = copy.deepcopy(dataframe)\n",
    "    \n",
    "        # Presetting the smoking status in the dataframe to either be 1 or 0\n",
    "        tmp_df[\"SMOKING_STATUS\"] = [s] * tmp_df.shape[0]\n",
    "        \n",
    "        for e in unique_echo:\n",
    "            tmp_tmp_df = copy.deepcopy(tmp_df)\n",
    "            tmp_tmp_df[\"echo\"] = [e] * tmp_df.shape[0]\n",
    "            \n",
    "            prob_1 = 1 - model1.predict(tmp_tmp_df)\n",
    "            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]\n",
    "            prob_3 = model3.predict(tmp_tmp_df)\n",
    "            \n",
    "            \n",
    "            if e == 0:\n",
    "                output = prob_1 * prob_2 * (1 - prob_3)\n",
    "                prob_a0_c_y0_u.append(output)\n",
    "            else:\n",
    "                output = prob_1 * prob_2 * prob_3\n",
    "                prob_a1_c_y0_u.append(output)\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=0)\n",
    "    num_0a = prob_a1_c_y1_u[0] * inverse[0][0] + prob_a1_c_y1_u[1] * inverse[1][0] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][0] + prob_a1_c_y1_u[3] * inverse[3][0]\n",
    "    tmp_0a = prob_a1_c_y0_u[0] * inverse[0][0] + prob_a1_c_y0_u[1] * inverse[1][0] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][0] + prob_a1_c_y0_u[3] * inverse[3][0]\n",
    "    denom_0a = num_0a + tmp_0a\n",
    "    upper_0a = num_0a / denom_0a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=0)\n",
    "    num_0b = prob_a0_c_y1_u[0] * inverse[0][0] + prob_a0_c_y1_u[1] * inverse[1][0] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][0] + prob_a0_c_y1_u[3] * inverse[3][0]\n",
    "    tmp_0b = prob_a0_c_y0_u[0] * inverse[0][0] + prob_a0_c_y0_u[1] * inverse[1][0] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][0] + prob_a0_c_y0_u[3] * inverse[3][0]\n",
    "    denom_0b = num_0b + tmp_0b\n",
    "    lower_0b = num_0b / denom_0b\n",
    "    \n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=1)\n",
    "    num_1a = prob_a1_c_y1_u[0] * inverse[0][1] + prob_a1_c_y1_u[1] * inverse[1][1] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][1] + prob_a1_c_y1_u[3] * inverse[3][1]\n",
    "    tmp_1a = prob_a1_c_y0_u[0] * inverse[0][1] + prob_a1_c_y0_u[1] * inverse[1][1] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][1] + prob_a1_c_y0_u[3] * inverse[3][1]\n",
    "    denom_1a = num_1a + tmp_1a\n",
    "    upper_1a = num_1a / denom_1a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=1)\n",
    "    num_1b = prob_a0_c_y1_u[0] * inverse[0][1] + prob_a0_c_y1_u[1] * inverse[1][1] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][1] + prob_a0_c_y1_u[3] * inverse[3][1]\n",
    "    tmp_1b = prob_a0_c_y0_u[0] * inverse[0][1] + prob_a0_c_y0_u[1] * inverse[1][1] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][1] + prob_a0_c_y0_u[3] * inverse[3][1]\n",
    "    denom_1b = num_1b + tmp_1b\n",
    "    lower_1b = num_1b / denom_1b\n",
    "    \n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=2)\n",
    "    num_2a = prob_a1_c_y1_u[0] * inverse[0][2] + prob_a1_c_y1_u[1] * inverse[1][2] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][2] + prob_a1_c_y1_u[3] * inverse[3][2]\n",
    "    tmp_2a = prob_a1_c_y0_u[0] * inverse[0][2] + prob_a1_c_y0_u[1] * inverse[1][2] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][2] + prob_a1_c_y0_u[3] * inverse[3][2]\n",
    "    denom_2a = num_2a + tmp_2a\n",
    "    upper_2a = num_2a / denom_2a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=2)\n",
    "    num_2b = prob_a0_c_y1_u[0] * inverse[0][2] + prob_a0_c_y1_u[1] * inverse[1][2] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][2] + prob_a0_c_y1_u[3] * inverse[3][2]\n",
    "    tmp_2b = prob_a0_c_y0_u[0] * inverse[0][2] + prob_a0_c_y0_u[1] * inverse[1][2] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][2] + prob_a0_c_y0_u[3] * inverse[3][2]\n",
    "    denom_2b = num_2b + tmp_2b\n",
    "    lower_2b = num_2b / denom_2b\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=3)\n",
    "    num_3a = prob_a1_c_y1_u[0] * inverse[0][3] + prob_a1_c_y1_u[1] * inverse[1][3] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][3] + prob_a1_c_y1_u[3] * inverse[3][3]\n",
    "    tmp_3a = prob_a1_c_y0_u[0] * inverse[0][3] + prob_a1_c_y0_u[1] * inverse[1][3] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][3] + prob_a1_c_y0_u[3] * inverse[3][3]\n",
    "    denom_3a = num_3a + tmp_3a\n",
    "    upper_3a = num_3a / denom_3a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=3)\n",
    "    num_3b = prob_a0_c_y1_u[0] * inverse[0][3] + prob_a0_c_y1_u[1] * inverse[1][3] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][3] + prob_a0_c_y1_u[3] * inverse[3][3]\n",
    "    tmp_3b = prob_a0_c_y0_u[0] * inverse[0][3] + prob_a0_c_y0_u[1] * inverse[1][3] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][3] + prob_a0_c_y0_u[3] * inverse[3][3]\n",
    "    denom_3b = num_3b + tmp_3b\n",
    "    lower_3b = num_3b / denom_3b\n",
    "    \n",
    "    # Getting P(u | c) \n",
    "    prob_u0_c = num_0a + tmp_0a + num_0b + tmp_0b\n",
    "    prob_u1_c = num_1a + tmp_1a + num_1b + tmp_1b\n",
    "    prob_u2_c = num_2a + tmp_2a + num_2b + tmp_2b\n",
    "    prob_u3_c = num_3a + tmp_3a + num_3b + tmp_3b\n",
    "    \n",
    "    numerator_a = np.sum(upper_0a * prob_u0_c) + np.sum(upper_1a * prob_u1_c) + \\\n",
    "                  np.sum(upper_2a * prob_u2_c) + np.sum(upper_3a * prob_u3_c)\n",
    "    numerator_b = np.sum((1 - lower_0b) * prob_u0_c) + np.sum((1 - lower_1b) * prob_u1_c) + \\\n",
    "                  np.sum((1 - lower_2b) * prob_u2_c) + np.sum((1 - lower_3b) * prob_u3_c)\n",
    "     \n",
    "    denominator_a = np.sum((1 - upper_0a) * prob_u0_c) + np.sum((1 - upper_1a) * prob_u1_c) + \\\n",
    "                    np.sum((1 - upper_2a) * prob_u2_c) + np.sum((1 - upper_3a) * prob_u3_c)\n",
    "    denominator_b = np.sum(lower_0b * prob_u0_c) + np.sum(lower_1b * prob_u1_c) + \\\n",
    "                    np.sum(lower_2b * prob_u2_c) + np.sum(lower_3b * prob_u3_c)\n",
    "    \n",
    "    numerator = numerator_a * numerator_b\n",
    "    denominator = denominator_a * denominator_b\n",
    "    \n",
    "    return numerator / denominator\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "ca656b14",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8864763609817276"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m1, m2, m3 = generate_models(merged_df)\n",
    "odds_ratio(merged_df, m1, m2, m3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38994b36",
   "metadata": {},
   "source": [
    "### Bootstrapping merged dataframe for OR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "48c5ee7b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8851573696534452"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def bootstrap_merged_data_or(dataframe, m1, m2, m3):\n",
    "    '''\n",
    "    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe as well as three trained models \n",
    "    from generate_models(), perform bootstrapping by shuffling the merged dataframe for\n",
    "    odds ratio calculations. Iterations set to 100.\n",
    "    The assumptions of this function are:\n",
    "        1) Smoking proxy predictions are categorical\n",
    "        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 \n",
    "           for not receiving treatment\n",
    "        3) Order for model inputs matter:\n",
    "            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c\n",
    "            b) model2 = P(u* | a, c)\n",
    "            c) model3 = P(a | c)\n",
    "    '''\n",
    "    iterations = 100\n",
    "    output = []\n",
    "    \n",
    "    for _ in range(iterations):\n",
    "        bt_data = dataframe.sample(frac=1, replace=True, ignore_index=True)\n",
    "        or_val = odds_ratio(bt_data, m1, m2, m3)\n",
    "        output.append(or_val)\n",
    "\n",
    "    return [sum(output) / len(output), output]\n",
    "\n",
    "bt_merged_or = bootstrap_merged_data_or(merged_df, m1, m2, m3)\n",
    "bt_merged_or[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "9c071915",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.8665715709298584, 0.8999256163715614]"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Compute 95% CI for OR while bootstrapping merged dataframe\n",
    "compute_ci_95(bt_merged_or[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56844e59",
   "metadata": {},
   "source": [
    "### Bootstrapping Error Matrix for OR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "dafa3abb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def odds_ratio_bootstrap(dataframe, model1, model2, model3, error_mat):\n",
    "    '''\n",
    "    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe, three trained models \n",
    "    from generate_models(), and an error rate matrix, perform bootstrapping on error-rate matrix for\n",
    "    odds ratio calculations. Helper function for bootstrap_or()\n",
    "    The assumptions of this function are:\n",
    "        1) Smoking proxy predictions are categorical\n",
    "        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 \n",
    "           for not receiving treatment\n",
    "        3) Order for model inputs matter:\n",
    "            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c\n",
    "            b) model2 = P(u* | a, c)\n",
    "            c) model3 = P(a | c)\n",
    "    '''\n",
    "    tmp_df = None\n",
    "    unique_smoking = [1,2,3,4]\n",
    "    unique_echo = [1,0]\n",
    "    exp_array = []\n",
    "    \n",
    "    inverse = np.linalg.pinv(error_mat)\n",
    "    \n",
    "    # Getting P(A, c, y=1, u*) \n",
    "    prob_a1_c_y1_u = []\n",
    "    prob_a0_c_y1_u = []\n",
    "    for s in unique_smoking:\n",
    "        tmp_df = copy.deepcopy(dataframe)\n",
    "    \n",
    "        # Presetting the smoking status in the dataframe to be a cateogrical value in [1,2,3,4]\n",
    "        tmp_df[\"SMOKING_STATUS\"] = [s] * tmp_df.shape[0]\n",
    "        \n",
    "        for e in unique_echo:\n",
    "            tmp_tmp_df = copy.deepcopy(tmp_df)\n",
    "            tmp_tmp_df[\"echo\"] = [e] * tmp_df.shape[0]\n",
    "            \n",
    "            prob_1 = model1.predict(tmp_tmp_df)\n",
    "            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]\n",
    "            prob_3 = model3.predict(tmp_tmp_df)\n",
    "            \n",
    "            \n",
    "            if e == 0:\n",
    "                output = prob_1 * prob_2 * (1 - prob_3)\n",
    "                prob_a0_c_y1_u.append(output)\n",
    "            else:\n",
    "                output = prob_1 * prob_2 * prob_3\n",
    "                prob_a1_c_y1_u.append(output)\n",
    "    \n",
    "    # Getting P(A, c, y=0, u*)\n",
    "    prob_a1_c_y0_u = []\n",
    "    prob_a0_c_y0_u = []\n",
    "    for s in unique_smoking:\n",
    "        tmp_df = copy.deepcopy(dataframe)\n",
    "    \n",
    "        # Presetting the smoking status in the dataframe to either be 1 or 0\n",
    "        tmp_df[\"SMOKING_STATUS\"] = [s] * tmp_df.shape[0]\n",
    "        \n",
    "        for e in unique_echo:\n",
    "            tmp_tmp_df = copy.deepcopy(tmp_df)\n",
    "            tmp_tmp_df[\"echo\"] = [e] * tmp_df.shape[0]\n",
    "            \n",
    "            prob_1 = 1 - model1.predict(tmp_tmp_df)\n",
    "            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]\n",
    "            prob_3 = model3.predict(tmp_tmp_df)\n",
    "            \n",
    "            \n",
    "            if e == 0:\n",
    "                output = prob_1 * prob_2 * (1 - prob_3)\n",
    "                prob_a0_c_y0_u.append(output)\n",
    "            else:\n",
    "                output = prob_1 * prob_2 * prob_3\n",
    "                prob_a1_c_y0_u.append(output)\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=0)\n",
    "    num_0a = prob_a1_c_y1_u[0] * inverse[0][0] + prob_a1_c_y1_u[1] * inverse[1][0] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][0] + prob_a1_c_y1_u[3] * inverse[3][0]\n",
    "    tmp_0a = prob_a1_c_y0_u[0] * inverse[0][0] + prob_a1_c_y0_u[1] * inverse[1][0] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][0] + prob_a1_c_y0_u[3] * inverse[3][0]\n",
    "    denom_0a = num_0a + tmp_0a\n",
    "    upper_0a = num_0a / denom_0a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=0)\n",
    "    num_0b = prob_a0_c_y1_u[0] * inverse[0][0] + prob_a0_c_y1_u[1] * inverse[1][0] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][0] + prob_a0_c_y1_u[3] * inverse[3][0]\n",
    "    tmp_0b = prob_a0_c_y0_u[0] * inverse[0][0] + prob_a0_c_y0_u[1] * inverse[1][0] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][0] + prob_a0_c_y0_u[3] * inverse[3][0]\n",
    "    denom_0b = num_0b + tmp_0b\n",
    "    lower_0b = num_0b / denom_0b\n",
    "    \n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=1)\n",
    "    num_1a = prob_a1_c_y1_u[0] * inverse[0][1] + prob_a1_c_y1_u[1] * inverse[1][1] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][1] + prob_a1_c_y1_u[3] * inverse[3][1]\n",
    "    tmp_1a = prob_a1_c_y0_u[0] * inverse[0][1] + prob_a1_c_y0_u[1] * inverse[1][1] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][1] + prob_a1_c_y0_u[3] * inverse[3][1]\n",
    "    denom_1a = num_1a + tmp_1a\n",
    "    upper_1a = num_1a / denom_1a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=1)\n",
    "    num_1b = prob_a0_c_y1_u[0] * inverse[0][1] + prob_a0_c_y1_u[1] * inverse[1][1] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][1] + prob_a0_c_y1_u[3] * inverse[3][1]\n",
    "    tmp_1b = prob_a0_c_y0_u[0] * inverse[0][1] + prob_a0_c_y0_u[1] * inverse[1][1] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][1] + prob_a0_c_y0_u[3] * inverse[3][1]\n",
    "    denom_1b = num_1b + tmp_1b\n",
    "    lower_1b = num_1b / denom_1b\n",
    "    \n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=2)\n",
    "    num_2a = prob_a1_c_y1_u[0] * inverse[0][2] + prob_a1_c_y1_u[1] * inverse[1][2] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][2] + prob_a1_c_y1_u[3] * inverse[3][2]\n",
    "    tmp_2a = prob_a1_c_y0_u[0] * inverse[0][2] + prob_a1_c_y0_u[1] * inverse[1][2] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][2] + prob_a1_c_y0_u[3] * inverse[3][2]\n",
    "    denom_2a = num_2a + tmp_2a\n",
    "    upper_2a = num_2a / denom_2a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=2)\n",
    "    num_2b = prob_a0_c_y1_u[0] * inverse[0][2] + prob_a0_c_y1_u[1] * inverse[1][2] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][2] + prob_a0_c_y1_u[3] * inverse[3][2]\n",
    "    tmp_2b = prob_a0_c_y0_u[0] * inverse[0][2] + prob_a0_c_y0_u[1] * inverse[1][2] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][2] + prob_a0_c_y0_u[3] * inverse[3][2]\n",
    "    denom_2b = num_2b + tmp_2b\n",
    "    lower_2b = num_2b / denom_2b\n",
    "    \n",
    "    # Getting P(Y=1 | A=1, C, U=3)\n",
    "    num_3a = prob_a1_c_y1_u[0] * inverse[0][3] + prob_a1_c_y1_u[1] * inverse[1][3] + prob_a1_c_y1_u[2] * \\\n",
    "             inverse[2][3] + prob_a1_c_y1_u[3] * inverse[3][3]\n",
    "    tmp_3a = prob_a1_c_y0_u[0] * inverse[0][3] + prob_a1_c_y0_u[1] * inverse[1][3] + prob_a1_c_y0_u[2] * \\\n",
    "             inverse[2][3] + prob_a1_c_y0_u[3] * inverse[3][3]\n",
    "    denom_3a = num_3a + tmp_3a\n",
    "    upper_3a = num_3a / denom_3a\n",
    "    \n",
    "    # Getting P(Y=1 | A=0, C, U=3)\n",
    "    num_3b = prob_a0_c_y1_u[0] * inverse[0][3] + prob_a0_c_y1_u[1] * inverse[1][3] + prob_a0_c_y1_u[2] * \\\n",
    "             inverse[2][3] + prob_a0_c_y1_u[3] * inverse[3][3]\n",
    "    tmp_3b = prob_a0_c_y0_u[0] * inverse[0][3] + prob_a0_c_y0_u[1] * inverse[1][3] + prob_a0_c_y0_u[2] * \\\n",
    "             inverse[2][3] + prob_a0_c_y0_u[3] * inverse[3][3]\n",
    "    denom_3b = num_3b + tmp_3b\n",
    "    lower_3b = num_3b / denom_3b\n",
    "    \n",
    "    # Getting P(u | c) \n",
    "    prob_u0_c = num_0a + tmp_0a + num_0b + tmp_0b\n",
    "    prob_u1_c = num_1a + tmp_1a + num_1b + tmp_1b\n",
    "    prob_u2_c = num_2a + tmp_2a + num_2b + tmp_2b\n",
    "    prob_u3_c = num_3a + tmp_3a + num_3b + tmp_3b\n",
    "    \n",
    "    numerator_a = np.sum(upper_0a * prob_u0_c) + np.sum(upper_1a * prob_u1_c) + \\\n",
    "                  np.sum(upper_2a * prob_u2_c) + np.sum(upper_3a * prob_u3_c)\n",
    "    numerator_b = np.sum((1 - lower_0b) * prob_u0_c) + np.sum((1 - lower_1b) * prob_u1_c) + \\\n",
    "                  np.sum((1 - lower_2b) * prob_u2_c) + np.sum((1 - lower_3b) * prob_u3_c)\n",
    "     \n",
    "    denominator_a = np.sum((1 - upper_0a) * prob_u0_c) + np.sum((1 - upper_1a) * prob_u1_c) + \\\n",
    "                    np.sum((1 - upper_2a) * prob_u2_c) + np.sum((1 - upper_3a) * prob_u3_c)\n",
    "    denominator_b = np.sum(lower_0b * prob_u0_c) + np.sum(lower_1b * prob_u1_c) + \\\n",
    "                    np.sum(lower_2b * prob_u2_c) + np.sum(lower_3b * prob_u3_c)\n",
    "    \n",
    "    numerator = numerator_a * numerator_b\n",
    "    denominator = denominator_a * denominator_b\n",
    "    \n",
    "    return numerator / denominator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "05c1a7c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def bootstrap_or(dataframe, model1, model2, model3):\n",
    "    '''\n",
    "    Given a dataframe and 5 models generated from generate_models(), bootstrap the testing set \n",
    "    for n2c2 2006 smoking dataset to get different error rate matrices to test robustness of \n",
    "    the odds ratio casual effect.\n",
    "    Utilize predict_bootstrap_2006.py to generate pickle files that store the confusion matrices.\n",
    "    '''\n",
    "    \n",
    "    # Iterating through the bootstrapped confusion matrices \n",
    "    # \"iterations\" var depends on how many bootstrapped confusion matrics were generated\n",
    "    # Default in predict_bootstrap_2006.py is 10\n",
    "    iterations = 100 \n",
    "    o_r_arr = []\n",
    "    for x in range(iterations):\n",
    "        # Access each pickle file containing the confusion matrix\n",
    "        f = open(\"...\", \"rb\")  # First input should be the bootstrapped matrices (pkl file)\n",
    "        con_matrix = pickle.load(f)\n",
    "        res = con_matrix/con_matrix.sum(axis=1)[:,None]\n",
    "        o_r = odds_ratio_bootstrap(dataframe, model1, model2, model3, res)\n",
    "        o_r_arr.append(o_r)\n",
    "    \n",
    "    return [sum(o_r_arr) / len(o_r_arr), o_r_arr]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "6e87266c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9196165710847475"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m1, m2, m3 = generate_models(merged_df)\n",
    "bt_or = bootstrap_or(merged_df, m1, m2, m3)\n",
    "bt_or[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "c64d8ba1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.7734094843134075, 0.9803855037359114]"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Computing 95% interval for odds ratio while boostrapping error rate matrix\n",
    "compute_ci_95(bt_or[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89323240",
   "metadata": {},
   "source": [
    "### Combined Bootstrapping for OR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "acbe7639",
   "metadata": {},
   "outputs": [],
   "source": [
    "def combined_bootstrap_or(dataframe, model1, model2, model3):\n",
    "    '''\n",
    "    Given a dataframe and 3 models generated from generate_models(), do combined bootstrapping to see\n",
    "    robustness of odds ratio calculations. Iterations set to 100\n",
    "    Utilize predict_bootstrap_2006.py to generate pickle files that store the confusion matrices.\n",
    "    '''\n",
    "    \n",
    "    # Iterate through 10 of bootstrapped error matrices\n",
    "    iterations_em = 10\n",
    "    or_arr = []\n",
    "    for iem in range(iterations_em):\n",
    "        f = open(\"...\", \"rb\")\n",
    "        con_matrix = pickle.load(f)\n",
    "        res = con_matrix/con_matrix.sum(axis=1)[:,None]\n",
    "        \n",
    "        # Iterate through 10 bootstrapped (shuffled) dataframes\n",
    "        iterations_df = 10\n",
    "        for idf in range(iterations_df):\n",
    "            bt_df = dataframe.sample(frac=1, replace=True, ignore_index=True)\n",
    "            or_v = odds_ratio_bootstrap(bt_df, model1, model2, model3, res)\n",
    "            or_arr.append(or_v)\n",
    "    print(len(or_arr)) # == 100 based on default settings\n",
    "    print(\"Mean combined bootstrap or_v:\", sum(or_arr) / len(or_arr))\n",
    "    return [sum(or_arr) / len(or_arr), or_arr]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "ebca1d72",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100\n",
      "Mean combined bootstrap or_v: 0.883528044221483\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0.5818542201286497, 0.955810700152887]"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### Compute 95% invertal for RR while doing combined bootstrapping\n",
    "combined_or = combined_bootstrap_or(merged_df, m1, m2, m3)\n",
    "compute_ci_95(combined_or[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "227ddb8d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
