{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "30a1e642",
   "metadata": {},
   "outputs": [],
   "source": [
    "library(simex)\n",
    "library(reticulate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4eadb5c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df <- read.csv(\"combined_mimic_smoking_status_0417.csv\")\n",
    "# head(merged_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "35002eef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<span style=white-space:pre-wrap>'mort_28_day ~ echo + first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \\n            vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \\n            icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \\n            lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \\n            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \\n            lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \\n            lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag + SMOKING_STATUS'</span>"
      ],
      "text/latex": [
       "'mort\\_28\\_day \\textasciitilde{} echo + first\\_careunit + age + gender + weight + saps + sofa + elix\\_score + vent + \\textbackslash{}n            vaso + icu\\_adm\\_weekday + icu\\_adm\\_hour + icd\\_chf + icd\\_afib + icd\\_renal + icd\\_liver + icd\\_copd + \\textbackslash{}n            icd\\_cad + icd\\_stroke + icd\\_malignancy + vs\\_heart\\_rate\\_first + vs\\_map\\_first + vs\\_temp\\_first + \\textbackslash{}n            lab\\_hemoglobin\\_first + lab\\_platelet\\_first + lab\\_wbc\\_first + lab\\_ph\\_first + lab\\_chloride\\_first + \\textbackslash{}n            lab\\_sodium\\_first + lab\\_bun\\_first + lab\\_bicarbonate\\_first + lab\\_pco2\\_first + lab\\_creatinine\\_first + \\textbackslash{}n            lab\\_potassium\\_first + lab\\_po2\\_first + lab\\_lactate\\_first + sedative + vs\\_cvp\\_flag + \\textbackslash{}n            lab\\_creatinine\\_kinase\\_flag + lab\\_bnp\\_flag + lab\\_troponin\\_flag + SMOKING\\_STATUS'"
      ],
      "text/markdown": [
       "<span style=white-space:pre-wrap>'mort_28_day ~ echo + first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \\n            vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \\n            icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \\n            lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \\n            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \\n            lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \\n            lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag + SMOKING_STATUS'</span>"
      ],
      "text/plain": [
       "[1] \"mort_28_day ~ echo + first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \\n            vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \\n            icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \\n            lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \\n            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \\n            lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \\n            lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag + SMOKING_STATUS\""
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "fml <- 'mort_28_day ~ echo + first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \\\n",
    "            vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \\\n",
    "            icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \\\n",
    "            lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \\\n",
    "            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \\\n",
    "            lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \\\n",
    "            lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag + SMOKING_STATUS'\n",
    "fml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c9aba927",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df$SMOKING_STATUS <- as.factor(merged_df$SMOKING_STATUS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fefb7297",
   "metadata": {},
   "outputs": [],
   "source": [
    "glm_model = glm(as.formula(fml), data = merged_df, family = binomial, na.action = na.exclude)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "606f1be4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\n",
       "Call:\n",
       "glm(formula = as.formula(fml), family = binomial, data = merged_df, \n",
       "    na.action = na.exclude)\n",
       "\n",
       "Coefficients:\n",
       "                             Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)                 3.5542731  7.4974785   0.474 0.635455    \n",
       "echo                       -0.1629170  0.1284231  -1.269 0.204585    \n",
       "first_careunitSICU         -0.1714718  0.1725299  -0.994 0.320288    \n",
       "age                         0.0196017  0.0049069   3.995 6.48e-05 ***\n",
       "genderM                     0.2627325  0.1292458   2.033 0.042071 *  \n",
       "weight                     -0.0076758  0.0028308  -2.712 0.006697 ** \n",
       "saps                        0.0941357  0.0169318   5.560 2.70e-08 ***\n",
       "sofa                        0.2292723  0.0260908   8.787  < 2e-16 ***\n",
       "elix_score                  0.0016398  0.0382643   0.043 0.965819    \n",
       "vent                        0.2919105  0.2076957   1.405 0.159881    \n",
       "vaso                        0.0096973  0.1592693   0.061 0.951450    \n",
       "icu_adm_weekdaymonday       0.2885723  0.2308834   1.250 0.211350    \n",
       "icu_adm_weekdaysaturday     0.3271973  0.2339191   1.399 0.161884    \n",
       "icu_adm_weekdaysunday       0.3674316  0.2213365   1.660 0.096903 .  \n",
       "icu_adm_weekdaythursday     0.2761743  0.2178751   1.268 0.204948    \n",
       "icu_adm_weekdaytuesday      0.4834773  0.2242950   2.156 0.031119 *  \n",
       "icu_adm_weekdaywednesday    0.3732998  0.2235363   1.670 0.094924 .  \n",
       "icu_adm_hour                0.0101863  0.0083958   1.213 0.225032    \n",
       "icd_chf                     0.0179492  0.1441395   0.125 0.900898    \n",
       "icd_afib                    0.2291138  0.1447864   1.582 0.113552    \n",
       "icd_renal                  -0.0815891  0.1805888  -0.452 0.651417    \n",
       "icd_liver                   0.4885647  0.2067116   2.364 0.018103 *  \n",
       "icd_copd                    0.2186242  0.1690735   1.293 0.195986    \n",
       "icd_cad                    -0.0871206  0.1815129  -0.480 0.631249    \n",
       "icd_stroke                  1.0539960  0.2451065   4.300 1.71e-05 ***\n",
       "icd_malignancy              0.6415184  0.1420419   4.516 6.29e-06 ***\n",
       "vs_heart_rate_first         0.0106343  0.0031247   3.403 0.000666 ***\n",
       "vs_map_first               -0.0055075  0.0031051  -1.774 0.076109 .  \n",
       "vs_temp_first              -0.0264161  0.0554477  -0.476 0.633779    \n",
       "lab_hemoglobin_first       -0.0175549  0.0323383  -0.543 0.587232    \n",
       "lab_platelet_first          0.0006520  0.0005003   1.303 0.192456    \n",
       "lab_wbc_first              -0.0079274  0.0048621  -1.630 0.103009    \n",
       "lab_ph_first               -1.0481758  0.9867278  -1.062 0.288111    \n",
       "lab_chloride_first         -0.0298946  0.0201947  -1.480 0.138788    \n",
       "lab_sodium_first            0.0111143  0.0216535   0.513 0.607758    \n",
       "lab_bun_first               0.0119775  0.0032869   3.644 0.000268 ***\n",
       "lab_bicarbonate_first       0.0243049  0.0235463   1.032 0.301971    \n",
       "lab_pco2_first             -0.0118131  0.0085862  -1.376 0.168875    \n",
       "lab_creatinine_first       -0.3045481  0.0652522  -4.667 3.05e-06 ***\n",
       "lab_potassium_first         0.0745577  0.0822375   0.907 0.364610    \n",
       "lab_po2_first              -0.0008913  0.0006258  -1.424 0.154409    \n",
       "lab_lactate_first           0.1067601  0.0365808   2.918 0.003518 ** \n",
       "sedative                   -0.1557019  0.1820521  -0.855 0.392407    \n",
       "vs_cvp_flag                -0.4690783  0.1415634  -3.314 0.000921 ***\n",
       "lab_creatinine_kinase_flag -0.1289921  0.1685141  -0.765 0.443993    \n",
       "lab_bnp_flag               -0.3880811  0.3466024  -1.120 0.262853    \n",
       "lab_troponin_flag          -0.0035534  0.1753985  -0.020 0.983837    \n",
       "SMOKING_STATUS2             0.9925886  0.3830245   2.591 0.009557 ** \n",
       "SMOKING_STATUS3            -0.5717791  0.1572635  -3.636 0.000277 ***\n",
       "SMOKING_STATUS4             0.2519885  0.1486283   1.695 0.089994 .  \n",
       "---\n",
       "Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n",
       "\n",
       "(Dispersion parameter for binomial family taken to be 1)\n",
       "\n",
       "    Null deviance: 2518.3  on 1992  degrees of freedom\n",
       "Residual deviance: 1779.2  on 1943  degrees of freedom\n",
       "  (2742 observations deleted due to missingness)\n",
       "AIC: 1879.2\n",
       "\n",
       "Number of Fisher Scoring iterations: 5\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(glm_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "35f19038",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"dataframe\">\n",
       "<caption>A matrix: 4 × 4 of type dbl</caption>\n",
       "<thead>\n",
       "\t<tr><th></th><th scope=col>1</th><th scope=col>2</th><th scope=col>3</th><th scope=col>4</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>1</th><td>0.7272727</td><td>0.3508917</td><td>0.0625</td><td>0.015873</td></tr>\n",
       "\t<tr><th scope=row>2</th><td>0.0000000</td><td>0.3545172</td><td>0.0000</td><td>0.000000</td></tr>\n",
       "\t<tr><th scope=row>3</th><td>0.1818182</td><td>0.2634643</td><td>0.8750</td><td>0.015873</td></tr>\n",
       "\t<tr><th scope=row>4</th><td>0.0909091</td><td>0.0311268</td><td>0.0625</td><td>0.968254</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "A matrix: 4 × 4 of type dbl\n",
       "\\begin{tabular}{r|llll}\n",
       "  & 1 & 2 & 3 & 4\\\\\n",
       "\\hline\n",
       "\t1 & 0.7272727 & 0.3508917 & 0.0625 & 0.015873\\\\\n",
       "\t2 & 0.0000000 & 0.3545172 & 0.0000 & 0.000000\\\\\n",
       "\t3 & 0.1818182 & 0.2634643 & 0.8750 & 0.015873\\\\\n",
       "\t4 & 0.0909091 & 0.0311268 & 0.0625 & 0.968254\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "A matrix: 4 × 4 of type dbl\n",
       "\n",
       "| <!--/--> | 1 | 2 | 3 | 4 |\n",
       "|---|---|---|---|---|\n",
       "| 1 | 0.7272727 | 0.3508917 | 0.0625 | 0.015873 |\n",
       "| 2 | 0.0000000 | 0.3545172 | 0.0000 | 0.000000 |\n",
       "| 3 | 0.1818182 | 0.2634643 | 0.8750 | 0.015873 |\n",
       "| 4 | 0.0909091 | 0.0311268 | 0.0625 | 0.968254 |\n",
       "\n"
      ],
      "text/plain": [
       "  1         2         3      4       \n",
       "1 0.7272727 0.3508917 0.0625 0.015873\n",
       "2 0.0000000 0.3545172 0.0000 0.000000\n",
       "3 0.1818182 0.2634643 0.8750 0.015873\n",
       "4 0.0909091 0.0311268 0.0625 0.968254"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "matrix_error <- matrix(c(8/11, 0, 2/11, 1/11, 4/11, 4/11, 3/11, 0, \n",
    "                         1/16, 0, 14/16, 1/16, 1/63, 0, 1/63, 61/63), nrow=4)\n",
    "matrix_error <- build.mc.matrix(matrix_error)\n",
    "dimnames(matrix_error) <- list(levels(merged_df$SMOKING_STATUS), \n",
    "                               levels(merged_df$SMOKING_STATUS))\n",
    "matrix_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d1e14af0",
   "metadata": {},
   "outputs": [],
   "source": [
    "tte_smoking_mcsimex <- mcsimex(glm_model, \n",
    "                               SIMEXvariable = \"SMOKING_STATUS\",\n",
    "                               mc.matrix=matrix_error, \n",
    "                               asymptotic = FALSE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "92f9840a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Call:\n",
       "mcsimex(model = glm_model, SIMEXvariable = \"SMOKING_STATUS\", \n",
       "    mc.matrix = matrix_error, asymptotic = FALSE)\n",
       "\n",
       "Naive model: \n",
       "glm(formula = as.formula(fml), family = binomial, data = merged_df, \n",
       "    na.action = na.exclude)\n",
       "\n",
       "Simex variable : SMOKING_STATUS \n",
       "Misclassification matrix: \n",
       "          1         2      3        4\n",
       "1 0.7272727 0.3508917 0.0625 0.015873\n",
       "2 0.0000000 0.3545172 0.0000 0.000000\n",
       "3 0.1818182 0.2634643 0.8750 0.015873\n",
       "4 0.0909091 0.0311268 0.0625 0.968254\n",
       "\n",
       "Number of iterations:  100 \n",
       "\n",
       "Residuals: \n",
       "     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. \n",
       "-0.924002 -0.208724 -0.065014  0.003324  0.175681  0.983596 \n",
       "\n",
       "Coefficients: \n",
       "\n",
       "Jackknife variance: \n",
       "                             Estimate Std. Error t value Pr(>|t|)    \n",
       "(Intercept)                 3.321e+00  7.609e+00   0.436 0.662543    \n",
       "echo                       -1.564e-01  1.301e-01  -1.202 0.229595    \n",
       "first_careunitSICU         -1.560e-01  1.748e-01  -0.892 0.372440    \n",
       "age                         2.020e-02  4.990e-03   4.048 5.37e-05 ***\n",
       "genderM                     2.394e-01  1.312e-01   1.825 0.068203 .  \n",
       "weight                     -7.977e-03  2.868e-03  -2.781 0.005468 ** \n",
       "saps                        9.782e-02  1.734e-02   5.642 1.93e-08 ***\n",
       "sofa                        2.281e-01  2.636e-02   8.653  < 2e-16 ***\n",
       "elix_score                  9.166e-03  3.878e-02   0.236 0.813200    \n",
       "vent                        2.037e-01  2.121e-01   0.961 0.336839    \n",
       "vaso                        2.656e-02  1.619e-01   0.164 0.869698    \n",
       "icu_adm_weekdaymonday       3.083e-01  2.350e-01   1.312 0.189802    \n",
       "icu_adm_weekdaysaturday     3.423e-01  2.368e-01   1.446 0.148423    \n",
       "icu_adm_weekdaysunday       3.723e-01  2.240e-01   1.662 0.096650 .  \n",
       "icu_adm_weekdaythursday     2.680e-01  2.210e-01   1.212 0.225524    \n",
       "icu_adm_weekdaytuesday      5.072e-01  2.269e-01   2.236 0.025497 *  \n",
       "icu_adm_weekdaywednesday    4.178e-01  2.267e-01   1.843 0.065428 .  \n",
       "icu_adm_hour                1.031e-02  8.488e-03   1.215 0.224576    \n",
       "icd_chf                    -2.503e-02  1.460e-01  -0.171 0.863885    \n",
       "icd_afib                    2.195e-01  1.461e-01   1.503 0.133123    \n",
       "icd_renal                  -4.639e-02  1.828e-01  -0.254 0.799690    \n",
       "icd_liver                   5.155e-01  2.086e-01   2.471 0.013561 *  \n",
       "icd_copd                    1.704e-01  1.722e-01   0.989 0.322616    \n",
       "icd_cad                    -1.080e-01  1.848e-01  -0.584 0.558985    \n",
       "icd_stroke                  1.046e+00  2.480e-01   4.218 2.58e-05 ***\n",
       "icd_malignancy              6.024e-01  1.446e-01   4.166 3.23e-05 ***\n",
       "vs_heart_rate_first         1.014e-02  3.163e-03   3.206 0.001367 ** \n",
       "vs_map_first               -5.081e-03  3.135e-03  -1.621 0.105281    \n",
       "vs_temp_first              -2.447e-02  5.620e-02  -0.435 0.663360    \n",
       "lab_hemoglobin_first       -2.507e-02  3.289e-02  -0.762 0.446154    \n",
       "lab_platelet_first          7.229e-04  5.076e-04   1.424 0.154563    \n",
       "lab_wbc_first              -8.038e-03  4.997e-03  -1.609 0.107876    \n",
       "lab_ph_first               -1.016e+00  1.002e+00  -1.014 0.310884    \n",
       "lab_chloride_first         -3.473e-02  2.052e-02  -1.692 0.090714 .  \n",
       "lab_sodium_first            1.580e-02  2.191e-02   0.721 0.470870    \n",
       "lab_bun_first               1.090e-02  3.332e-03   3.271 0.001092 ** \n",
       "lab_bicarbonate_first       1.939e-02  2.391e-02   0.811 0.417361    \n",
       "lab_pco2_first             -1.239e-02  8.714e-03  -1.421 0.155342    \n",
       "lab_creatinine_first       -2.967e-01  6.566e-02  -4.518 6.60e-06 ***\n",
       "lab_potassium_first         6.392e-02  8.335e-02   0.767 0.443258    \n",
       "lab_po2_first              -8.621e-04  6.311e-04  -1.366 0.172099    \n",
       "lab_lactate_first           9.990e-02  3.725e-02   2.682 0.007378 ** \n",
       "sedative                   -1.204e-01  1.858e-01  -0.648 0.517161    \n",
       "vs_cvp_flag                -4.611e-01  1.432e-01  -3.219 0.001305 ** \n",
       "lab_creatinine_kinase_flag -1.576e-01  1.708e-01  -0.923 0.356276    \n",
       "lab_bnp_flag               -3.961e-01  3.498e-01  -1.132 0.257642    \n",
       "lab_troponin_flag           3.658e-02  1.777e-01   0.206 0.836976    \n",
       "SMOKING_STATUS2             2.146e+00  1.309e+02   0.016 0.986923    \n",
       "SMOKING_STATUS3            -8.393e-01  2.192e-01  -3.828 0.000133 ***\n",
       "SMOKING_STATUS4             3.875e-01  1.715e-01   2.259 0.023992 *  \n",
       "---\n",
       "Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(tte_smoking_mcsimex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62c9119d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot(tte_smoking_mcsimex)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7652ce27",
   "metadata": {},
   "source": [
    "### Calculating Risk Ratio using MC-SIMEX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "c136ca66",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df_0 <- read.csv(\"combined_mimic_smoking_status_0417.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "7602fb17",
   "metadata": {},
   "outputs": [],
   "source": [
    "data <- replace(merged_df_0[\"echo\"], merged_df_0[\"echo\"]>0, 0) \n",
    "# print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "c3cc45fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df_0[\"echo\"] <- data\n",
    "merged_df_0$SMOKING_STATUS <- as.factor(merged_df_0$SMOKING_STATUS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "540e4e7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions_0 = predict(tte_smoking_mcsimex, merged_df_0, type=\"response\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "67135762",
   "metadata": {},
   "outputs": [],
   "source": [
    "p0_total = sum(predictions_0, na.rm=T)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "310ecb5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df_1 <- read.csv(\"combined_mimic_smoking_status_0417.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "9be4bf38",
   "metadata": {},
   "outputs": [],
   "source": [
    "data <- replace(merged_df_1[\"echo\"], merged_df_1[\"echo\"]>-1, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "3ff3a800",
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df_1[\"echo\"] <- data\n",
    "merged_df_1$SMOKING_STATUS <- as.factor(merged_df_1$SMOKING_STATUS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "d0a13bcc",
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions_1 = predict(tte_smoking_mcsimex, merged_df_1, type=\"response\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "a639297c",
   "metadata": {},
   "outputs": [],
   "source": [
    "p1_total = sum(predictions_1, na.rm=T)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "f9ac36bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0.9343328\n"
     ]
    }
   ],
   "source": [
    "rr = p1_total / p0_total\n",
    "print(rr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "afb72831",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0.930508\n",
      "[1] 0.9677904\n",
      "[1] 0.9114576\n",
      "[1] 0.9443506\n"
     ]
    }
   ],
   "source": [
    "merged_df_0$predictions <- predictions_0\n",
    "merged_df_1$predictions <- predictions_1\n",
    "\n",
    "status_1a <- merged_df_0[merged_df_0$SMOKING_STATUS==1,]\n",
    "status_1a_total = sum(status_1a$predictions, na.rm=T)\n",
    "status_1b <- merged_df_1[merged_df_1$SMOKING_STATUS==1,]\n",
    "status_1b_total = sum(status_1b$predictions, na.rm=T)\n",
    "rr_1 <- status_1b_total / status_1a_total\n",
    "print(rr_1)\n",
    "    \n",
    "status_2a <- merged_df_0[merged_df_0$SMOKING_STATUS==2,]\n",
    "status_2a_total = sum(status_2a$predictions, na.rm=T)\n",
    "status_2b <- merged_df_1[merged_df_1$SMOKING_STATUS==2,]\n",
    "status_2b_total = sum(status_2b$predictions, na.rm=T)\n",
    "rr_2 <- status_2b_total / status_2a_total\n",
    "print(rr_2)\n",
    "    \n",
    "status_3a <- merged_df_0[merged_df_0$SMOKING_STATUS==3,]\n",
    "status_3a_total = sum(status_3a$predictions, na.rm=T)\n",
    "status_3b <- merged_df_1[merged_df_1$SMOKING_STATUS==3,]\n",
    "status_3b_total = sum(status_3b$predictions, na.rm=T)\n",
    "rr_3 <- status_3b_total / status_3a_total\n",
    "print(rr_3)\n",
    "    \n",
    "status_4a <- merged_df_0[merged_df_0$SMOKING_STATUS==4,]\n",
    "status_4a_total = sum(status_4a$predictions, na.rm=T)\n",
    "status_4b <- merged_df_1[merged_df_1$SMOKING_STATUS==4,]\n",
    "status_4b_total = sum(status_4b$predictions, na.rm=T)\n",
    "rr_4 <- status_4b_total / status_4a_total\n",
    "print(rr_4)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60c67d90",
   "metadata": {},
   "source": [
    "### Calculating Odds Ratio using MC-SIMEX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "03fd7c3a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0.9043839\n"
     ]
    }
   ],
   "source": [
    "# (p(M=1 | E=1) * P(M=0 | E=0)) / (p(M=0 | E = 1) * p(M=1 | E=0))\n",
    "predictions_0b <- 1 - predictions_0\n",
    "p0b_total <- sum(predictions_0b, na.rm=T)\n",
    "\n",
    "predictions_1b <- 1 - predictions_1\n",
    "p1b_total <- sum(predictions_1b, na.rm=T)\n",
    "\n",
    "or <- (p1_total * p0b_total) / (p1b_total * p0_total)\n",
    "print(or)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8073ea7e",
   "metadata": {},
   "source": [
    "### Bootstrapping Risk Ratio Via Error Rate Matrices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "db61a048",
   "metadata": {},
   "outputs": [],
   "source": [
    "require(\"reticulate\")\n",
    "source_python(\"matrix_script_reader.py\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "c47bf5f1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0\n",
      "[1] 0.9316413\n",
      "[1] 1\n",
      "[1] 0.9343548\n",
      "[1] 2\n",
      "[1] 0.9320158\n",
      "[1] 3\n",
      "[1] 0.933391\n",
      "[1] 4\n",
      "[1] 0.9366119\n",
      "[1] 5\n",
      "[1] 0.9303805\n",
      "[1] 6\n",
      "[1] 0.9303805\n",
      "[1] 7\n",
      "[1] 0.9325448\n",
      "[1] 8\n",
      "[1] 0.9311083\n",
      "[1] 9\n",
      "[1] 0.9312361\n"
     ]
    }
   ],
   "source": [
    "rr_arr <- list()\n",
    "status_1_rr_arr <- list()\n",
    "status_2_rr_arr <- list()\n",
    "status_3_rr_arr <- list()\n",
    "status_4_rr_arr <- list()\n",
    "\n",
    "for (x in 0:9){\n",
    "    print(x)\n",
    "    tmp_m_error <- matrix_script_reader(paste(\"INSERT FILE PATH\",\n",
    "                                              as.character(x),\".pkl\", \n",
    "                                              sep=''))\n",
    "    tmp_m_error <- build.mc.matrix(tmp_m_error, method=\"log\")\n",
    "    dimnames(tmp_m_error) <- list(levels(merged_df$SMOKING_STATUS), levels(merged_df$SMOKING_STATUS))\n",
    "    \n",
    "    tryCatch({boot_mcsimex <- mcsimex(glm_model, \n",
    "                                      SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                      mc.matrix=tmp_m_error, \n",
    "                                      asymptotic = FALSE)}\n",
    "            , error = function(e) {tmp_m_error <- build.mc.matrix(tmp_m_error, method=\"jlt\"); \n",
    "                                   boot_mcsimex <- mcsimex(glm_model, \n",
    "                                                           SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                           mc.matrix=tmp_m_error, \n",
    "                                                           asymptotic = FALSE)} )\n",
    "    \n",
    "    merged_df_0 <- read.csv(\"combined_mimic_smoking_status_0417.csv\")\n",
    "    data_0 <- replace(merged_df_0[\"echo\"], merged_df_0[\"echo\"]>0, 0) \n",
    "    merged_df_0[\"echo\"] <- data_0\n",
    "    merged_df_0$SMOKING_STATUS <- as.factor(merged_df_0$SMOKING_STATUS)\n",
    "    predictions_0 = predict(boot_mcsimex, merged_df_0, type=\"response\")\n",
    "    p0_total = sum(predictions_0, na.rm=T)\n",
    "    \n",
    "    merged_df_1 <- read.csv(\"combined_mimic_smoking_status_0417.csv\")\n",
    "    data_1 <- replace(merged_df_1[\"echo\"], merged_df_1[\"echo\"]>-1, 1)\n",
    "    merged_df_1[\"echo\"] <- data_1\n",
    "    merged_df_1$SMOKING_STATUS <- as.factor(merged_df_1$SMOKING_STATUS)\n",
    "    predictions_1 = predict(boot_mcsimex, merged_df_1, type=\"response\")\n",
    "    p1_total = sum(predictions_1, na.rm=T)\n",
    "    \n",
    "    merged_df_0$predictions <- predictions_0\n",
    "    merged_df_1$predictions <- predictions_1\n",
    "    \n",
    "    status_1a <- merged_df_0[merged_df_0$SMOKING_STATUS==1,]\n",
    "    status_1a_total = sum(status_1a$predictions, na.rm=T)\n",
    "    status_1b <- merged_df_1[merged_df_1$SMOKING_STATUS==1,]\n",
    "    status_1b_total = sum(status_1b$predictions, na.rm=T)\n",
    "    status_1_rr_arr <- append(status_1_rr_arr, status_1b_total / status_1a_total)\n",
    "    \n",
    "    status_2a <- merged_df_0[merged_df_0$SMOKING_STATUS==2,]\n",
    "    status_2a_total = sum(status_2a$predictions, na.rm=T)\n",
    "    status_2b <- merged_df_1[merged_df_1$SMOKING_STATUS==2,]\n",
    "    status_2b_total = sum(status_2b$predictions, na.rm=T)\n",
    "    status_2_rr_arr <- append(status_2_rr_arr, status_2b_total / status_2a_total)\n",
    "    \n",
    "    status_3a <- merged_df_0[merged_df_0$SMOKING_STATUS==3,]\n",
    "    status_3a_total = sum(status_3a$predictions, na.rm=T)\n",
    "    status_3b <- merged_df_1[merged_df_1$SMOKING_STATUS==3,]\n",
    "    status_3b_total = sum(status_3b$predictions, na.rm=T)\n",
    "    status_3_rr_arr <- append(status_3_rr_arr, status_3b_total / status_3a_total)\n",
    "    \n",
    "    status_4a <- merged_df_0[merged_df_0$SMOKING_STATUS==4,]\n",
    "    status_4a_total = sum(status_4a$predictions, na.rm=T)\n",
    "    status_4b <- merged_df_1[merged_df_1$SMOKING_STATUS==4,]\n",
    "    status_4b_total = sum(status_4b$predictions, na.rm=T)\n",
    "    status_4_rr_arr <- append(status_4_rr_arr, status_4b_total / status_4a_total)\n",
    "    \n",
    "    rr = p1_total / p0_total\n",
    "    rr_arr <- append(rr_arr, rr)\n",
    "    print(rr)\n",
    "    \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "a81d427e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.930380528522845</dd><dt>97.5%</dt><dd>0.936104072030073</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.930380528522845\n",
       "\\item[97.5\\textbackslash{}\\%] 0.936104072030073\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.93038052852284597.5%\n",
       ":   0.936104072030073\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.9303805 0.9361041 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "quantile(unlist(rr_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "3e4eeb76",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.927214696837548</dd><dt>97.5%</dt><dd>0.93264259624629</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.927214696837548\n",
       "\\item[97.5\\textbackslash{}\\%] 0.93264259624629\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.92721469683754897.5%\n",
       ":   0.93264259624629\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.9272147 0.9326426 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Smoking Status 1 CI\n",
    "quantile(unlist(status_1_rr_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "ab135750",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.975513648123908</dd><dt>97.5%</dt><dd>0.981191747421394</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.975513648123908\n",
       "\\item[97.5\\textbackslash{}\\%] 0.981191747421394\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.97551364812390897.5%\n",
       ":   0.981191747421394\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.9755136 0.9811917 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Smoking Status 2 CI\n",
    "quantile(unlist(status_2_rr_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "ef588036",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.908143720387407</dd><dt>97.5%</dt><dd>0.913195408727953</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.908143720387407\n",
       "\\item[97.5\\textbackslash{}\\%] 0.913195408727953\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.90814372038740797.5%\n",
       ":   0.913195408727953\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.9081437 0.9131954 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Smoking Status 3 CI\n",
    "quantile(unlist(status_3_rr_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "bfbce83e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.938843348442111</dd><dt>97.5%</dt><dd>0.944035069933909</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.938843348442111\n",
       "\\item[97.5\\textbackslash{}\\%] 0.944035069933909\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.93884334844211197.5%\n",
       ":   0.944035069933909\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.9388433 0.9440351 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Smoking Status 4 CI\n",
    "quantile(unlist(status_4_rr_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13cc7a52",
   "metadata": {},
   "source": [
    "### Bootstrapping Risk Ratio via Sampling Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "b8da0d8d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0.9177163\n",
      "[1] 0.9192531\n",
      "[1] 0.918995\n",
      "[1] 0.8905925\n",
      "[1] 0.9624975\n",
      "[1] 0.9466161\n",
      "[1] 0.903054\n",
      "[1] 0.9608147\n",
      "[1] 0.9609693\n",
      "[1] 0.9609696\n"
     ]
    }
   ],
   "source": [
    "rr_df_arr <- list()\n",
    "status_1_rr_df_arr <- list()\n",
    "status_2_rr_df_arr <- list()\n",
    "status_3_rr_df_arr <- list()\n",
    "status_4_rr_df_arr <- list()\n",
    "for (x in 0:9){\n",
    "\n",
    "    sampled_df <- merged_df[sample(nrow(merged_df), size=nrow(merged_df), replace=TRUE), ]\n",
    "    sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)\n",
    "    sampled_glm_model <- glm(as.formula(fml), \n",
    "                             data = sampled_df, \n",
    "                             family = binomial, \n",
    "                             na.action = na.exclude)\n",
    "    \n",
    "    matrix_error <- matrix(c(8/11, 0, 2/11, 1/11, 4/11, 4/11, 3/11, \n",
    "                             0, 1/16, 0, 14/16, 1/16, 1/63, 0, 1/63, \n",
    "                             61/63), nrow=4)\n",
    "    matrix_error <- build.mc.matrix(matrix_error)\n",
    "    dimnames(matrix_error) <- list(levels(merged_df$SMOKING_STATUS), levels(merged_df$SMOKING_STATUS))\n",
    "    \n",
    "    tryCatch({sampled_mc_simex_model <- mcsimex(sampled_glm_model, \n",
    "                                                SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                mc.matrix=matrix_error, \n",
    "                                                asymptotic = FALSE)}\n",
    "            , error = function(e) {sampled_df <- merged_df[sample(nrow(merged_df), \n",
    "                                                                  size=nrow(merged_df)-1, \n",
    "                                                                  replace=TRUE), ]; \n",
    "                                   sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)\n",
    "                                   sampled_glm_model <- glm(as.formula(fml), \n",
    "                                                            data = sampled_df, \n",
    "                                                            family = binomial, \n",
    "                                                            na.action = na.exclude); \n",
    "                                   sampled_mc_simex_model <- mcsimex(sampled_glm_model, \n",
    "                                                                     SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                                     mc.matrix=matrix_error, \n",
    "                                                                     asymptotic = FALSE)} )\n",
    "    \n",
    "    sampled_df_0 <- sampled_df\n",
    "    sampled_data_0 <- replace(sampled_df_0[\"echo\"], sampled_df_0[\"echo\"]>0, 0) \n",
    "    sampled_df_0[\"echo\"] <- sampled_data_0\n",
    "    sampled_df_0$SMOKING_STATUS <- as.factor(sampled_df_0$SMOKING_STATUS)\n",
    "    sampled_predictions_0 = predict(sampled_mc_simex_model, sampled_df_0, type=\"response\")\n",
    "    sampled_p0_total = sum(sampled_predictions_0, na.rm=T)\n",
    "    \n",
    "    sampled_df_1 <- sampled_df\n",
    "    sampled_data_1 <- replace(sampled_df_1[\"echo\"], sampled_df_1[\"echo\"]>-1, 1)\n",
    "    sampled_df_1[\"echo\"] <- sampled_data_1\n",
    "    sampled_df_1$SMOKING_STATUS <- as.factor(sampled_df_1$SMOKING_STATUS)\n",
    "    sampled_predictions_1 = predict(sampled_mc_simex_model, sampled_df_1, type=\"response\")\n",
    "    sampled_p1_total = sum(sampled_predictions_1, na.rm=T)\n",
    "    \n",
    "    sampled_df_0$predictions <- sampled_predictions_0\n",
    "    sampled_df_1$predictions <- sampled_predictions_1\n",
    "    \n",
    "    status_1a <- sampled_df_0[sampled_df_0$SMOKING_STATUS==1,]\n",
    "    status_1a_total = sum(status_1a$predictions, na.rm=T)\n",
    "    status_1b <- sampled_df_1[sampled_df_1$SMOKING_STATUS==1,]\n",
    "    status_1b_total = sum(status_1b$predictions, na.rm=T)\n",
    "    status_1_rr_df_arr <- append(status_1_rr_df_arr, status_1b_total / status_1a_total)\n",
    "    \n",
    "    status_2a <- sampled_df_0[sampled_df_0$SMOKING_STATUS==2,]\n",
    "    status_2a_total = sum(status_2a$predictions, na.rm=T)\n",
    "    status_2b <- sampled_df_1[sampled_df_1$SMOKING_STATUS==2,]\n",
    "    status_2b_total = sum(status_2b$predictions, na.rm=T)\n",
    "    status_2_rr_df_arr <- append(status_2_rr_df_arr, status_2b_total / status_2a_total)\n",
    "    \n",
    "    status_3a <- sampled_df_0[sampled_df_0$SMOKING_STATUS==3,]\n",
    "    status_3a_total = sum(status_3a$predictions, na.rm=T)\n",
    "    status_3b <- sampled_df_1[sampled_df_1$SMOKING_STATUS==3,]\n",
    "    status_3b_total = sum(status_3b$predictions, na.rm=T)\n",
    "    status_3_rr_df_arr <- append(status_3_rr_df_arr, status_3b_total / status_3a_total)\n",
    "    \n",
    "    status_4a <- sampled_df_0[sampled_df_0$SMOKING_STATUS==4,]\n",
    "    status_4a_total = sum(status_4a$predictions, na.rm=T)\n",
    "    status_4b <- sampled_df_1[sampled_df_1$SMOKING_STATUS==4,]\n",
    "    status_4b_total = sum(status_4b$predictions, na.rm=T)\n",
    "    status_4_rr_df_arr <- append(status_4_rr_df_arr, status_4b_total / status_4a_total)\n",
    "\n",
    "    sample_rr = sampled_p1_total / sampled_p0_total\n",
    "    rr_df_arr <- append(rr_df_arr, sample_rr)\n",
    "    print(sample_rr)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "d858a784",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.893396353104605</dd><dt>97.5%</dt><dd>0.96215376151139</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.893396353104605\n",
       "\\item[97.5\\textbackslash{}\\%] 0.96215376151139\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.89339635310460597.5%\n",
       ":   0.96215376151139\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.8933964 0.9621538 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "quantile(unlist(rr_df_arr), c(.025, 0.975))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "ee400f2e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.889599843587598</dd><dt>97.5%</dt><dd>0.960854152636325</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.889599843587598\n",
       "\\item[97.5\\textbackslash{}\\%] 0.960854152636325\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.88959984358759897.5%\n",
       ":   0.960854152636325\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.8895998 0.9608542 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Smoking Status 1 CI\n",
    "quantile(unlist(status_1_rr_df_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "a04b4808",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.909412380790716</dd><dt>97.5%</dt><dd>0.973342196388496</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.909412380790716\n",
       "\\item[97.5\\textbackslash{}\\%] 0.973342196388496\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.90941238079071697.5%\n",
       ":   0.973342196388496\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.9094124 0.9733422 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Smoking Status 2 CI\n",
    "quantile(unlist(status_2_rr_df_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "235f247b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.842802419395224</dd><dt>97.5%</dt><dd>0.95155216399804</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.842802419395224\n",
       "\\item[97.5\\textbackslash{}\\%] 0.95155216399804\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.84280241939522497.5%\n",
       ":   0.95155216399804\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.8428024 0.9515522 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Smoking Status 3 CI\n",
    "quantile(unlist(status_3_rr_df_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "fa7bc68e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.905982992335071</dd><dt>97.5%</dt><dd>0.970227253814938</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.905982992335071\n",
       "\\item[97.5\\textbackslash{}\\%] 0.970227253814938\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.90598299233507197.5%\n",
       ":   0.970227253814938\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.9059830 0.9702273 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Smoking Status 4 CI\n",
    "quantile(unlist(status_4_rr_df_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "46139064",
   "metadata": {},
   "source": [
    "### Bootstrapping Risk Ratio combining both strategies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "735ba20c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0.8498759\n",
      "[1] 0.8547966\n",
      "[1] 0.8494987\n",
      "[1] 0.8505417\n",
      "[1] 0.857892\n",
      "[1] 0.8506836\n",
      "[1] 0.8506836\n",
      "[1] 0.8522792\n",
      "[1] 0.8505662\n",
      "[1] 0.8507709\n",
      "[1] 0.8829385\n",
      "[1] 0.8815192\n",
      "[1] 0.8804538\n",
      "[1] 0.881803\n",
      "[1] 0.8782177\n",
      "[1] 0.8834176\n",
      "[1] 0.8834176\n",
      "[1] 0.8824677\n",
      "[1] 0.8830197\n",
      "[1] 0.8812688\n",
      "[1] 0.9178056\n",
      "[1] 0.9169759\n",
      "[1] 0.9148318\n",
      "[1] 0.9172119\n",
      "[1] 0.9174504\n",
      "[1] 0.9179122\n",
      "[1] 0.9179122\n",
      "[1] 0.9168176\n",
      "[1] 0.9168748\n",
      "[1] 0.9166274\n",
      "[1] 0.8943567\n",
      "[1] 0.8936341\n",
      "[1] 0.8909847\n",
      "[1] 0.8910933\n",
      "[1] 0.8968332\n",
      "[1] 0.8923147\n",
      "[1] 0.8923147\n",
      "[1] 0.8946779\n",
      "[1] 0.8939735\n",
      "[1] 0.8914655\n",
      "[1] 0.9099752\n",
      "[1] 0.9074819\n",
      "[1] 0.9072579\n",
      "[1] 0.9080405\n",
      "[1] 0.9102136\n",
      "[1] 0.9089579\n",
      "[1] 0.9089579\n",
      "[1] 0.9092853\n",
      "[1] 0.9083634\n",
      "[1] 0.909059\n",
      "[1] 0.9191958\n",
      "[1] 0.9199039\n",
      "[1] 0.9164768\n",
      "[1] 0.9179599\n",
      "[1] 0.9192645\n",
      "[1] 0.918949\n",
      "[1] 0.918949\n",
      "[1] 0.9189645\n",
      "[1] 0.9166585\n",
      "[1] 0.918716\n",
      "[1] 1.009724\n",
      "[1] 1.014891\n",
      "[1] 1.010643\n",
      "[1] 1.012132\n",
      "[1] 1.017634\n",
      "[1] 1.007861\n",
      "[1] 1.007861\n",
      "[1] 1.008069\n",
      "[1] 1.00921\n",
      "[1] 1.008941\n",
      "[1] 0.9159847\n",
      "[1] 0.9121385\n",
      "[1] 0.9114735\n",
      "[1] 0.912814\n",
      "[1] 0.9151667\n",
      "[1] 0.9157711\n",
      "[1] 0.9157711\n",
      "[1] 0.9140224\n",
      "[1] 0.9118952\n",
      "[1] 0.9105027\n",
      "[1] 0.99289\n",
      "[1] 0.9981164\n",
      "[1] 0.9987793\n",
      "[1] 0.9982318\n",
      "[1] 1.000453\n",
      "[1] 0.9893768\n",
      "[1] 0.9893768\n",
      "[1] 0.9902771\n",
      "[1] 0.9966094\n",
      "[1] 0.9974126\n",
      "[1] 0.9814112\n",
      "[1] 0.9830389\n",
      "[1] 0.9864121\n",
      "[1] 0.9852063\n",
      "[1] 0.9791975\n",
      "[1] 0.9820864\n",
      "[1] 0.9820864\n",
      "[1] 0.9840233\n",
      "[1] 0.9863468\n",
      "[1] 0.9857723\n"
     ]
    }
   ],
   "source": [
    "rr_combined_arr <- list()\n",
    "for (x in 0:9){\n",
    "\n",
    "    sampled_df <- merged_df[sample(nrow(merged_df), size=nrow(merged_df), replace=TRUE), ]\n",
    "    sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)\n",
    "    sampled_glm_model <- glm(as.formula(fml), data = sampled_df, family = binomial, na.action = na.exclude)\n",
    "    \n",
    "    \n",
    "    for (y in 0:9){\n",
    "        \n",
    "        tmp_m_error <- matrix_script_reader(paste(\"INSERT FILE PATH\",\n",
    "                                                  as.character(y),\".pkl\", \n",
    "                                                  sep=''))\n",
    "        tmp_m_error <- build.mc.matrix(tmp_m_error, method=\"log\")\n",
    "        dimnames(tmp_m_error) <- list(levels(sampled_df$SMOKING_STATUS), levels(sampled_df$SMOKING_STATUS))\n",
    "        \n",
    "        tryCatch({sampled_mc_simex_model <- mcsimex(sampled_glm_model, \n",
    "                                                    SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                    mc.matrix=tmp_m_error, \n",
    "                                                    asymptotic = FALSE)}\n",
    "            , error = function(e) {sampled_df <- merged_df[sample(nrow(merged_df), \n",
    "                                                                  size=nrow(merged_df)-1, \n",
    "                                                                  replace=TRUE), ]; \n",
    "                                   sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)\n",
    "                                   sampled_glm_model <- glm(as.formula(fml), \n",
    "                                                            data = sampled_df, \n",
    "                                                            family = binomial, \n",
    "                                                            na.action = na.exclude);\n",
    "                                   tmp_m_error <- build.mc.matrix(tmp_m_error, method=\"jlt\");\n",
    "                                   sampled_mc_simex_model <- mcsimex(sampled_glm_model, \n",
    "                                                                     SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                                     mc.matrix=tmp_m_error, \n",
    "                                                                     asymptotic = FALSE)} )\n",
    "        sampled_df_0 <- sampled_df\n",
    "        sampled_data_0 <- replace(sampled_df_0[\"echo\"], sampled_df_0[\"echo\"]>0, 0) \n",
    "        sampled_df_0[\"echo\"] <- sampled_data_0\n",
    "        sampled_df_0$SMOKING_STATUS <- as.factor(sampled_df_0$SMOKING_STATUS)\n",
    "        sampled_predictions_0 = predict(sampled_mc_simex_model, sampled_df_0, type=\"response\")\n",
    "        sampled_p0_total = sum(sampled_predictions_0, na.rm=T)\n",
    "\n",
    "        sampled_df_1 <- sampled_df\n",
    "        sampled_data_1 <- replace(sampled_df_1[\"echo\"], sampled_df_1[\"echo\"]>-1, 1)\n",
    "        sampled_df_1[\"echo\"] <- sampled_data_1\n",
    "        sampled_df_1$SMOKING_STATUS <- as.factor(sampled_df_1$SMOKING_STATUS)\n",
    "        sampled_predictions_1 = predict(sampled_mc_simex_model, sampled_df_1, type=\"response\")\n",
    "        sampled_p1_total = sum(sampled_predictions_1, na.rm=T)\n",
    "\n",
    "        sample_rr = sampled_p1_total / sampled_p0_total\n",
    "        rr_combined_arr <- append(rr_combined_arr, sample_rr)\n",
    "        print(sample_rr)\n",
    "           \n",
    "    }\n",
    "    \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "eec0097a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>0%</dt><dd>0.849498655836156</dd><dt>25%</dt><dd>0.893304214395079</dd><dt>50%</dt><dd>0.915877892454088</dd><dt>75%</dt><dd>0.983284973521591</dd><dt>100%</dt><dd>1.01763420731108</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[0\\textbackslash{}\\%] 0.849498655836156\n",
       "\\item[25\\textbackslash{}\\%] 0.893304214395079\n",
       "\\item[50\\textbackslash{}\\%] 0.915877892454088\n",
       "\\item[75\\textbackslash{}\\%] 0.983284973521591\n",
       "\\item[100\\textbackslash{}\\%] 1.01763420731108\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "0%\n",
       ":   0.84949865583615625%\n",
       ":   0.89330421439507950%\n",
       ":   0.91587789245408875%\n",
       ":   0.983284973521591100%\n",
       ":   1.01763420731108\n",
       "\n"
      ],
      "text/plain": [
       "       0%       25%       50%       75%      100% \n",
       "0.8494987 0.8933042 0.9158779 0.9832850 1.0176342 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "quantile(unlist(rr_combined_arr))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "da95d674",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.850553342088739</dd><dt>97.5%</dt><dd>1.01142506789169</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.850553342088739\n",
       "\\item[97.5\\textbackslash{}\\%] 1.01142506789169\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.85055334208873997.5%\n",
       ":   1.01142506789169\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.8505533 1.0114251 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "quantile(unlist(rr_combined_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09f604e4",
   "metadata": {},
   "source": [
    "### Bootstrapping Odds Ratio via Error Rate Matrices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "6d00bbb8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0\n",
      "[1] 0.9013468\n",
      "[1] 1\n",
      "[1] 0.9044345\n",
      "[1] 2\n",
      "[1] 0.8999322\n",
      "[1] 3\n",
      "[1] 0.9024599\n",
      "[1] 4\n",
      "[1] 0.9085702\n",
      "[1] 5\n",
      "[1] 0.8986585\n",
      "[1] 6\n",
      "[1] 0.8986585\n",
      "[1] 7\n",
      "[1] 0.898196\n",
      "[1] 8\n",
      "[1] 0.8997136\n",
      "[1] 9\n",
      "[1] 0.8980627\n"
     ]
    }
   ],
   "source": [
    "or_arr <- list()\n",
    "for (x in 0:9){\n",
    "    print(x)\n",
    "    tmp_m_error <- matrix_script_reader(paste(\"INSERT FILE PATH\",\n",
    "                                              as.character(x),\".pkl\", \n",
    "                                              sep=''))\n",
    "    tmp_m_error <- build.mc.matrix(tmp_m_error, method=\"log\")\n",
    "    dimnames(tmp_m_error) <- list(levels(merged_df$SMOKING_STATUS), levels(merged_df$SMOKING_STATUS))\n",
    "    \n",
    "    tryCatch({boot_mcsimex <- mcsimex(glm_model, \n",
    "                                      SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                      mc.matrix=tmp_m_error, \n",
    "                                      asymptotic = FALSE)}\n",
    "            , error = function(e) {tmp_m_error <- build.mc.matrix(tmp_m_error, method=\"jlt\"); \n",
    "                                   boot_mcsimex <- mcsimex(glm_model, \n",
    "                                                           SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                           mc.matrix=tmp_m_error, \n",
    "                                                           asymptotic = FALSE)} )\n",
    "    \n",
    "    merged_df_0 <- read.csv(\"combined_mimic_smoking_status_0417.csv\")\n",
    "    data_0 <- replace(merged_df_0[\"echo\"], merged_df_0[\"echo\"]>0, 0) \n",
    "    merged_df_0[\"echo\"] <- data_0\n",
    "    merged_df_0$SMOKING_STATUS <- as.factor(merged_df_0$SMOKING_STATUS)\n",
    "    predictions_0 = predict(boot_mcsimex, merged_df_0, type=\"response\")\n",
    "    p0_total = sum(predictions_0, na.rm=T)\n",
    "    \n",
    "    merged_df_1 <- read.csv(\"combined_mimic_smoking_status_0417.csv\")\n",
    "    data_1 <- replace(merged_df_1[\"echo\"], merged_df_1[\"echo\"]>-1, 1)\n",
    "    merged_df_1[\"echo\"] <- data_1\n",
    "    merged_df_1$SMOKING_STATUS <- as.factor(merged_df_1$SMOKING_STATUS)\n",
    "    predictions_1 = predict(boot_mcsimex, merged_df_1, type=\"response\")\n",
    "    p1_total = sum(predictions_1, na.rm=T)\n",
    "    \n",
    "    \n",
    "    predictions_0b <- 1 - predictions_0\n",
    "    p0b_total <- sum(predictions_0b, na.rm=T)\n",
    "\n",
    "    predictions_1b <- 1 - predictions_1\n",
    "    p1b_total <- sum(predictions_1b, na.rm=T)\n",
    "\n",
    "    or <- (p1_total * p0b_total) / (p1b_total * p0_total)\n",
    "    or_arr <- append(or_arr, or)\n",
    "    print(or)\n",
    "    \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "9404a385",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.898092667144413</dd><dt>97.5%</dt><dd>0.907639694885957</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.898092667144413\n",
       "\\item[97.5\\textbackslash{}\\%] 0.907639694885957\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.89809266714441397.5%\n",
       ":   0.907639694885957\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.8980927 0.9076397 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "quantile(unlist(or_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4dfd6b33",
   "metadata": {},
   "source": [
    "### Bootstrapping Odds Ratio via Sampling Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "cd53c011",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0.9597897\n",
      "[1] 0.8355193\n",
      "[1] 0.7804283\n",
      "[1] 0.7817963\n",
      "[1] 0.7811384\n",
      "[1] 0.7555916\n",
      "[1] 0.8333176\n",
      "[1] 0.838733\n",
      "[1] 0.9276948\n",
      "[1] 0.9031998\n"
     ]
    }
   ],
   "source": [
    "or_df_arr <- list()\n",
    "for (x in 0:9){\n",
    "\n",
    "    sampled_df <- merged_df[sample(nrow(merged_df), size=nrow(merged_df), replace=TRUE), ]\n",
    "    sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)\n",
    "    sampled_glm_model <- glm(as.formula(fml), \n",
    "                             data = sampled_df, \n",
    "                             family = binomial, \n",
    "                             na.action = na.exclude)\n",
    "    \n",
    "    matrix_error <- matrix(c(8/11, 0, 2/11, 1/11, 4/11, 4/11, 3/11, \n",
    "                             0, 1/16, 0, 14/16, 1/16, 1/63, 0, 1/63, \n",
    "                             61/63), nrow=4)\n",
    "    matrix_error <- build.mc.matrix(matrix_error)\n",
    "    dimnames(matrix_error) <- list(levels(merged_df$SMOKING_STATUS), levels(merged_df$SMOKING_STATUS))\n",
    "    \n",
    "    tryCatch({sampled_mc_simex_model <- mcsimex(sampled_glm_model, \n",
    "                                                SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                mc.matrix=matrix_error, \n",
    "                                                asymptotic = FALSE)}\n",
    "            , error = function(e) {sampled_df <- merged_df[sample(nrow(merged_df), \n",
    "                                                                  size=nrow(merged_df)-1, \n",
    "                                                                  replace=TRUE), ]; \n",
    "                                   sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)\n",
    "                                   sampled_glm_model <- glm(as.formula(fml), \n",
    "                                                            data = sampled_df, \n",
    "                                                            family = binomial, \n",
    "                                                            na.action = na.exclude); \n",
    "                                   sampled_mc_simex_model <- mcsimex(sampled_glm_model, \n",
    "                                                                     SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                                     mc.matrix=matrix_error, \n",
    "                                                                     asymptotic = FALSE)} )\n",
    "    \n",
    "    sampled_df_0 <- sampled_df\n",
    "    sampled_data_0 <- replace(sampled_df_0[\"echo\"], sampled_df_0[\"echo\"]>0, 0) \n",
    "    sampled_df_0[\"echo\"] <- sampled_data_0\n",
    "    sampled_df_0$SMOKING_STATUS <- as.factor(sampled_df_0$SMOKING_STATUS)\n",
    "    sampled_predictions_0 = predict(sampled_mc_simex_model, sampled_df_0, type=\"response\")\n",
    "    sampled_p0_total = sum(sampled_predictions_0, na.rm=T)\n",
    "    \n",
    "    sampled_df_1 <- sampled_df\n",
    "    sampled_data_1 <- replace(sampled_df_1[\"echo\"], sampled_df_1[\"echo\"]>-1, 1)\n",
    "    sampled_df_1[\"echo\"] <- sampled_data_1\n",
    "    sampled_df_1$SMOKING_STATUS <- as.factor(sampled_df_1$SMOKING_STATUS)\n",
    "    sampled_predictions_1 = predict(sampled_mc_simex_model, sampled_df_1, type=\"response\")\n",
    "    sampled_p1_total = sum(sampled_predictions_1, na.rm=T)\n",
    "    \n",
    "    sampled_predictions_0b <- 1 - sampled_predictions_0\n",
    "    sampled_p0b_total <- sum(sampled_predictions_0b, na.rm=T)\n",
    "    \n",
    "    sampled_predictions_1b <- 1 - sampled_predictions_1\n",
    "    sampled_p1b_total <- sum(sampled_predictions_1b, na.rm=T)\n",
    "\n",
    "    sample_or <- (sampled_p1_total * sampled_p0b_total) / (sampled_p1b_total * sampled_p0_total)\n",
    "    or_df_arr <- append(or_df_arr, sample_or)\n",
    "    print(sample_or)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "4611fc82",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.761179850151689</dd><dt>97.5%</dt><dd>0.952568332633718</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.761179850151689\n",
       "\\item[97.5\\textbackslash{}\\%] 0.952568332633718\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.76117985015168997.5%\n",
       ":   0.952568332633718\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.7611799 0.9525683 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "quantile(unlist(or_df_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b326fc39",
   "metadata": {},
   "source": [
    "### Bootstrapping Odds Ratio combining both strategies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "d4c99f07",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0.8307577\n",
      "[1] 0.8349587\n",
      "[1] 0.8326337\n",
      "[1] 0.8305974\n",
      "[1] 0.8358218\n",
      "[1] 0.8253645\n",
      "[1] 0.8253645\n",
      "[1] 0.8284771\n",
      "[1] 0.8315031\n",
      "[1] 0.8288004\n",
      "[1] 0.8571802\n",
      "[1] 0.8613284\n",
      "[1] 0.8622972\n",
      "[1] 0.8616072\n",
      "[1] 0.8593255\n",
      "[1] 0.8558702\n",
      "[1] 0.8558702\n",
      "[1] 0.8586496\n",
      "[1] 0.8608632\n",
      "[1] 0.8631997\n",
      "[1] 0.9027386\n",
      "[1] 0.9036023\n",
      "[1] 0.9011807\n",
      "[1] 0.9030251\n",
      "[1] 0.9060547\n",
      "[1] 0.894438\n",
      "[1] 0.894438\n",
      "[1] 0.898146\n",
      "[1] 0.9020912\n",
      "[1] 0.8995272\n",
      "[1] 0.8718458\n",
      "[1] 0.8761887\n",
      "[1] 0.871458\n",
      "[1] 0.873121\n",
      "[1] 0.8759731\n",
      "[1] 0.8715824\n",
      "[1] 0.8715824\n",
      "[1] 0.8706098\n",
      "[1] 0.8708969\n",
      "[1] 0.8723712\n",
      "[1] 0.9971386\n",
      "[1] 1.001183\n",
      "[1] 0.9943017\n",
      "[1] 0.9971899\n",
      "[1] 1.006179\n",
      "[1] 0.991759\n",
      "[1] 0.991759\n",
      "[1] 0.9912891\n",
      "[1] 0.9969101\n",
      "[1] 0.9953207\n",
      "[1] 0.9042866\n",
      "[1] 0.9051599\n",
      "[1] 0.9023571\n",
      "[1] 0.9060004\n",
      "[1] 0.9001079\n",
      "[1] 0.9051268\n",
      "[1] 0.9051268\n",
      "[1] 0.9106352\n",
      "[1] 0.9033449\n",
      "[1] 0.9080424\n",
      "[1] 0.8678863\n",
      "[1] 0.8728354\n",
      "[1] 0.8699545\n",
      "[1] 0.8687646\n",
      "[1] 0.875112\n",
      "[1] 0.8633982\n",
      "[1] 0.8633982\n",
      "[1] 0.8664942\n",
      "[1] 0.8669841\n",
      "[1] 0.8667989\n",
      "[1] 0.8566376\n",
      "[1] 0.8689318\n",
      "[1] 0.861951\n",
      "[1] 0.8625971\n",
      "[1] 0.8706221\n",
      "[1] 0.8505624\n",
      "[1] 0.8505624\n",
      "[1] 0.8530929\n",
      "[1] 0.861981\n",
      "[1] 0.8615712\n",
      "[1] 0.9951312\n",
      "[1] 0.9975826\n",
      "[1] 0.9966223\n",
      "[1] 0.9979961\n",
      "[1] 1.000989\n",
      "[1] 0.9929112\n",
      "[1] 0.9929112\n",
      "[1] 0.9923031\n",
      "[1] 0.995292\n",
      "[1] 0.9951903\n",
      "[1] 0.9289358\n",
      "[1] 0.9384615\n",
      "[1] 0.9343147\n",
      "[1] 0.9390983\n",
      "[1] 0.9401863\n",
      "[1] 0.9242886\n",
      "[1] 0.9242886\n",
      "[1] 0.9282305\n",
      "[1] 0.9316203\n",
      "[1] 0.9324297\n"
     ]
    }
   ],
   "source": [
    "or_combined_arr <- list()\n",
    "for (x in 0:9){\n",
    "\n",
    "    sampled_df <- merged_df[sample(nrow(merged_df), size=nrow(merged_df), replace=TRUE), ]\n",
    "    sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)\n",
    "    sampled_glm_model <- glm(as.formula(fml), data = sampled_df, family = binomial, na.action = na.exclude)\n",
    "    \n",
    "    \n",
    "    for (y in 0:9){\n",
    "        \n",
    "        tmp_m_error <- matrix_script_reader(paste(\"INSERT FILE PATH\",\n",
    "                                                  as.character(y),\".pkl\", \n",
    "                                                  sep=''))\n",
    "        tmp_m_error <- build.mc.matrix(tmp_m_error, method=\"log\")\n",
    "        dimnames(tmp_m_error) <- list(levels(sampled_df$SMOKING_STATUS), levels(sampled_df$SMOKING_STATUS))\n",
    "        \n",
    "        tryCatch({sampled_mc_simex_model <- mcsimex(sampled_glm_model, \n",
    "                                                    SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                    mc.matrix=tmp_m_error, \n",
    "                                                    asymptotic = FALSE)}\n",
    "            , error = function(e) {sampled_df <- merged_df[sample(nrow(merged_df), \n",
    "                                                                  size=nrow(merged_df)-1, \n",
    "                                                                  replace=TRUE), ]; \n",
    "                                   sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)\n",
    "                                   sampled_glm_model <- glm(as.formula(fml), \n",
    "                                                            data = sampled_df, \n",
    "                                                            family = binomial, \n",
    "                                                            na.action = na.exclude);\n",
    "                                   tmp_m_error <- build.mc.matrix(tmp_m_error, method=\"jlt\");\n",
    "                                   sampled_mc_simex_model <- mcsimex(sampled_glm_model, \n",
    "                                                                     SIMEXvariable = \"SMOKING_STATUS\", \n",
    "                                                                     mc.matrix=tmp_m_error, \n",
    "                                                                     asymptotic = FALSE)} )\n",
    "        sampled_df_0 <- sampled_df\n",
    "        sampled_data_0 <- replace(sampled_df_0[\"echo\"], sampled_df_0[\"echo\"]>0, 0) \n",
    "        sampled_df_0[\"echo\"] <- sampled_data_0\n",
    "        sampled_df_0$SMOKING_STATUS <- as.factor(sampled_df_0$SMOKING_STATUS)\n",
    "        sampled_predictions_0 = predict(sampled_mc_simex_model, sampled_df_0, type=\"response\")\n",
    "        sampled_p0_total = sum(sampled_predictions_0, na.rm=T)\n",
    "\n",
    "        sampled_df_1 <- sampled_df\n",
    "        sampled_data_1 <- replace(sampled_df_1[\"echo\"], sampled_df_1[\"echo\"]>-1, 1)\n",
    "        sampled_df_1[\"echo\"] <- sampled_data_1\n",
    "        sampled_df_1$SMOKING_STATUS <- as.factor(sampled_df_1$SMOKING_STATUS)\n",
    "        sampled_predictions_1 = predict(sampled_mc_simex_model, sampled_df_1, type=\"response\")\n",
    "        sampled_p1_total = sum(sampled_predictions_1, na.rm=T)\n",
    "        \n",
    "        sampled_predictions_0b <- 1 - sampled_predictions_0\n",
    "        sampled_p0b_total <- sum(sampled_predictions_0b, na.rm=T)\n",
    "\n",
    "        sampled_predictions_1b <- 1 - sampled_predictions_1\n",
    "        sampled_p1b_total <- sum(sampled_predictions_1b, na.rm=T)\n",
    "\n",
    "        sample_or <- (sampled_p1_total * sampled_p0b_total) / (sampled_p1b_total * sampled_p0_total)\n",
    "        or_combined_arr <- append(or_combined_arr, sample_or)\n",
    "        print(sample_or)\n",
    "           \n",
    "    }\n",
    "    \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "4450ca16",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".dl-inline {width: auto; margin:0; padding: 0}\n",
       ".dl-inline>dt, .dl-inline>dd {float: none; width: auto; display: inline-block}\n",
       ".dl-inline>dt::after {content: \":\\0020\"; padding-right: .5ex}\n",
       ".dl-inline>dt:not(:first-of-type) {padding-left: .5ex}\n",
       "</style><dl class=dl-inline><dt>2.5%</dt><dd>0.828630678546708</dd><dt>97.5%</dt><dd>0.999567202429758</dd></dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[2.5\\textbackslash{}\\%] 0.828630678546708\n",
       "\\item[97.5\\textbackslash{}\\%] 0.999567202429758\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "2.5%\n",
       ":   0.82863067854670897.5%\n",
       ":   0.999567202429758\n",
       "\n"
      ],
      "text/plain": [
       "     2.5%     97.5% \n",
       "0.8286307 0.9995672 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "quantile(unlist(or_combined_arr), c(0.025, 0.975))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b9ed26b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "4.3.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
