{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Proximal Causal Inference Synthetic Simulations\n",
    "\n",
    "In this set of experiments, we attempt to simulate text-based proximal causal inference when two pieces of text data are independent and representative of the underlying distribution of text data. Below are the steps of the experiment:\n",
    "\n",
    "1. Make a training dataset DTrain. For each row of data in DTrain, generate two realizations of X1, X2, X3, and X4. \n",
    "2. Generate X = [mean(X11, X12), mean(X21, X22), mean(X31, X32), mean(X41, X42)]. Train a single linear logistic regression on this data to predict U. This simulates training a zero-shot classifier on some broad background/training data.\n",
    "3. Make a dataset that simulates inference time DInference. For each row of data, generate two realizations of X1, X2, X3, and X4. These two realizations of X1 through X4 are independent by definition and represent independent realizations of the oracle U.\n",
    "4. Apply the previously trained 'zero-shot model' to obtain Z and W from the two realizations of X1, X2, X3, and X4.\n",
    "5. Proceed with proximal pipeline as usual."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.special import expit\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "import pickle\n",
    "from create_latex_table import *\n",
    "\n",
    "import warnings\n",
    "\n",
    "# Suppress perfect separation warnings\n",
    "# we are supressing these warnings because perfect separation is expected when ignore\n",
    "# Gotcha 3 in our P1M experiments\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "np.random.seed(7)\n",
    "size = 20000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Code implementing future helper functions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def proximal_find_ace(A, Y, W, Z, covariates, data):\n",
    "    # Split the dataset into training and testing sets\n",
    "    X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=[Y]), data[Y], test_size=0.5)\n",
    "\n",
    "    # subset the features to just A, Z, and covariates and the outcome to W\n",
    "    model1_features = X_train[[A]+[Z]+covariates]\n",
    "    model1_outcome = X_train[W]\n",
    "\n",
    "    # if there is a high amount of class imbalance, rebalance the class weights\n",
    "    if np.mean(model1_outcome) < 0.2 or np.mean(model1_outcome) > 0.8:\n",
    "        model1 = LogisticRegression(class_weight=\"balanced\", penalty=None)\n",
    "    else:\n",
    "        model1 = LogisticRegression(penalty=None)\n",
    "\n",
    "    model1.fit(model1_features, model1_outcome)\n",
    "\n",
    "    # make predictions on the probability\n",
    "    What = model1.predict_proba(X_test[[A]+[Z]+covariates])[:, 1]\n",
    "    # print(np.mean(What))\n",
    "\n",
    "    X_test[\"What\"] = What\n",
    "\n",
    "    # train a linear regression for the second stage of the estimation strategy\n",
    "    model2_features = X_test[[A]+[\"What\"]+covariates]\n",
    "    model2_outcome = y_test\n",
    "\n",
    "    model2 = LinearRegression()\n",
    "    model2.fit(model2_features, model2_outcome)\n",
    "    \n",
    "    return model2.coef_[0]\n",
    "\n",
    "def compute_confidence_intervals(A, Y, W, Z, covariates, data, num_bootstraps=200, alpha=0.05):\n",
    "    \"\"\"\n",
    "    Compute confidence intervals for proximal causal inference via bootstrap\n",
    "    \n",
    "    Returns tuple (q_low, q_up) for the lower and upper quantiles of the confidence interval.\n",
    "    \"\"\"\n",
    "    \n",
    "    Ql = alpha/2\n",
    "    Qu = 1 - alpha/2\n",
    "    # two lists for the two indexes of output\n",
    "    estimates = []\n",
    "    \n",
    "    for i in range(num_bootstraps):\n",
    "        \n",
    "        # resample the data with replacement\n",
    "        data_sampled = data.sample(len(data), replace=True)\n",
    "        data_sampled.reset_index(drop=True, inplace=True)\n",
    "        \n",
    "        # add estimate from resampled data\n",
    "        output = proximal_find_ace(A, Y, W, Z, covariates, data_sampled)\n",
    "        estimates.append(output)\n",
    "\n",
    "    # calculate the quantiles\n",
    "    quantiles = np.quantile(estimates, q=[Ql, Qu])\n",
    "    q_low = quantiles[0]\n",
    "    q_up = quantiles[1]\n",
    "    \n",
    "    return (q_low, q_up)\n",
    "\n",
    "def odds_ratio(X, Y, Z, data):\n",
    "    features = data[[Y]+Z]\n",
    "    outcome = data[X]\n",
    "\n",
    "    if np.mean(outcome) < 0.2 or np.mean(outcome) > 0.8:\n",
    "        model = LogisticRegression(class_weight=\"balanced\", penalty=None)\n",
    "    else:\n",
    "        model = LogisticRegression(penalty=None)\n",
    "    model.fit(features, outcome)\n",
    "\n",
    "    return np.exp(model.coef_[0][0])\n",
    "\n",
    "def odds_ratio_confidence_intervals(X, Y, Z, data, num_bootstraps=200, alpha=0.05):\n",
    "    \"\"\"\n",
    "    Get bootstrap confidence intervals for the value of the odds ratio\n",
    "    \"\"\"\n",
    "\n",
    "    Ql = alpha/2\n",
    "    Qu = 1 - alpha/2\n",
    "    # two lists for the two indexes of output\n",
    "    estimates = []\n",
    "    \n",
    "    for i in range(num_bootstraps):\n",
    "        \n",
    "        # resample the data with replacement\n",
    "        data_sampled = data.sample(len(data), replace=True)\n",
    "        data_sampled.reset_index(drop=True, inplace=True)\n",
    "        \n",
    "        # add estimate from resampled data\n",
    "        output = odds_ratio(X, Y, Z, data_sampled)\n",
    "        estimates.append(output)\n",
    "\n",
    "    # calculate the quantiles\n",
    "    quantiles = np.quantile(estimates, q=[Ql, Qu])\n",
    "    q_low = quantiles[0]\n",
    "    q_up = quantiles[1]\n",
    "    \n",
    "    return (q_low, q_up)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a list to save results into and eventaully into a pickle file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "save = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define the data generating process."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_data():\n",
    "    U = np.random.binomial(1, 0.48, size)\n",
    "\n",
    "    # create a baseline confounder\n",
    "    C = np.random.normal(0, 1, size)\n",
    "    C_coefficient = 3\n",
    "\n",
    "    X11 = np.random.normal(0, 1, size) + 1.95*U + C_coefficient*C\n",
    "    X12 = np.random.normal(0, 1, size) + 1.95*U + C_coefficient*C\n",
    "\n",
    "    # make sure that X2 is some non-linear function\n",
    "    X21 = np.random.normal(0, 1, size) + np.exp(X11) + U + C_coefficient*C\n",
    "    X22 = np.random.normal(0, 1, size) + np.exp(X12) + U + C_coefficient*C\n",
    "\n",
    "    X31 = np.random.normal(0, 1, size) + 1.25*U + C_coefficient*C\n",
    "    X32 = np.random.normal(0, 1, size) + 1.25*U + C_coefficient*C\n",
    "\n",
    "    # make sure that X4 is some non-linear function\n",
    "    X41 = np.random.normal(0, 1, size) + X31**2 + 0.5*X31**3 + U + C_coefficient*C\n",
    "    X42 = np.random.normal(0, 1, size) + X32**2 + 0.5*X32**3 + U + C_coefficient*C\n",
    "\n",
    "    A = np.random.binomial(1, expit(0.8*U+C-0.3), size)\n",
    "\n",
    "    Y = np.random.normal(0, 1, size) + 1.3*A + 0.8*U + 1*C\n",
    "\n",
    "    data = pd.DataFrame({\"U\": U, \"X11\": X11, \"X21\": X21, \"X31\": X31, \"X41\": X41, \"X12\": X12, \"X22\": X22, \"X32\": X32, \"X42\": X42,\n",
    "                         \"A\": A, \"Y\": Y, \"C\": C})\n",
    "\n",
    "    return data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generate training dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "Dtrain = generate_data()\n",
    "\n",
    "# simulate the creation of a 'broad zero-shot predictor'\n",
    "Dtrain['X1'] = (Dtrain['X11']+Dtrain['X12'])/2\n",
    "Dtrain['X2'] = (Dtrain['X21']+Dtrain['X22'])/2\n",
    "Dtrain['X3'] = (Dtrain['X31']+Dtrain['X32'])/2\n",
    "Dtrain['X4'] = (Dtrain['X41']+Dtrain['X42'])/2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train a linear logistic regressor to simulate a zero-shot classifier."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression(penalty=None)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(penalty=None)</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "LogisticRegression(penalty=None)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features = Dtrain[['X1', 'X2', 'X3', 'X4']]\n",
    "outcome = Dtrain['U']\n",
    "\n",
    "zero_shot_model = LogisticRegression(penalty=None)\n",
    "zero_shot_model.fit(features, outcome)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generate the inference dataset, and generate Z and W by using the two realizations of the covariates."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "DInference = generate_data()\n",
    "\n",
    "# create the two realizations of the covariates to generate Z and W\n",
    "realization1 = pd.DataFrame()\n",
    "realization1['X1'] = DInference['X11']\n",
    "realization1['X2'] = DInference['X21']\n",
    "realization1['X3'] = DInference['X31']\n",
    "realization1['X4'] = DInference['X41']\n",
    "\n",
    "W = zero_shot_model.predict(realization1[['X1', 'X2', 'X3', 'X4']])\n",
    "\n",
    "realization2 = pd.DataFrame()\n",
    "realization2['X1'] = DInference['X12']\n",
    "realization2['X2'] = DInference['X22']\n",
    "realization2['X3'] = DInference['X32']\n",
    "realization2['X4'] = DInference['X42']\n",
    "\n",
    "Z = zero_shot_model.predict(realization2[['X1', 'X2', 'X3', 'X4']])\n",
    "\n",
    "DInference['Z'] = Z\n",
    "DInference['W'] = W"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify odds ratio values between W and Z."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "odds ratio, nominal: 2.036667012105297\n",
      "odds ratio, condition C: 1.3819210904788075\n",
      "odds ratio, condition C and U: 1.0004344685417592\n",
      "odds ratio confidence interval: (1.350918677956896, 1.4159820364036846)\n"
     ]
    }
   ],
   "source": [
    "print(\"odds ratio, nominal:\", odds_ratio(\"W\", \"Z\", [], DInference))\n",
    "print(\"odds ratio, condition C:\", odds_ratio(\"W\", \"Z\", [\"C\"], DInference))\n",
    "print(\"odds ratio, condition C and U:\", odds_ratio(\"W\", \"Z\", [\"C\", \"U\"], DInference))\n",
    "or_ci = odds_ratio_confidence_intervals(\"W\", \"Z\", [\"C\"], DInference)\n",
    "print(\"odds ratio confidence interval:\", or_ci)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Proceed with the causal inference pipeline as normal."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.3035777535463435\n",
      "(1.2092251113030785, 1.3939507481471058)\n"
     ]
    }
   ],
   "source": [
    "est_ace = proximal_find_ace(\"A\", \"Y\", \"W\", \"Z\", [\"C\"], DInference)\n",
    "bias = abs(1.3-est_ace)\n",
    "ace_ci = compute_confidence_intervals(\"A\", \"Y\", \"W\", \"Z\", [\"C\"], DInference)\n",
    "ci_cov = f'{{\\\\bf Yes}}'\n",
    "\n",
    "print(est_ace)\n",
    "print(ace_ci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "save.append({\n",
    "    'pipeline': 'P1M',\n",
    "    'or_ci_low': or_ci[0],\n",
    "    'or_ci_high': or_ci[1],\n",
    "    'est_ace': est_ace,\n",
    "    'bias': bias,\n",
    "    'ace_ci': ace_ci,\n",
    "    'ci_cov': ci_cov\n",
    "})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generate the proxies from the same realization of data, this violates P1 on purpose and should produce biased results.\n",
    "\n",
    "*Note:* The following procedure by definition will generate the same proxies. This explains why the proximal pipleine is throwing perfect separation errors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "W = zero_shot_model.predict(realization1[['X1', 'X2', 'X3', 'X4']])\n",
    "\n",
    "Z = zero_shot_model.predict(realization1[['X1', 'X2', 'X3', 'X4']])\n",
    "\n",
    "DInference['Z'] = Z\n",
    "DInference['W'] = W"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify odds ratio values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "odds ratio, nominal: 6226206818464955.0\n",
      "odds ratio, condition C: 1.4934648023714422e+16\n",
      "odds ratio, condition C and U: 1.4618008444686512e+16\n",
      "odds ratio confidence interval: (1.3192862015421102e+16, 1.615047285264014e+16)\n"
     ]
    }
   ],
   "source": [
    "print(\"odds ratio, nominal:\", odds_ratio(\"W\", \"Z\", [], DInference))\n",
    "print(\"odds ratio, condition C:\", odds_ratio(\"W\", \"Z\", [\"C\"], DInference))\n",
    "print(\"odds ratio, condition C and U:\", odds_ratio(\"W\", \"Z\", [\"C\", \"U\"], DInference))\n",
    "or_ci = odds_ratio_confidence_intervals(\"W\", \"Z\", [\"C\"], DInference)\n",
    "print(\"odds ratio confidence interval:\", or_ci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.4304801592918137\n",
      "(1.4045551394655196, 1.4953321912501123)\n"
     ]
    }
   ],
   "source": [
    "est_ace = proximal_find_ace(\"A\", \"Y\", \"W\", \"Z\", ['C'], DInference)\n",
    "bias = abs(1.3-est_ace)\n",
    "ace_ci = compute_confidence_intervals(\"A\", \"Y\", \"W\", \"Z\", ['C'], DInference)\n",
    "ci_cov = f'No'\n",
    "\n",
    "print(est_ace)\n",
    "print(ace_ci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "save.append({\n",
    "    'pipeline': 'P1M, same',\n",
    "    'or_ci_low': or_ci[0],\n",
    "    'or_ci_high': or_ci[1],\n",
    "    'est_ace': est_ace,\n",
    "    'bias': bias,\n",
    "    'ace_ci': ace_ci,\n",
    "    'ci_cov': ci_cov\n",
    "})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 2\n",
    "\n",
    "Use the zero-shot predictor to generate a proxy W and a simple heuristic based predictor to generate the proxy Z. Then, proceed with proximal causal inference as normal."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "DInference = generate_data()\n",
    "\n",
    "# create the first realization of the covariates to generate W\n",
    "realization1 = pd.DataFrame()\n",
    "realization1['X1'] = DInference['X11']\n",
    "realization1['X2'] = DInference['X21']\n",
    "realization1['X3'] = DInference['X31']\n",
    "realization1['X4'] = DInference['X41']\n",
    "\n",
    "W = zero_shot_model.predict(realization1[['X1', 'X2', 'X3', 'X4']])\n",
    "\n",
    "# use a simple heuristic for the second realization of the covariates to generate the proxy Z\n",
    "# (this heuristic attains roughly accuracy 0.84)\n",
    "Z = []\n",
    "for i in range(size):\n",
    "    if DInference['X12'][i] > 1.1:\n",
    "        Z.append(1)\n",
    "    else:\n",
    "        Z.append(0)\n",
    "\n",
    "DInference['W'] = W\n",
    "DInference['Z'] = Z"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify odds ratio values between W and Z."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "odds ratio, nominal: 3.63752559165995\n",
      "odds ratio, condition C: 1.880269157051084\n",
      "odds ratio, condition C and U: 1.0363406022327513\n",
      "odds ratio confidence intervals: (1.820170440932906, 1.9363521194561315)\n"
     ]
    }
   ],
   "source": [
    "print(\"odds ratio, nominal:\", odds_ratio(\"W\", \"Z\", [], DInference))\n",
    "print(\"odds ratio, condition C:\", odds_ratio(\"W\", \"Z\", [\"C\"], DInference))\n",
    "print(\"odds ratio, condition C and U:\", odds_ratio(\"W\", \"Z\", [\"C\", \"U\"], DInference))\n",
    "or_ci = odds_ratio_confidence_intervals(\"W\", \"Z\", [\"C\"], DInference)\n",
    "print(\"odds ratio confidence intervals:\", or_ci)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Proceed with proximal causal inference pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.3431202293299802\n",
      "(1.2731587609051653, 1.4246635248211406)\n"
     ]
    }
   ],
   "source": [
    "est_ace = proximal_find_ace(\"A\", \"Y\", \"W\", \"Z\", [\"C\"], DInference)\n",
    "bias = abs(1.3-est_ace)\n",
    "ace_ci = compute_confidence_intervals(\"A\", \"Y\", \"W\", \"Z\", [\"C\"], DInference)\n",
    "ci_cov = f'{{\\\\bf Yes}}'\n",
    "\n",
    "print(est_ace)\n",
    "print(ace_ci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "save.append({\n",
    "    'pipeline': 'P2M',\n",
    "    'or_ci_low': or_ci[0],\n",
    "    'or_ci_high': or_ci[1],\n",
    "    'est_ace': est_ace,\n",
    "    'bias': bias,\n",
    "    'ace_ci': ace_ci,\n",
    "    'ci_cov': ci_cov\n",
    "})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generate the proxies from the same realization of data, this violates P1 on purpose and should produce biased results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "W = zero_shot_model.predict(realization1[['X1', 'X2', 'X3', 'X4']])\n",
    "\n",
    "# use a simple heuristic on the same realization of the covariates to generate the proxy Z\n",
    "# (this heuristic attains roughly accuracy 0.84)\n",
    "Z = []\n",
    "for i in range(size):\n",
    "    if DInference['X11'][i] > 1.1:\n",
    "        Z.append(1)\n",
    "    else:\n",
    "        Z.append(0)\n",
    "\n",
    "DInference['W'] = W\n",
    "DInference['Z'] = Z"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify odds ratio values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "odds ratio, nominal: 7.45302627844804\n",
      "odds ratio, condition C: 8.162085461375293\n",
      "odds ratio, condition C and U: 5.643790451130861\n",
      "odds ratio confidence interval: (7.901714409322671, 8.40806602828209)\n"
     ]
    }
   ],
   "source": [
    "print(\"odds ratio, nominal:\", odds_ratio(\"W\", \"Z\", [], DInference))\n",
    "print(\"odds ratio, condition C:\", odds_ratio(\"W\", \"Z\", [\"C\"], DInference))\n",
    "print(\"odds ratio, condition C and U:\", odds_ratio(\"W\", \"Z\", [\"C\", \"U\"], DInference))\n",
    "or_ci = odds_ratio_confidence_intervals(\"W\", \"Z\", [\"C\"], DInference)\n",
    "print(\"odds ratio confidence interval:\", or_ci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.4074315971024316\n",
      "(1.3764490464840136, 1.4786932779637072)\n"
     ]
    }
   ],
   "source": [
    "est_ace = proximal_find_ace(\"A\", \"Y\", \"W\", \"Z\", [\"C\"], DInference)\n",
    "bias = abs(1.3-est_ace)\n",
    "ace_ci = compute_confidence_intervals(\"A\", \"Y\", \"W\", \"Z\", [\"C\"], DInference)\n",
    "ci_cov = f'No'\n",
    "\n",
    "print(est_ace)\n",
    "print(ace_ci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "save.append({\n",
    "    'pipeline': 'P2M, same',\n",
    "    'or_ci_low': or_ci[0],\n",
    "    'or_ci_high': or_ci[1],\n",
    "    'est_ace': est_ace,\n",
    "    'bias': bias,\n",
    "    'ace_ci': ace_ci,\n",
    "    'ci_cov': ci_cov\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(save, open('fully_synthetic_save.p', 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\t\tP1M & {\\color{blue} $(1.35, 1.42)^\\checkmark$} & $1.304$ & $0.004$ & $(1.209, 1.394)$ & {\\bf Yes} \\\\\n",
      "\t\tP1M, same & {\\color{red} $(1.32e+16, 1.62e+16)$} & $1.430$ & $0.130$ & $(1.405, 1.495)$ & No \\\\\n",
      "\t\tP2M & {\\color{blue} $(1.82, 1.94)^\\checkmark$} & $1.343$ & $0.043$ & $(1.273, 1.425)$ & {\\bf Yes} \\\\\n",
      "\t\tP2M, same & {\\color{red} $(7.9, 8.41)$} & $1.407$ & $0.107$ & $(1.376, 1.479)$ & No \\\\\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(create_table('fully_synthetic_save.p'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
