# -*- coding: utf-8 -*-
"""dml_actg_100524.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.corp.google.com/drive/1cKhxd1XFd2IpCgVjcP45CsYfFkQBTKzm
"""

from matplotlib import pyplot as plt
import numpy as np

import random
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from colabtools import drive

# @title Load data

df = drive.LoadFile('ACTG175.csv')

# @title Preprocess data

data = [line.split(',') for line in df.decode().split('\n')]
columns = data.pop(0)
data = pd.DataFrame(data)
data.columns = columns
data = data[:-1]

data = data.rename(columns={'gender': 'C1', 'age': 'C2', 'wtkg': 'C3', 'homo': 'C4', 'cd40': 'W', 'cd420': 'Y', 'str2': 'X1', 'treat': 'X2'})
data = data[['C1', 'C2', 'C3', 'C4', 'W', 'Y', 'X1', 'X2']]
data = data.astype('float32')
data = data[data['W']!=0]
data=(data-data.min())/(data.max()-data.min())
data.describe()

"""# Construct data for different domains"""

# @title Construct data for S=2

'''S=2'''
# We generate a dataset S=2 by simply copying ACTG.
data_S2 = data.copy()

# Estimate the policy for X2 in domain S2, given by P(X2|C1,C2,C3,C4,X1,W). Should be random allocation.
est_X2 = GradientBoostingClassifier()
est_X2.fit(data_S2[['C1', 'C2', 'C3', 'C4','X1','W']], np.ravel(data_S2['X2']))
def pi_X2(data):
  w = est_X2.predict_proba(data[['C1', 'C2', 'C3', 'C4', 'X1','W']])
  return np.array([w[i, int(x2)] for i, x2 in enumerate(np.array(data['X2']).flatten())])

pi_X2(data_S2)

# @title Construct data for S=1

'''S=1'''
# We generate a dataset S=1 by taking the ACTG trial and changing the correlation between X2 and X1,W to mimick an observational trial.
data_S1 = data.copy()
prob_X2_X1W1 = np.ones_like(data_S1['X2'])*0.5
prob_X2_X1W1[(data_S1['W'] > 0.5) & (data_S1['X1'] == 1) & (data_S1['X2'] == 1)] = 0.7
prob_X2_X1W1[(data_S1['W'] > 0.5) & (data_S1['X1'] == 0) & (data_S1['X2'] == 1)] = 0.3
prob_X2_X1W1[(data_S1['W'] < 0.5) & (data_S1['X1'] == 1) & (data_S1['X2'] == 1)] = 0.3
prob_X2_X1W1[(data_S1['W'] < 0.5) & (data_S1['X1'] == 0) & (data_S1['X2'] == 1)] = 0.7
include = np.random.uniform(size=len(data_S1)) < prob_X2_X1W1
#data_S2 = data_S2[include]
data_S1 = data_S1.sample(n = 2000, weights=prob_X2_X1W1, replace=True)

# We also induce a different policy on X1 and different covariate distribtuion to create a difference between S=2 and S=1
prob_X1_C1 = np.ones_like(data_S1['C1'])
prob_X1_C1[(data_S1['C1'] > 0.5) & (data_S1['X1'] == 1)] = 0.7
prob_X1_C1[(data_S1['C1'] > 0.5) & (data_S1['X1'] == 0)] = 0.3
prob_X1_C1[(data_S1['C1'] < 0.5) & (data_S1['X1'] == 1)] = 0.3
prob_X1_C1[(data_S1['C1'] < 0.5) & (data_S1['X1'] == 0)] = 0.7
include = np.random.uniform(size=len(data_S1)) < prob_X1_C1
# data_S1 = data_S1[include]
data_S1 = data_S1.sample(n = 2000, weights=prob_X1_C1, replace=True) # generate more samples.

# Change ditribution of C3 (age) by upsampling instance with large C3
prob_C3 = np.ones_like(data_S1['C3'])
prob_C3[(data_S1['C3'] > 0.4)] = 0.3
prob_C3[(data_S1['C3'] <= 0.4)] = 0.7
data_S1 = data_S1.sample(n = 2000, weights=prob_C3, replace=True) # generate more samples.

# Estimate the policy for X1 in domain S1, given by P(X1|C1,C2,C3,C4)
# This assumes no unobserved confounding in the assignment of X1 in the original data
est_X1 = GradientBoostingClassifier()
est_X1.fit(data_S1[['C1', 'C2', 'C3', 'C4']], np.ravel(data_S1['X1']))
def pi_X1(data):
  w = est_X1.predict_proba(data[['C1', 'C2', 'C3', 'C4']])
  return np.array([w[i, int(x1)] for i, x1 in enumerate(np.array(data['X1']).flatten())])

# @title Construct data for S=0

def sigmoid(x):
	return 1 / (1 + np.exp(-x))

'''S=0'''
# We generate a dataset S=0 by copying the ACTG trial and randomizing X1.
# We also create a difference in covariate distributions
data_S0 = data.copy()

# Change ditribution of C3 (age) by upsampling instance with large C3
prob_C3 = np.ones_like(data_S0['C3'])
prob_C3[(data_S0['C3'] > 0.4)] = 0.7
prob_C3[(data_S0['C3'] <= 0.4)] = 0.3
data_S0 = data_S0.sample(n = 2000, weights=prob_C3, replace=True) # generate more samples.

# Change distribution of X1
est_X1_ = GradientBoostingClassifier()
est_X1_.fit(data_S0[['C1', 'C2', 'C3', 'C4']], np.ravel(data_S0['X1']))
def pi_X1_(data):
  w = est_X1_.predict_proba(data[['C1', 'C2', 'C3', 'C4']])
  return np.array([w[i, int(x1)] for i, x1 in enumerate(np.array(data['X1']).flatten())])

def pi_X1_new(data):
	return sigmoid(1*(data['C1']+data['C2'])-1)

prob_X1_new = pi_X1_new(data_S0) / pi_X1_(data_S0)
data_S0 = data_S0.sample(n = 2000, weights=prob_X1_new, replace=True) # generate more samples.

# Change distribution of X2
est_X2_ = GradientBoostingClassifier()
est_X2_.fit(data_S0[['C1', 'C2', 'C3', 'C4','X1','W']], np.ravel(data_S0['X2']))
def pi_X2_(data):
  w = est_X2_.predict_proba(data[['C1', 'C2', 'C3', 'C4', 'X1','W']])
  return np.array([w[i, int(x2)] for i, x2 in enumerate(np.array(data['X2']).flatten())])

def pi_X2_new(data):
	return sigmoid(1*(data['C3']+data['X1'])-1)

prob_X2_new = pi_X2_new(data_S0) / pi_X2_(data_S0)
data_S0 = data_S0.sample(n = 2000, weights=prob_X2_new, replace=True) # generate more samples.



# We assume that the policy \pi* we wish to evaluate is given by
# {P(X1|C1,C2,C3,C4,S=0),P(X2|C1,C2,C3,C4,X1,W,S=0)}. These should be both randomized.
# E_\pi*[Y|S=0] is therefore equal to the empirical expectation of Y in data_S0.

est_star_X1 = GradientBoostingClassifier()
est_star_X1.fit(data_S0[['C1', 'C2', 'C3', 'C4']], np.ravel(data_S0['X1']))
def pi_star_X1(data):
  w = est_star_X1.predict_proba(data[['C1', 'C2', 'C3', 'C4']])
  return w[:,1]

est_star_X2 = GradientBoostingClassifier()
est_star_X2.fit(data_S0[['C1', 'C2', 'C3', 'C4','X1','W']], np.ravel(data_S0['X2']))
def pi_star_X2(data):
  w = est_star_X2.predict_proba(data[['C1', 'C2', 'C3', 'C4', 'X1','W']])
  return  w[:,1]

pi_star_X2(data_S0)

len(data_S0), len(data_S1), len(data_S2)

data_S0.describe()

# @title Utilities

def add_noise(vec,add_noise_TF):
	if add_noise_TF:
		n = len(vec)
		noise = np.random.normal(loc=n**(-1/4), scale=n**(-1/4), size=len(vec))
		vec += noise
	return vec

def learn_mu(obs, col_feature, col_label):
	# XGBoost regression model to regress Y on X and Z
	params = {'learning_rate': 0.02,
						'max_depth': 3,
						'min_samples_leaf': 2,
						'n_estimators': 200}
	est = GradientBoostingRegressor(**params)
	est.fit(obs[col_feature], np.ravel(obs[col_label]))
	return est

def learn_pi(obs, col_feature, col_label):
	# XGBoost classification model to regress X on Z
  est = GradientBoostingClassifier()
  est.fit(obs[col_feature], np.ravel(obs[col_label]))
  return est

def estimate_odds_ratio(data_0, data_1, col_feature, n_sample):
	# Step 2: Randomly sample n_sample data points from both datasets
	samples_0 = data_0.sample(n=n_sample, random_state=42)
	samples_1 = data_1.sample(n=n_sample, random_state=42)

	# Step 3: Create a new dataframe with labels
	samples_0['L'] = 0
	samples_1['L'] = 1
	col_label = ['L']
	total_features = col_feature + col_label
	total_samples = pd.concat([samples_0[total_features], samples_1[total_features]], axis=0)

	# Step 4: Construct the XGBoost model
	model = learn_pi(total_samples, col_feature, col_label)
	return model

# param_grid={'n_estimators':[100,200],
#             'learning_rate': [0.1,0.05,0.02],
#             'max_depth':[3, 4],
#             'min_samples_leaf':[1, 2] }

# samples_0 = data_S0
# samples_1 = data_S1

# # Step 3: Create a new dataframe with labels
# samples_0['L'] = 0
# samples_1['L'] = 1
# col_label = ['L']
# total_features = ['C1','C2','C3', 'C4'] + col_label
# total_samples = pd.concat([samples_0[total_features], samples_1[total_features]], axis=0)

# estimator = GradientBoostingRegressor()
# est = GridSearchCV(estimator=estimator, cv=3, param_grid=param_grid, verbose=3)
# est.fit(total_samples[['C1','C2','C3', 'C4']], total_samples[col_label])

# @title Evaluation script for actg


def evaluate_DML(data_S0_cov, data_S1, data_S2, seednum, L=5, add_noise_TF = False):
	def compute_check_mu2(col_feature_mu2, mu2_model, data):
		'''
		Compute sum_{x2} mu2(C1,C2,C3,C4,X1, W, x2) * pi_star_X2(X1, W, C1, C2,C3,C4)
		'''
		# Evaluate mu2(C1, C2, X1, W, X2=0) at S=2
		data_X2_x0 = data.copy()
		data_X2_x0['X2'] = 0
		mu2_X2_x0 = add_noise( mu2_model.predict(data_X2_x0[col_feature_mu2]), add_noise_TF )

		# Evaluate mu2(C1,C2,X1, W, X2=1)
		data_X2_x1 = data.copy()
		data_X2_x1['X2'] = 1
		mu2_X2_x1 = add_noise( mu2_model.predict(data_X2_x1[col_feature_mu2]), add_noise_TF )

		# Compute check_mu2(C1,C2,X1,W) := \sum_{x2} mu2(C1,C2,X1, W, x2) * pi_star_X2(X1, W, C1, C2)
		pi_star_X2_val = np.array( pi_star_X2(data) )
		check_mu2_S2 = (mu2_X2_x1 * pi_star_X2_val) + (mu2_X2_x0 * (1-pi_star_X2_val))
		return check_mu2_S2

	def compute_check_mu1(col_feature_mu1, mu1_model, data):
		# Compute \sum_{x1} mu1(C1,C2,x1) * pi_star_X1(C1, C2)
		## Evaluate mu1(C1,C2,X1 = 0)
		data_X1_x0 = data.copy()
		data_X1_x0['X1'] = 0
		mu1_X1_x0 = add_noise( mu1_model.predict(data_X1_x0[col_feature_mu1]), add_noise_TF )

		## Evaluate mu1(C1,C2,X1 = 1)
		data_X1_x1 = data.copy()
		data_X1_x1['X1'] = 1
		mu1_X1_x1 = add_noise( mu1_model.predict(data_X1_x1[col_feature_mu1]), add_noise_TF )

		## Compute check_mu1(C1,C2) := \sum_{x1} mu1(C1,C2,x1) * pi_star_X1(C1, C2)
		pi_star_X1_val = pi_star_X1(data)
		check_mu1_S1 = (mu1_X1_x1 * pi_star_X1_val) + mu1_X1_x0 * (1-pi_star_X1_val)
		return check_mu1_S1

	np.random.seed(seednum)
	random.seed(123)

	results_OM = []
	results_PW = []
	results_DML = []
	kf = KFold(n_splits=L, shuffle=True)

	for train_index, test_index in kf.split(data_S1):
		'''
		Estimate OM
		'''
		# Split the samples data_S1 and data_S2
		data_S1_train, data_S1_test = data_S1.iloc[train_index], data_S1.iloc[test_index]
		data_S2_train, data_S2_test = data_S2.iloc[train_index], data_S2.iloc[test_index]

		# Learn mu2_model := mu2(C1,C2,X1,W,X2) := E_{P2_pi2}[Y | C1,C2,X1,W,X2] by regressing Y onto {C1,C2,X1,W,X2} using S=2
		col_feature_mu2 = ['C1','C2','C3', 'C4','X1','W','X2']
		col_label_mu2 = ['Y']
		mu2_model = learn_mu(data_S2, col_feature_mu2, col_label_mu2)

		# Compute \sum_{x2} mu2(C1,C2,X1, W, x2) * pi_star_X2(X1, W, C1, C2),
		# where mu2 and pi_star_X2 are evaluated from data_S1
		check_mu2_S1 = compute_check_mu2(col_feature_mu2, mu2_model, data_S1_train)

		# Learn mu1_model := mu1(C1,C2,X1) := E_{P2_pi1}[check_mu2 | C1,C2,X1] by regressing check_mu2 onto {C1,C2,X1} using S=1
		data_S1_train_mu1 = data_S1_train.copy()
		data_S1_train_mu1['check_mu2'] = check_mu2_S1
		col_feature_mu1 = ['C1','C2','C3','C4','X1']
		col_label_mu1 = ['check_mu2']
		mu1_model = learn_mu(data_S1_train_mu1, col_feature_mu1, col_label_mu1)

		# Compute \sum_{x1} mu1(C1,C2,x1) * pi_star_X1(C1, C2) evaluated from data_S0
		check_mu1_S0 = compute_check_mu1(col_feature_mu1, mu1_model, data_S0_cov)

		# OM
		result_OM = np.clip( np.mean(check_mu1_S0), 0, 1)
		results_OM.append(result_OM)

		'''
		Estimate PW
		'''
		# Compute omega_2(C1,C2,X1,W,X2) = (P0(C1,C2) / P2(C1,C2)) * (pi_star_X1(C1,C2) / pi_X1(C1,C2)) * {(P1(W,C1,C2,X1)/P2(W,C1,C2,X1)) * (P2(C1,C2,X1)/P1(C1,C2,X1))} * (pi_star_X2(X1,W,C1,C2) / pi_X2(X1,W,C1,C2))
		## Estimate (P0(C1,C2) / P2(C1,C2))
		### Model for P(S | C) whree S=0 means C1,C2 from P0, and S=1 means C1,C2 from P2
		model_ratio_P0_over_P2_C1C2 = estimate_odds_ratio(data_S0_cov, data_S2_train, ['C1','C2','C3','C4'], len(data_S2_train))
		pred_ratio_P0_over_P2_C1C2 = model_ratio_P0_over_P2_C1C2.predict_proba(data_S2_test[['C1','C2','C3','C4']]) # P(S=1 | C1,C2)
		pred_ratio_P0_over_P2_C1C2 = np.array(pred_ratio_P0_over_P2_C1C2)[:,1]

		### Compute P(S=0|C)/P(S=1|C)
		ratio_P0_over_P2_C1C2 = (1-pred_ratio_P0_over_P2_C1C2)/(pred_ratio_P0_over_P2_C1C2)

		## Estimate P1(W,C1,C2,X1)/P2(W,C1,C2,X1)
		### Model for P(S | W,C1,C2,X1) whree S=0 means W,C1,C2,X1 from P1, and S=1 means W,C1,C2,X1 from P2
		model_ratio_P1_over_P2_WC1C2X1 = estimate_odds_ratio(data_S1_train, data_S2_train, ['C1','C2','C3', 'C4','X1','W'], len(data_S2_train))
		pred_ratio_P1_over_P2_WC1C2X1 = model_ratio_P1_over_P2_WC1C2X1.predict_proba(data_S2_test[['C1','C2','C3', 'C4','X1','W']]) # P(S=1 | C1,C2)
		pred_ratio_P1_over_P2_WC1C2X1 = np.array(pred_ratio_P1_over_P2_WC1C2X1)[:,1]

		### Compute P(S=0 | W,C1,C2,X1)/P(S=1| W,C1,C2,X1)
		ratio_P1_over_P2_WC1C2X1 = (1-pred_ratio_P1_over_P2_WC1C2X1)/(pred_ratio_P1_over_P2_WC1C2X1)

		## Estimate P2(C1,C2,X1)/P1(C1,C2,X1)
		### Model for P(S | C1,C2,X1) whree S=0 means C1,C2,X1 from P2, and S=1 means C1,C2,X1 from P1
		model_ratio_P2_over_P1_C1C2X1 = estimate_odds_ratio(data_S2_train, data_S1_train, ['C1','C2','C3', 'C4','X1'], len(data_S2_train))
		pred_ratio_P2_over_P1_C1C2X1 = model_ratio_P2_over_P1_C1C2X1.predict_proba(data_S2_test[['C1','C2','C3', 'C4','X1']]) # P(S=1 | C1,C2)
		pred_ratio_P2_over_P1_C1C2X1 = np.array(pred_ratio_P2_over_P1_C1C2X1)[:,1]
		ratio_P2_over_P1_C1C2X1 = (1-pred_ratio_P2_over_P1_C1C2X1)/(pred_ratio_P2_over_P1_C1C2X1)

		## Estmate (pi_star_X1(C1,C2) / pi_X1(C1,C2))
		pi_star_X1_over_pi_X1 = np.array( pi_star_X1(data_S2_test) / pi_X1(data_S2_test) )

		## Estmate (pi_star_X2(X1,W,C1,C2) / pi_X2(X1,W,C1,C2))
		pi_star_X2_over_pi_X2 = np.array( pi_star_X2(data_S2_test) / pi_X2(data_S2_test) )

		# PW
		omega_2 = add_noise( ratio_P0_over_P2_C1C2 * (ratio_P1_over_P2_WC1C2X1 * ratio_P2_over_P1_C1C2X1) * pi_star_X1_over_pi_X1 * pi_star_X2_over_pi_X2, add_noise_TF)
		result_PW = np.clip( np.mean(data_S2_test['Y'] * omega_2), 0, 1)
		results_PW.append(result_PW)

		'''
		Estimate DML
		- E_{S2}[omega2 check_mu2_S2] + E_{S1}[omega1 (check_mu2_S1 -  check_mu1_S1) ]
		'''
		# Compute \sum_{x2} mu2(C1,C2,X1, W, x2) * pi_star_X2(X1, W, C1, C2) with S2
		check_mu2_S2 = compute_check_mu2(col_feature_mu2, mu2_model, data_S2_train)

		# Compute omega1 := {P0(C) / P1(C) } * {pi_star_X1(C) / pi_1(C)}
		## Compute {P0(C) / P1(C) }
		model_ratio_P0_over_P1_C1C2 = estimate_odds_ratio(data_S0_cov, data_S1_train, ['C1','C2','C3','C4'], len(data_S1_train))
		pred_ratio_P0_over_P1_C1C2 = model_ratio_P0_over_P1_C1C2.predict_proba(data_S1_test[['C1','C2','C3','C4']]) # P(S=1 | C1,C2)
		pred_ratio_P0_over_P1_C1C2 = np.array(pred_ratio_P0_over_P1_C1C2)[:,1]
		### Compute P(S=0 | C1,C2)/P(S=1| C1,C2)
		ratio_P0_over_P1_C1C2 = (1-pred_ratio_P0_over_P1_C1C2)/(pred_ratio_P0_over_P1_C1C2)

		## Compute {pi_star_X1(C) / pi_1(C)}
		pi_star_X1_over_pi_X1 = np.array( pi_star_X1(data_S1_test) / pi_X1(data_S1_test) )

		# Omega1
		omega_1 = add_noise(ratio_P0_over_P1_C1C2 * pi_star_X1_over_pi_X1, add_noise_TF)

		# check_mu1_S1
		check_mu1_S1 = compute_check_mu1(col_feature_mu1, mu1_model, data_S1_train)

		result_DML = np.clip((result_OM + result_PW) - np.mean(omega_2 * check_mu2_S2) + np.mean(omega_1 * (check_mu2_S1 - check_mu1_S1)), 0, 1)
		results_DML.append(result_DML)



	return np.mean(results_OM), np.mean(results_PW), np.mean(results_DML)

def performance(truth, est_OM, est_PW, est_DML):
	table_data = {
		'Truth': truth,
		'OM': est_OM,
		'PW': est_PW,
		'DML': est_DML
	}

	error_data = {
		'OM': np.abs(truth - est_OM),
		'PW': np.abs(truth - est_PW),
		'DML': np.abs(truth - est_DML)
	}

	return table_data, error_data

# @title Results

n_list = [500, 1000, 2000]
rounds_simulations = 50
seednum_idx = 1
L = 2
add_noise_TF = True

seednum_list = np.random.randint(1000000, size=rounds_simulations)
avg_acc = {"OM":[], "PW":[], "DML": []}
ci_acc = {"OM":[], "PW":[], "DML": []}

for n in n_list:
  avg_acc_at_n = {"OM":[], "PW":[], "DML": []}
  for seednum in seednum_list:
    data_S0_, data_S1_, data_S2_ = data_S0.sample(n, random_state=seednum), data_S1.sample(n, random_state=seednum), data_S2.sample(n, random_state=seednum)
    data_S0_cov_ = data_S0_[['C1','C2','C3','C4']]

    truth = np.mean(data_S0_['Y'])
    est_OM, est_PW, est_DML = evaluate_DML(data_S0_cov_, data_S1_, data_S2_, seednum, L, add_noise_TF)

    table_data, error_data = performance(truth, est_OM, est_PW, est_DML)
    avg_acc_at_n["OM"].append( error_data["OM"] )
    avg_acc_at_n["PW"].append( error_data["PW"] )
    avg_acc_at_n["DML"].append( error_data["DML"] )

    #print(("%.3f%% completed") % (seednum_idx / (len(seednum_list) * len(n_list)) * 100))
    seednum_idx += 1

  for method in ['OM', 'PW', 'DML']:
    mean, margin_of_error = np.mean(avg_acc_at_n[method]), 1.96*np.std(np.array(avg_acc_at_n[method]), axis=0)
    avg_acc[method].append(mean)
    ci_acc[method].append(margin_of_error)

mean_OM, err_OM = avg_acc['OM'], ci_acc['OM']
mean_PW, err_PW = avg_acc['PW'], ci_acc['PW']
mean_DML, err_DML = avg_acc['DML'], ci_acc['DML']

# Plotting with confidence intervals
plt.figure(figsize=(12, 10))  # 10 inches wide, 8 inches tall
plt.errorbar(n_list, mean_OM, yerr=err_OM, label='OM', marker='o', capsize=10, markersize=16, linewidth=5, elinewidth=3)
plt.errorbar(n_list, mean_PW, yerr=err_PW, label='PW', marker='o', capsize=10, markersize=16, linewidth=5, elinewidth=3)
plt.errorbar(n_list, mean_DML, yerr=err_DML, label='DML', marker='o', capsize=10, markersize=16, linewidth=5, elinewidth=3)
plt.xticks(ticks=n_list, labels=n_list, size=35)
# plt.ylabel("Error", fontsize=35)
plt.yticks(size=45)
plt.legend(prop={'size': 30})
plt.grid(False)
plt.show()

