'error_dse_out' = 0, 'error_arbe_out' = 0, 'error_tmle_linear_out' = 0, 'error_tmle_ensemble_out' = 0,
'sd_dse_in' = 0, 'sd_arbe_in' = 0, 'sd_tmle_linear_in' = 0, 'sd_tmle_ensemble_in' = 0,
'sd_dse_out' = 0, 'sd_arbe_out' = 0, 'sd_tmle_linear_out' = 0, 'sd_tmle_ensemble_out' = 0)
eps.dse.train <- rep(0,10)
eps.dse.test <- rep(0,10)
eps.arbe.train <- rep(0,10)
eps.arbe.test <- rep(0,10)
eps.tmle.linear.train <- rep(0,10)
eps.tmle.linear.test <- rep(0,10)
eps.tmle.ensemble.train <- rep(0,10)
eps.tmle.ensemble.test <- rep(0,10)
# experiment for simulation
for (dgp in 1:num){
print(dgp)
data <- read_csv(paste("sim/sim", dgp, ".csv", sep=""))
# split dataset
set.seed(10)
sample <- sample(c(TRUE, FALSE), replace=TRUE, nrow(data), prob=c(0.9,0.1))
train <- data[sample, ]
test <- data[!sample, ]
# reformat data
y.train <- unlist(train[1])
y.test <- unlist(test[1])
y_count.train <- unlist(train[2])
y_count.test <- unlist(test[2])
treat.train <- unlist(train[3])
treat.test <- unlist(test[3])
x.train <- as.matrix(train[4:dim(train)[2]])
x.test <- as.matrix(test[4:dim(test)[2]])
# true ate
ate.true.train <- mean((y.train-y_count.train)*(2*treat.train-1))
ate.true.test <- mean((y.test-y_count.test)*(2*treat.test-1))
# Double selection estimator (DSE)
set.seed(10)
out.mod <- rlasso(y.train~x.train)
treat.mod <- rlassologit(treat.train ~ x.train)
out.res.train <- y.train - predict(out.mod, x.train)
out.res.test <- y.test - predict(out.mod, x.test)
treat.res.train <- treat.train - predict(treat.mod, x.train)
treat.res.test <- treat.test - predict(treat.mod, x.test)
ate.dse.train <- sum(out.res.train * treat.res.train)/sum(treat.res.train^2)
ate.dse.test <- sum(out.res.test * treat.res.test)/sum(treat.res.test^2)
eps.dse.train[dgp] <- abs(ate.dse.train - ate.true.train)
eps.dse.test[dgp] <- abs(ate.dse.test - ate.true.test)
print(eps.dse.train[dgp])
print(eps.dse.test[dgp])
# Approximately residual balancing estimators (ARBE)
set.seed(10)
ate.arbe <- residualBalance.ate(x.train, y.train, x.test, y.test, treat.train, treat.test)
ate.arbe.train <- ate.arbe[1]
ate.arbe.test <- ate.arbe[2]
eps.arbe.train[dgp] <- abs(ate.arbe.train - ate.true.train)
eps.arbe.test[dgp] <- abs(ate.arbe.test - ate.true.test)
print(eps.arbe.train[dgp])
print(eps.arbe.test[dgp])
# Targeted maximum likelihood estimators - linear (TMLE)
set.seed(10)
sl_libs <- c('SL.glmnet', 'SL.glm')
ate.linear.tmle <- TMLE(y.train, y.test, train[3:dim(train)[2]], test[3:dim(test)[2]], sl_libs)
ate.tmle.linear.train <- ate.linear.tmle[1]
ate.tmle.linear.test <- ate.linear.tmle[2]
eps.tmle.linear.train[dgp] <- abs(ate.tmle.linear.train - ate.true.train)
eps.tmle.linear.test[dgp] <- abs(ate.tmle.linear.test - ate.true.test)
print(eps.tmle.linear.train[dgp])
print(eps.tmle.linear.test[dgp])
# Targeted maximum likelihood estimators - ensemble (TMLE)
set.seed(10)
sl_libs <- c('SL.glmnet', 'SL.xgboost')
ate.ensemble.tmle <- TMLE(y.train, y.test, train[3:dim(train)[2]], test[3:dim(test)[2]], sl_libs)
ate.tmle.ensemble.train <- ate.ensemble.tmle[1]
ate.tmle.ensemble.test <- ate.ensemble.tmle[2]
eps.tmle.ensemble.train[dgp] <- abs(ate.tmle.ensemble.train - ate.true.train)
eps.tmle.ensemble.test[dgp] <- abs(ate.tmle.ensemble.train - ate.true.test)
print(eps.tmle.ensemble.train[dgp])
print(eps.tmle.ensemble.test[dgp])
}
results['error_dse_in'] = mean(eps.dse.train)
results['error_arbe_in'] = mean(eps.arbe.train)
results['error_tmle_linear_in'] = mean(eps.tmle.linear.train)
results['error_tmle_ensemble_in'] = mean(eps.tmle.ensemble.train)
results['error_dse_out'] = mean(eps.dse.test)
results['error_arbe_out'] = mean(eps.arbe.test)
results['error_tmle_linear_out'] = mean(eps.tmle.linear.test)
results['error_tmle_ensemble_out'] = mean(eps.tmle.ensemble.test)
results['sd_dse_in'] = sqrt(var(eps.dse.train)/num)
results['sd_arbe_in'] = sqrt(var(eps.arbe.train)/num)
results['sd_tmle_linear_in'] = sqrt(var(eps.tmle.linear.train)/num)
results['sd_tmle_ensemble_in'] = sqrt(var(eps.tmle.ensemble.train)/num)
results['sd_dse_out'] = sqrt(var(eps.dse.test)/num)
results['sd_arbe_out'] = sqrt(var(eps.arbe.test)/num)
results['sd_tmle_linear_out'] = sqrt(var(eps.tmle.linear.test)/num)
results['sd_tmle_ensemble_out'] = sqrt(var(eps.tmle.ensemble.test)/num)
results
saveRDS(results, file='semipara_sim.rds')
readRDS('semipara_acic.rds')
dgp = c(1, 2, 3, 4, 5, 7, 9, 11, 13, 15)
dgp[1]
dgp[2]
dgp[10]
source("~/Documents/Purdue/sparse_stonet/code/benchmark/semipara_acic.R", echo=TRUE)
source("~/Documents/Purdue/sparse_stonet/code/benchmark/semipara_acic.R", echo=TRUE)
#setwd("~/Documents/Purdue/sparse_stonet/code/benchmark")
#install.packages('hdm')
library(hdm)
library(readr)
source("residual.balance.R")
source("TMLE.R")
num <- 10
results <- c('error_dse_in' = 0, 'error_arbe_in' = 0, 'error_tmle_linear_in' = 0, 'error_tmle_ensemble_in' = 0,
'error_dse_out' = 0, 'error_arbe_out' = 0, 'error_tmle_linear_out' = 0, 'error_tmle_ensemble_out' = 0,
'sd_dse_in' = 0, 'sd_arbe_in' = 0, 'sd_tmle_linear_in' = 0, 'sd_tmle_ensemble_in' = 0,
'sd_dse_out' = 0, 'sd_arbe_out' = 0, 'sd_tmle_linear_out' = 0, 'sd_tmle_ensemble_out' = 0)
eps.dse.train <- rep(0,10)
eps.dse.test <- rep(0,10)
eps.arbe.train <- rep(0,10)
eps.arbe.test <- rep(0,10)
eps.tmle.linear.train <- rep(0,10)
eps.tmle.linear.test <- rep(0,10)
eps.tmle.ensemble.train <- rep(0,10)
eps.tmle.ensemble.test <- rep(0,10)
# experiment for ACIC
ate.true = c(0.2, 0.8, -0.8, 2.1, -0.3429, -1.1039, 0, -1.432, 12.62, 9.134, 10.77, -3.159, -0.8486, 61.11, -0.16058, 1)
dgp = c(16, 13, 5, 15, 8, 10, 3, 12, 2, 7)
for (i in c(1:num)){
print(dgp[i])
data <- read_csv(paste("../raw_data/acic/acic_homo", dgp[i], ".csv", sep=""))
colnames(data)[which(names(data) == 'Y')] <- 'y'
colnames(data)[which(names(data) == 'A')] <- 'treat'
# split dataset
set.seed(10)
sample <- sample(c(TRUE, FALSE),  replace=TRUE, nrow(data), prob=c(0.66,0.34))
train <- data[sample, ]
test <- data[!sample, ]
# reformat data
y.train <- unlist(train['y'])
y.test <- unlist(test['y'])
treat.train <- unlist(train['treat'])
treat.test <- unlist(test['treat'])
x.train <- as.matrix(train[, !(colnames(train) %in% c('y', 'treat'))])
x.test <- as.matrix(test[, !(colnames(train) %in% c('y', 'treat'))])
# Double selection estimator (DSE)
set.seed(10)
out.mod <- rlasso(y.train~x.train)
treat.mod <- rlassologit(treat.train ~ x.train)
out.res.train <- y.train - predict(out.mod, x.train)
out.res.test <- y.test - predict(out.mod, x.test)
treat.res.train <- treat.train - predict(treat.mod, x.train)
treat.res.test <- treat.test - predict(treat.mod, x.test)
ate.dse.train <- sum(out.res.train * treat.res.train)/sum(treat.res.train^2)
ate.dse.test <- sum(out.res.test * treat.res.test)/sum(treat.res.test^2)
eps.dse.train[i] <- abs(ate.dse.train - ate.true[dgp[i]])
eps.dse.test[i] <- abs(ate.dse.test - ate.true[dgp[i]])
print(eps.dse.train[i])
print(eps.dse.test[i])
# Approximately residual balancing estimators (ARBE)
set.seed(10)
ate.arbe <- residualBalance.ate(x.train, y.train, x.test, y.test, treat.train, treat.test)
ate.arbe.train <- ate.arbe[1]
ate.arbe.test <- ate.arbe[2]
eps.arbe.train[i] <- abs(ate.arbe.train - ate.true[dgp[i]])
eps.arbe.test[i] <- abs(ate.arbe.test - ate.true[dgp][i])
print(eps.arbe.train[i])
print(eps.arbe.test[i])
# Targeted maximum likelihood estimators - linear (TMLE)
set.seed(10)
sl_libs <- c('SL.glmnet', 'SL.glm')
ate.linear.tmle <- TMLE(y.train, y.test, train[2:dim(train)[2]], test[2:dim(test)[2]], sl_libs)
ate.tmle.linear.train <- ate.linear.tmle[1]
ate.tmle.linear.test <- ate.linear.tmle[2]
eps.tmle.linear.train[i] <- abs(ate.tmle.linear.train - ate.true[dgp[i]])
eps.tmle.linear.test[i] <- abs(ate.tmle.linear.test - ate.true[dgp[i]])
print(eps.tmle.linear.train[i])
print(eps.tmle.linear.test[i])
# Targeted maximum likelihood estimators - ensemble (TMLE)
# set.seed(10)
#
# sl_libs <- c('SL.glmnet', 'SL.xgboost')
# ate.ensemble.tmle <- TMLE(y.train, y.test, train[2:dim(train)[2]], test[2:dim(test)[2]], sl_libs)
# ate.tmle.ensemble.train <- ate.ensemble.tmle[1]
# ate.tmle.ensemble.test <- ate.ensemble.tmle[2]
#
# eps.tmle.ensemble.train[i] <- abs(ate.tmle.ensemble.train - ate.true[dgp[i]])
# eps.tmle.ensemble.test[i] <- abs(ate.tmle.ensemble.train - ate.true[dgp[i]])
#
# print(eps.tmle.ensemble.train[i])
# print(eps.tmle.ensemble.test[i])
}
#setwd("~/Documents/Purdue/sparse_stonet/code/benchmark")
#install.packages('hdm')
library(hdm)
library(readr)
source("residual.balance.R")
source("TMLE.R")
num <- 10
results <- c('error_dse_in' = 0, 'error_arbe_in' = 0, 'error_tmle_linear_in' = 0, 'error_tmle_ensemble_in' = 0,
'error_dse_out' = 0, 'error_arbe_out' = 0, 'error_tmle_linear_out' = 0, 'error_tmle_ensemble_out' = 0,
'sd_dse_in' = 0, 'sd_arbe_in' = 0, 'sd_tmle_linear_in' = 0, 'sd_tmle_ensemble_in' = 0,
'sd_dse_out' = 0, 'sd_arbe_out' = 0, 'sd_tmle_linear_out' = 0, 'sd_tmle_ensemble_out' = 0)
eps.dse.train <- rep(0,10)
eps.dse.test <- rep(0,10)
eps.arbe.train <- rep(0,10)
eps.arbe.test <- rep(0,10)
eps.tmle.linear.train <- rep(0,10)
eps.tmle.linear.test <- rep(0,10)
eps.tmle.ensemble.train <- rep(0,10)
eps.tmle.ensemble.test <- rep(0,10)
# experiment for ACIC
ate.true = c(0.2, 0.8, -0.8, 2.1, -0.3429, -1.1039, 0, -1.432, 12.62, 9.134, 10.77, -3.159, -0.8486, 61.11, -0.16058, 1)
dgp = c(16, 13, 5, 15, 8, 10, 3, 12, 2, 7)
for (i in c(1:num)){
print(dgp[i])
data <- read_csv(paste("../raw_data/acic/acic_homo", dgp[i], ".csv", sep=""))
colnames(data)[which(names(data) == 'Y')] <- 'y'
colnames(data)[which(names(data) == 'A')] <- 'treat'
# split dataset
set.seed(10)
sample <- sample(c(TRUE, FALSE),  replace=TRUE, nrow(data), prob=c(0.66,0.34))
train <- data[sample, ]
test <- data[!sample, ]
# reformat data
y.train <- unlist(train['y'])
y.test <- unlist(test['y'])
treat.train <- unlist(train['treat'])
treat.test <- unlist(test['treat'])
x.train <- as.matrix(train[, !(colnames(train) %in% c('y', 'treat'))])
x.test <- as.matrix(test[, !(colnames(train) %in% c('y', 'treat'))])
# Double selection estimator (DSE)
set.seed(10)
out.mod <- rlasso(y.train~x.train)
treat.mod <- rlassologit(treat.train ~ x.train)
out.res.train <- y.train - predict(out.mod, x.train)
out.res.test <- y.test - predict(out.mod, x.test)
treat.res.train <- treat.train - predict(treat.mod, x.train)
treat.res.test <- treat.test - predict(treat.mod, x.test)
ate.dse.train <- sum(out.res.train * treat.res.train)/sum(treat.res.train^2)
ate.dse.test <- sum(out.res.test * treat.res.test)/sum(treat.res.test^2)
eps.dse.train[i] <- abs(ate.dse.train - ate.true[dgp[i]])
eps.dse.test[i] <- abs(ate.dse.test - ate.true[dgp[i]])
print(eps.dse.train[i])
print(eps.dse.test[i])
# Approximately residual balancing estimators (ARBE)
set.seed(10)
ate.arbe <- residualBalance.ate(x.train, y.train, x.test, y.test, treat.train, treat.test)
ate.arbe.train <- ate.arbe[1]
ate.arbe.test <- ate.arbe[2]
eps.arbe.train[i] <- abs(ate.arbe.train - ate.true[dgp[i]])
eps.arbe.test[i] <- abs(ate.arbe.test - ate.true[dgp][i])
print(eps.arbe.train[i])
print(eps.arbe.test[i])
# Targeted maximum likelihood estimators - linear (TMLE)
set.seed(10)
sl_libs <- c('SL.glmnet', 'SL.glm')
ate.linear.tmle <- TMLE(y.train, y.test, train[2:dim(train)[2]], test[2:dim(test)[2]], sl_libs)
ate.tmle.linear.train <- ate.linear.tmle[1]
ate.tmle.linear.test <- ate.linear.tmle[2]
eps.tmle.linear.train[i] <- abs(ate.tmle.linear.train - ate.true[dgp[i]])
eps.tmle.linear.test[i] <- abs(ate.tmle.linear.test - ate.true[dgp[i]])
print(eps.tmle.linear.train[i])
print(eps.tmle.linear.test[i])
# Targeted maximum likelihood estimators - ensemble (TMLE)
# set.seed(10)
#
# sl_libs <- c('SL.glmnet', 'SL.xgboost')
# ate.ensemble.tmle <- TMLE(y.train, y.test, train[2:dim(train)[2]], test[2:dim(test)[2]], sl_libs)
# ate.tmle.ensemble.train <- ate.ensemble.tmle[1]
# ate.tmle.ensemble.test <- ate.ensemble.tmle[2]
#
# eps.tmle.ensemble.train[i] <- abs(ate.tmle.ensemble.train - ate.true[dgp[i]])
# eps.tmle.ensemble.test[i] <- abs(ate.tmle.ensemble.train - ate.true[dgp[i]])
#
# print(eps.tmle.ensemble.train[i])
# print(eps.tmle.ensemble.test[i])
}
results['error_dse_in'] = mean(eps.dse.train)
results['error_arbe_in'] = mean(eps.arbe.train)
results['error_tmle_linear_in'] = mean(eps.tmle.linear.train)
results['error_tmle_ensemble_in'] = mean(eps.tmle.ensemble.train)
results['error_dse_out'] = mean(eps.dse.test)
results['error_arbe_out'] = mean(eps.arbe.test)
results['error_tmle_linear_out'] = mean(eps.tmle.linear.test)
results['error_tmle_ensemble_out'] = mean(eps.tmle.ensemble.test)
results['sd_dse_in'] = sqrt(var(eps.dse.train)/num)
results['sd_arbe_in'] = sqrt(var(eps.arbe.train)/num)
results['sd_tmle_linear_in'] = sqrt(var(eps.tmle.linear.train)/num)
results['sd_tmle_ensemble_in'] = sqrt(var(eps.tmle.ensemble.train)/num)
results['sd_dse_out'] = sqrt(var(eps.dse.test)/num)
results['sd_arbe_out'] = sqrt(var(eps.arbe.test)/num)
results['sd_tmle_linear_out'] = sqrt(var(eps.tmle.linear.test)/num)
results['sd_tmle_ensemble_out'] = sqrt(var(eps.tmle.ensemble.test)/num)
results
saveRDS(results, file='semipara_acic.rds')
source("~/Documents/Purdue/sparse_stonet/code/benchmark/semipara_acic.R", echo=TRUE)
#setwd("~/Documents/Purdue/sparse_stonet/code/benchmark")
#install.packages('hdm')
library(hdm)
library(readr)
source("residual.balance.R")
source("TMLE.R")
num <- 10
results <- c('error_dse_in' = 0, 'error_arbe_in' = 0, 'error_tmle_linear_in' = 0, 'error_tmle_ensemble_in' = 0,
'error_dse_out' = 0, 'error_arbe_out' = 0, 'error_tmle_linear_out' = 0, 'error_tmle_ensemble_out' = 0,
'sd_dse_in' = 0, 'sd_arbe_in' = 0, 'sd_tmle_linear_in' = 0, 'sd_tmle_ensemble_in' = 0,
'sd_dse_out' = 0, 'sd_arbe_out' = 0, 'sd_tmle_linear_out' = 0, 'sd_tmle_ensemble_out' = 0)
eps.dse.train <- rep(0,10)
eps.dse.test <- rep(0,10)
eps.arbe.train <- rep(0,10)
eps.arbe.test <- rep(0,10)
eps.tmle.linear.train <- rep(0,10)
eps.tmle.linear.test <- rep(0,10)
eps.tmle.ensemble.train <- rep(0,10)
eps.tmle.ensemble.test <- rep(0,10)
# experiment for ACIC
ate.true = c(0.2, 0.8, -0.8, 2.1, -0.3429, -1.1039, 0, -1.432, 12.62, 9.134, 10.77, -3.159, -0.8486, 61.11, -0.16058, 1)
dgp = c(16, 13, 5, 15, 8, 10, 3, 12, 2, 7)
for (i in c(1:num)){
print(dgp[i])
data <- read_csv(paste("../raw_data/acic/acic_homo", dgp[i], ".csv", sep=""))
colnames(data)[which(names(data) == 'Y')] <- 'y'
colnames(data)[which(names(data) == 'A')] <- 'treat'
# split dataset
set.seed(10)
sample <- sample(c(TRUE, FALSE),  replace=TRUE, nrow(data), prob=c(0.66,0.34))
train <- data[sample, ]
test <- data[!sample, ]
# reformat data
y.train <- unlist(train['y'])
y.test <- unlist(test['y'])
treat.train <- unlist(train['treat'])
treat.test <- unlist(test['treat'])
x.train <- as.matrix(train[, !(colnames(train) %in% c('y', 'treat'))])
x.test <- as.matrix(test[, !(colnames(train) %in% c('y', 'treat'))])
# Double selection estimator (DSE)
set.seed(10)
out.mod <- rlasso(y.train~x.train)
treat.mod <- rlassologit(treat.train ~ x.train)
out.res.train <- y.train - predict(out.mod, x.train)
out.res.test <- y.test - predict(out.mod, x.test)
treat.res.train <- treat.train - predict(treat.mod, x.train)
treat.res.test <- treat.test - predict(treat.mod, x.test)
ate.dse.train <- sum(out.res.train * treat.res.train)/sum(treat.res.train^2)
ate.dse.test <- sum(out.res.test * treat.res.test)/sum(treat.res.test^2)
eps.dse.train[i] <- abs(ate.dse.train - ate.true[dgp[i]])
eps.dse.test[i] <- abs(ate.dse.test - ate.true[dgp[i]])
print(eps.dse.train[i])
print(eps.dse.test[i])
# Approximately residual balancing estimators (ARBE)
set.seed(10)
ate.arbe <- residualBalance.ate(x.train, y.train, x.test, y.test, treat.train, treat.test)
ate.arbe.train <- ate.arbe[1]
ate.arbe.test <- ate.arbe[2]
eps.arbe.train[i] <- abs(ate.arbe.train - ate.true[dgp[i]])
eps.arbe.test[i] <- abs(ate.arbe.test - ate.true[dgp][i])
print(eps.arbe.train[i])
print(eps.arbe.test[i])
# Targeted maximum likelihood estimators - linear (TMLE)
set.seed(10)
sl_libs <- c('SL.glmnet', 'SL.glm')
ate.linear.tmle <- TMLE(y.train, y.test, train[2:dim(train)[2]], test[2:dim(test)[2]], sl_libs)
ate.tmle.linear.train <- ate.linear.tmle[1]
ate.tmle.linear.test <- ate.linear.tmle[2]
eps.tmle.linear.train[i] <- abs(ate.tmle.linear.train - ate.true[dgp[i]])
eps.tmle.linear.test[i] <- abs(ate.tmle.linear.test - ate.true[dgp[i]])
print(eps.tmle.linear.train[i])
print(eps.tmle.linear.test[i])
# Targeted maximum likelihood estimators - ensemble (TMLE)
set.seed(10)
sl_libs <- c('SL.glmnet', 'SL.xgboost')
ate.ensemble.tmle <- TMLE(y.train, y.test, train[2:dim(train)[2]], test[2:dim(test)[2]], sl_libs)
ate.tmle.ensemble.train <- ate.ensemble.tmle[1]
ate.tmle.ensemble.test <- ate.ensemble.tmle[2]
eps.tmle.ensemble.train[i] <- abs(ate.tmle.ensemble.train - ate.true[dgp[i]])
eps.tmle.ensemble.test[i] <- abs(ate.tmle.ensemble.test - ate.true[dgp[i]])
print(eps.tmle.ensemble.train[i])
print(eps.tmle.ensemble.test[i])
}
readRDS('semipara_twins.rds')
source("~/Documents/Purdue/sparse_stonet/code/benchmark/semipara_acic.R", echo=TRUE)
setwd("~/Documents/Purdue/sparse_stonet/code/benchmark")
#install.packages('hdm')
#library(devtools)
#install_github("swager/balanceHD")
library(hdm)
#library(balanceHD)
library(readr)
library(caret)
#library(performanceEstimation)
#library(tmle)
source("residual.balance.R")
source("TMLE.R")
num <- 3
results <- c('ate_dse' = 0, 'ate_arbe' = 0, 'ate_tmle_linear' = 0, 'ate_tmle_ensemble' = 0,
'sd_dse' = 0, 'sd_arbe' = 0, 'sd_tmle_linear' = 0, 'sd_tmle_ensemble' = 0)
# experiment for Twins data
dse <- rep(0,3)
arbe <- rep(0,3)
tmle.linear <- rep(0,3)
tmle.ensemble <- rep(0,3)
# read data
data <- read_csv("raw_data/twins/twins_data.csv")
setwd("~/Documents/Purdue/sparse_stonet/code/benchmark")
#install.packages('hdm')
#library(devtools)
#install_github("swager/balanceHD")
library(hdm)
#library(balanceHD)
library(readr)
library(caret)
#library(performanceEstimation)
#library(tmle)
source("residual.balance.R")
source("TMLE.R")
num <- 3
results <- c('ate_dse' = 0, 'ate_arbe' = 0, 'ate_tmle_linear' = 0, 'ate_tmle_ensemble' = 0,
'sd_dse' = 0, 'sd_arbe' = 0, 'sd_tmle_linear' = 0, 'sd_tmle_ensemble' = 0)
# experiment for Twins data
dse <- rep(0,3)
arbe <- rep(0,3)
tmle.linear <- rep(0,3)
tmle.ensemble <- rep(0,3)
# read data
data <- read_csv("../raw_data/twins/twins_data.csv")
data <- subset(data, select = -counter)
# shuffle dataset
set.seed(1)
data <- data[sample(1:nrow(data)), ] # shuffle dataset
size <- round(dim(data)[1]/3)
for (cv in 1:num){
print(cv)
# split dataset
test.start <- size*(cv-1)+1
test.end <- min(dim(data)[1], size*cv)
test.idx <- c(test.start:test.end)
test <- data[test.idx, ]
train <- data[-test.idx, ]
# # use SMOTE to create a balanced dataset
# train <- data.frame(lapply(train, as.factor))
# train <- smote(y~., train, perc.over = 2, perc.under = 2)
# train <- train[sample(1:nrow(train)), ]
# train <- data.frame(lapply(train, function(x) as.numeric(x)-1))
# reformat data
y.train <- unlist(train$y)
y.test <- unlist(test$y)
treat.train <- unlist(train$treat)
treat.test <- unlist(test$treat)
x.train <- as.matrix(train[, !(colnames(train) %in% c('y', 'treat'))])
x.test <- as.matrix(test[, !(colnames(train) %in% c('y', 'treat'))])
# Double selection estimator (DSE)
set.seed(4)
out.mod <- rlassologit(y.train~x.train)
treat.mod <- rlassologit(treat.train ~ x.train)
out.res.train <- y.train - predict(out.mod, x.train)
out.res.test <- y.test - predict(out.mod, x.test)
treat.res.train <- treat.train - predict(treat.mod, x.train)
treat.res.test <- treat.test - predict(treat.mod, x.test)
ate.dse.train <- sum(out.res.train * treat.res.train)/sum(treat.res.train^2)
ate.dse.test <- sum(out.res.test * treat.res.test)/sum(treat.res.test^2)
print(ate.dse.train)
print(ate.dse.test)
dse[cv] <- ate.dse.test
# Approximately residual balancing estimators (ARBE)
set.seed(4)
ate.arbe <- residualBalance.ate(x.train, y.train, x.test, y.test, treat.train, treat.test, binary=TRUE, scale.X = FALSE)
ate.arbe.train <- ate.arbe[1]
ate.arbe.test <- ate.arbe[2]
print(ate.arbe.train)
print(ate.arbe.test)
arbe[cv] <- ate.arbe.test
# Targeted maximum likelihood estimators - linear (TMLE)
set.seed(4)
sl_libs <- c('SL.glmnet', 'SL.glm')
ate.linear.tmle <- TMLE(y.train, y.test, train[2:dim(train)[2]], test[2:dim(test)[2]], binary=TRUE, sl_libs)
ate.tmle.linear.train <- ate.linear.tmle[1]
ate.tmle.linear.test <- ate.linear.tmle[2]
print(ate.tmle.linear.train)
print(ate.tmle.linear.test)
tmle.linear[cv] <- ate.tmle.linear.test
# Targeted maximum likelihood estimators - ensemble (TMLE)
set.seed(4)
sl_libs <- c('SL.glmnet', 'SL.xgboost')
ate.ensemble.tmle <- TMLE(y.train, y.test, train[2:dim(train)[2]], test[2:dim(test)[2]], binary=TRUE, sl_libs)
ate.tmle.ensemble.train <- ate.ensemble.tmle[1]
ate.tmle.ensemble.test <- ate.ensemble.tmle[2]
print(ate.tmle.ensemble.train)
print(ate.tmle.ensemble.test)
tmle.ensemble[cv] <- ate.tmle.ensemble.test
}
readRDS('semipara_twins.rds')
result = readRDS('semipara_twins.rds')
result
result['ate_tmle_linear'] = -0.11034189
result['ate_tmle_ensemble'] = -0.129035142
result
result['sd_tmle_linear'] = 0.072694750
result['sd_tmle_ensembler'] = 0.010747825
result
result
saveRDS(result, file='./benchmark/semipara_twins.rds')
getwd()
saveRDS(result, file='.semipara_twins.rds')
readRDS(result, file='.semipara_twins.rds')
result['sd_tmle_ensemble'] = 0.050747825
result
result[:-1]
result[1:8]
result <- result[1:8]
saveRDS(result, file='.semipara_twins.rds')
readRDS(result, file='.semipara_twins.rds')
readRDS('.semipara_sim.rds')
readRDS('semipara_sim.rds')
readRDS('semipara_acic.rds')
readRDS('semipara_twins.rds')
readRDS('semipara_tcga.rds')
