# install.packages('SuperLearner')
library(SuperLearner)
library(rsample)
library(tidyverse)
library(ggplot2)
library(foreach)
library(doParallel)

dr_sequential <- function(RY, R, covariates, cid, K=2, sl_lib=c("SL.glmnet", 
                                                             "SL.mean", "SL.ranger")){
  epsilon <- 0.2
  n <- length(R)
  G <- max(cid)
  # sample(unique(cid), size = G, replace = TRUE)
  # ngs <- table(cid)
  split <- sample(1:K, G, replace=TRUE)
  pis <- numeric(n)
  mus <- numeric(n)
  for (k in 1:K){
    train_g <- (1:G)[split == k]
    test_g <- (1:G)[split != k]
    train_ind <- which(cid %in% train_g)
    test_ind <- which(cid %in% test_g)
    pi_mod = SuperLearner(Y = R[train_ind],
                          X = covariates[train_ind,], newX = covariates[test_ind,],
                          SL.library = sl_lib, family=binomial())
    temp <- pi_mod$SL.predict
    pis[test_ind] <- ifelse(temp>epsilon, temp, epsilon)
    # pis[test_ind] <- pi_mod$SL.predict
    train_ind_nonmiss <- which(cid %in% train_g & R==1)
    mu_mod = SuperLearner(Y = RY[train_ind_nonmiss],
                          X = covariates[train_ind_nonmiss,], newX = covariates[test_ind,],
                          SL.library = sl_lib)
    mus[test_ind] <- mu_mod$SL.predict
  }
  ifs <- R*(RY-mus)/pis + mus
  theta_hat <- mean(ifs)
  sd1 <- sd(ifs)/sqrt(n)
  return(list(est=theta_hat,sd_iid=sd1))
}



simulate_missingness <- function(X, seed = 42) {
  set.seed(seed)
  n <- dim(X)[1]
  d <- dim(X)[2]
  
  # Generate coefficients with scaling 1/sqrt(d)
  # beta <- (2*rbinom(d, 1, 0.5)-1)*10
  beta <- rnorm(d, mean = 0, sd = 2)
  
  # Compute probabilities using the logistic function
  logits <- X %*% beta
  probs <- 1 / (1 + exp(-logits))  # Sigmoid function
  
  # Simulate binary response variable Y ~ Bernoulli(probs)
  R <- rbinom(n, size = 1, prob = probs)
  
  return(R)
}



data_sequential <- read_csv("data_sequential.csv")
data_sequential$cid <- as.numeric(factor(data_sequential$cid))
data_sequential$S65 <- factor(data_sequential$S65)
data_sequential$X <- factor(data_sequential$X)
S <- as.matrix(data_sequential[,paste0('S',1:64)])
# diffs <- numeric(100)
# for (i in 1:100){
#   R <- simulate_missingness(W, seed=i)
#   t1 <- mean(data_cluster$creativity)-mean(data_cluster$creativity[R==1])
#   t2 <- mean(data_cluster$toxicity)-mean(data_cluster$toxicity[R==1])
#   t3 <- mean(data_cluster$humor)-mean(data_cluster$humor[R==1])
#   t4 <- mean(data_cluster$quality)-mean(data_cluster$quality[R==1])
#   diffs[i] <- abs(t1) + abs(t2)+abs(t3)+abs(t4)
# }
# max(diffs)

# Generate missingness
R <- simulate_missingness(S, seed=1)
# Original seed = 1

# Check bias
t1 <- mean(data_sequential$creativity)-mean(data_sequential$creativity[R==1])
t2 <- mean(data_sequential$toxicity)-mean(data_sequential$toxicity[R==1])
t3 <- mean(data_sequential$humor)-mean(data_sequential$humor[R==1])
t4 <- mean(data_sequential$quality)-mean(data_sequential$quality[R==1])
mean(data_sequential$humor)
c(mean(data_sequential$humor[R==1])-1.96*sd(data_sequential$humor[R==1])/sqrt(length(data_sequential$humor[R==1])), mean(data_sequential$humor[R==1])+1.96*sd(data_sequential$humor[R==1])/sqrt(length(data_sequential$humor[R==1])))

mean(data_sequential$creativity)
c(mean(data_sequential$creativity[R==1])-1.96*sd(data_sequential$creativity[R==1])/sqrt(length(data_sequential$creativity[R==1])), mean(data_sequential$creativity[R==1])+1.96*sd(data_sequential$creativity[R==1])/sqrt(length(data_sequential$creativity[R==1])))

mean(data_sequential$quality)
c(mean(data_sequential$quality[R==1])-1.96*sd(data_sequential$quality[R==1])/sqrt(length(data_sequential$quality[R==1])), mean(data_sequential$quality[R==1])+1.96*sd(data_sequential$quality[R==1])/sqrt(length(data_sequential$quality[R==1])))

mean(data_sequential$toxicity)
c(mean(data_sequential$toxicity[R==1])-1.96*sd(data_sequential$toxicity[R==1])/sqrt(length(data_sequential$toxicity[R==1])), mean(data_sequential$toxicity[R==1])+1.96*sd(data_sequential$toxicity[R==1])/sqrt(length(data_sequential$toxicity[R==1])))

# DR estimation
covariates <- data_sequential[, c(paste0('S', 1:65), 'X')]

set.seed(521)
res_sequential_quality <- dr_sequential(R*data_sequential$quality, R, covariates, data_sequential$cid)
c(res_sequential_quality$est-1.96*res_sequential_quality$sd_iid, res_sequential_quality$est+1.96*res_sequential_quality$sd_iid)
# 0.6433523 0.6480195
# 

# R <- simulate_missingness(S, seed=100)
set.seed(521)
res_sequential_humor <- dr_sequential(R*data_sequential$humor, R, covariates, data_sequential$cid)
c(res_sequential_humor$est-1.96*res_sequential_humor$sd_iid, res_sequential_humor$est+1.96*res_sequential_humor$sd_iid)
# 0.1855584 0.1895143 seed=1
# 0.1820259 0.1859772 seed=100
# # nuisances <- dr_sequential(R*data_sequential$humor, R, covariates, data_sequential$cid)
# # epsilon <- 0.28
# # pis <- ifelse(nuisances$pis>epsilon, nuisances$pis, epsilon)
# # mus <- nuisances$mus
# # ifs <- R*(RY-mus)/pis + mus
# # theta_hat <- mean(ifs)
# # sd1 <- sd(ifs)/sqrt(n)
# # sum_sequential <- aggregate(ifs, list(cid = cid), sum)$x
# # sd2 <- sqrt(sum(sum_sequential^2)/(n^2)-sum(ngs^2)*theta_hat^2/(n^2))
# # mean(ifs)-1.96*sd1
# # mean(ifs)-1.96*sd2
# # mean(data_sequential$humor)
# 
set.seed(666)
res_sequential_creativity <- dr_sequential(R*data_sequential$creativity, R, covariates, data_sequential$cid)
c(res_sequential_creativity$est-1.96*res_sequential_creativity$sd_iid, res_sequential_creativity$est+1.96*res_sequential_creativity$sd_iid)
# 0.3812045 0.3861917
# 
# 
set.seed(521)
res_sequential_toxicity <- dr_sequential(R*data_sequential$toxicity, R, covariates, data_sequential$cid)
c(res_sequential_toxicity$est-1.96*res_sequential_toxicity$sd_iid, res_sequential_toxicity$est+1.96*res_sequential_toxicity$sd_iid)
# 0.1229990 0.1258389

# Cluster Bootstrap
B <- 50
num_batch <- 10
boot_humor <- matrix(0, ncol=B, nrow=num_batch)
# boot_humor <- numeric(B)
data_sequential$R <- R
df <- data_sequential %>% nest(-cid)
G <- max(data_sequential$cid)
ngs <- table(data_sequential$cid)
set.seed(666)


num_cores <- detectCores() - 1  # Use all but one core
cl <- makeCluster(num_cores)  # Create cluster
registerDoParallel(cl)  # Register the parallel backend
on.exit(stopCluster(cl), add = TRUE)
for (i in 1:B){
  bs <- bootstraps(df, times = num_batch)
  boot_humor[,i] <- foreach(i=1:num_batch, .combine=c, .packages = c("tidyverse", 'rsample','SuperLearner')) %dopar%{
    boot_data <- as_tibble(bs$splits[[i]]) %>% unnest(cols=c(data))
    new_cid <- rep(1:G, ngs[analysis(bs$splits[[i]])$cid])
    temp <- dr_sequential(boot_data$R*boot_data$humor, boot_data$R, boot_data[,c(paste0('S', 1:65), 'X')], new_cid)$est
    temp
  }
}

write.csv(as.data.frame(boot_humor), file = "boot_humor.csv", row.names = FALSE)

# boot_humor <- foreach(i=1:B, .combine=c, .packages = c("tidyverse", 'rsample','SuperLearner')) %dopar%{
#   boot_data <- as_tibble(bs$splits[[i]]) %>% unnest(cols=c(data))
#   new_cid <- rep(1:G, ngs[analysis(bs$splits[[i]])$cid])
#   temp <- dr_sequential(boot_data$R*boot_data$humor, boot_data$R, boot_data[,c(paste0('S', 1:65), 'X')], new_cid)
#   temp
# }
# stopCluster(cl)
write.csv(as.data.frame(boot_humor), file = "boot_humor.csv", row.names = FALSE)


# Do similar bootstrap for other outcomes




norm_quantile <- qnorm(0.975)
# Quality

boot_quality <- read_csv("boot_quality.csv")
sd_quality <- sd(as.matrix(boot_quality))
# 0.6429779 0.6483939
data_plot <- data.frame(
  Parameter = rep(c("Unadjusted", "Adjusted(i.i.d)", "Adjusted(sequential)"), each = 1),
  Estimate = c(mean(data_sequential$quality[R==1]), res_sequential_quality$est, res_sequential_quality$est),       # Estimated values
  Lower = c(mean(data_sequential$quality[R==1])-norm_quantile*sd(data_sequential$quality[R==1])/sqrt(length(data_sequential$quality[R==1])), res_sequential_quality$est-norm_quantile*res_sequential_quality$sd_iid, res_sequential_quality$est-norm_quantile*sd_quality),          # Lower bound of CI
  Upper = c(mean(data_sequential$quality[R==1])+norm_quantile*sd(data_sequential$quality[R==1])/sqrt(length(data_sequential$quality[R==1])), res_sequential_quality$est+norm_quantile*res_sequential_quality$sd_iid, res_sequential_quality$est+norm_quantile*sd_quality)           # Upper bound of CI
)

true_value <- mean(data_sequential$quality)  # Adjust this as needed
data_plot$Parameter <- factor(data_plot$Parameter, levels = c("Adjusted(sequential)", "Adjusted(i.i.d)", "Unadjusted"))


# Create the plot
ggplot(data_plot, aes(x = Parameter, y = Estimate)) +
  geom_point(size = 3, color = "blue") +  # Plot estimates
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2, color = "blue") +  # CI bars
  geom_hline(yintercept = true_value, linetype = "dashed", color = "red", size = 1) +  # True parameter line
  labs(y = "Quality",
       x = "Types of CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 14))


# Humor
boot_humor <- read_csv("boot_humor.csv")
sd_humor <- sd(as.matrix(boot_humor))

data_plot <- data.frame(
  Parameter = rep(c("Unadjusted", "Adjusted(i.i.d)", "Adjusted(sequential)"), each = 1),
  Estimate = c(mean(data_sequential$humor[R==1]), res_sequential_humor$est, res_sequential_humor$est),       # Estimated values
  Lower = c(mean(data_sequential$humor[R==1])-norm_quantile*sd(data_sequential$humor[R==1])/sqrt(length(data_sequential$humor[R==1])), res_sequential_humor$est-norm_quantile*res_sequential_humor$sd_iid, res_sequential_humor$est-norm_quantile*sd_humor),          # Lower bound of CI
  Upper = c(mean(data_sequential$humor[R==1])+norm_quantile*sd(data_sequential$humor[R==1])/sqrt(length(data_sequential$humor[R==1])), res_sequential_humor$est+norm_quantile*res_sequential_humor$sd_iid, res_sequential_humor$est+norm_quantile*sd_humor)           # Upper bound of CI
)

true_value <- mean(data_sequential$humor)  # Adjust this as needed
data_plot$Parameter <- factor(data_plot$Parameter, levels = c("Adjusted(sequential)", "Adjusted(i.i.d)", "Unadjusted"))


ggplot(data_plot, aes(x = Parameter, y = Estimate)) +
  geom_point(size = 3, color = "blue") +  # Plot estimates
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2, color = "blue") +  # CI bars
  geom_hline(yintercept = true_value, linetype = "dashed", color = "red", size = 1) +  # True parameter line
  labs(y = "Humor",
       x = "Types of CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 14))


# Creativity
boot_creativity <- read_csv("boot_creativity.csv")
sd_creativity <- sd(as.matrix(boot_creativity))
# 0.3806965 0.3866997
data_plot <- data.frame(
  Parameter = rep(c("Unadjusted", "Adjusted(i.i.d)", "Adjusted(sequential)"), each = 1),
  Estimate = c(mean(data_sequential$creativity[R==1]), res_sequential_creativity$est, res_sequential_creativity$est),       # Estimated values
  Lower = c(mean(data_sequential$creativity[R==1])-norm_quantile*sd(data_sequential$creativity[R==1])/sqrt(length(data_sequential$creativity[R==1])), res_sequential_creativity$est-norm_quantile*res_sequential_creativity$sd_iid, res_sequential_creativity$est-norm_quantile*sd_creativity),          # Lower bound of CI
  Upper = c(mean(data_sequential$creativity[R==1])+norm_quantile*sd(data_sequential$creativity[R==1])/sqrt(length(data_sequential$creativity[R==1])), res_sequential_creativity$est+norm_quantile*res_sequential_creativity$sd_iid, res_sequential_creativity$est+norm_quantile*sd_creativity)           # Upper bound of CI
)

true_value <- mean(data_sequential$creativity)  # Adjust this as needed
data_plot$Parameter <- factor(data_plot$Parameter, levels = c("Adjusted(sequential)", "Adjusted(i.i.d)", "Unadjusted"))



ggplot(data_plot, aes(x = Parameter, y = Estimate)) +
  geom_point(size = 3, color = "blue") +  # Plot estimates
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2, color = "blue") +  # CI bars
  geom_hline(yintercept = true_value, linetype = "dashed", color = "red", size = 1) +  # True parameter line
  labs(y = "Creativity",
       x = "Types of CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 14))



# toxicity
boot_toxicity <- read_csv("boot_toxicity.csv")
sd_toxicity <- sd(as.matrix(boot_toxicity))
# 0.1224297 0.1256935
data_plot <- data.frame(
  Parameter = rep(c("Unadjusted", "Adjusted(i.i.d)", "Adjusted(sequential)"), each = 1),
  Estimate = c(mean(data_sequential$toxicity[R==1]), res_sequential_toxicity$est, res_sequential_toxicity$est),       # Estimated values
  Lower = c(mean(data_sequential$toxicity[R==1])-norm_quantile*sd(data_sequential$toxicity[R==1])/sqrt(length(data_sequential$toxicity[R==1])), res_sequential_toxicity$est-norm_quantile*res_sequential_toxicity$sd_iid, res_sequential_toxicity$est-norm_quantile*sd_toxicity),          # Lower bound of CI
  Upper = c(mean(data_sequential$toxicity[R==1])+norm_quantile*sd(data_sequential$toxicity[R==1])/sqrt(length(data_sequential$toxicity[R==1])), res_sequential_toxicity$est+norm_quantile*res_sequential_toxicity$sd_iid, res_sequential_toxicity$est+norm_quantile*sd_toxicity)           # Upper bound of CI
)

true_value <- mean(data_sequential$toxicity)  # Adjust this as needed
data_plot$Parameter <- factor(data_plot$Parameter, levels = c("Adjusted(sequential)", "Adjusted(i.i.d)", "Unadjusted"))



ggplot(data_plot, aes(x = Parameter, y = Estimate)) +
  geom_point(size = 3, color = "blue") +  # Plot estimates
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2, color = "blue") +  # CI bars
  geom_hline(yintercept = true_value, linetype = "dashed", color = "red", size = 1) +  # True parameter line
  labs(y = "Toxicity",
       x = "Types of CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 14))




