library(SuperLearner)



dr_cluster <- function(RY, R, covariates, cid, K=2, sl_lib=c("SL.glmnet", 
                                                             "SL.mean", "SL.ranger")){
  epsilon <- 0.28 # 20% quantile of the propensity scores
  n <- length(R)
  G <- max(cid)
  ngs <- table(cid)
  split <- sample(1:K, G, replace=TRUE)
  pis <- numeric(n)
  mus <- numeric(n)
  for (k in 1:K){
    train_g <- (1:G)[split == k]
    test_g <- (1:G)[split != k]
    train_ind <- which(cid %in% train_g)
    test_ind <- which(cid %in% test_g)
    # train_data <- data[train_ind,]
    # test_data <- data[test_ind,]
    # pi_mod <- glm(R ~ X+W, family=binomial(), data=train_data)
    pi_mod = SuperLearner(Y = R[train_ind],
                             X = covariates[train_ind,], newX = covariates[test_ind,],
                             SL.library = sl_lib, family=binomial())
    temp <- pi_mod$SL.predict
    pis[test_ind] <- ifelse(temp>epsilon, temp, epsilon)
    # pis[test_ind] <- pi_mod$SL.predict
    # mu_mod <- glm(RY ~ X+W, data=train_data, subset=train_data$R==1)
    train_ind_nonmiss <- which(cid %in% train_g & R==1)
    mu_mod = SuperLearner(Y = RY[train_ind_nonmiss],
                          X = covariates[train_ind_nonmiss,], newX = covariates[test_ind,],
                          SL.library = sl_lib)
    mus[test_ind] <- mu_mod$SL.predict
  }
  ifs <- R*(RY-mus)/pis + mus
  theta_hat <- mean(ifs)
  sd1 <- sd(ifs)/sqrt(n)
  sum_cluster <- aggregate(ifs, list(cid = cid), sum)$x
  sd2 <- sqrt(sum(sum_cluster^2)/(n^2)-sum(ngs^2)*theta_hat^2/(n^2))
  # return(list(pis=pis, mus=mus))
  return(list(est=theta_hat, sd_iid=sd1, sd_cluster=sd2))
}



simulate_missingness <- function(X, seed = 42) {
  set.seed(seed)
  n <- dim(X)[1]
  d <- dim(X)[2]
  
  # Generate coefficients with scaling 1/sqrt(d)
  # beta <- (2*rbinom(d, 1, 0.5)-1)*10
  beta <- rnorm(d, mean = 0, sd = 2)
  
  # Compute probabilities using the logistic function
  logits <- X %*% beta
  probs <- 1 / (1 + exp(-logits))  # Sigmoid function
  
  # Simulate binary response variable Y ~ Bernoulli(probs)
  R <- rbinom(n, size = 1, prob = probs)

  return(R)
}



data_cluster <- read_csv("data_cluster.csv")
data_cluster$cid <- as.numeric(factor(data_cluster$cid))
data_cluster$W65 <- factor(data_cluster$W65)
data_cluster$X <- factor(data_cluster$X)
W <- as.matrix(data_cluster[,paste0('W',1:64)])
# diffs <- numeric(100)
# for (i in 1:100){
#   R <- simulate_missingness(W, seed=i)
#   t1 <- mean(data_cluster$creativity)-mean(data_cluster$creativity[R==1])
#   t2 <- mean(data_cluster$toxicity)-mean(data_cluster$toxicity[R==1])
#   t3 <- mean(data_cluster$humor)-mean(data_cluster$humor[R==1])
#   t4 <- mean(data_cluster$quality)-mean(data_cluster$quality[R==1])
#   diffs[i] <- abs(t1) + abs(t2)+abs(t3)+abs(t4)
# }
# max(diffs)

# Generate missingness
R <- simulate_missingness(W, seed=14)

# Check bias
t1 <- mean(data_cluster$creativity)-mean(data_cluster$creativity[R==1])
t2 <- mean(data_cluster$toxicity)-mean(data_cluster$toxicity[R==1])
t3 <- mean(data_cluster$humor)-mean(data_cluster$humor[R==1])
t4 <- mean(data_cluster$quality)-mean(data_cluster$quality[R==1])
mean(data_cluster$humor)
c(mean(data_cluster$humor[R==1])-1.96*sd(data_cluster$humor[R==1])/sqrt(length(data_cluster$humor[R==1])), mean(data_cluster$humor[R==1])+1.96*sd(data_cluster$humor[R==1])/sqrt(length(data_cluster$humor[R==1])))

mean(data_cluster$creativity)
c(mean(data_cluster$creativity[R==1])-1.96*sd(data_cluster$creativity[R==1])/sqrt(length(data_cluster$creativity[R==1])), mean(data_cluster$creativity[R==1])+1.96*sd(data_cluster$creativity[R==1])/sqrt(length(data_cluster$creativity[R==1])))

mean(data_cluster$quality)
c(mean(data_cluster$quality[R==1])-1.96*sd(data_cluster$quality[R==1])/sqrt(length(data_cluster$quality[R==1])), mean(data_cluster$quality[R==1])+1.96*sd(data_cluster$quality[R==1])/sqrt(length(data_cluster$quality[R==1])))

mean(data_cluster$toxicity)
c(mean(data_cluster$toxicity[R==1])-1.96*sd(data_cluster$toxicity[R==1])/sqrt(length(data_cluster$toxicity[R==1])), mean(data_cluster$toxicity[R==1])+1.96*sd(data_cluster$toxicity[R==1])/sqrt(length(data_cluster$toxicity[R==1])))

# DR estimation
covariates <- data_cluster[, c(paste0('W', 1:65), 'X')]

set.seed(521)
res_cluster_quality <- dr_cluster(R*data_cluster$quality, R, covariates, data_cluster$cid)

set.seed(521)
res_cluster_humor <- dr_cluster(R*data_cluster$humor, R, covariates, data_cluster$cid)
# nuisances <- dr_cluster(R*data_cluster$humor, R, covariates, data_cluster$cid)
# epsilon <- 0.28
# pis <- ifelse(nuisances$pis>epsilon, nuisances$pis, epsilon)
# mus <- nuisances$mus
# ifs <- R*(RY-mus)/pis + mus
# theta_hat <- mean(ifs)
# sd1 <- sd(ifs)/sqrt(n)
# sum_cluster <- aggregate(ifs, list(cid = cid), sum)$x
# sd2 <- sqrt(sum(sum_cluster^2)/(n^2)-sum(ngs^2)*theta_hat^2/(n^2))
# mean(ifs)-1.96*sd1
# mean(ifs)-1.96*sd2
# mean(data_cluster$humor)

set.seed(521)
res_cluster_creativity <- dr_cluster(R*data_cluster$creativity, R, covariates, data_cluster$cid)

set.seed(521)
res_cluster_toxicity <- dr_cluster(R*data_cluster$toxicity, R, covariates, data_cluster$cid)


library(ggplot2)

norm_quantile <- qnorm(0.975)
# Quality
data_plot <- data.frame(
  Parameter = rep(c("Unadjusted", "Adjusted(i.i.d)", "Adjusted(cluster)"), each = 1),
  Estimate = c(mean(data_cluster$quality[R==1]), res_cluster_quality$est, res_cluster_quality$est),       # Estimated values
  Lower = c(mean(data_cluster$quality[R==1])-norm_quantile*sd(data_cluster$quality[R==1])/sqrt(length(data_cluster$quality[R==1])), res_cluster_quality$est-norm_quantile*res_cluster_quality$sd_iid, res_cluster_quality$est-norm_quantile*res_cluster_quality$sd_cluster),          # Lower bound of CI
  Upper = c(mean(data_cluster$quality[R==1])+norm_quantile*sd(data_cluster$quality[R==1])/sqrt(length(data_cluster$quality[R==1])), res_cluster_quality$est+norm_quantile*res_cluster_quality$sd_iid, res_cluster_quality$est+norm_quantile*res_cluster_quality$sd_cluster)           # Upper bound of CI
)

true_value <- mean(data_cluster$quality)  # Adjust this as needed

# Create the plot
ggplot(data_plot, aes(x = Parameter, y = Estimate)) +
  geom_point(size = 3, color = "blue") +  # Plot estimates
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2, color = "blue") +  # CI bars
  geom_hline(yintercept = true_value, linetype = "dashed", color = "red", size = 1) +  # True parameter line
  labs(y = "Quality",
       x = "Types of CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 14))


# Humor
data_plot <- data.frame(
  Parameter = rep(c("Unadjusted", "Adjusted(i.i.d)", "Adjusted(cluster)"), each = 1),
  Estimate = c(mean(data_cluster$humor[R==1]), res_cluster_humor$est, res_cluster_humor$est),       # Estimated values
  Lower = c(mean(data_cluster$humor[R==1])-norm_quantile*sd(data_cluster$humor[R==1])/sqrt(length(data_cluster$humor[R==1])), res_cluster_humor$est-norm_quantile*res_cluster_humor$sd_iid, res_cluster_humor$est-norm_quantile*res_cluster_humor$sd_cluster),          # Lower bound of CI
  Upper = c(mean(data_cluster$humor[R==1])+norm_quantile*sd(data_cluster$humor[R==1])/sqrt(length(data_cluster$humor[R==1])), res_cluster_humor$est+norm_quantile*res_cluster_humor$sd_iid, res_cluster_humor$est+norm_quantile*res_cluster_humor$sd_cluster)           # Upper bound of CI
)

true_value <- mean(data_cluster$humor)  # Adjust this as needed



ggplot(data_plot, aes(x = Parameter, y = Estimate)) +
  geom_point(size = 3, color = "blue") +  # Plot estimates
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2, color = "blue") +  # CI bars
  geom_hline(yintercept = true_value, linetype = "dashed", color = "red", size = 1) +  # True parameter line
  labs(y = "Humor",
       x = "Types of CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 14))


# Creativity
data_plot <- data.frame(
  Parameter = rep(c("Unadjusted", "Adjusted(i.i.d)", "Adjusted(cluster)"), each = 1),
  Estimate = c(mean(data_cluster$creativity[R==1]), res_cluster_creativity$est, res_cluster_creativity$est),       # Estimated values
  Lower = c(mean(data_cluster$creativity[R==1])-norm_quantile*sd(data_cluster$creativity[R==1])/sqrt(length(data_cluster$creativity[R==1])), res_cluster_creativity$est-norm_quantile*res_cluster_creativity$sd_iid, res_cluster_creativity$est-norm_quantile*res_cluster_creativity$sd_cluster),          # Lower bound of CI
  Upper = c(mean(data_cluster$creativity[R==1])+norm_quantile*sd(data_cluster$creativity[R==1])/sqrt(length(data_cluster$creativity[R==1])), res_cluster_creativity$est+norm_quantile*res_cluster_creativity$sd_iid, res_cluster_creativity$est+norm_quantile*res_cluster_creativity$sd_cluster)           # Upper bound of CI
)

true_value <- mean(data_cluster$creativity)  # Adjust this as needed



ggplot(data_plot, aes(x = Parameter, y = Estimate)) +
  geom_point(size = 3, color = "blue") +  # Plot estimates
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2, color = "blue") +  # CI bars
  geom_hline(yintercept = true_value, linetype = "dashed", color = "red", size = 1) +  # True parameter line
  labs(y = "Creativity",
       x = "Types of CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 14))



# toxicity
data_plot <- data.frame(
  Parameter = rep(c("Unadjusted", "Adjusted(i.i.d)", "Adjusted(cluster)"), each = 1),
  Estimate = c(mean(data_cluster$toxicity[R==1]), res_cluster_toxicity$est, res_cluster_toxicity$est),       # Estimated values
  Lower = c(mean(data_cluster$toxicity[R==1])-norm_quantile*sd(data_cluster$toxicity[R==1])/sqrt(length(data_cluster$toxicity[R==1])), res_cluster_toxicity$est-norm_quantile*res_cluster_toxicity$sd_iid, res_cluster_toxicity$est-norm_quantile*res_cluster_toxicity$sd_cluster),          # Lower bound of CI
  Upper = c(mean(data_cluster$toxicity[R==1])+norm_quantile*sd(data_cluster$toxicity[R==1])/sqrt(length(data_cluster$toxicity[R==1])), res_cluster_toxicity$est+norm_quantile*res_cluster_toxicity$sd_iid, res_cluster_toxicity$est+norm_quantile*res_cluster_toxicity$sd_cluster)           # Upper bound of CI
)

true_value <- mean(data_cluster$toxicity)  # Adjust this as needed



ggplot(data_plot, aes(x = Parameter, y = Estimate)) +
  geom_point(size = 3, color = "blue") +  # Plot estimates
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.2, color = "blue") +  # CI bars
  geom_hline(yintercept = true_value, linetype = "dashed", color = "red", size = 1) +  # True parameter line
  labs(y = "Toxicity",
       x = "Types of CI") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 14))




