library(MASS)
library(ggplot2)

generate_cluster <- function(ng, rho, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon){
  X <- rnorm(1)
  cov_matrix <- matrix(0, ng, ng)
  for (i in 1:ng) {
    for (j in 1:ng) {
      cov_matrix[i, j] <- rho^abs(i - j)
    }
  }
  W <- mvrnorm(1, rep(X,ng), sigma^2*cov_matrix)
  covariates <- cbind(rep(X, ng), W)
  temp1 <- covariates %*% beta_pi + intercept_pi
  temp2 <- exp(temp1)/(1+exp(temp1))
  pis <- ifelse(temp2>epsilon, temp2, epsilon)
  R <- rbinom(ng, 1, pis)
  mus <- covariates %*% beta_mu + intercept_mu
  Y <- rnorm(ng, mus)
  data_g <- data.frame(cbind(rep(X, ng), W, R, R*Y))
  colnames(data_g) <- c("X", "W", "R", "RY")
  return(data_g)
}


ng <- 100
rho <- 0.8
beta_pi <- c(1,0.5)
beta_mu <- c(-1,1)
intercept_pi <- 0
intercept_mu <- 0.5
sigma <- 2
epsilon <- 0.05
data_g <- generate_cluster(ng, rho, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)


generate_data <- function(n, alpha, rho, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon){
  ng <- round(n^alpha)
  G <- round(n/ng)
  L <- list()
  for (g in 1:G){
   L[[g]] <- generate_cluster(ng, rho, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)
   L[[g]]$cid <- g
  }
  data <- do.call(rbind, L)
  return(data)
}

n <- 10000
alpha <- 0.25
data <- generate_data(n, alpha, rho, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)


dr_est <- function(data, K=2){
  n <- dim(data)[1]
  G <- max(data$cid)
  ng <- n/G
  split <- sample(1:K, G, replace=TRUE)
  pis <- numeric(n)
  mus <- numeric(n)
  for (k in 1:K){
    train_g <- (1:G)[split == k]
    test_g <- (1:G)[split != k]
    train_ind <- which(data$cid %in% train_g)
    test_ind <- which(data$cid %in% test_g)
    train_data <- data[train_ind,]
    test_data <- data[test_ind,]
    pi_mod <- glm(R ~ X+W, family=binomial(), data=train_data)
    temp <- predict(pi_mod, newdata=test_data[,1:2], type="response")
    pis[test_ind] <- ifelse(temp>epsilon, temp, epsilon)
    mu_mod <- glm(RY ~ X+W, data=train_data, subset=train_data$R==1)
    mus[test_ind] <- predict(mu_mod, newdata=test_data[,1:2])
  }
  data$ifs <- data$R*(data$RY-mus)/pis + mus
  theta_hat <- mean(data$ifs)
  sd1 <- sd(data$ifs)/sqrt(n)
  sum_cluster <- aggregate(data$ifs, list(cid = data$cid), sum)$x
  sd2 <- sqrt(sum(sum_cluster^2)/(n^2)-sum(ng^2)*theta_hat^2/(n^2))
  return(list(est=theta_hat, sd_iid=sd1, sd_cluster=sd2))
}




n <- 10000
alphas <- seq(from=0.1, to=0.5, by=0.05)
rho <- 0.8
beta_pi <- c(1,0.5)
beta_mu <- c(-1,1)
intercept_pi <- 0
intercept_mu <- 0.5
sigma <- 2
epsilon <- 0.05
M <- 500
coverage_iid <- matrix(0, M, length(alphas))
coverage_cluster <- matrix(0, M, length(alphas))
set.seed(521)
for (m in 1:M){
  for(i in seq_along(alphas)){
    data <- generate_data(n, alpha, rho, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)
    ests <- dr_est(data,2)
    if(ests$est- qnorm(0.975)*ests$sd_iid< 0.5 & ests$est+qnorm(0.975)*ests$sd_iid>0.5){
      coverage_iid[m,i] <- 1
    }
    else coverage_iid[m,i] <- 0
    if(ests$est- qnorm(0.975)*ests$sd_cluster< 0.5 & ests$est+qnorm(0.975)*ests$sd_cluster>0.5){
      coverage_cluster[m,i] <- 1
    }
    else coverage_cluster[m,i] <- 0
  }
}


res_iid <- colMeans(coverage_iid)
res_cluster <- colMeans(coverage_cluster)
df <- data.frame(
  x = rep(alphas, 2),
  y = c(res_iid, res_cluster),
  line = factor(rep(c("i.i.d.", "cluster"), each = length(alphas)))
)
ggplot(df, aes(x = x, y = y, color = line)) +
  geom_line(size = 1) +       # Plot lines
  geom_point(size = 2) +      # Optionally add points
  geom_hline(yintercept = 0.95, linetype = "dashed", color = "red") +  # Horizontal line at y = 0
  ylim(0, 1) +                 # Set y-axis range from 0 to 1
  labs(x = expression(alpha),
       y = "Estimated Coverage Probability",
       color = "Estimated Coverage Probability") +       # Customize legend title
  theme_minimal()              # Use a minimal them


