library(MASS)
library(ggplot2)

generate_cluster <- function(ng, beta_ar, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon){
  X <- rnorm(1)
  W <- c(arima.sim(model = list(ar = beta_ar), n = ng, sd = sigma))
  S1 <- cummax(W)
  S2 <- cummin(W)
  S3 <- cumsum(W)/(1:ng)
  covariates <- cbind(rep(X, ng), S1, S2, S3)
  temp1 <- covariates %*% beta_pi + intercept_pi
  temp2 <- exp(temp1)/(1+exp(temp1))
  pis <- ifelse(temp2>epsilon, temp2, epsilon)
  R <- rbinom(ng, 1, pis)
  mus <- covariates %*% beta_mu + intercept_mu
  Y <- rnorm(ng, mus)
  data_g <- data.frame(cbind(rep(X, ng), W, S1,S2,S3, R, R*Y))
  colnames(data_g) <- c("X", "W", "S1", "S2", "S3", "R", "RY")
  return(data_g)
}

ng <- 100
sigma <- 2
beta_ar <- c(1,-0.5)
beta_pi <- c(1, 1, 0.8, -0.5)
beta_mu <- c(-1, 1, 1, -0.5)
intercept_pi <- 0
intercept_mu <- 1
epsilon <- 0.05
data_g <- generate_cluster(ng, beta_ar, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)

generate_cluster_var <- function(ng, A1, A2,  beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon){
  X <- rnorm(1)
  k <- dim(A1)[1]
  # Generate VAR(2)
  W <- matrix(0, nrow = ng, ncol = k)
  errors <- matrix(rnorm(ng * k, sd=sigma), nrow = ng, ncol = k)
  W[1, ] <- errors[1, ]  
  W[2, ] <- A1 %*% W[1, ] + errors[2, ]
  for (t in 3:ng) {
    W[t, ] <- A1 %*% W[t - 1, ] + A2 %*% W[t - 2, ] + errors[t, ]
  }
  S1 <- apply(apply(W, 2, cummax), 1, max) 
  S2 <- apply(apply(W, 2, cummin), 1, min) 
  S3 <- apply(W, 2, cumsum)/(1:ng)
  covariates <- cbind(rep(X, ng), S1, S2, S3)
  temp1 <- covariates %*% beta_pi + intercept_pi
  temp2 <- exp(temp1)/(1+exp(temp1))
  pis <- ifelse(temp2>epsilon, temp2, epsilon)
  R <- rbinom(ng, 1, pis)
  mus <- covariates %*% beta_mu + intercept_mu
  Y <- rnorm(ng, mus)
  data_g <- data.frame(cbind(rep(X, ng), W, S1,S2,S3, R, R*Y))
  colnames(data_g) <- c("X", "W1","W2", "S1", "S2", "S31", "S32", "R", "RY")
  return(data_g)
}

ng <- 100
sigma <- 2
A1 <- matrix(c(0.5, 0.2,
               0.3, 0.4), nrow = 2, byrow = TRUE)

A2 <- matrix(c(0.1, 0.05,
               0.05, 0.1), nrow = 2, byrow = TRUE)
beta_pi <- c(1, 1, 0.8, -0.5, 0.3)
beta_mu <- c(-1, 1, 1, -0.5, -0.4)
intercept_pi <- 0
intercept_mu <- 1
epsilon <- 0.05
data_g <- generate_cluster_var(ng, A1, A2, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)


generate_data <- function(n, alpha, beta_ar, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon){
  ng <- round(n^alpha)
  G <- round(n/ng)
  L <- list()
  for (g in 1:G){
    L[[g]] <- generate_cluster(ng, beta_ar, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)
    L[[g]]$cid <- g
  }
  data <- do.call(rbind, L)
  return(data)
}

generate_data_var <- function(n, alpha, A1, A2, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon){
  ng <- round(n^alpha)
  G <- round(n/ng)
  L <- list()
  for (g in 1:G){
    L[[g]] <- generate_cluster_var(ng, A1, A2, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)
    L[[g]]$cid <- g
  }
  data <- do.call(rbind, L)
  return(data)
}

n <- 10000
alpha <- 0.25
data <- generate_data_var(n, alpha, A1, A2, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)

dr_est <- function(data, K=2){
  n <- dim(data)[1]
  G <- max(data$cid)
  ng <- n/G
  split <- sample(1:K, G, replace=TRUE)
  pis <- numeric(n)
  mus <- numeric(n)
  for (k in 1:K){
    train_g <- (1:G)[split == k]
    test_g <- (1:G)[split != k]
    train_ind <- which(data$cid %in% train_g)
    test_ind <- which(data$cid %in% test_g)
    train_data <- data[train_ind,]
    test_data <- data[test_ind,]
    pi_mod <- glm(R ~ X+W1+W2, family=binomial(), data=train_data)
    temp <- predict(pi_mod, newdata=test_data[,1:3], type="response")
    pis[test_ind] <- ifelse(temp>epsilon, temp, epsilon)
    mu_mod <- glm(RY ~ X+W1+W2, data=train_data, subset=train_data$R==1)
    mus[test_ind] <- predict(mu_mod, newdata=test_data[,1:3])
  }
  data$ifs <- data$R*(data$RY-mus)/pis + mus
  theta_hat <- mean(data$ifs)
  return(theta_hat)
}


dr_seq_est <- function(data, K=2){
  n <- dim(data)[1]
  G <- max(data$cid)
  ng <- n/G
  split <- sample(1:K, G, replace=TRUE)
  pis <- numeric(n)
  mus <- numeric(n)
  for (k in 1:K){
    train_g <- (1:G)[split == k]
    test_g <- (1:G)[split != k]
    train_ind <- which(data$cid %in% train_g)
    test_ind <- which(data$cid %in% test_g)
    train_data <- data[train_ind,]
    test_data <- data[test_ind,]
    pi_mod <- glm(R ~ X+S1+S2+S31+S32, family=binomial(), data=train_data)
    temp <- predict(pi_mod, newdata=test_data[,c(1,4:7)], type="response")
    pis[test_ind] <- ifelse(temp>epsilon, temp, epsilon)
    mu_mod <- glm(RY ~ X+S1+S2+S31+S32, data=train_data, subset=train_data$R==1)
    mus[test_ind] <- predict(mu_mod, newdata=test_data[,c(1,4:7)])
  }
  data$ifs <- data$R*(data$RY-mus)/pis + mus
  theta_hat <- mean(data$ifs)
  return(theta_hat)
}

# AR(2)
n <- seq(from=2000, to=16000, by=2000)
alpha <- 0.4
sigma <- 2
beta_ar <- c(1,-0.5)
beta_pi <- c(1, 1, 0.8, -0.5, 0.3)
beta_mu <- c(-1, 1, 1, -0.5, -0.4)
intercept_pi <- 0
intercept_mu <- 1
epsilon <- 0.05
M <- 500
MSE_cur <- matrix(0, M, length(n))
MSE_seq <- matrix(0, M, length(n))
set.seed(521)
for (m in 1:M){
  for(i in seq_along(n)){
    data <- generate_data(n[i], alpha, beta_ar, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)
    ests1 <- dr_est(data,2)
    MSE_cur[m,i] <- (ests1-1)^2
    ests2 <- dr_seq_est(data,2)
    MSE_seq[m,i] <- (ests2-1)^2
  }
}

res_cur <- colMeans(MSE_cur)
res_cluster <- colMeans(MSE_seq)

df <- data.frame(
  x = rep(n, 2),
  y = sqrt(c(res_cur, res_cluster)),
  line = factor(rep(c("current", "history summary"), each = length(n)))
)
ggplot(df, aes(x = x, y = y, color = line)) +
  geom_line(size = 1) +       # Plot lines
  geom_point(size = 2) +      # Optionally add points
  labs(x = "Sample size",
       y = "Estimated RMSE",
       color = "Information adjusted for") +       # Customize legend title
  ylim(0, 1.5) + 
  theme_minimal()              # Use a minimal them


# VAR(2)
A1 <- matrix(c(0.5, 0.2,
               0.3, 0.4), nrow = 2, byrow = TRUE)

A2 <- matrix(c(0.1, 0.05,
               0.05, 0.1), nrow = 2, byrow = TRUE)

n <- seq(from=2000, to=16000, by=2000)
alpha <- 0.4
sigma <- 2
beta_pi <- c(1, 1, 0.8, -0.5, 0.3)
beta_mu <- c(-1, 1, 1, -0.5, -0.4)
intercept_pi <- 0
intercept_mu <- 1
epsilon <- 0.05
M <- 500
MSE_cur <- matrix(0, M, length(n))
ests_cur <- matrix(0, M, length(n))
MSE_seq <- matrix(0, M, length(n))
ests_seq <- matrix(0, M, length(n))
MSE_unadj <- matrix(0, M, length(n))
ests_unadj <- matrix(0, M, length(n))

set.seed(521)
for (m in 1:M){
  for(i in seq_along(n)){
    data <- generate_data_var(n[i], alpha, A1, A2, beta_pi, beta_mu, intercept_pi, intercept_mu, sigma, epsilon)
    ests_cur[m,i] <- dr_est(data,2)
    MSE_cur[m,i] <- (ests_cur[m,i]-1)^2
    ests_seq[m,i] <- dr_seq_est(data,2)
    MSE_seq[m,i] <- (ests_seq[m,i]-1)^2
    ests_unadj[m,i] <- mean(data$RY[data$R==1])
    MSE_unadj[m,i] <- (ests_unadj[m,i]-1)^2
  }
}

res_cur <- colMeans(MSE_cur)
res_cluster <- colMeans(MSE_seq)
res_unadj <- colMeans(MSE_unadj)
mean_cur <- colMeans(ests_cur)
mean_seq <- colMeans(ests_seq)
mean_unadj <- colMeans(ests_unadj)

df <- data.frame(
  x = rep(n, 3),
  y = sqrt(c(res_cur, res_cluster, res_unadj)),
  line = factor(rep(c("current", "historical summary", "unadjusted"), each = length(n)))
)
ggplot(df, aes(x = x, y = y, color = line)) +
  geom_line(size = 1) +       # Plot lines
  geom_point(size = 2) +      # Optionally add points
  labs(x = "Sample size",
       y = "Estimated RMSE",
       color = "Information adjusted for") +       # Customize legend title
  ylim(0, 1.2) + 
  theme_minimal()              # Use a minimal them







