### Compute bounds of MMD and performing test
rm(list = ls())
library(parallel)
source('MMD using permutation with Missing data.R')

mcluster <- parallel::makeCluster((32)) # using all cores
print(mcluster)

MNAR_Univariate <- function(X, Y, s_1, s_2){
  ## s_1,s_2, sparsity in two groups respectively
  n <- length(X)
  m <- length(Y)
  if(sum(X<0) >= n*s_1){
    missing_location_X <- sample(which(X<0), n*s_1)
  }else{
    missing_location_X <- union(which(X<0), sample(setdiff(seq(1,n), which(X<0)), (n*s_1-sum(X<0))))
  }
  
  X[missing_location_X] <- NA
  
  
  if(sum(Y>0) >= m*s_2){
    missing_location_Y <- sample(which(Y>0), m*s_2)
  }else{
    missing_location_Y <- union(which(Y>0), sample(setdiff(seq(1,m), which(Y>0)), (m*s_2-sum(Y>0))))
  }
  
  Y[missing_location_Y] <- NA  
  return(c(X, Y))
}

### type I error: case I
test_Sample_sizes <- c(100,200,500,1000,1500,2000,2500,5000)
rejection_times_cd <- rep(0,length(test_Sample_sizes))
rejection_times_mean <- rep(0,length(test_Sample_sizes))
rejection_times_hd <- rep(0,length(test_Sample_sizes))
reject_times_bounds_perm <- rep(0,length(test_Sample_sizes))

test_typeIerror <- function(num){
  # num: number of permutations
  perm <- 100
  alpha <- 0.05
  flag <- 1
  for (n in test_Sample_sizes) {

    # generate random samples
    X <- rnorm(n,0,1)
    Y <- rnorm(m,0,1) 

    MissingData <- MNAR_Univariate(X,Y,s,s)
    Missing_X <- MissingData[1:n]
    Missing_Y <- MissingData[(n+1):(n+m)]
    
    ## case deletion
    deleted_X <- Missing_X[!is.na(Missing_X)]
    deleted_Y <- Missing_Y[!is.na(Missing_Y)]
    beta_delete <- MedianHeuristic(deleted_X,deleted_Y)
    if(permutation_testing_with_missing_data(deleted_X,deleted_Y,beta_delete,perm)$pval < alpha){
      rejection_times_cd[flag] <- rejection_times_cd[flag]+1 
    }
    
    
    ## mean imputation 
    mean_imputed_X <- Missing_X
    mean_imputed_X[is.na(mean_imputed_X)] <- mean(deleted_X)
    mean_imputed_Y <- Missing_Y
    mean_imputed_Y[is.na(mean_imputed_Y)] <- mean(deleted_Y)
    beta_mean <- MedianHeuristic(mean_imputed_X,mean_imputed_Y)
    if(permutation_testing_with_missing_data(mean_imputed_X,mean_imputed_Y,beta_mean,perm)$pval < alpha){
      rejection_times_mean[flag] <- rejection_times_mean[flag]+1 
    }
    
    
    ## hd imputation
    hd_imputed_X <- Missing_X
    hd_imputed_X[is.na(hd_imputed_X)] <- sample(deleted_X, size = sum(is.na(hd_imputed_X)))
    hd_imputed_Y <- Missing_Y
    hd_imputed_Y[is.na(hd_imputed_Y)] <- sample(deleted_Y, size = sum(is.na(hd_imputed_Y)))
    beta_hd <- MedianHeuristic(hd_imputed_X,hd_imputed_Y)
    if(permutation_testing_with_missing_data(hd_imputed_X,hd_imputed_Y,beta_hd,perm)$pval < alpha){
      rejection_times_hd[flag] <- rejection_times_hd[flag]+1 
    }
    
    ## Proposed: Perm
    if(permutation_testing_with_missing_data(Missing_X,Missing_Y,beta_delete,perm)$pval < alpha){
      reject_times_bounds_perm[flag] <- reject_times_bounds_perm[flag]+1 
    }
    
    flag <- flag + 1
  }
  return(c( rejection_times_cd, rejection_times_mean, rejection_times_hd, reject_times_bounds_perm))
}
num <- 100
clusterExport(mcluster,ls())
start_time <- Sys.time()
test_res_case_I <- parLapply(mcluster,1:num,test_typeIerror)
end_time <- Sys.time() 
end_time - start_time


#### case deletion
Type_I_error_cd <- rep(0, length(test_Sample_sizes))
for (i in 1:length(test_Sample_sizes)) {
  
  for (j in 1:num) {
    
    Type_I_error_cd[i] <- Type_I_error_cd[i] + test_res_case_I[[j]][i]
    
  }
  
}
Type_I_error_cd <- Type_I_error_cd/num 
# Type I Error
df <- data.frame(test_Sample_sizes, Type_I_error_cd)
write.csv(df, 'Univariate_Type_I_Error_cd_Gaussian_MNAR_Proportion_0dot05_batch_1.xlsx')


#### mean imputation
Type_I_error_mean_impute <- rep(0, length(test_Sample_sizes))
for (i in 1:length(test_Sample_sizes)) {
  
  for (j in 1:num) {
    
    Type_I_error_mean_impute[i] <- Type_I_error_mean_impute[i] + test_res_case_I[[j]][(i + length(test_Sample_sizes)) ]
    
  }
  
}
Type_I_error_mean_impute <- Type_I_error_mean_impute/num 
# Type I Error
df <- data.frame(test_Sample_sizes, Type_I_error_mean_impute)
write.csv(df, 'Univariate_Type_I_Error_mean_Gaussian_MNAR_Proportion_0dot05_batch_1.xlsx')


#### hot deck imputation
Type_I_error_hd_impute <- rep(0, length(test_Sample_sizes))
for (i in 1:length(test_Sample_sizes)) {
  
  for (j in 1:num) {
    
    Type_I_error_hd_impute[i] <- Type_I_error_hd_impute[i] + test_res_case_I[[j]][(i + 2*length(test_Sample_sizes)) ]
    
  }
  
}
Type_I_error_hd_impute <- Type_I_error_hd_impute/num 
# Type I Error
df <- data.frame(test_Sample_sizes, Type_I_error_hd_impute)
write.csv(df, 'Univariate_Type_I_Error_hd_Gaussian_MNAR_Proportion_0dot05_batch_1.xlsx')



#### bounds: perm
Type_I_error_bounds <- rep(0, length(test_Sample_sizes))
for (i in 1:length(test_Sample_sizes)) {
  
  for (j in 1:num) {
    
    Type_I_error_bounds[i] <- Type_I_error_bounds[i] + test_res_case_I[[j]][(i + 3*length(test_Sample_sizes)) ]
    
  }
  
}
Type_I_error_bounds <- Type_I_error_bounds/num 
# Type I Error
df <- data.frame(test_Sample_sizes, Type_I_error_bounds)
write.csv(df, 'Univariate_Type_I_Error_bounds_perm_Gaussian_MNAR_Proportion_0dot05_batch_1.xlsx')