### Compute bounds of MMD and performing test
rm(list = ls())
library(parallel)
source('MMD using permutation with Missing data.R')
source('MMD using CLT with Missing data.R')

mcluster <- parallel::makeCluster((32))
print(mcluster)

Incomplete_Mixture_Gaussian <- function(n,m,d,S_1,s_1,S_2,s_2){
  ### function for generating incompletely observed samples
  
  ## generate X
  sigma_1 <- diag(d)
  X <- rep(0,d)
  locations_X <- 0
  X <- MASS::mvrnorm(n, rep(0,d),sigma_1)
  
  ### generate incomplete X
  if(sum(rowSums(X)/sqrt(d) < -0.8) >= n*S_1){
    missing_location_X <- sample(which(rowSums(X)/sqrt(d) < -0.8), n*S_1)
    for (i in missing_location_X) {
      missing_location_vector <- sample(which(rank(X[i,]) <= d*0.5), d*s_1)
      X[i, missing_location_vector] <- NA
    } 
  }else{
    missing_location_X <- which(rowSums(X)/sqrt(d) < -0.8)
    for (i in missing_location_X) {
      missing_location_vector <- sample(which(rank(X[i,]) <= d*0.5), d*s_1)
      X[i, missing_location_vector] <- NA
    } 
    
    missing_index <- sample(setdiff(1:n,missing_location_X),(n*S_1 - length(missing_location_X)))
    
    for (i in missing_index) {
      missing_location_vector <- sample(seq(1,d), d*s_1)
      X[i,missing_location_vector] <- NA
    }
    
  }
  
  ## generate Y
  sigma_2 <- diag(d)
  Y <- rep(0,d)
  Y<- MASS::mvrnorm(m, rep(0,d),sigma_1)
  
  ### generate incomplete Y
  if(sum(rowSums(Y)/sqrt(d) > 0.8) >= m*S_2){
    missing_location_Y <- sample(which(rowSums(Y)/sqrt(d) > 0.8), m*S_2)
    for (i in missing_location_Y) {
      missing_location_vector <- sample(which(rank(Y[i,]) >= d*0.5), d*s_2)
      Y[i, missing_location_vector] <- NA
    }
  }else{
    missing_location_Y <- which(rowSums(Y)/sqrt(d) > 0.8)
    for (i in missing_location_Y) {
      missing_location_vector <- sample(which(rank(Y[i,]) >= d*0.5), d*s_2)
      Y[i, missing_location_vector] <- NA
    }
    
    missing_index <- sample(setdiff(1:m,missing_location_Y),(m*S_2 - length(missing_location_Y)))
    
    for (i in missing_index) {
      missing_location_vector <- sample(seq(1,d), d*s_2)
      Y[i,missing_location_vector] <- NA
    }
    
  }
  
  return(list(Incomplete_X = X, Incomplete_Y = Y))
}

### Type I Error with increasing sample sizes
test_Sample_sizes <- c(100,200,500,1000,1500,2000,2500,5000)
reject_times_cd <- rep(0,length(test_Sample_sizes))
reject_times_mean_row <- rep(0,length(test_Sample_sizes))
reject_times_hd_row <- rep(0,length(test_Sample_sizes))
reject_times_bounds_CLT <- rep(0,length(test_Sample_sizes))
reject_times_bounds_perm <- rep(0,length(test_Sample_sizes))

test_typeIerror <- function(num){
  # num: number of permutations
  d <- 50
  perm <- 100
  alpha <- 0.05
  flag <- 1
  for (n in test_Sample_sizes) {
    m <- n
    S <- 0.05  ## fixed proportion of incompletely observed samples
    S_1 <- S
    s_1 <- 0.3
    S_2 <- S
    s_2 <- 0.3
    IncompleteData <- Incomplete_Mixture_Gaussian(n,m,d,S_1,s_1,S_2,s_2)
    Incomplete_X <- IncompleteData$Incomplete_X
    Incomplete_Y <- IncompleteData$Incomplete_Y

    DeletedX <- Incomplete_X[! rowSums(is.na(Incomplete_X)) > 0,]
    DeletedY <- Incomplete_Y[! rowSums(is.na(Incomplete_Y)) > 0,]
    
    beta_delete <- MedianHeuristic(DeletedX,DeletedY)
    
    if(permutation_testing_with_missing_data(DeletedX,DeletedY,beta_delete,perm)$pval < alpha){
      reject_times_cd[flag] <- 1
    }
    ## mean imputation
    Mean_imputedX_row <- Incomplete_X
    for (i in 1:n) {
      if(sum(is.na(Incomplete_X[i,])) > 0){
        Mean_imputedX_row[i, is.na(Mean_imputedX_row[i,])] <- mean(Mean_imputedX_row[i, !is.na(Mean_imputedX_row[i,])])
      }
    }
    Mean_imputedY_row <- Incomplete_Y
    for (i in 1:m) {
      if(sum(is.na(Incomplete_Y[i,])) > 0){
        Mean_imputedY_row[i, is.na(Mean_imputedY_row[i,])] <- mean(Mean_imputedY_row[i, !is.na(Mean_imputedY_row[i,])])
      }
    }
    
    beta_mean <- MedianHeuristic(Mean_imputedX_row, Mean_imputedY_row)
    if(permutation_testing_with_missing_data(Mean_imputedX_row,Mean_imputedY_row,beta_mean,perm)$pval < alpha){
      reject_times_mean_row[flag] <- 1
    }
    
    ## hot deck imputation
    HD_imputedX_row <- Incomplete_X
    for (i in 1:n) {
      if(sum(is.na(Incomplete_X[i,])) > 0){
        HD_imputedX_row[i, is.na(HD_imputedX_row[i,])] <- sample(HD_imputedX_row[i, !is.na(HD_imputedX_row[i,])], sum(is.na(HD_imputedX_row[i,])), replace = TRUE)
      }
    }
    HD_imputedY_row <- Incomplete_Y
    for (i in 1:m) {
      if(sum(is.na(Incomplete_Y[i,])) > 0){
        HD_imputedY_row[i, is.na(HD_imputedY_row[i,])] <- sample(HD_imputedY_row[i, !is.na(HD_imputedY_row[i,])], sum(is.na(HD_imputedY_row[i,])), replace = TRUE)
      }
    }
    beta_hotdeck <- MedianHeuristic(HD_imputedX_row, HD_imputedY_row)
    if(permutation_testing_with_missing_data(HD_imputedX_row,HD_imputedY_row,beta_hotdeck,perm)$pval < alpha){
      reject_times_hd_row[flag] <- 1
    }
    
    # bounds: CLT
    if(testing_with_missing_using_CLT(Incomplete_X, Incomplete_Y,beta_delete)$pval < alpha){
      reject_times_bounds_CLT[flag] <- 1
    }

    #bounds: perm
    if(permutation_testing_with_missing_data(Incomplete_X, Incomplete_Y,beta_delete,perm)$pval < alpha){
      reject_times_bounds_perm[flag] <- 1
    }
    
    flag <- flag + 1
    
  }
  return( c(reject_times_cd,reject_times_mean_row, reject_times_hd_row, reject_times_bounds_CLT, reject_times_bounds_perm) )
}
num <- 100
clusterExport(mcluster, c(ls()) ) 
start_time <- Sys.time()
test_res_case_I <- parLapply(mcluster,1:num,test_typeIerror)
end_time <- Sys.time() 
end_time - start_time

#### case deletion
Type_I_error_cd <- rep(0, length(test_Sample_sizes))
for (i in 1:length(test_Sample_sizes)) {
  
  for (j in 1:num) {
    
    Type_I_error_cd[i] <- Type_I_error_cd[i] + test_res_case_I[[j]][i]
    
  }
  
}
Type_I_error_cd <- Type_I_error_cd/num 
# Type I Error
df <- data.frame(test_Sample_sizes, Type_I_error_cd)
write.csv(df, 'Multivariate_Type_I_Error_cd_Gaussian_MNAR_Proportion_0dot05_d_50_batch_1.xlsx')


#### mean imputation
Type_I_error_mean_impute <- rep(0, length(test_Sample_sizes))
for (i in 1:length(test_Sample_sizes)) {
  
  for (j in 1:num) {
    
    Type_I_error_mean_impute[i] <- Type_I_error_mean_impute[i] + test_res_case_I[[j]][(i + length(test_Sample_sizes)) ]
    
  }
  
}
Type_I_error_mean_impute <- Type_I_error_mean_impute/num 
# Type I Error
df <- data.frame(test_Sample_sizes, Type_I_error_mean_impute)
write.csv(df, 'Multivariate_Type_I_Error_mean_Gaussian_MNAR_Proportion_0dot05_d_50_batch_1.xlsx')


#### hot deck imputation
Type_I_error_hd_impute <- rep(0, length(test_Sample_sizes))
for (i in 1:length(test_Sample_sizes)) {
  
  for (j in 1:num) {
    
    Type_I_error_hd_impute[i] <- Type_I_error_hd_impute[i] + test_res_case_I[[j]][(i + 2*length(test_Sample_sizes)) ]
    
  }
  
}
Type_I_error_hd_impute <- Type_I_error_hd_impute/num 
# Type I Error
df <- data.frame(test_Sample_sizes, Type_I_error_hd_impute)
write.csv(df, 'Multivariate_Type_I_Error_hd_Gaussian_MNAR_Proportion_0dot05_d_50_batch_1.xlsx')


### bounds: CLT
Type_I_error_bounds <- rep(0, length(test_Sample_sizes))
for (i in 1:length(test_Sample_sizes)) {

  for (j in 1:num) {

    Type_I_error_bounds[i] <- Type_I_error_bounds[i] + test_res_case_I[[j]][(i + 3*length(test_Sample_sizes)) ]

  }

}
Type_I_error_bounds <- Type_I_error_bounds/num
# Type I Error
df <- data.frame(test_Sample_sizes, Type_I_error_bounds)
write.csv(df, 'Multivariate_Type_I_Error_bounds_CLT_Gaussian_MNAR_Proportion_0dot05_d_50_batch_1.xlsx')

#### bounds: perm
Type_I_error_bounds <- rep(0, length(test_Sample_sizes))
for (i in 1:length(test_Sample_sizes)) {
  
  for (j in 1:num) {
    
    Type_I_error_bounds[i] <- Type_I_error_bounds[i] + test_res_case_I[[j]][(i + 4*length(test_Sample_sizes)) ]
    
  }
  
}
Type_I_error_bounds <- Type_I_error_bounds/num 
# Type I Error
df <- data.frame(test_Sample_sizes, Type_I_error_bounds)
write.csv(df, 'Multivariate_Type_I_Error_bounds_perm_Gaussian_MNAR_Proportion_0dot05_d_50_batch_1.xlsx')

