### Compute bounds of MMD and performing test
rm(list = ls())
library(dslabs)
source('MMD using permutation with Missing data.R')
source('MMD using CLT with Missing data.R')
mcluster <- parallel::makeCluster((32)) # using all cores

print(mcluster)


missingness_mechanisms <- function(Y,S,s){
  
  n2 <- dim(Y)[1]
  d <- 28*28
  if( sum(rowSums(Y > 0.5) >= 130) > S*n2 ){
    missing_location <- sample(which(rowSums(Y > 0.5) >= 130), n2*S)
    for (i in missing_location) {
      prob <- rep(1,d)
      prob[which(Y[i,]>0)] <- 5
      missing_location_vector <- sample(seq(1,d), d*s, prob =  prob)
      Y[i, missing_location_vector] <- NA
    }
    
  }else{
    missing_location <- which(rowSums(Y > 0.5) >= 130)
    for (i in missing_location) {
      prob <- rep(1,d)
      prob[which(Y[i,]>0)] <- 5
      missing_location_vector <- sample(seq(1,d), d*s, prob =  prob)
      Y[i, missing_location_vector] <- NA
    }
    
    
    missing_location_extra <- sample(setdiff(1:n2, missing_location), (S*n2 - length(missing_location)) )
    for (i in missing_location_extra) {
      missing_location_vector <- sample(seq(1,d), d*s )
      Y[i, missing_location_vector] <- NA
    }
    
  }
  return(Y)
  
}

####### read data using package ``dslabs''
mnist <- read_mnist()
training_set <- mnist$train
training_set_labels <- training_set$labels
training_set_images <- training_set$images

### Create data set with labels == 0
training_set_images_labels_0 <- training_set_images[training_set_labels == 0,]/255
total_number_equals_0 <- sum(training_set_labels == 5)
N1 <- total_number_equals_0

### Create data set with labels == 3
training_set_images_labels_3 <- training_set_images[training_set_labels == 3,]/255
total_number_equals_3 <- sum(training_set_labels == 3)
N2 <- total_number_equals_3


### Type I Error
test_Sparsity <- c(0,0.05,0.10,0.15,0.20,0.25,0.30)
reject_times_cd <- rep(0,length(test_Sparsity))
reject_times_mean_row <- rep(0,length(test_Sparsity))
reject_times_hd_row <- rep(0,length(test_Sparsity))
reject_times_bounds_CLT <- rep(0,length(test_Sparsity))
reject_times_bounds_perm <- rep(0,length(test_Sparsity))

test_typeIerror <- function(num){
  # num: number of permutations
  perm <- 100
  alpha <- 0.05
  flag <- 1
  for (S in test_Sparsity) {
    n1 <- 500
    n2 <- 500
    s <- 0.2
    locations_X <- sample(1:N2, n1, replace = TRUE)
    locations_Y <- sample(1:N2, n2, replace = TRUE)
    X <- training_set_images_labels_3[locations_X,]
    Y <- training_set_images_labels_3[locations_Y,]
    Incomplete_Y <- missingness_mechanisms(Y,S,s)

    DeletedY <- Incomplete_Y[! rowSums(is.na(Incomplete_Y)) > 0,]
    
    beta_delete <- MedianHeuristic(X,DeletedY)
    
    if(permutation_testing_with_missing_data(X,DeletedY,beta_delete,perm)$pval < alpha){
      reject_times_cd[flag] <- 1
    }
    ## mean imputation
    Mean_imputedY_row <- Incomplete_Y
    for (i in 1:n2) {
      if(sum(is.na(Incomplete_Y[i,])) > 0){
        Mean_imputedY_row[i, is.na(Mean_imputedY_row[i,])] <- mean(Mean_imputedY_row[i, !is.na(Mean_imputedY_row[i,])])
      }
    }
    
    beta_mean <- MedianHeuristic(X, Mean_imputedY_row)
    if(permutation_testing_with_missing_data(X,Mean_imputedY_row,beta_mean,perm)$pval < alpha){
      reject_times_mean_row[flag] <- 1
    }
    
    ## hot deck imputation
    HD_imputedY_row <- Incomplete_Y
    for (i in 1:n2) {
      if(sum(is.na(Incomplete_Y[i,])) > 0){
        HD_imputedY_row[i, is.na(HD_imputedY_row[i,])] <- sample(HD_imputedY_row[i, !is.na(HD_imputedY_row[i,])], sum(is.na(HD_imputedY_row[i,])), replace = TRUE)
      }
    }
    
    beta_hotdeck <- MedianHeuristic(X, HD_imputedY_row)
    if(permutation_testing_with_missing_data(X,HD_imputedY_row,beta_hotdeck,perm)$pval < alpha){
      reject_times_hd_row[flag] <- 1
    }
    
    ## bounds: CLT
    if(testing_with_missing_using_CLT(X, Incomplete_Y,beta_delete)$pval < alpha){
      reject_times_bounds_CLT[flag] <- 1
    }
    
    # # bounds: perm
    if(permutation_testing_with_missing_data(X, Incomplete_Y,beta_delete,perm)$pval < alpha){
      reject_times_bounds_perm[flag] <- 1
    }
    
    
    flag <- flag + 1
    
  }
  return( c(reject_times_cd,reject_times_mean_row, reject_times_hd_row, reject_times_bounds_CLT, reject_times_bounds_perm) )
}
num <- 100
clusterExport(mcluster, ls() ) 
start_time <- Sys.time()
test_res_case_I <- parLapply(mcluster,1:num,test_typeIerror)
end_time <- Sys.time() 
end_time - start_time


#### case deletion
Type_I_error_cd <- rep(0, length(test_Sparsity))
for (i in 1:length(test_Sparsity)) {
  
  for (j in 1:num) {
    
    Type_I_error_cd[i] <- Type_I_error_cd[i] + test_res_case_I[[j]][i]
    
  }
  
}
Type_I_error_cd <- Type_I_error_cd/num 
# Type I Error
df <- data.frame(test_Sparsity, Type_I_error_cd)
write.csv(df, 'MNIST_Type_I_Error_cd_MNAR_CaseII_n_m_500_batch_1.xlsx')


#### mean imputation
Type_I_error_mean_impute <- rep(0, length(test_Sparsity))
for (i in 1:length(test_Sparsity)) {
  
  for (j in 1:num) {
    
    Type_I_error_mean_impute[i] <- Type_I_error_mean_impute[i] + test_res_case_I[[j]][(i + length(test_Sparsity)) ]
    
  }
  
}
Type_I_error_mean_impute <- Type_I_error_mean_impute/num 
# Type I Error
df <- data.frame(test_Sparsity, Type_I_error_mean_impute)
write.csv(df, 'MNIST_Type_I_Error_mean_MNAR_CaseII_n_m_500_batch_1.xlsx')


#### hot deck imputation
Type_I_error_hd_impute <- rep(0, length(test_Sparsity))
for (i in 1:length(test_Sparsity)) {
  
  for (j in 1:num) {
    
    Type_I_error_hd_impute[i] <- Type_I_error_hd_impute[i] + test_res_case_I[[j]][(i + 2*length(test_Sparsity)) ]
    
  }
  
}
Type_I_error_hd_impute <- Type_I_error_hd_impute/num 
# Type I Error
df <- data.frame(test_Sparsity, Type_I_error_hd_impute)
write.csv(df, 'MNIST_Type_I_Error_hd_MNAR_CaseII_n_m_500_batch_1.xlsx')


#### bounds: CLT
Type_I_error_bounds <- rep(0, length(test_Sparsity))
for (i in 1:length(test_Sparsity)) {

  for (j in 1:num) {

    Type_I_error_bounds[i] <- Type_I_error_bounds[i] + test_res_case_I[[j]][(i + 3*length(test_Sparsity)) ]

  }

}
Type_I_error_bounds <- Type_I_error_bounds/num
# Type I Error
df <- data.frame(test_Sparsity, Type_I_error_bounds)
write.csv(df, 'MNIST_Type_I_Error_bounds_CLT_MNAR_CaseII_n_m_500_batch_1.xlsx')

# #### bounds: perm
Type_I_error_bounds <- rep(0, length(test_Sparsity))
for (i in 1:length(test_Sparsity)) {

  for (j in 1:num) {

    Type_I_error_bounds[i] <- Type_I_error_bounds[i] + test_res_case_I[[j]][(i + 4*length(test_Sparsity)) ]

  }

}
Type_I_error_bounds <- Type_I_error_bounds/num
# Type I Error
df <- data.frame(test_Sparsity, Type_I_error_bounds)
write.csv(df, 'MNIST_Type_I_Error_bounds_perm_MNAR_CaseII_n_m_500_batch_1.xlsx')

