library(MASS)
library(glmnet)
library(Matrix)
library(foreach)
library(doParallel)
library(doRNG)
library(KRLS)
setwd("")
set.seed(521)

N1=10000 # 训练集样本量
N2=2000 # 测试集样本量
n<-c(500,1000,1500)
p<-4
c = 0.01 ##LGS 的参数  
d = 1## LGS的参数  
a = b = 1
q=0.1# 0.1,0.2,0.3,0.4
burn=300
M <-100# 实验重复次数
source_file_path <- ""
# Set up parallel backend  
cl <- makeCluster(detectCores() -4)  # Use one less core than available
registerDoParallel(cl)
# 初始化一个列表来存储所有结果
all_results <- list()
for (n_val in n) {
  results <- foreach(m = 1:M, .combine = 'rbind',.packages = c('MASS','glmnet','Matrix','kernlab','KRLS')) %dorng% {
    source(source_file_path)
    train_clean <- clean_data(N1, p)
    X_train    <- train_clean$X
    Y_train    <- train_clean$Y
    #污染
    outliers <- sample(1:N1, N1*q) # 随机产生N1*q个污染的下标
    corr <- corrupted_data(X_train, Y_train, outliers, N1, p, q)
    X_train_tilde <- corr$X_tilde
    Y_train_tilde <- corr$Y_tilde
    #干净的测试集
    test_clean <- clean_data(N2, p)
    X_test<-test_clean$X
    Y_test<-test_clean$Y
    # 假设 p=列数
    ##############################训练集中抽样##################################
    ###########################基于核的方法################
    
    ######################均匀抽样################
    idx <- sample(1:N1, n_val, replace = FALSE)
    # 使用相同的索引来选取X0和Y0
    X_UNIF = X_train_tilde[idx, ]
    Y_UNIF = Y_train_tilde[idx]
    # 定义高斯核
    rbf_kernel  <- rbfdot(sigma = p)
    #均匀采样干净比例
    ps_UNIF <- intersect(idx, outliers)
    psr_UNIF <- 1-length(ps_UNIF)/n_val
    fit_final_f0 <- krls(X = X_UNIF, y = Y_UNIF, derivative = FALSE, vcov= FALSE)
    # 使用 predict 函数进行预测
    Yhat_train_tilde <- predict(fit_final_f0, newdata = X_train_tilde)$fit
    ## 计算残差
    res0<-abs(Y_train_tilde-Yhat_train_tilde)
    res0_2<-res0^2
    ##计算训练集在初始模型下的核矩阵
    K_train_tilde<-kernelMatrix(rbf_kernel,X_train_tilde,X_UNIF)
    # 计算每个样本的核范数，即计算每行的L2范数
    K_norms_train_tilde <-sqrt(rowSums( K_train_tilde^2))
    rm(K_train_tilde) # 删除核矩阵
    
    
    #######################Li2024################
    # 抽样概率
    GK<-exp(res0_2)
    max_iter <- 1 #（迭代次数）
    iter <- 0
    while(iter < max_iter) {
      iter <- iter + 1
      ## 开始抽样 
      L2MS_result<-LGMS(X_train_tilde,Y_train_tilde, GK, 1, burn, n_val)
      X_L2MS<-L2MS_result$X_selected
      Y_L2MS<-L2MS_result$Y_selected
      indices_L2MS<-L2MS_result$indices
      # 拟合最终模型
      fit_final_L2MS <- krls(X = X_L2MS, y = Y_L2MS,derivative=FALSE,vcov=FALSE)
      # 子样本污染数据比例
      ps_L2MS <- intersect(indices_L2MS, outliers)
      psr_L2MS <- 1-length(ps_L2MS)/n_val
    }
    
    ################KRMS##################
    
    # 抽样概率
    G_KRMS<-res0/K_norms_train_tilde
    max_iter <- 15 #（迭代次数）
    tolerance <- 0.03
    iter <- 0
    diff_norm<-Inf
    fit_final_KRMS<-NULL
    psr_KRMS<-NA
    while(iter < max_iter & diff_norm > tolerance) {
      iter <- iter + 1
      KRMS_result<-LGMS(X_train_tilde,Y_train_tilde, G_KRMS, 1, burn, n_val)
      X_KRMS<-KRMS_result$X_selected
      Y_KRMS<-KRMS_result$Y_selected
      indices_KRMS<-KRMS_result$indices
      # 检查 Y_KRMS 是否为常数或方差过小
      if (length(Y_KRMS) < 2 || sd(Y_KRMS) < 1e-9) {
        warning(paste0("KRMS (n_val=", n_val, ", m_rep=", m, ", iter=", iter, "): Y_KRMS 是常数或方差接近于零。正在停止此轮KRMS迭代。"))
        break # 立即跳出while循环）
      }
      # 拟合最终模型
      fit_final_KRMS<-krls(X = X_KRMS, y = Y_KRMS,derivative = FALSE, vcov= FALSE)
      #更新训练集的核以及二范数
      K_train_tilde<-kernelMatrix(rbf_kernel,X_train_tilde,X_KRMS)
      K_norms_train_tilde <-sqrt(rowSums( K_train_tilde^2))
      rm(K_train_tilde) # 释放内存
      # 用干净样本得到的模型预测预测训练集实现迭代
      Yhat_train_tilde_new <- predict(fit_final_KRMS, newdata =X_train_tilde)$fit
      #更新残差
      res<-abs(Y_train_tilde-Yhat_train_tilde_new)
      #更新抽样概率
      G_KRMS<-res/K_norms_train_tilde
      #迭代停止条件
      diff_norm<- mean((Yhat_train_tilde-Yhat_train_tilde_new)^2)
      # 在循环内部计算差异后更新
      Yhat_train_tilde <- Yhat_train_tilde_new
      # 子样本污染数据比例
      ps_KRMS <- intersect(indices_KRMS, outliers)
      psr_KRMS <- 1-length(ps_KRMS)/n_val
    }
    mse_KRMS <- NA # 默认为 NA
    if (!is.null(fit_final_KRMS)) {
      Y_test_hat_KRMS <- predict(fit_final_KRMS, newdata = X_test)$fit
      mse_KRMS <- mean((Y_test - Y_test_hat_KRMS)^2)
    } else {
      warning(paste0("KRMS (n_val=", n_val, ", m_rep=", m, "): 最终KRMS模型为NULL。MSE将为NA。"))
    }
    iter_KRMS<-iter
    
    
    
    
    #####################线性模型方法##########################
    
    ######用线性回归模型拟合均匀抽样的结果来获得初始beta####
    colnames(X_train_tilde) <- paste0("X", 1:ncol(X_train_tilde))
    colnames(X_test)        <- paste0("X", 1:ncol(X_test))
    
    ########均匀抽样拟合线性回归模型########
    
    beta0 = solve(t(X_UNIF)%*%X_UNIF)%*%t(X_UNIF)%*% Y_UNIF
    Y_train_hat_UNIF <- X_train_tilde%*%beta0
    
    ############ GMS Gong2020 ###########
    # 计算梯度  
    G <- gradient(X_train_tilde, Y_train_tilde, Y_train_hat_UNIF, p)$G
    GMS_result <- GMS(X_train_tilde, Y_train_tilde, G, 1,burn, n_val)
    X_GMS <- GMS_result$X_selected
    Y_GMS <- GMS_result$Y_selected
    beta_GMS = solve(t(X_GMS)%*%X_GMS)%*%t(X_GMS)%*% Y_GMS
    indices_GMS<-GMS_result$indices
    ps_GMS <- intersect(indices_GMS, outliers)
    psr_GMS <- 1-length(ps_GMS)/n_val
    
    ########## LGS Jing2023 #############
    iter <- 0
    diff_norm <- Inf
    max_iter <- 15
    tolerance <- 0.03
    Y_hat_prev_LGS <-Y_train_hat_UNIF# 上一次迭代的系数估计值
    beta_former <- beta0 # 初始化 beta_former
    nu <- rep(0, p)
    Gradient_result <- gradient( X_train_tilde, Y_train_tilde, Y_hat_prev_LGS, p)
    G<-Gradient_result$G
    while(iter < max_iter & diff_norm > tolerance) {
      iter <- iter + 1
      # LGS  
      LGS_result <- LGS(X_train_tilde, Y_train_tilde, G, 0,  n_val,N1)
      X_LGS <- LGS_result$X_selected
      Y_LGS <- LGS_result$Y_selected
      beta.hat = solve( t( X_LGS)%*% X_LGS/n_val + diag(nu) ) %*%
        ( t( X_LGS)%*% Y_LGS/n_val + diag(nu)%*%beta_former )
      Y_hat_new_LGS<-X_train_tilde%*%beta.hat
      #更新梯度矩阵  
      Gradient_result <- gradient(X_train_tilde, Y_train_tilde, Y_hat_new_LGS, p)
      G<-Gradient_result$G
      G_matrix<-Gradient_result$G_matrix
      #求梯度矩阵的均值  
      mu <- G_matrix |> apply(2, mean)
      #更新nu  
      nu <- c * (iter / log(1+abs(mu)))^d
      # 计算差的二范数  
      diff_norm <- mean((Y_hat_prev_LGS - Y_hat_new_LGS)^2)
      # 更新估计值  
      Y_hat_prev_LGS <-Y_hat_new_LGS
      beta_former = beta.hat
      # 使用当前迭代的样本来计算干净样本的比例  
      indices_LGS<-LGS_result$indices
      ps_LGS <- intersect(indices_LGS, outliers)
      psr_LGS <- 1-length(ps_LGS)/n_val
    }
    beta_LGS<-beta.hat
    iter_LGS<-iter
    #核方法下的均匀抽样在测试集的MSE
    Y_test_hat_UNIF<-predict(fit_final_f0, newdata = X_test)$fit
    mse_UNIF_kernel  <- mean((Y_test-Y_test_hat_UNIF)^2)
    #Li2024年测试集的MSE
    Y_test_hat_L2MS<-predict(fit_final_L2MS, newdata = X_test)$fit
    mse_L2MS  <- mean((Y_test-Y_test_hat_L2MS)^2)
  
    # KRMS测试集的MSE
    #Y_test_hat_KRMS<-predict(fit_final_KRMS, newdata = X_test)$fit
    #mse_KRMS  <- mean((Y_test-Y_test_hat_KRMS)^2)
    
    ##############线性模型下的#############
    
    #均匀抽样在测试集上的MSE
    Y_test_hat_UNIF<-X_test%*%beta0
    mse_UNIF<-mean((Y_test-Y_test_hat_UNIF)^2)
    #GMS在测试集上的MSE

    Y_test_hat_GMS<-X_test%*%beta_GMS
    mse_GMS  <- mean((Y_test-Y_test_hat_GMS)^2)
    
    #LGS在测试集上的MSE
    Y_test_hat_LGS<-X_test%*%beta_LGS
    mse_LGS  <- mean((Y_test-Y_test_hat_LGS)^2)
    
    # 返回结果作为一行数据
    data.frame(n_val = n_val, m = m, iter_KRMS=iter_KRMS,iter_LGS=iter_LGS,
               mse_UNIF_kernel=mse_UNIF_kernel, mse_L2MS = mse_L2MS, mse_KRMS = mse_KRMS,mse_UNIF=mse_UNIF,mse_GMS = mse_GMS,mse_LGS = mse_LGS,
               psr_UNIF_kernel=psr_UNIF        ,psr_L2MS = psr_L2MS, psr_KRMS = psr_KRMS,psr_UNIF=psr_UNIF,psr_GMS = psr_GMS,psr_LGS = psr_LGS)
  }
  
  
  # 计算M次重复实验的均值和标准差
  mse_mean_UNIF_kernel <- mean(results$mse_UNIF_kernel)
  mse_sd_UNIF_kernel <- sd(results$mse_UNIF_kernel)
  psr_mean_UNIF_kernel <- mean(results$psr_UNIF_kernel)
  
  mse_mean_L2MS <- mean(results$mse_L2MS)
  mse_sd_L2MS <- sd(results$mse_L2MS)
  psr_mean_L2MS <- mean(results$psr_L2MS)
  
  iter_mean_KRMS<-mean(results$iter_KRMS)
  mse_mean_KRMS <- mean(results$mse_KRMS, na.rm = TRUE)
  mse_sd_KRMS <- sd(results$mse_KRMS, na.rm = TRUE)
  psr_mean_KRMS <- mean(results$psr_KRMS, na.rm = TRUE)
  
  mse_mean_UNIF <- mean(results$mse_UNIF)
  mse_sd_UNIF <- sd(results$mse_UNIF)
  psr_mean_UNIF <- mean(results$psr_UNIF)
  
  mse_mean_GMS <- mean(results$mse_GMS)
  mse_sd_GMS <- sd(results$mse_GMS)
  psr_mean_GMS <- mean(results$psr_GMS)
  
  iter_mean_LGS<-mean(results$iter_LGS)
  mse_mean_LGS <- mean(results$mse_LGS)
  mse_sd_LGS <- sd(results$mse_LGS)
  psr_mean_LGS <- mean(results$psr_LGS)
  # 创建汇总数据框
  summary_df <- data.frame(
    n_val = n_val,
    mse_mean_UNIF_kernel=mse_mean_UNIF_kernel,
    mse_sd_UNIF_kernel=mse_sd_UNIF_kernel,
    psr_mean_UNIF_kernel=psr_mean_UNIF_kernel,
    
    mse_mean_L2MS=mse_mean_L2MS,
    mse_sd_L2MS=mse_sd_L2MS,
    psr_mean_L2MS=psr_mean_L2MS,
    
    iter_mean_KRMS= iter_mean_KRMS,
    mse_mean_KRMS=mse_mean_KRMS,
    mse_sd_KRMS=mse_sd_KRMS,
    psr_mean_KRMS=psr_mean_KRMS,
    
    mse_mean_UNIF=mse_mean_UNIF,
    mse_sd_UNIF=mse_sd_UNIF,
    psr_mean_UNIF=psr_mean_UNIF,
    
    mse_mean_GMS=mse_mean_GMS,
    mse_sd_GMS=mse_sd_GMS,
    psr_mean_GMS=psr_mean_GMS,
    
    iter_mean_LGS=iter_mean_LGS,
    mse_mean_LGS=mse_mean_LGS,
    mse_sd_LGS=mse_sd_LGS,
    psr_mean_LGS=psr_mean_LGS
  )
  # 创建格式化版本用于保存CSV
  formatted_summary <- data.frame(
    n_val = n_val,
    mse_mean_UNIF_kernel=sprintf("%.3f",mse_mean_UNIF_kernel),
    mse_sd_UNIF_kernel=sprintf("%.3f",mse_sd_UNIF_kernel),
    psr_mean_UNIF_kernel=sprintf("%.2f",psr_mean_UNIF_kernel* 100), 
    
    mse_mean_L2MS=sprintf("%.3f",mse_mean_L2MS),
    mse_sd_L2MS=sprintf("%.3f",mse_sd_L2MS),
    psr_mean_L2MS=sprintf("%.2f",psr_mean_L2MS* 100), 
    
    # KRMS 结果格式化
    iter_mean_KRMS = sprintf("%.2f", iter_mean_KRMS),
    mse_mean_KRMS = sprintf("%.3f", mse_mean_KRMS),
    mse_sd_KRMS = sprintf("%.3f", mse_sd_KRMS),
    psr_mean_KRMS = sprintf("%.2f", psr_mean_KRMS * 100),  # 转换为百分比
    
    mse_mean_UNIF=sprintf("%.3f",mse_mean_UNIF),
    mse_sd_UNIF=sprintf("%.3f",mse_sd_UNIF),
    psr_mean_UNIF=sprintf("%.2f",psr_mean_UNIF* 100),
    
    mse_mean_GMS=sprintf("%.3f",mse_mean_GMS),
    mse_sd_GMS=sprintf("%.3f",mse_sd_GMS),
    psr_mean_GMS=sprintf("%.2f",psr_mean_GMS* 100),
    
    # LGS 结果格式化
    iter_mean_LGS = sprintf("%.2f", iter_mean_LGS),
    mse_mean_LGS = sprintf("%.3f", mse_mean_LGS),
    mse_sd_LGS = sprintf("%.3f", mse_sd_LGS),
    psr_mean_LGS = sprintf("%.2f", psr_mean_LGS * 100)   # 转换为百分比
  )
  
  
  
  # 保存数值结果
  all_results[[as.character(n_val)]] <- list(
    individual = results,
    summary = formatted_summary)
  # 可选：将每个n_val的结果保存为CSV文件
  write.csv(results, file = paste0("results_n_", n_val, ".csv"), row.names = FALSE)
  #write.csv(summary_df, file = paste0("summary_n_", n_val, ".csv"), row.names = FALSE)
  # 保存格式化后的汇总结果
  write.csv(formatted_summary, file = paste0("summary_n_", n_val, ".csv"), row.names = FALSE)
}
stopCluster(cl)
registerDoSEQ()
# 可选：将所有结果保存为RData文件
save(all_results, file = "simulation_results.RData")

# 打印汇总结果
for (n_val in n) {
  cat("\nSummary for n =", n_val, ":\n")
  print(all_results[[as.character(n_val)]]$summary)
}
