library(MASS)
library(glmnet)
library(Matrix)
library(foreach)
library(doParallel)
library(doRNG)
library(kernlab)
library(KRLS)
library(caret)
library(tidyverse)
library(lubridate)
setwd("")
source("function.R")
set.seed(521)
burn<-500
q_values   <- c(0,0.1,0.2,0.3)
n_vals     <- c(500,1000,1500)
d = 1## LGS的参数  
a = b = 1
c = 0.01 ##LGS 的参数  
M <- 1
col_names <- c(
  "Month", "Day", "Year", "Hour",
  "CO_GT", "PT08_S1", "NMHC_GT", "C6H6_GT", "PT08_S2",
  "NOx_GT", "PT08_S3", "NO2_GT", "PT08_S4", "PT08_S5",
  "Temperature", "RelativeHumidity", "AbsoluteHumidity",
  "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"
)

raw <- read_csv("AirQuality_clean.csv", col_names = FALSE, col_types = cols())
colnames(raw) <- col_names
# 3. Basic cleaning & timestamp
air <- raw %>%
  mutate(across(everything(), ~ na_if(.x, -200))) %>%   # replace sentinel -200 with NA
  mutate(Datetime = make_datetime(Year, Month, Day, Hour)) %>%
  relocate(Datetime)                                     # move Datetime to front
target_pollutant <- "NO2_GT"

air <- air %>%
  mutate(
    Hour_sin   = sin(2 * pi * Hour   / 24),
    Hour_cos   = cos(2 * pi * Hour   / 24),
    Month_sin  = sin(2 * pi * Month  / 12),
    Month_cos  = cos(2 * pi * Month  / 12)
  )
air_model <- air %>% drop_na(all_of(target_pollutant))
selected_features <- c(
  "Month", "Hour",
  "PT08_S1", "PT08_S2", "PT08_S3", "PT08_S4", "PT08_S5",
  "Temperature", "RelativeHumidity", "AbsoluteHumidity"
)
feature_cols <- selected_features
all_runs_error_list <- list()
all_summary_stats_list <- list()
for (q in q_values) {
  for (n_val in n_vals) {
    errors_current_combination <- data.frame(
      q_value = rep(q, M),
      n_value = rep(n_val, M),
      iteration = 1:M,
      rmse_UNIF_kernel= numeric(M) ,
      rmse_L2MS= numeric(M),
      rmse_KRMS= numeric(M),
      rmse_UNIF = numeric(M),
      rmse_GMS = numeric(M),
      rmse_LGS = numeric(M))
    for (i in 1:M) {
      set.seed(i)
      # 分割为训练集 (70%) 和测试集 (30%)
      N<-nrow(air_model )
      train_index <- sample(1:N, size = 0.7 * N)
      train_data <- air_model [train_index, ]
      test_data <- air_model [-train_index, ]
      X_train <- data.matrix(subset(train_data,select=feature_cols ))
      Y_train <- data.matrix(subset(train_data,select=target_pollutant))
      X_test <- data.matrix(subset(test_data,select=feature_cols ))
      Y_test <- data.matrix(subset(test_data,select=target_pollutant))
      # 8. 数据标准化 (Scaling)
      # 计算训练集的均值和标准差
      train_mean <- apply(X_train, 2, mean)
      train_sd <- apply(X_train, 2, sd)
      # 标准化训练集
      X_train<- scale(X_train, center = train_mean, scale = train_sd)
      # 使用训练集的均值和标准差来标准化测试集
      X_test<- scale(X_test, center = train_mean, scale = train_sd)
      Y_mean <- mean(Y_train)
      Y_sd   <- sd(Y_train)
      Y_train <- (Y_train - Y_mean) / Y_sd        # 训练标签（标准化）
      Y_test  <- (Y_test  - Y_mean) / Y_sd        # 测试标签仅备用
      p_val<-ncol(X_train)
      N_train<- nrow(X_train)
      W <- matrix(0, nrow = N_train, ncol = p_val)
      O<-matrix(0,nrow=N_train,ncol=1)
      # # 计算需要填充的行数
      num_filled_rows <- ceiling(q * N_train)
      filled_rows <- sample(1:N_train, num_filled_rows)
      #corruption
      W[filled_rows, ] <- matrix(rnorm(num_filled_rows * p_val, mean=-10,sd=sqrt(3)), nrow = num_filled_rows, ncol = p_val)
      O[filled_rows,]<- matrix(rnorm(num_filled_rows*1,mean=-3,sd=sqrt(3)), nrow = num_filled_rows, ncol = 1)
      X_train[filled_rows,]<-W[filled_rows, ]
      Y_train[filled_rows,]<-O[filled_rows,]
      rbf_kernel  <- rbfdot(sigma =0.01)
      ####################### 均匀抽样###################
      idx <- sample(1: N_train, n_val, replace = FALSE)
      # 使用相同的索引来选取
      X_UNIF = X_train[idx, ]
      Y_UNIF = Y_train[idx, ]
      # 拟合最终模型(核方法)
      fit_final_f0 <- krls(X = X_UNIF, y = Y_UNIF,derivative = FALSE, vcov= TRUE)
      Yhat_train <- predict(fit_final_f0, newdata = X_train)$fit
      ## 计算残差
      res0<-abs(Y_train-Yhat_train)
      ## 计算残差平方
      res0_2<-res0^2
      ##计算训练集在初始模型下的核矩阵
      K_train<-kernelMatrix(rbf_kernel,X_train,X_UNIF)
      # 计算每个样本的核范数，即计算每行的L2范数
      K_norms_train <-sqrt(rowSums( K_train^2))
      rm(K_train) # 删除核矩阵
      #########################Li2014############
      # 抽样概率
      G<-exp(res0^2)
      max_iter <- 1 #（迭代次数）
      tolerance <- 0.001
      iter <- 0
      while(iter < max_iter) {
        iter <- iter + 1
        ## 开始抽样 小梯度markov
        L2MS_result<-LGMS(X_train,Y_train, G, 1, burn, n_val)
        X_L2MS<-L2MS_result$X_selected
        Y_L2MS<-L2MS_result$Y_selected
        indices_L2MS<-L2MS_result$indices
        # 拟合最终模型
        fit_final_L2MS <- krls(X = X_L2MS, y = Y_L2MS,derivative=FALSE,vcov= TRUE)
        ps_L2MS <- intersect(indices_L2MS, filled_rows )
        psr_L2MS <- 1-length(ps_L2MS)/n_val
      }
      ##############################KRMS####################
      G_KRMS<-res0/K_norms_train
      max_iter <- 15#（迭代次数）
      tolerance <- 0.001
      iter <- 0
      diff_norm<-Inf
      fit_final_KRMS <- NULL
      while(iter < max_iter & diff_norm > tolerance) {
        iter<- iter + 1
        possible_outcome <- tryCatch({
          # 1. 进行LGMS抽样
          KRMS_result <- LGMS(X_train, Y_train, G_KRMS, 1, burn, n_val)
          X_KRMS <- KRMS_result$X_selected
          Y_KRMS <- KRMS_result$Y_selected
          # 2. 在调用krls之前，主动检查 Y_LGMS_iter 是否为常数
          if (length(unique(Y_KRMS)) < 2) {
            # 如果是常数，记录警告并主动抛出一个特定错误，会被tryCatch捕获
            warning(sprintf("KRMS (Our Method) iter %d for q=%.1f, n_val=%d: Y_KRMS is constant. Will use previous estimate.", iter, q, n_val))
            stop("Y_KRMS_is_constant_in_iter") # 自定义错误信息
          }
          # 3. 如果 Y_LGMS_iter 不是常数，拟合krls模型
          current_fit_KRMS <- krls(X = X_KRMS, y = Y_KRMS, derivative=FALSE, vcov=FALSE)
          # 4. 如果成功，返回包含模型和所用X子样的列表
          list(success = TRUE, fit = current_fit_KRMS, X_subsample = X_KRMS)
          
        }, error = function(e) {
          # 5. 如果发生任何错误 (包括我们主动抛出的 "Y_KRMS_is_constant_in_iter")
          warning(sprintf("Error or Y_KRMS constant in KRMS (Our Method) iter %d for q=%.1f, n_val=%d: %s. Using previous estimate for KRMS method.", iter, q, n_val, e$message))
          # 返回一个表示失败的列表
          list(success = FALSE, error_message = e$message)
        })
        
        # 根据 tryCatch 的结果进行处理
        if (possible_outcome$success) {
          # 本次迭代成功
          fit_final_KRMS <- possible_outcome$fit # 更新最终的KRMS模型
          X_KRMS <- possible_outcome$X_subsample # 获取本次成功的X子样本
          
          # 更新下一次迭代所需的 K_norms_train_current, res_current, G_LGMS_current, diff_norm_lgms
          K_train<- kernelMatrix(rbf_kernel, X_train, X_KRMS)
          K_norms_train <- sqrt(rowSums(K_train^2))
          K_norms_train[K_norms_train == 0] <- 1e-9 # 避免除以零
          rm(K_train)
          
          Yhat_train_new <- predict(fit_final_KRMS, newdata = X_train)$fit
          res<- abs(Y_train - Yhat_train_new)
          G_KRMS<- res / K_norms_train
          diff_norm <- mean((Yhat_train - Yhat_train_new)^2) 
          
        } else {
          cat(sprintf("KRMS (Our Method) while loop stopped at iteration %d for q=%.1f, n_val=%d due to error/constant Y. Final LGMS model is from previous successful state.\n", iter, q, n_val))
          break # 跳出 while 循环
        }
      }
      
      ####################线性方法#####################
      ######用线性回归模型拟合均匀抽样的结果来获得初始beta
      
      beta_UNIF = solve(t(X_UNIF)%*%X_UNIF)%*%t(X_UNIF)%*% Y_UNIF
      Y_train_hat_UNIF <- X_train%*%beta_UNIF
      ########## GMS ########
      # 计算梯度  
      G <- gradient(X_train, Y_train, Y_train_hat_UNIF, p_val)$G
      GMS_result <- GMS(X_train, Y_train, G, 1,burn, n_val)
      X_GMS <- GMS_result$X_selected
      Y_GMS <- GMS_result$Y_selected
      #train_GMS <- data.frame(Y = Y_GMS,  X_GMS )
      #model_GMS <- lm(Y ~ ., data = train_GMS)
      beta_GMS = solve(t(X_GMS)%*%X_GMS)%*%t(X_GMS)%*% Y_GMS
      ########## LGS  #############
      iter <- 0
      diff_norm <- Inf
      max_iter <- 15
      tolerance <- 0.001
      Y_hat_prev_LGS <-Y_train_hat_UNIF# 上一次迭代的系数估计值
      beta_former <- beta_UNIF # 初始化 beta_former
      nu <- rep(0, p_val)
      Gradient_result <- gradient( X_train, Y_train, Y_hat_prev_LGS, p_val)
      G<-Gradient_result$G
      while(iter < max_iter & diff_norm > tolerance) {
        iter <- iter + 1
        # LGS  
        LGS_result <- LGS(X_train, Y_train, G, 0,  n_val,N_train)
        X_LGS <- LGS_result$X_selected
        Y_LGS <- LGS_result$Y_selected
        #train_LGS <- data.frame(Y = Y_LGS,  X_LGS )
        #model_LGS<- lm(Y ~ ., data = train_LGS)
        #Y_hat_new_LGS<-predict(model_LGS, newdata = X_train_df)
        beta.hat = solve( t( X_LGS)%*% X_LGS/n_val + diag(nu) ) %*%
          ( t( X_LGS)%*% Y_LGS/n_val + diag(nu)%*%beta_former )
        Y_hat_new_LGS<-X_train%*%beta.hat
        #更新梯度矩阵  
        Gradient_result <- gradient(X_train, Y_train, Y_hat_new_LGS, p_val)
        G<-Gradient_result$G
        G_matrix<-Gradient_result$G_matrix
        #求梯度矩阵的均值  
        mu <- G_matrix |> apply(2, mean)
        #更新nu  
        nu <- c * (iter / log(1+abs(mu)))^d
        # 计算差的二范数  
        diff_norm <- mean((Y_hat_prev_LGS - Y_hat_new_LGS)^2)
        # 更新估计值  
        Y_hat_prev_LGS <-Y_hat_new_LGS
        beta_former = beta.hat
      }
      beta_LGS<-beta.hat
      #核方法下的均匀抽样
      Y_test_hat_UNIF_kernel<-predict(fit_final_f0, newdata = X_test)$fit
      mse_UNIF_kernel  <- mean((Y_test-Y_test_hat_UNIF_kernel)^2)
      rmse_UNIF_kernel <-sqrt(mse_UNIF_kernel)
      #核方法2014
      Y_test_hat_L2MS<-predict(fit_final_L2MS, newdata = X_test)$fit
      mse_L2MS  <- mean((Y_test-Y_test_hat_L2MS)^2)
      rmse_L2MS  <- sqrt( mse_L2MS)
      # 迭代停止计算我们方法测试集的MSE
      Y_test_hat_KRMS<-predict(fit_final_KRMS, newdata = X_test)$fit
      mse_KRMS  <- mean((Y_test-Y_test_hat_KRMS)^2)
      rmse_KRMS<-sqrt(mse_KRMS)
      
      
      ##############线性模型下的#########
      
      
      #均匀抽样在测试集上的MSE
      Y_test_hat_UNIF<-X_test%*%beta_UNIF
      mse_UNIF<-mean((Y_test-Y_test_hat_UNIF)^2)
      rmse_UNIF<-sqrt(mse_UNIF)
      #GMS在测试集上的MSE
      
      Y_test_hat_GMS<-X_test%*%beta_GMS
      mse_GMS  <- mean((Y_test-Y_test_hat_GMS)^2)
      rmse_GMS<-sqrt(mse_GMS)
      #LGS在测试集上的MSE
      Y_test_hat_LGS<-X_test%*%beta_LGS
      mse_LGS  <- mean((Y_test-Y_test_hat_LGS)^2)
      rmse_LGS<-sqrt(mse_LGS)
      
      errors_current_combination$rmse_UNIF_kernel[i] <-rmse_UNIF_kernel
      errors_current_combination$rmse_L2MS[i]  <-rmse_L2MS
      errors_current_combination$rmse_KRMS[i]  <-rmse_KRMS
      errors_current_combination$rmse_UNIF[i] <- rmse_UNIF
      errors_current_combination$rmse_GMS[i]  <- rmse_GMS
      errors_current_combination$rmse_LGS[i]  <- rmse_LGS
      cat(sprintf("Finished iteration %d for q=%.1f, n_val=%d\n", i, q, n_val))
    }
    # MODIFICATION: 将此 (q, n_val) 组合的错误数据框存储到列表中
    all_runs_error_list[[paste0("q", q, "_n", n_val)]] <- errors_current_combination
    
    # MODIFICATION: 计算并存储此 (q, n_val) 组合的摘要统计信息
    mean_errors_current <- colMeans(errors_current_combination[, -(1:3)]) # Exclude q, n, iter cols
    sd_errors_current <- apply(errors_current_combination[, -(1:3)], 2, sd)
    
    summary_stats_current <- data.frame(
      q_value = q,
      n_value = n_val,
      Method = names(mean_errors_current),
      Mean_RMSE = mean_errors_current,
      SD_RMSE = sd_errors_current,
      row.names = NULL
    )
    all_summary_stats_list[[paste0("q_summary", q, "_n_summary", n_val)]] <- summary_stats_current
    
    cat(sprintf("\n--- Summary for q = %.1f, n_val = %d ---\n", q, n_val))
    print(summary_stats_current)
    
  } # End of n_val loop
} # End of q loop

## 修改: 将所有结果合并到单个数据框中
final_all_iterations_df <- do.call(rbind, all_runs_error_list)
rownames(final_all_iterations_df) <- NULL # 清理行名

final_summary_statistics_df <- do.call(rbind, all_summary_stats_list)
rownames(final_summary_statistics_df) <- NULL # 清理行名

## 修改: 将合并后的结果保存到 CSV 文件
write.csv(final_all_iterations_df, file = "all_iterations_errors_combined.csv", row.names = FALSE)
write.csv(final_summary_statistics_df, file = "summary_statistics_all_params_combined.csv", row.names = FALSE)

cat("\n\n--- 所有模拟完成。组合结果已保存。 ---\n")
cat("所有参数组合的摘要:\n")
print(final_summary_statistics_df)


