library(MASS)
library(glmnet)
library(Matrix)
library(foreach)
library(doParallel)
library(doRNG)
library(kernlab)
library(KRLS)
library(caret)
setwd("")
source("function.R")
set.seed(521)
burn<-300
q_values   <- c(0,0.1, 0.2,0.3)# 污染比例
n_vals     <- c(500,1000,1500)# 子样本量
d = 1## LGS的参数  
a = b = 1
c = 0.01 ##LGS 的参数  a
M <- 100
data <- read.csv("nasdaq_processed_R.csv", header = TRUE)
N<-nrow(data)
feature_names <- c(
  'InterestRate', 'ExchangeRate', 'VIX', 'TEDSpread', 'EFFR', 'Gold', 'Oil', # 经济指标
  'Return_Lag_1', 'Return_Lag_2', 'Return_Lag_3', 'Return_Lag_4', 'Return_Lag_5', # 滞后收益率  
  'MA_5', 'MA_20' # 移动平均线
)

Y<-data.matrix(subset(data,select=Target_Return))
X<-data.matrix(subset(data,select=feature_names))
# 7. 数据分割 
all_runs_error_list <- list()
all_summary_stats_list <- list()

for (q in q_values) {
  for (n_val in n_vals) {
    errors_current_combination <- data.frame(
      q_value = rep(q, M),
      n_value = rep(n_val, M),
      iteration = 1:M,
      rmse_UNIF_kernel= numeric(M) ,
      rmse_L2MS= numeric(M),
      rmse_KRMS= numeric(M),
      rmse_UNIF = numeric(M),
      rmse_GMS = numeric(M),
      rmse_LGS = numeric(M))
    for (i in 1:M) {
      # 分割为训练集 (70%) 和测试集 (30%)
      train_index <- sample(1:N,size=0.7*N)
      X_train <- X[train_index, ]
      X_test <- X[-train_index, ]
      Y_train <- Y[train_index, , drop = FALSE] 
      # 使用 drop = FALSE 确保 Y_test 是矩阵，并修正索引
      Y_test <- Y[-train_index, , drop = FALSE]
      # 8. 数据标准化 (Scaling)
      # 计算训练集的均值和标准差
      train_mean <- apply(X_train, 2, mean)
      train_sd <- apply(X_train, 2, sd)
      # 标准化训练集
      X_train<- scale(X_train, center = train_mean, scale = train_sd)
      # 使用训练集的均值和标准差来标准化测试集
      X_test<- scale(X_test, center = train_mean, scale = train_sd)
      set.seed(i)
      p_val<-ncol(X_train)
      N_train<- nrow(X_train)
      W <- matrix(0, nrow = N_train, ncol = p_val)
      O<-matrix(0,nrow=N_train,ncol=1)
      # # 计算需要填充的行数
      num_filled_rows <- ceiling(q * N_train)
      filled_rows <- sample(1:N_train, num_filled_rows)
      ##加入污染数据
      W[filled_rows, ] <- matrix(runif(num_filled_rows * p_val, 2,3), nrow = num_filled_rows, ncol = p_val)
      O[filled_rows,]<-matrix(rnorm(num_filled_rows*1,mean=1,sd=sqrt(3)), nrow = num_filled_rows, ncol = 1)
      X_train[filled_rows,]<-W[filled_rows, ]
      Y_train[filled_rows,]<-O[filled_rows,]
      # 定义高斯核
      rbf_kernel  <- rbfdot(sigma =p_val)
      ####################### 均匀抽样###################
      idx <- sample(1: N_train, n_val, replace = FALSE)
      # 使用相同的索引来选取
      X_UNIF = X_train[idx, ]
      Y_UNIF = Y_train[idx, ]
      #计算均匀采样训练集核矩阵
      K_UNIF<-kernelMatrix(rbf_kernel,X_UNIF)
      # 拟合最终模型(核方法)
      fit_final_f0 <- krls(X = X_UNIF, y = Y_UNIF,derivative = FALSE, vcov= FALSE)
      Yhat_train <- predict(fit_final_f0, newdata = X_train)$fit
      ## 计算残差
      res0<-abs(Y_train-Yhat_train)
      ## 计算残差平方
      res0_2<-res0^2
      ##计算训练集在初始模型下的核矩阵
      K_train<-kernelMatrix(rbf_kernel,X_train,X_UNIF)
      # 计算每个样本的核范数，即计算每行的L2范数
      K_norms_train <-sqrt(rowSums( K_train^2))
      rm(K_train) # 删除核矩阵
      #########################Li2014############
      # 抽样概率
      G<-exp(res0^2)
      max_iter <- 1 #（迭代次数）
      tolerance <- 0.001
      iter <- 0
      while(iter < max_iter) {
        iter <- iter + 1
        ## 开始抽样 小梯度markov
        L2MS_result<-LGMS(X_train,Y_train, G, 1, burn, n_val)
        X_L2MS<-L2MS_result$X_selected
        Y_L2MS<-L2MS_result$Y_selected
        indices_L2MS<-L2MS_result$indices
        # 拟合最终模型
        fit_final_L2MS <- krls(X = X_L2MS, y = Y_L2MS,derivative=FALSE,vcov=FALSE)
      }
      ##############################KRMS####################
      G_KRMS<-res0/K_norms_train
      max_iter <- 15#（迭代次数）
      tolerance <- 0.001
      iter <- 0
      diff_norm<-Inf
      fit_final_KRMS <- NULL
      while(iter < max_iter & diff_norm > tolerance) {
        iter<- iter + 1
        #cat(sprintf("Attempting LGMS (Our Method) iteration %d for q=%.1f, n_val=%d\n", iter_lgms, q, n_val))
        # 使用 tryCatch 来包裹可能出错的代码块
        # possible_outcome 会存储 tryCatch 块的返回值
        possible_outcome <- tryCatch({
          # 1. 进行LGMS抽样
          KRMS_result <- LGMS(X_train, Y_train, G_KRMS, 1, burn, n_val)
          X_KRMS <- KRMS_result$X_selected
          Y_KRMS <- KRMS_result$Y_selected
          # 2. 在调用krls之前，主动检查 Y_LGMS_iter 是否为常数
          if (length(unique(Y_KRMS)) < 2) {
            # 如果是常数，记录警告并主动抛出一个特定错误，会被tryCatch捕获
            warning(sprintf("KRMS (Our Method) iter %d for q=%.1f, n_val=%d: Y_KRMS is constant. Will use previous estimate.", iter, q, n_val))
            stop("Y_KRMS_is_constant_in_iter") # 自定义错误信息
          }
          
          # 3. 如果 Y_LGMS_iter 不是常数，拟合krls模型
          current_fit_KRMS <- krls(X = X_KRMS, y = Y_KRMS, derivative=FALSE, vcov=FALSE)
          # 4. 如果成功，返回包含模型和所用X子样的列表
          list(success = TRUE, fit = current_fit_KRMS, X_subsample = X_KRMS)
          
        }, error = function(e) {
          # 5. 如果发生任何错误 (包括我们主动抛出的 "Y_KRMS_is_constant_in_iter")
          warning(sprintf("Error or Y_KRMS constant in KRMS (Our Method) iter %d for q=%.1f, n_val=%d: %s. Using previous estimate for KRMS method.", iter, q, n_val, e$message))
          # 返回一个表示失败的列表
          list(success = FALSE, error_message = e$message)
        })
        
        # 根据 tryCatch 的结果进行处理
        if (possible_outcome$success) {
          # 本次迭代成功
          fit_final_KRMS <- possible_outcome$fit # 更新最终的KRMS模型
          X_KRMS <- possible_outcome$X_subsample # 获取本次成功的X子样本
          
          # 更新下一次迭代所需的 K_norms_train_current, res_current, G_KRMS_current, diff_norm_lgms
          K_train<- kernelMatrix(rbf_kernel, X_train, X_KRMS)
          K_norms_train <- sqrt(rowSums(K_train^2))
          K_norms_train[K_norms_train == 0] <- 1e-9 # 避免除以零
          rm(K_train)
          
          Yhat_train_new <- predict(fit_final_KRMS, newdata = X_train)$fit
          res<- abs(Y_train - Yhat_train_new)
          G_KRMS<- res / K_norms_train
          diff_norm <- mean((Yhat_train - Yhat_train_new)^2) 
          
        } else {
          cat(sprintf("KRMS (Our Method) while loop stopped at iteration %d for q=%.1f, n_val=%d due to error/constant Y. Final LGMS model is from previous successful state.\n", iter, q, n_val))
          break # 跳出 while 循环
        }
      }
      
      
      ####################线性方法#####################
      ######用线性回归模型拟合均匀抽样的结果来获得初始beta
      
      beta_UNIF = solve(t(X_UNIF)%*%X_UNIF)%*%t(X_UNIF)%*% Y_UNIF
      Y_train_hat_UNIF <- X_train%*%beta_UNIF
      ########## GMS ########
      # 计算梯度  
      G <- gradient(X_train, Y_train, Y_train_hat_UNIF, p_val)$G
      GMS_result <- GMS(X_train, Y_train, G, 1,burn, n_val)
      X_GMS <- GMS_result$X_selected
      Y_GMS <- GMS_result$Y_selected
      beta_GMS = solve(t(X_GMS)%*%X_GMS)%*%t(X_GMS)%*% Y_GMS
      
      
      ########## LGS  #############
      iter <- 0
      diff_norm <- Inf
      max_iter <-  15
      tolerance <- 0.001
      Y_hat_prev_LGS <-Y_train_hat_UNIF# 上一次迭代的系数估计值
      beta_former <- beta_UNIF # 初始化 beta_former
      nu <- rep(0, p_val)
      Gradient_result <- gradient( X_train, Y_train, Y_hat_prev_LGS, p_val)
      G<-Gradient_result$G
      while(iter < max_iter & diff_norm > tolerance) {
        iter <- iter + 1
        # LGS  
        LGS_result <- LGS(X_train, Y_train, G, 0,  n_val,N_train)
        X_LGS <- LGS_result$X_selected
        Y_LGS <- LGS_result$Y_selected
        beta.hat = solve( t( X_LGS)%*% X_LGS/n_val + diag(nu) ) %*%
          ( t( X_LGS)%*% Y_LGS/n_val + diag(nu)%*%beta_former )
        Y_hat_new_LGS<-X_train%*%beta.hat
        #更新梯度矩阵  
        Gradient_result <- gradient(X_train, Y_train, Y_hat_new_LGS, p_val)
        G<-Gradient_result$G
        G_matrix<-Gradient_result$G_matrix
        #求梯度矩阵的均值  
        mu <- G_matrix |> apply(2, mean)
        #更新nu  
        nu <- c * (iter / log(1+abs(mu)))^d
        # 计算差的二范数  
        diff_norm <- mean((Y_hat_prev_LGS - Y_hat_new_LGS)^2)
        # 更新估计值  
        Y_hat_prev_LGS <-Y_hat_new_LGS
        beta_former = beta.hat
      }
      beta_LGS<-beta.hat
      
      #核方法下的均匀抽样
      Y_test_hat_UNIF_kernel<-predict(fit_final_f0, newdata = X_test)$fit
      mse_UNIF_kernel  <- mean((Y_test-Y_test_hat_UNIF_kernel)^2)
      rmse_UNIF_kernel <-sqrt(mse_UNIF_kernel)
      #核方法2014
      Y_test_hat_L2MS<-predict(fit_final_L2MS, newdata = X_test)$fit
      mse_L2MS  <- mean((Y_test-Y_test_hat_L2MS)^2)
      rmse_L2MS  <- sqrt( mse_L2MS)
      # 迭代停止计算我们方法测试集的MSE
      Y_test_hat_KRMS<-predict(fit_final_KRMS, newdata = X_test)$fit
      mse_KRMS  <- mean((Y_test-Y_test_hat_KRMS)^2)
      rmse_KRMS<-sqrt(mse_KRMS)
      
      
      #均匀抽样在测试集上的MSE
      Y_test_hat_UNIF<-X_test%*%beta_UNIF
      mse_UNIF<-mean((Y_test-Y_test_hat_UNIF)^2)
      rmse_UNIF<-sqrt(mse_UNIF)
      #GMS在测试集上的MSE
      
      Y_test_hat_GMS<-X_test%*%beta_GMS
      mse_GMS  <- mean((Y_test-Y_test_hat_GMS)^2)
      rmse_GMS<-sqrt(mse_GMS)
      #LGS在测试集上的MSE
      Y_test_hat_LGS<-X_test%*%beta_LGS
      mse_LGS  <- mean((Y_test-Y_test_hat_LGS)^2)
      rmse_LGS<-sqrt(mse_LGS)
      
      errors_current_combination$rmse_UNIF_kernel[i] <-rmse_UNIF_kernel
      errors_current_combination$rmse_L2MS[i]  <-rmse_L2MS
      errors_current_combination$rmse_KRMS[i]  <-rmse_KRMS
      errors_current_combination$rmse_UNIF[i] <- rmse_UNIF
      errors_current_combination$rmse_GMS[i]  <- rmse_GMS
      errors_current_combination$rmse_LGS[i]  <- rmse_LGS
      cat(sprintf("Finished iteration %d for q=%.1f, n_val=%d\n", i, q, n_val))
    }
    # MODIFICATION: 将此 (q, n_val) 组合的错误数据框存储到列表中
    all_runs_error_list[[paste0("q", q, "_n", n_val)]] <- errors_current_combination
    
    # MODIFICATION: 计算并存储此 (q, n_val) 组合的摘要统计信息
    mean_errors_current <- colMeans(errors_current_combination[, -(1:3)]) # Exclude q, n, iter cols
    sd_errors_current <- apply(errors_current_combination[, -(1:3)], 2, sd)
    
    summary_stats_current <- data.frame(
      q_value = q,
      n_value = n_val,
      Method = names(mean_errors_current),
      Mean_RMSE = mean_errors_current,
      SD_RMSE = sd_errors_current,
      row.names = NULL
    )
    all_summary_stats_list[[paste0("q_summary", q, "_n_summary", n_val)]] <- summary_stats_current
    
    cat(sprintf("\n--- Summary for q = %.1f, n_val = %d ---\n", q, n_val))
    print(summary_stats_current)
    
  } # End of n_val loop
} # End of q loop

## 修改: 将所有结果合并到单个数据框中
final_all_iterations_df <- do.call(rbind, all_runs_error_list)
rownames(final_all_iterations_df) <- NULL # 清理行名

final_summary_statistics_df <- do.call(rbind, all_summary_stats_list)
rownames(final_summary_statistics_df) <- NULL # 清理行名

## 修改: 将合并后的结果保存到 CSV 文件
write.csv(final_all_iterations_df, file = "all_iterations_errors_combined.csv", row.names = FALSE)
write.csv(final_summary_statistics_df, file = "summary_statistics_all_params_combined.csv", row.names = FALSE)

cat("\n\n--- 所有模拟完成。组合结果已保存。 ---\n")
cat("所有参数组合的摘要:\n")
print(final_summary_statistics_df)
