library(MASS)
library(glmnet)
library(Matrix)
library(kernlab)


# 生成干净数据 非线性模型
clean_data<-function(N,p){
  X = matrix(runif(N * p,0,1), ncol = p)
  xi= rnorm(N, mean=0, sd=1)
  Y=2*exp(-X[,1]) + 3 * sin(pi * X[,2]) +2*(X[,3]^2) + 3*X[,4]+xi
  return(list(X = X, Y = Y))
}

# 生成干净数据 线性模型
clean_data<-function(N,p){
  X = matrix(runif(N * p,0,1), ncol = p)
  xi= rnorm(N, mean=0, sd=1)
  Y=1*X[,1]+2*X[,2]+3*X[,3]+4*X[,4]+xi
  return(list(X = X, Y = Y))
}


# Huber污染 非线性模型下的设置
corrupted_data<- function(X, Y,outliers,N1,p,q) {
  # 污染模式1 背景噪声
  X[outliers,] <-  matrix(runif((N1*q) * p,-10,10), ncol = p)
  Y[outliers] <- rnorm(N1*q, mean = 0, sd = sqrt(5))  # 给异常点设置离群的 Y 值
  #污染模式2 负中心设计
  X[outliers, ] <- matrix(rnorm((N1 * q) * p, mean = -5, sd = sqrt(10)), ncol = p)
  Y[outliers] <- rnorm(N1*q, mean = 0, sd = sqrt(5))  
  #污染模式3 混合设计
  X[outliers, ] <- matrix(0.5*rnorm((N1 * q) * p, mean = -10, sd = sqrt(5))+0.5*rnorm((N1 * q) * p, mean = 10, sd = sqrt(5)), ncol = p)
  Y[outliers] <- rnorm(N1*q, mean = 0, sd = sqrt(10))  
  return(list(X_tilde = X, Y_tilde = Y))
}


# Huber污染 线性设置
corrupted_data<- function(X, Y,outliers,N1,p,q) {
  # 污染模式1 t 分布
  X[outliers,] <-  matrix(rt((N1*q) * p, df=1), ncol = p)
  Y[outliers] <-rnorm(N1*q, mean = 0, sd = sqrt(10))   # 给异常点设置离群的 Y 值
  #污染模式2 指数分布设计
  X[outliers, ] <- matrix(rexp((N1 * q) * p, rate=1), ncol = p)
  Y[outliers] <- rnorm(N1*q, mean = 0, sd = sqrt(10))  
  #污染模式3 F分布
  X[outliers, ] <-matrix(rf((N1*q) * p, df1=1,df2=1), ncol = p)
  Y[outliers] <- rnorm(N1*q, mean = 0, sd = sqrt(10))  
  return(list(X_tilde = X, Y_tilde = Y))
}


### Markov抽样框架
LGMS <- function(X,Y, G, m, burn, n) {
  N <- nrow(X) # 数据集中的样本数量
  indices <- 1:N # 样本的索引
  selected_indices <- integer(0) # 初始化选中的索引向量
  all_indices <- integer(0)
  # 初始样本
  valid_indices <- which(!is.na(G) & G != 0)
  if (length(valid_indices) == 0) {
    stop("No valid initial sample found with non-zero G values")
  }
  current_index <- sample(valid_indices, 1)
  G_0<-G[current_index]
  for (i in 1:(burn + n)) {
    accepted <- FALSE # 初始化接受标志为FALSE
    while (!accepted) {
      if (i <= burn || length(selected_indices) < n) {
        candidate_indices <- sample(indices, m, replace = TRUE)
      } else {
        candidate_indices <- sample(setdiff(indices, selected_indices), m, replace = FALSE)
      }
      # 确保候选样本中有有效的 G 值
      valid_candidates <- candidate_indices[!is.na(G[candidate_indices]) & G[candidate_indices] != 0]
      if (length(valid_candidates) == 0) {
        next # 跳过这次迭代
      }
      candidate_index <- valid_candidates[which.min(G[valid_candidates])]
      G_min <- G[candidate_index]
      # 计算接受概率
      P <- min(1, (G_0) /(G_min))
      if (is.na(P)) { # 检查P是否为NA
        next
      }
      # 决定是否接受候选样本
      if (runif(1) < P) {
        current_index <- candidate_index
        G_0 <- G_min
        accepted <- TRUE # 更新接受标志
      }
    }
    # 在老化期之后记录选中的索引
    all_indices <- c(all_indices, current_index)
    if (i > burn) {
      selected_indices <- c(selected_indices, current_index)
    }
  }
  # 返回选中的X, Y, 索引和接受概率
  all_indices <- c(all_indices, current_index)
  list(X_selected = X[selected_indices, ], Y_selected = Y[selected_indices], indices = selected_indices, all_indices = all_indices)
}


#####LGS抽样(Jing2023)
LGS <-function(X,Y,G,alpha,n,N){
  pi <- alpha/N + (1-alpha)*(1/G)/sum(1/G)
  selected_indices <- sample(1:N, n, replace = F, prob = pi)
  X_selected <- X[selected_indices ,]
  Y_selected <- Y[selected_indices ]
  list(X_selected = X_selected, Y_selected = Y_selected, indices = selected_indices)
}
######Gong2020
GMS <- function(X, Y, G, m, burn, n) {
  N <- nrow(X) # 数据集中的样本数量
  indices <- 1:N # 样本的索引
  selected_indices <- integer(0) # 初始化选中的索引向量
  all_indices <- integer(0)
  # 初始样本
  valid_indices <- which(!is.na(G) & G != 0)
  if (length(valid_indices) == 0) {
    stop("No valid initial sample found with non-zero G values")
  }
  current_index <- sample(valid_indices, 1)
  G_0<-G[current_index]
  for (i in 1:(burn + n)) {
    accepted <- FALSE # 初始化接受标志为FALSE
    while (!accepted) {
      if (i <= burn || length(selected_indices) < n) {
        candidate_indices <- sample(indices, m, replace = TRUE)
      } else {
        candidate_indices <- sample(setdiff(indices, selected_indices), m, replace = FALSE)
      }
      # 确保候选样本中有有效的 G 值
      valid_candidates <- candidate_indices[!is.na(G[candidate_indices]) & G[candidate_indices] != 0]
      if (length(valid_candidates) == 0) {
        next # 跳过这次迭代
      }
      candidate_index <- valid_candidates[which.min(G[valid_candidates])]
      G_min <- G[candidate_index]
      # 计算接受概率
      P <- min(1, G_0 /G_min)# 指数分布
      if (is.na(P)) { # 检查P是否为NA
        next
      }
      # 决定是否接受候选样本
      if (runif(1) < P) {
        current_index <- candidate_index
        G_0 <- G_min
        accepted <- TRUE # 更新接受标志
      }
    }
    # 在老化期之后记录选中的索引
    all_indices <- c(all_indices, current_index)
    if (i > burn) {
      selected_indices <- c(selected_indices, current_index)
    }
  }
  # 返回选中的X, Y, 索引和接受概率
  all_indices <- c(all_indices, current_index)
  list(X_selected = X[selected_indices, ], Y_selected = Y[selected_indices], indices = selected_indices, all_indices = all_indices)
}

#### 计算梯度
gradient <- function(X, Y, Y_hat,p) {
  # 计算梯度矩阵
  gradient_matrix <- X * matrix(rep((Y - Y_hat), p), ncol = p, byrow =  FALSE)
  # 计算每一行的L2范数
  G <- apply(gradient_matrix, 1, function(row) sqrt(sum(row^2)))
  #G<-abs(Y - X %*% beta)
  return(list(G= G, G_matrix= gradient_matrix))
}
