library(tidyverse)
library(ggplot2)
rm(list = ls())
house <- read_csv("train.csv")
house <- house %>%
  mutate(Neighborhood = as.factor(Neighborhood))
a<-house %>%
  count(Neighborhood) %>%
  arrange(desc(n))

ggplot(house, aes(x=reorder(Neighborhood, SalePrice, median), y=SalePrice)) +
  geom_boxplot(fill="#69b3a2") +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  labs(x="Neighborhood", y="SalePrice", title="House Price by Neighborhood")

house %>%
  group_by(Neighborhood) %>%
  summarise(
    MeanPrice = mean(SalePrice, na.rm=TRUE),
    SDPrice = sd(SalePrice, na.rm=TRUE),
    n = n()
  ) %>%
  arrange(desc(MeanPrice))
#
selected_hoods <- c( "Edwards","OldTown","IDOTRR",
                     "SaWyer","Timber","NridgHt","CollgCr","NoRidge","StoneBr")

house_sub <- house[house$Neighborhood %in% selected_hoods, ]

house_sub$Neighborhood_group <- with(house_sub, ifelse(
  Neighborhood %in% c("MeadowV", "IDOTRR", "BrDale", "OldTown","SaWyer","IDOTRR","Edwards","NAmes"), "low",
  ifelse(Neighborhood %in% c("Somerst", "Timber", "StoneBr","NridgHt","CollgCr","NoRidge"), "high", NA)
))
count(house_sub,Neighborhood_group)
house_sub$Neighborhood_binary <- ifelse(house_sub$Neighborhood_group == "high", 1, 0)

# 将其变为因子变量，确保顺序一致
house_sub$Neighborhood_group <- factor(house_sub$Neighborhood_group, levels = c("low", "high"))

write.csv(house_sub,"all_data_filter.csv")

#########res_result
rm(list = ls())
library(randomForest)
library(tidyverse)
library(doSNOW)
library(randomForest)
library(grf)
library(MASS)

progress <- function(nfin){
  cat(sprintf('%s: tasks completed: %d.\n', Sys.time(), nfin))
}

opts <- list(progress = progress)

dis_vec<-function(vec){
  n<-length(vec)
  dis_mat<-(matrix(rep(vec,each=n), nrow = n, ncol = n,byrow = T)-matrix(rep(vec,each=n), nrow = n, ncol = n))^2
  return(dis_mat)
}#输入一个向量，输出的矩阵为(i,j)元为i，j对应的元素之间的平方差

dis_matrix<-function(mat){
  if(is.matrix(mat)){
    result_list<-apply(mat, 2, dis_vec,simplify=F)
    result_matrix<-Reduce("+",result_list)
    return(result_matrix)
  }else{
    result_matrix<-dis_vec(mat)
    return(result_matrix)
  }
}

BH<-function(p_k=p_k,alpha=alpha){
  rank_p<-sapply(1:length(p_k), function(x){sum(p_k[x]>=p_k)})
  if(sum(p_k<alpha*rank_p/length(p_k))==0){
    return(0)
  }else{
    return(max(rank_p[p_k<alpha*rank_p/length(p_k)]))
  }
}



QLCP_au_detect<-function(train_score=train_score,cal_score=cal_score,test_score=test_score,h_sel=h_sel,r1=r1,alpha=0.2){
  ra_two_class<-data.frame(X=c(X_train[,d],X_test[,d]),S=c(train_score,test_score))
  ra_tar<-c(rep(0,n_train),rep(1,n_test))
  ra_model<-glm(tar ~ ., data = data.frame(ra_two_class,tar=ra_tar), family = binomial)
  ra_predictions_test <- predict(ra_model, newdata = data.frame(X=X_test[,d],S=test_score), type = "response")
  ra_predictions_test<-as.double(ra_predictions_test)
  ra_test<-(1-ra_predictions_test)*(n_test)/(n_train*ra_predictions_test)
  # 进行矩阵运算
  score_all<-matrix(rep(c(cal_score,test_score),each=(n_cal+n_test)),ncol = (n_cal+n_test))
  sign_matrix<-matrix(as.numeric(t(score_all)<=score_all),ncol = (n_cal+n_test))
  rm(score_all)
  h_dis_nu<-matrix(NA,ncol = n_test,nrow = length(h_sel))
  R_con<-matrix(NA,ncol = n_test,nrow = length(h_sel))
  weight_matrix_lis<-list()
  eme_lis<-list()
  weight_cal_lis<-list()
  weight_test_lis<-list()
  ra_weight_test_lis<-list()
  ra_weight_sum_lis<-list()
  K_sum_cal_lis<-list()
  K_sum_test_lis<-list()
  ra_K_sum_test_lis<-list()
  ra_K_sum_lis<-list()
  for (i in 1:length(h_sel)) {
    # weight_matrix_lis[[i]]<-exp(-dis_mat/h_sel[i])
    weight_matrix_lis[[i]]<- dis_mat
    eme_lis[[i]]<-weight_matrix_lis[[i]]*sign_matrix
    weight_cal_lis[[i]]<-rowSums(eme_lis[[i]][,1:n_cal])
    weight_test_lis[[i]]<-eme_lis[[i]][,-(1:n_cal)]
    ra_weight_test_lis[[i]]<-t(t(weight_test_lis[[i]])*ra_test)
    ra_weight_sum_lis[[i]]<-rowSums(ra_weight_test_lis[[i]])
    K_sum_cal_lis[[i]]<-rowSums(weight_matrix_lis[[i]][,1:n_cal])
    K_sum_test_lis[[i]]<-weight_matrix_lis[[i]][,-(1:n_cal)]
    ra_K_sum_test_lis[[i]]<-t(t(K_sum_test_lis[[i]])*ra_test)
    ra_K_sum_lis[[i]]<-rowSums(ra_K_sum_test_lis[[i]])
  }
  for (i in 1:length((h_sel))) {
    weight_matrix<-weight_matrix_lis[[i]]
    eme<-eme_lis[[i]]
    weight_cal<-weight_cal_lis[[i]]
    weight_test<-weight_test_lis[[i]]
    ra_weight_test<-ra_weight_test_lis[[i]]
    ra_weight_sum<-ra_weight_sum_lis[[i]]
    K_sum_cal<-K_sum_cal_lis[[i]]
    K_sum_test<-K_sum_test_lis[[i]]
    ra_K_sum_test<- ra_K_sum_test_lis[[i]]
    ra_K_sum<- ra_K_sum_lis[[i]]
    rm(eme);gc()
    dis_num<-vector(,length = n_test)
    R_con_k<-vector(,length = n_test)
    for (k in 1:n_test) {
      # K_cal<-((weight_cal[1:n_cal]+weight_test[1:n_cal,k]+weight_test[1:n_cal,])/(K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]+K_sum_test[1:n_cal,]))[,-k]
      # K_sel<-((weight_cal[n_cal+k]+weight_test[n_cal+k,]+weight_test[n_cal+k,k])/(K_sum_cal[n_cal+k]+K_sum_test[n_cal+k,]+K_sum_test[n_cal+k,k]))[-k]
      # K_te<-((weight_cal[-(1:n_cal)]+diag(weight_test[-(1:n_cal),])+weight_test[-(1:n_cal),k])/(K_sum_cal[-(1:n_cal)]+diag(K_sum_test[-(1:n_cal),])+K_sum_test[-(1:n_cal),k]))[-k]
      # K_ra_cal<-((ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k]-ra_weight_test[1:n_cal,])[,-k])/((ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k]-ra_K_sum_test[1:n_cal,])[,-k])
      # K_ra_sel<-(sapply(1:n_test, function(x){ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k]-ra_weight_test[n_cal+x,k]})[-k])/(sapply(1:n_test, function(x){ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k]-ra_K_sum_test[n_cal+x,k]})[-k])
      # K_ra_te<-(sapply(1:n_test,function(x){ra_weight_sum[n_cal+x]-ra_weight_test[n_cal+x,k]-ra_weight_test[n_cal+x,x]})[-k])/(sapply(1:n_test,function(x){ra_K_sum[n_cal+x]-ra_K_sum_test[n_cal+x,k]-ra_K_sum_test[n_cal+x,x]})[-k])
      # p_k_all<-r1*rbind(K_cal,K_sel,K_te)+(1-r1)*rbind(K_ra_cal,K_ra_sel,K_ra_te)
      nu_cal=r1*(weight_cal[1:n_cal]+weight_test[1:n_cal,k]+weight_test[1:n_cal,])[,-k]+(1-r1)*(ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k]-ra_weight_test[1:n_cal,])[,-k]
      nu_sel=r1*(weight_cal[n_cal+k]+weight_test[n_cal+k,]+weight_test[n_cal+k,k])[-k]+(1-r1)*(sapply(1:n_test, function(x){ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k]-ra_weight_test[n_cal+x,k]})[-k])
      nu_te=r1*((weight_cal[-(1:n_cal)]+diag(weight_test[-(1:n_cal),])+weight_test[-(1:n_cal),k]))[-k]+(1-r1)*((sapply(1:n_test,function(x){ra_weight_sum[n_cal+x]-ra_weight_test[n_cal+x,k]-ra_weight_test[n_cal+x,x]})[-k]))
      de_cal=r1*((K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]+K_sum_test[1:n_cal,]))[,-k]+(1-r1)*(((ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k]-ra_K_sum_test[1:n_cal,])[,-k]))
      de_sel=r1*((K_sum_cal[n_cal+k]+K_sum_test[n_cal+k,]+K_sum_test[n_cal+k,k]))[-k]+(1-r1)*((sapply(1:n_test, function(x){ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k]-ra_K_sum_test[n_cal+x,k]})[-k]))
      de_te=r1*((K_sum_cal[-(1:n_cal)]+diag(K_sum_test[-(1:n_cal),])+K_sum_test[-(1:n_cal),k]))[-k]+(1-r1)*((sapply(1:n_test,function(x){ra_K_sum[n_cal+x]-ra_K_sum_test[n_cal+x,k]-ra_K_sum_test[n_cal+x,x]})[-k]))
      p_k_all<-rbind(nu_cal/de_cal,nu_sel/de_sel,nu_te/de_te)
      p_k<-apply(p_k_all, 2, function(x){sum(x[length(x)]>=x)/length(x)})
      p_k[is.na(p_k)]<-1
      dis_num[k]<-BH(p_k,alpha)
      R_con_k[k]<-BH(c(p_k,0),alpha)
    }
    h_dis_nu[i,]<-dis_num
    R_con[i,]<-R_con_k
  }
  R_selected<-sapply(1:n_test, function(x){R_con[,x][which(h_dis_nu[,x]==max(h_dis_nu[,x]))[1]]})
  h_selected<-apply(h_dis_nu, 2, function(x){h_sel[which(x==max(x))[1]]})
  p_k<-rep(NA,n_test)
  
  
  for (k in 1:n_test) {
    order_k<-sum(h_selected[k]>=h_sel)
    weight_matrix<-weight_matrix_lis[[order_k]]
    eme<-eme_lis[[order_k]]
    weight_cal<-weight_cal_lis[[order_k]]
    weight_test<-weight_test_lis[[order_k]]
    ra_weight_test<-ra_weight_test_lis[[order_k]]
    ra_weight_sum<-ra_weight_sum_lis[[order_k]]
    K_sum_cal<-K_sum_cal_lis[[order_k]]
    K_sum_test<-K_sum_test_lis[[order_k]]
    ra_K_sum_test<- ra_K_sum_test_lis[[order_k]]
    ra_K_sum<- ra_K_sum_lis[[order_k]]
    rm(eme);gc()
    # K_cal<-((weight_cal[1:n_cal]+weight_test[1:n_cal,k])/(K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]))
    # K_test<-(weight_cal[n_cal+k]+1)/(K_sum_cal[n_cal+k]+1)
    # K_ra_cal<-((ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k]))/((ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k]))
    # K_ra_te<-(ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k])/(ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k])
    # p_k_all<-r1*c(K_cal,K_test)+(1-r1)*c(K_ra_cal,K_ra_te)
    nu_cal<-r1*(weight_cal[1:n_cal]+weight_test[1:n_cal,k])+(1-r1)*(ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k])
    nu_test<-r1*((weight_cal[n_cal+k]+1))+(1-r1)*(ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k])
    de_cal<-r1*((K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]))+(1-r1)*(ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k])
    de_test<-r1*((K_sum_cal[n_cal+k]+1))+(1-r1)*((ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k]))
    p_k_all<-c(nu_cal/de_cal,nu_test/de_test)
    p_k[k]<-sum(p_k_all[length(p_k_all)]>=p_k_all)/length(p_k_all)
  }
  p_k[is.na(p_k)]<-1
  # R_selected<-sapply(1:n_test, function(x){R_con[,x][which(h_dis_nu[,x]==max(h_dis_nu[,x]))[1]]})
  # K_selected<-apply(h_dis_nu, 2, function(x){h_sel[which(x==max(x))[1]]})
  BH_result<-which(p_k<=alpha*BH(p_k,alpha)/length(p_k))
  # BH_result<-which(p_k<=alpha*R_selected/length(p_k))
  if(length(BH_result)>=max(max(R_selected[BH_result]),1)){
    detection_result<-BH_result
  }else{
    u<-runif(length(BH_result))
    p_til<-u*R_selected[BH_result]/length(BH_result)
    detection_result<-BH_result[which(sapply(1:length(p_til), function(x){sum(p_til[x]>=p_til)})<BH(p_til,1))]
  }
  detection_result<-BH_result
  
  # rank_p<-sapply(1:length(p_k), function(x){sum(p_k[x]>=p_k)})
  # detection_result<-rank_p<=max(rank_p[p_k<=alpha*rank_p/length(p_k)])
  return(detection_result)
}

RLCP<-function(X_cal,X_test,cal_score,test_score,h,alpha){
  s_sam <- matrix(0, ncol = 1, nrow = n_test)
  for (i in 1:n_test) {
    s_sam[i,] <- mvrnorm(1, as.numeric(X_test[i,d]), (h^2)*diag(1))
  }
  
  weight <- matrix(0, nrow = n_test, ncol = n_cal+1)
  for (j in 1:n_cal) {
    diffmat <- matrix(0, nrow = n_test, ncol = 1)
    for (k in 1:1) {
      diffmat[, 1] <- s_sam[, 1] - X_cal[j,d]
    }
    weight[, j] <- exp(-apply(diffmat^2, 1, sum)/(h^2))
  }
  diffmat <- matrix(0, nrow = n_test, ncol = 1)
  for (k in 1:1) {
    diffmat[, 1] <- s_sam[, 1] - X_test[,d]
  }
  weight[, n_cal+1] <- exp(-apply(diffmat^2, 1, sum)/(h^2))
  IndQR <- matrix(1, nrow = n_test, ncol = n_cal+1)
  for (j in 1:n_cal) {
    IndQR[,j] <- ifelse(test_score<=cal_score[j], 1, 0)
  }
  IndQR[, n_cal+1] <- runif(n_test)
  WQR <- weight*IndQR
  pvalues <- (apply(WQR, 1, sum))/(apply(weight, 1, sum))
  pvalues[is.na(pvalues)] <- 1
  Rtild <- rep(0, n_test)
  unnorm_p <- apply(WQR, 1, sum)
  sum_weight <- apply(weight, 1, sum)
  for (j in 1:n_test) {
    pvalues_j <- (unnorm_p - WQR[, n_cal+1] + weight[, n_cal+1]*ifelse(test_score<=test_score[j], 1, 0))/sum_weight
    pvalues_j[is.na(pvalues_j)] <- 1
    pvalues_j[j] <- 0
    rej_j <- sort(pvalues_j)<=((1:length(pvalues_j))/length(pvalues_j))*alpha
    rejnum_j <- max(which(rej_j==T))
    Rtild[j] <- rejnum_j
  }
  S <- alpha*Rtild/n_test
  R1 <- which(pvalues<=S)
  xi <- runif(n_test)
  R <- 0
  for (r in 1:length(R1)) {
    if(sum(ifelse(pvalues<=S&xi*Rtild<=r, 1, 0))>=r){
      R <- r
    }
  }
  reject <- which(pvalues<=S&xi*Rtild<=R)
  return(reject)
}

cutoff_ALCP<-function(train_score=train_score,cal_score=cal_score,test_score=test_score,h_sel=h_sel,r1=r1,alpha=0.2){
  ra_two_class<-data.frame(X=c(X_train[,d],X_test[,d]),S=c(train_score,test_score))
  ra_tar<-c(rep(0,n_train),rep(1,n_test))
  ra_model<-glm(tar ~ ., data = data.frame(ra_two_class,tar=ra_tar), family = binomial)
  ra_predictions_test <- predict(ra_model, newdata = data.frame(X=X_test[,d],S=test_score), type = "response")
  ra_predictions_test<-as.double(ra_predictions_test)
  ra_test<-(1-ra_predictions_test)*(n_test)/(n_train*ra_predictions_test)
  # 进行矩阵运算
  score_all<-matrix(rep(c(cal_score,test_score),each=(n_cal+n_test)),ncol = (n_cal+n_test))
  sign_matrix<-matrix(as.numeric(t(score_all)>=score_all),ncol = (n_cal+n_test))
  rm(score_all)
  h_dis_nu<-matrix(NA,ncol = n_test,nrow = length(h_sel))
  R_con<-matrix(NA,ncol = n_test,nrow = length(h_sel))
  weight_matrix_lis<-list()
  eme_lis<-list()
  weight_cal_lis<-list()
  weight_test_lis<-list()
  ra_weight_test_lis<-list()
  ra_weight_sum_lis<-list()
  K_sum_cal_lis<-list()
  K_sum_test_lis<-list()
  ra_K_sum_test_lis<-list()
  ra_K_sum_lis<-list()
  for (i in 1:length(h_sel)) {
    weight_matrix_lis[[i]]<-exp(-dis_mat/h_sel[i])
    # weight_matrix_lis[[i]]<- dis_mat
    eme_lis[[i]]<-weight_matrix_lis[[i]]*sign_matrix
    weight_cal_lis[[i]]<-rowSums(eme_lis[[i]][,1:n_cal])
    weight_test_lis[[i]]<-eme_lis[[i]][,-(1:n_cal)]
    ra_weight_test_lis[[i]]<-t(t(weight_test_lis[[i]])*ra_test)
    ra_weight_sum_lis[[i]]<-rowSums(ra_weight_test_lis[[i]])
    K_sum_cal_lis[[i]]<-rowSums(weight_matrix_lis[[i]][,1:n_cal])
    K_sum_test_lis[[i]]<-weight_matrix_lis[[i]][,-(1:n_cal)]
    ra_K_sum_test_lis[[i]]<-t(t(K_sum_test_lis[[i]])*ra_test)
    ra_K_sum_lis[[i]]<-rowSums(ra_K_sum_test_lis[[i]])
  }
  p_k<-rep(NA,n_test)
  for (k in 1:n_test) {
    # order_k<-sum(h_selected[k]>=h_sel)
    order_k<-1
    weight_matrix<-weight_matrix_lis[[order_k]]
    eme<-eme_lis[[order_k]]
    weight_cal<-weight_cal_lis[[order_k]]
    weight_test<-weight_test_lis[[order_k]]
    ra_weight_test<-ra_weight_test_lis[[order_k]]
    ra_weight_test[,loss_index]<-0
    # ra_weight_sum<-ra_weight_sum_lis[[order_k]]
    ra_weight_sum<-rowSums(ra_weight_test)
    K_sum_cal<-K_sum_cal_lis[[order_k]]
    K_sum_test<-K_sum_test_lis[[order_k]]
    ra_K_sum_test<- ra_K_sum_test_lis[[order_k]]
    ra_K_sum_test[,loss_index]<-0
    ra_K_sum<- rowSums(ra_K_sum_test)
    rm(eme);gc()
    # K_cal<-((weight_cal[1:n_cal]+weight_test[1:n_cal,k])/(K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]))
    # K_test<-(weight_cal[n_cal+k]+1)/(K_sum_cal[n_cal+k]+1)
    # K_ra_cal<-((ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k]))/((ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k]))
    # K_ra_te<-(ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k])/(ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k])
    # p_k_all<-r1*c(K_cal,K_test)+(1-r1)*c(K_ra_cal,K_ra_te)
    nu_cal<-r1*(weight_cal[1:n_cal]+weight_test[1:n_cal,k])+(1-r1)*(ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k])
    nu_test<-r1*((weight_cal[n_cal+k]+1))+(1-r1)*(ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k])
    de_cal<-r1*((K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]))+(1-r1)*(ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k])
    de_test<-r1*((K_sum_cal[n_cal+k]+1))+(1-r1)*((ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k]))
    p_k_all<-c(nu_cal/de_cal,nu_test/de_test)
    p_k[k]<-sum(p_k_all[length(p_k_all)]<=p_k_all)/length(p_k_all)
  }
  p_k[is.na(p_k)]<-0
  BH_result<-which(p_k<=alpha*BH(p_k,alpha)/length(p_k))
  detection_result<-BH_result
}

num_sim=100
FDP_alcp<-c();Power_alcp<-c()
FDP_cp<-c();Power_cp<-c()
FDP_rlcp<-c();Power_rlcp<-c()
for (times in 1:num_sim) {
  data<-read.csv("all_data_filter.csv")
  rf_model <- randomForest(
    SalePrice ~ Neighborhood+GrLivArea+TotalBsmtSF+LowQualFinSF ,
    data = data,
    ntree = 500,       # 树的数量
    importance = TRUE  # 计算变量重要性
  )
  data$score<-abs(data$SalePrice-predict(rf_model,newdata = data))
  score<-abs(data$SalePrice-predict(rf_model,newdata = data))
  if(length(which((data$score>2*10^5)&(data$Neighborhood_binary==1)))>0){
    data<-data[-which((data$score>2*10^5)&(data$Neighborhood_binary==1)),]
  }
  if(length(which((data$score>1*10^5)&(data$Neighborhood_binary==0)))>0){
    data<-data[-which((data$score>1*10^5)&(data$Neighborhood_binary==0)),]
  }
  data<-data[,c("SalePrice", "GrLivArea",  "TotalBsmtSF","LowQualFinSF", "Neighborhood_binary","YearBuilt","score")]
  colnames(data)<-c("SalePrice", "GrLivArea",  "TotalBsmtSF","LowQualFinSF", "Neighborhood","YearBuilt","score")
  
  # 选出Neighborhood=1和Neighborhood=0中score最大的前10%作为Outlier
  outlier_index_1 <- which(data$Neighborhood == 1)
  outlier_index_0 <- which(data$Neighborhood == 0)
  
  threshold_1 <- quantile(data$score[outlier_index_1], 0.9)
  threshold_0 <- quantile(data$score[outlier_index_0], 0.9)
  
  outlier_1 <- outlier_index_1[data$score[outlier_index_1] >= threshold_1]
  outlier_0 <- outlier_index_0[data$score[outlier_index_0] >= threshold_0]
  
  outlier_index <- c(outlier_1, outlier_0)
  X_outlier <- data[outlier_index, ]
  data_remaining <- data[-outlier_index, ]
  
  # 剩余数据按4:4:2分割(train:cal:test)
  n_remaining <- nrow(data_remaining)
  train_size <- floor(0.4 * n_remaining)
  cal_size <- floor(0.4 * n_remaining)
  test_size <- n_remaining - train_size - cal_size
  
  train_index <- sample(1:n_remaining, train_size)
  X_train <- data_remaining[train_index, ]
  data_after_train <- data_remaining[-train_index, ]
  
  cal_index <- sample(1:nrow(data_after_train), cal_size)
  X_cal <- data_after_train[cal_index, ]
  X_test_split <- data_after_train[-cal_index, ]
  
  # 将分割出的test与Outlier合并
  X_test <- rbind(X_test_split, X_outlier)
  
  rf_model <- randomForest(
    SalePrice ~ Neighborhood+GrLivArea+TotalBsmtSF+LowQualFinSF ,
    data = X_train,
    ntree = 500,       # 树的数量
    importance = TRUE  # 计算变量重要性
  )
  
  predicted_train <- predict(rf_model, newdata = X_train)
  train_score <- abs(X_train$SalePrice - predicted_train)
  
  predicted_cal <- predict(rf_model, newdata = X_cal)
  cal_score <- abs(X_cal$SalePrice - predicted_cal)
  
  predicted_test <- predict(rf_model, newdata = X_test)
  test_score <- abs(X_test$SalePrice - predicted_test)
  
  # mislabel_index就是Outlier在test中的索引位置
  # 由于X_test = rbind(X_test_split, X_outlier)，所以Outlier的索引是最后nrow(X_outlier)个位置
  mislabel_index <- (nrow(X_test_split) + 1):nrow(X_test)
  a=quantile(test_score[X_test$Neighborhood==1],0.7)
  b=quantile(test_score[X_test$Neighborhood==0],0.7)
  loss_index<-c(which(test_score>a & X_test$Neighborhood==1 ),which(test_score>b & X_test$Neighborhood==0 ))
  n_cal<-nrow(X_cal)
  n_test<-nrow(X_test)
  n_train<-nrow(X_train)
  d=5
  # dis_mat<-1-dis_matrix(c(X_cal[,"Neighborhood"],X_test[,"Neighborhood"]))
  dis_mat<-dis_matrix(c(X_cal[,"Neighborhood"],X_test[,"Neighborhood"]))
  r1=0.7
  d_con=1
  h_sel=n_cal^{-1/(d_con+2)}
  
  alpha=0.2
  ############CP#########
  p_CP<-sapply(test_score,function(x){(sum(x<=cal_score)+1)/(n_cal+1)})
  CP_result<-which(p_CP<BH(p_CP,alpha)*alpha/length(p_CP))
  ###########RLCP########
  RLCP_result<-RLCP(X_cal,X_test,cal_score,test_score,h=10*n_cal^{-1/(d_con+2)},alpha)
  ##########ALCP#########
  # detection_result<-QLCP_au_detect(train_score=train_score,cal_score=cal_score,test_score=test_score,h_sel=h_sel,r1=r1,alpha)
  detection_result<-cutoff_ALCP(train_score=train_score,cal_score=cal_score,test_score=test_score,h_sel=h_sel,r1=r1,alpha)
  
  FDP_rlcp[times]<-length(setdiff(RLCP_result,mislabel_index))/max(1,length(RLCP_result))
  FDP_alcp[times]<-length(setdiff(detection_result,mislabel_index))/max(1,length(detection_result))
  FDP_cp[times]<-length(setdiff(CP_result,mislabel_index))/max(length(CP_result),1)
  Power_alcp[times]<-length(intersect(detection_result,mislabel_index))/length(mislabel_index)
  Power_cp[times]<-length(intersect(CP_result,mislabel_index))/length(mislabel_index)
  Power_rlcp[times]<-length(intersect(RLCP_result,mislabel_index))/length(mislabel_index)
}
result<-matrix(NA,3,2)
result_sd<-matrix(NA,3,2)
colnames(result)<-c("FDR","Power")
rownames(result)<-c("CP","RLCP","ALCP")
colnames(result_sd)<-c("FDR","Power")
rownames(result_sd)<-c("CP","RLCP","ALCP")
result["CP",]<-c(mean(FDP_cp),mean(Power_cp))
result["RLCP",]<- c(mean(FDP_rlcp),mean(Power_rlcp))
result["ALCP",]<-c(mean(FDP_alcp),mean(Power_alcp))
result_sd["CP",]<-c(var(FDP_cp),var(Power_cp))/num_sim
result_sd["RLCP",]<- c(var(FDP_rlcp),var(Power_rlcp))/num_sim
result_sd["ALCP",]<-c(var(FDP_alcp),var(Power_alcp))/num_sim
num_sim=100
FDP_alcp<-c();Power_alcp<-c()
FDP_cp<-c();Power_cp<-c()
FDP_rlcp<-c();Power_rlcp<-c()
for (times in 1:num_sim) {
  data<-read.csv("all_data_filter.csv")
  rf_model <- randomForest(
    SalePrice ~ Neighborhood+GrLivArea+TotalBsmtSF+LowQualFinSF ,
    data = data,
    ntree = 500,       # 树的数量
    importance = TRUE  # 计算变量重要性
  )
  data$score<-abs(data$SalePrice-predict(rf_model,newdata = data))
  score<-abs(data$SalePrice-predict(rf_model,newdata = data))
  if(length(which((data$score>2*10^5)&(data$Neighborhood_binary==1)))>0){
    data<-data[-which((data$score>2*10^5)&(data$Neighborhood_binary==1)),]
  }
  if(length(which((data$score>1*10^5)&(data$Neighborhood_binary==0)))>0){
    data<-data[-which((data$score>1*10^5)&(data$Neighborhood_binary==0)),]
  }
  data<-data[,c("SalePrice", "GrLivArea",  "TotalBsmtSF","LowQualFinSF", "Neighborhood_binary","YearBuilt","score")]
  colnames(data)<-c("SalePrice", "GrLivArea",  "TotalBsmtSF","LowQualFinSF", "Neighborhood","YearBuilt","score")
  
  # 选出Neighborhood=1和Neighborhood=0中score最大的前10%作为Outlier
  outlier_index_1 <- which(data$Neighborhood == 1)
  outlier_index_0 <- which(data$Neighborhood == 0)
  
  threshold_1 <- quantile(data$score[outlier_index_1], 0.9)
  threshold_0 <- quantile(data$score[outlier_index_0], 0.9)
  
  outlier_1 <- outlier_index_1[data$score[outlier_index_1] >= threshold_1]
  outlier_0 <- outlier_index_0[data$score[outlier_index_0] >= threshold_0]
  
  outlier_index <- c(outlier_1, outlier_0)
  X_outlier <- data[outlier_index, ]
  data_remaining <- data[-outlier_index, ]
  
  # 剩余数据按4:4:2分割(train:cal:test)
  n_remaining <- nrow(data_remaining)
  train_size <- floor(0.4 * n_remaining)
  cal_size <- floor(0.4 * n_remaining)
  test_size <- n_remaining - train_size - cal_size
  
  train_index <- sample(1:n_remaining, train_size)
  X_train <- data_remaining[train_index, ]
  data_after_train <- data_remaining[-train_index, ]
  
  cal_index <- sample(1:nrow(data_after_train), cal_size)
  X_cal <- data_after_train[cal_index, ]
  X_test_split <- data_after_train[-cal_index, ]
  
  # 将分割出的test与Outlier合并
  X_test <- rbind(X_test_split, X_outlier)
  
  rf_model <- randomForest(
    SalePrice ~ Neighborhood+GrLivArea+TotalBsmtSF+LowQualFinSF ,
    data = X_train,
    ntree = 500,       # 树的数量
    importance = TRUE  # 计算变量重要性
  )
  
  predicted_train <- predict(rf_model, newdata = X_train)
  train_score <- abs(X_train$SalePrice - predicted_train)
  
  predicted_cal <- predict(rf_model, newdata = X_cal)
  cal_score <- abs(X_cal$SalePrice - predicted_cal)
  
  predicted_test <- predict(rf_model, newdata = X_test)
  test_score <- abs(X_test$SalePrice - predicted_test)
  
  # mislabel_index就是Outlier在test中的索引位置
  # 由于X_test = rbind(X_test_split, X_outlier)，所以Outlier的索引是最后nrow(X_outlier)个位置
  mislabel_index <- (nrow(X_test_split) + 1):nrow(X_test)
  a=quantile(test_score[X_test$Neighborhood==1],0.7)
  b=quantile(test_score[X_test$Neighborhood==0],0.7)
  loss_index<-c(which(test_score>a & X_test$Neighborhood==1 ),which(test_score>b & X_test$Neighborhood==0 ))
  n_cal<-nrow(X_cal)
  n_test<-nrow(X_test)
  n_train<-nrow(X_train)
  d=5
  # dis_mat<-1-dis_matrix(c(X_cal[,"Neighborhood"],X_test[,"Neighborhood"]))
  dis_mat<-dis_matrix(c(X_cal[,"Neighborhood"],X_test[,"Neighborhood"]))
  r1=0.7
  d_con=1
  h_sel=n_cal^{-1/(d_con+2)}
  
  alpha=0.2
  ############CP#########
  p_CP<-sapply(test_score,function(x){(sum(x<=cal_score)+1)/(n_cal+1)})
  CP_result<-which(p_CP<BH(p_CP,alpha)*alpha/length(p_CP))
  ###########RLCP########
  RLCP_result<-RLCP(X_cal,X_test,cal_score,test_score,h=10*n_cal^{-1/(d_con+2)},alpha)
  ##########ALCP#########
  # detection_result<-QLCP_au_detect(train_score=train_score,cal_score=cal_score,test_score=test_score,h_sel=h_sel,r1=r1,alpha)
  detection_result<-cutoff_ALCP(train_score=train_score,cal_score=cal_score,test_score=test_score,h_sel=h_sel,r1=r1,alpha)
  
  FDP_rlcp[times]<-length(setdiff(RLCP_result,mislabel_index))/max(1,length(RLCP_result))
  FDP_alcp[times]<-length(setdiff(detection_result,mislabel_index))/max(1,length(detection_result))
  FDP_cp[times]<-length(setdiff(CP_result,mislabel_index))/max(length(CP_result),1)
  Power_alcp[times]<-length(intersect(detection_result,mislabel_index))/length(mislabel_index)
  Power_cp[times]<-length(intersect(CP_result,mislabel_index))/length(mislabel_index)
  Power_rlcp[times]<-length(intersect(RLCP_result,mislabel_index))/length(mislabel_index)
}
result<-matrix(NA,3,2)
result_sd<-matrix(NA,3,2)
colnames(result)<-c("FDR","Power")
rownames(result)<-c("CP","RLCP","ALCP")
colnames(result_sd)<-c("FDR","Power")
rownames(result_sd)<-c("CP","RLCP","ALCP")
result["CP",]<-c(mean(FDP_cp),mean(Power_cp))
result["RLCP",]<- c(mean(FDP_rlcp),mean(Power_rlcp))
result["ALCP",]<-c(mean(FDP_alcp),mean(Power_alcp))
result_sd["CP",]<-c(var(FDP_cp),var(Power_cp))/num_sim
result_sd["RLCP",]<- c(var(FDP_rlcp),var(Power_rlcp))/num_sim
result_sd["ALCP",]<-c(var(FDP_alcp),var(Power_alcp))/num_sim
