rm(list = ls());gc()
library(doSNOW)
library(MASS)
library(randomForest)
library(grf)
library(tidyverse)

progress <- function(nfin){
  cat(sprintf('%s: tasks completed: %d.\n', Sys.time(), nfin))
}

opts <- list(progress = progress)

dis_vec<-function(vec){
  n<-length(vec)
  dis_mat<-(matrix(rep(vec,each=n), nrow = n, ncol = n,byrow = T)-matrix(rep(vec,each=n), nrow = n, ncol = n))^2
  return(dis_mat)
}

dis_matrix<-function(mat){
  if(is.matrix(mat)){
    result_list<-apply(mat, 2, dis_vec,simplify=F)
    result_matrix<-Reduce("+",result_list)
    return(result_matrix)
  }else{
    result_matrix<-dis_vec(mat)
    return(result_matrix)
  }
}

BH<-function(p_k=p_k,alpha=alpha){
  rank_p<-sapply(1:length(p_k), function(x){sum(p_k[x]>=p_k)})
  if(sum(p_k<alpha*rank_p/length(p_k))==0){
    return(0)
  }else{
    return(max(rank_p[p_k<alpha*rank_p/length(p_k)]))
  }
}

BH_index<-function(pval,alpha){
  m=length(pval)
  rej <- sort(pval)<((1:m)/m)*alpha
  rejnum <- max(which(rej==T))
  reject <- which(pval<=sort(pval)[rejnum])
  return(reject)
}

LCP_au_detect<-function(train_score=train_score,cal_score=cal_score,test_score=test_score,h_sel=h_sel,r1=r1,alpha=0.2){
  # 计算密度比
  ra_two_class<-data.frame(X=c(X_train[,1],X_test[,1]),S=c(train_score,test_score))
  ra_tar<-c(rep(0,n_train),rep(1,n_test))
  ra_model<-glm(tar ~ ., data = data.frame(ra_two_class,tar=ra_tar), family = binomial)
  ra_predictions_test <- predict(ra_model, newdata = data.frame(X=X_test[,1],S=test_score), type = "response")
  ra_predictions_test<-as.double(ra_predictions_test)
  ra_test<-(1-ra_predictions_test)*(n_test)/(n_train*ra_predictions_test)
  # 进行矩阵运算
  score_all<-matrix(rep(c(cal_score,test_score),each=(n_cal+n_test)),ncol = (n_cal+n_test))
  sign_matrix<-matrix(as.numeric(t(score_all)<=score_all),ncol = (n_cal+n_test))
  rm(score_all)
  h_dis_nu<-matrix(NA,ncol = n_test,nrow = length(h_sel))
  R_con<-matrix(NA,ncol = n_test,nrow = length(h_sel))
  for (i in 1:length((h_sel))) {
    weight_matrix<-exp(-dis_mat/h_sel[i])
    eme<-weight_matrix*sign_matrix
    weight_cal<-rowSums(eme[,1:n_cal])
    weight_test<-eme[,-(1:n_cal)]
    ra_weight_test<-t(t(weight_test)*ra_test)
    ra_weight_sum<-rowSums(ra_weight_test)
    K_sum_cal<-rowSums(weight_matrix[,1:n_cal])
    K_sum_test<-weight_matrix[,-(1:n_cal)]
    ra_K_sum_test<-t(t(K_sum_test)*ra_test)
    ra_K_sum<-rowSums(ra_K_sum_test)
    rm(eme);gc()
    dis_num<-vector(,length = n_test)
    R_con_k<-vector(,length = n_test)
    for (k in 1:n_test) {
      # K_cal<-((weight_cal[1:n_cal]+weight_test[1:n_cal,k]+weight_test[1:n_cal,])/(K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]+K_sum_test[1:n_cal,]))[,-k]
      # K_sel<-((weight_cal[n_cal+k]+weight_test[n_cal+k,]+weight_test[n_cal+k,k])/(K_sum_cal[n_cal+k]+K_sum_test[n_cal+k,]+K_sum_test[n_cal+k,k]))[-k]
      # K_te<-((weight_cal[-(1:n_cal)]+diag(weight_test[-(1:n_cal),])+weight_test[-(1:n_cal),k])/(K_sum_cal[-(1:n_cal)]+diag(K_sum_test[-(1:n_cal),])+K_sum_test[-(1:n_cal),k]))[-k]
      # K_ra_cal<-((ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k]-ra_weight_test[1:n_cal,])[,-k])/((ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k]-ra_K_sum_test[1:n_cal,])[,-k])
      # K_ra_sel<-(sapply(1:n_test, function(x){ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k]-ra_weight_test[n_cal+x,k]})[-k])/(sapply(1:n_test, function(x){ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k]-ra_K_sum_test[n_cal+x,k]})[-k])
      # K_ra_te<-(sapply(1:n_test,function(x){ra_weight_sum[n_cal+x]-ra_weight_test[n_cal+x,k]-ra_weight_test[n_cal+x,x]})[-k])/(sapply(1:n_test,function(x){ra_K_sum[n_cal+x]-ra_K_sum_test[n_cal+x,k]-ra_K_sum_test[n_cal+x,x]})[-k])
      # p_k_all<-r1*rbind(K_cal,K_sel,K_te)+(1-r1)*rbind(K_ra_cal,K_ra_sel,K_ra_te)
      nu_cal=r1*(weight_cal[1:n_cal]+weight_test[1:n_cal,k]+weight_test[1:n_cal,])[,-k]+(1-r1)*(ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k]-ra_weight_test[1:n_cal,])[,-k]
      nu_sel=r1*(weight_cal[n_cal+k]+weight_test[n_cal+k,]+weight_test[n_cal+k,k])[-k]+(1-r1)*(sapply(1:n_test, function(x){ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k]-ra_weight_test[n_cal+x,k]})[-k])
      nu_te=r1*((weight_cal[-(1:n_cal)]+diag(weight_test[-(1:n_cal),])+weight_test[-(1:n_cal),k]))[-k]+(1-r1)*((sapply(1:n_test,function(x){ra_weight_sum[n_cal+x]-ra_weight_test[n_cal+x,k]-ra_weight_test[n_cal+x,x]})[-k]))
      de_cal=r1*((K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]+K_sum_test[1:n_cal,]))[,-k]+(1-r1)*(((ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k]-ra_K_sum_test[1:n_cal,])[,-k]))
      de_sel=r1*((K_sum_cal[n_cal+k]+K_sum_test[n_cal+k,]+K_sum_test[n_cal+k,k]))[-k]+(1-r1)*((sapply(1:n_test, function(x){ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k]-ra_K_sum_test[n_cal+x,k]})[-k]))
      de_te=r1*((K_sum_cal[-(1:n_cal)]+diag(K_sum_test[-(1:n_cal),])+K_sum_test[-(1:n_cal),k]))[-k]+(1-r1)*((sapply(1:n_test,function(x){ra_K_sum[n_cal+x]-ra_K_sum_test[n_cal+x,k]-ra_K_sum_test[n_cal+x,x]})[-k]))
      p_k_all<-rbind(nu_cal/de_cal,nu_sel/de_sel,nu_te/de_te)
      p_k<-apply(p_k_all, 2, function(x){sum(x[length(x)]>=x)/length(x)})
      p_k[is.na(p_k)]<-1
      dis_num[k]<-BH(p_k,alpha)
      R_con_k[k]<-BH(c(p_k,0),alpha)
    }
    h_dis_nu[i,]<-dis_num
    R_con[i,]<-R_con_k
  }
  R_selected<-sapply(1:n_test, function(x){R_con[,x][which(h_dis_nu[,x]==max(h_dis_nu[,x]))[1]]})
  h_selected<-apply(h_dis_nu, 2, function(x){h_sel[which(x==max(x))[1]]})
  p_k<-rep(NA,n_test)
  for (k in 1:n_test) {
    weight_matrix<-exp(-dis_mat/h_selected[k])
    eme<-weight_matrix*sign_matrix
    weight_cal<-rowSums(eme[,1:n_cal])
    weight_test<-eme[,-(1:n_cal)]
    ra_weight_test<-t(t(weight_test)*ra_test)
    ra_weight_sum<-rowSums(ra_weight_test)
    K_sum_cal<-rowSums(weight_matrix[,1:n_cal])
    K_sum_test<-weight_matrix[,-(1:n_cal)]
    ra_K_sum_test<-t(t(K_sum_test)*ra_test)
    ra_K_sum<-rowSums(ra_K_sum_test)
    rm(eme);gc()
    # K_cal<-((weight_cal[1:n_cal]+weight_test[1:n_cal,k])/(K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]))
    # K_test<-(weight_cal[n_cal+k]+1)/(K_sum_cal[n_cal+k]+1)
    # K_ra_cal<-((ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k]))/((ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k]))
    # K_ra_te<-(ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k])/(ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k])
    # p_k_all<-r1*c(K_cal,K_test)+(1-r1)*c(K_ra_cal,K_ra_te)
    nu_cal<-r1*(weight_cal[1:n_cal]+weight_test[1:n_cal,k])+(1-r1)*(ra_weight_sum[1:n_cal]-ra_weight_test[1:n_cal,k])
    nu_test<-r1*((weight_cal[n_cal+k]+1))+(1-r1)*(ra_weight_sum[n_cal+k]-ra_weight_test[n_cal+k,k])
    de_cal<-r1*((K_sum_cal[1:n_cal]+K_sum_test[1:n_cal,k]))+(1-r1)*(ra_K_sum[1:n_cal]-ra_K_sum_test[1:n_cal,k])
    de_test<-r1*((K_sum_cal[n_cal+k]+1))+(1-r1)*((ra_K_sum[n_cal+k]-ra_K_sum_test[n_cal+k,k]))
    p_k_all<-c(nu_cal/de_cal,nu_test/de_test)
    p_k[k]<-sum(p_k_all[length(p_k_all)]>=p_k_all)/length(p_k_all)
  }
  p_k[is.na(p_k)]<-1
  # R_selected<-sapply(1:n_test, function(x){R_con[,x][which(h_dis_nu[,x]==max(h_dis_nu[,x]))[1]]})
  # K_selected<-apply(h_dis_nu, 2, function(x){h_sel[which(x==max(x))[1]]})
  # BH_result<-which(p_k<=alpha*BH(p_k,alpha)/length(p_k))
  BH_result<-which(p_k<=alpha*R_selected/length(p_k))
  if(length(BH_result)>=max(max(R_selected[BH_result]),1)){
    detection_result<-BH_result
  }else{
    u<-runif(length(BH_result))
    p_til<-u*R_selected[BH_result]/length(BH_result)
    detection_result<-BH_result[which(sapply(1:length(p_til), function(x){sum(p_til[x]>=p_til)})<BH(p_til,1))]
  }
  
  
  # rank_p<-sapply(1:length(p_k), function(x){sum(p_k[x]>=p_k)})
  # detection_result<-rank_p<=max(rank_p[p_k<=alpha*rank_p/length(p_k)])
  return(detection_result)
}



RLCP<-function(X_cal,X_test,cal_score,test_score,h,alpha){
  n <- nrow(X_cal)
  N <- nrow(X_test)
  s_sam <- matrix(0, ncol = 1, nrow = N)
  for (i in 1:N) {
    s_sam[i,] <- mvrnorm(1, as.numeric(X_test[i, 1:1]), (h^2)*diag(1))
    # rand <- runif(2)
    # s_sam[i,] <- c(datatest$s1[i], datatest$s2[i]) + h*sqrt(rand[1])*c(cos(2*pi*rand[2]), sin(2*pi*rand[2]))
  }
  
  weight <- matrix(0, nrow = N, ncol = n+1)
  for (j in 1:n) {
    diffmat <- matrix(0, nrow = N, ncol = 2)
    for (k in 1:1) {
      diffmat[, k] <- s_sam[, k] - X_cal[j, k]
    }
    #weight[, j] <- ifelse(apply((cbind(s_sam[, 1]-datacal$s1[j], s_sam[, 2]-datacal$s2[j]))^2, 1, sum)<=h^2, 1, 0)
    weight[, j] <- exp(-apply(diffmat^2, 1, sum)/(2*h^2))
  }
  diffmat <- matrix(0, nrow = N, ncol = 1)
  for (k in 1:1) {
    diffmat[, k] <- s_sam[, k] - X_test[, k]
  }
  weight[, n+1] <- exp(-apply(diffmat^2, 1, sum)/(2*h^2))
  
  IndQR <- matrix(1, nrow = N, ncol = n+1)
  for (j in 1:n) {
    IndQR[,j] <- ifelse(test_score<cal_score[j], 1, 0)
  }
  IndQR[, n+1] <- runif(N)
  
  WQR <- weight*IndQR
  pvalues <- (apply(WQR, 1, sum))/(apply(weight, 1, sum))
  pvalues[is.na(pvalues)] <- 1
  
  
  Rtild <- rep(0, N)
  unnorm_p <- apply(WQR, 1, sum)
  sum_weight <- apply(weight, 1, sum)
  for (j in 1:N) {
    pvalues_j <- (unnorm_p - WQR[, n+1] + weight[, n+1]*ifelse(test_score<=test_score[j], 1, 0))/sum_weight
    pvalues_j[is.na(pvalues_j)] <- 1
    pvalues_j[j] <- 0
    rej_j <- sort(pvalues_j)<((1:length(pvalues_j))/length(pvalues_j))*alpha
    rejnum_j <- max(which(rej_j==T))
    Rtild[j] <- rejnum_j
  }
  S <- alpha*Rtild/N
  R1 <- which(pvalues<=S)
  xi <- runif(N)
  R <- 0
  for (r in 1:length(R1)) {
    if(sum(ifelse(pvalues<=S&xi*Rtild<=r, 1, 0))>=r){
      R <- r
    }
  }
  reject <- which(pvalues<=S&xi*Rtild<=R)
  return(reject)
}



get_conditional_quantile <- function(s1_value, result_df) {
  filtered_result <- result_df %>%
    filter(lat == s1_value)
  
  if (nrow(filtered_result) == 0) {
    return(NA)
  } else {
    return(filtered_result$quantile)
  }
}

conditional_quantile <- function(data, prob) {
  data %>%
    group_by(lat) %>%
    summarise(quantile = quantile(price, probs = prob)) %>%
    ungroup()
}





data <- read.csv("kc_house_data.csv")

cols_with_na <- apply(data, 2, function(x) any(is.na(x)))

data1 <- data [, !cols_with_na]
data1 <- data1[, -c(1,2,17)]

condquan <- conditional_quantile(data1, 0.8)



d=18
r1=0.8 #分布函数两部分的比例
alpha<-0.2
n_1=1500;n_2=500


cl <- makeCluster(50, type = "SOCK")
registerDoSNOW(cl)
times<-200
Result <- foreach(i = 1:times,.packages = c("grf","randomForest","tidyverse","MASS"), .combine = "rbind",.options.snow = opts) %dopar% {
  
  resultfr <- data.frame()
  
  X <- data1[,-c(1,15)]
  Y <- data1[,1]
  s <- data1[,c(15)]
  
  
  data2 <- data.frame(s = s, X = X, y = Y)
  head(data2)
  names(data2) <- c("lat",
                    "X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","X13","X14","X15","X16",
                    "y")
  data2 <- data2[sample(nrow(data2)), ]
  datacal <- data2[(n_1/2+1):n_1,]
  datatest <- data2[(n_1+1):(n_1+n_2),]
  datatrain <- data2[1:(n_1/2),]
  
  outlier <- sample(1:n_2, 0.1*n_2)
  
  for(i in outlier){
    datatest$y[i] <- datatest$y[i] +
      0.75*get_conditional_quantile(datatest$lat[i], condquan)
  }
  
  
  X_train <- datatrain[, 1:(d-1)];X_cal=datacal[, 1:(d-1)]
  Y_train <- datatrain$y;Y_cal=datacal$y
  X_test <- datatest[, 1:(d-1)];Y_test<-datatest$y
  n_cal <- length(Y_cal);n_test=length(Y_test);n_train<-length(Y_train)
  dis_mat <- dis_matrix(c(X_cal[,1],X_test[,1]))
  
  
  # calculate score
  rf_model <- randomForest(Y~.,data = data.frame(X=X_train,Y=Y_train))
  cal_score <- abs(Y_cal-predict(rf_model, newdata = data.frame(X=X_cal)))
  train_score <- abs(Y_train-predict(rf_model, newdata = data.frame(X=X_train)))
  test_score <- abs(Y_test-predict(rf_model, newdata = data.frame(X=X_test)))
  modelQR <- quantile_forest(X_train,Y_train,quantiles = c(0.1,0.9))
  quan_cal <- predict(modelQR,X_cal)$predictions
  quan_cal_score <- apply(cbind(Y_cal-quan_cal[,2],quan_cal[,1]-Y_cal), 1, max)
  quan_train <- predict(modelQR,X_train)$predictions
  quan_train_score <- apply(cbind(Y_train-quan_train[,2],quan_train[,1]-Y_train), 1, max)
  quan_test <- predict(modelQR,X_test)$predictions
  quan_test_score <- apply(cbind(Y_test-quan_test[,2], quan_test[,1]-Y_test), 1, max)
  
  plot(datatest$lat, quan_test_score, col = ifelse(1:n_2%in%outlier, 'red', 'black'))
  # CP
  p_res_CP <- sapply(test_score, function(x){(sum(x<=cal_score)+1)/(n_cal+1)})
  result_res_CP <- BH_index(p_res_CP,alpha)
  FDP_CP <- (length(result_res_CP) - sum(result_res_CP%in%outlier))/max(c(length(result_res_CP), 1))
  power_CP <- sum(result_res_CP%in%outlier)/(n_2*0.1)
  resultfr <- rbind(resultfr, data.frame(FDP = FDP_CP, power = power_CP, method = 'CP'))
  
  p_quan_CP <- sapply(quan_test_score, function(x){(sum(x<=quan_cal_score)+1)/(n_cal+1)})
  result_quan_CP <- BH_index(p_quan_CP,alpha)
  FDP_CQ <- (length(result_quan_CP) - sum(result_quan_CP%in%outlier))/max(c(length(result_quan_CP), 1))
  power_CQ <- sum(result_quan_CP%in%outlier)/(n_2*0.1)
  resultfr <- rbind(resultfr, data.frame(FDP = FDP_CQ, power = power_CQ, method = 'CQ'))
  
  # RLCP
  result_res_RLCP <- RLCP(X_cal,X_test,cal_score,test_score,h=0.1,alpha = alpha)
  FDP_RLCP <- (length(result_res_RLCP) - sum(result_res_RLCP%in%outlier))/max(c(length(result_res_RLCP), 1))
  power_RLCP <- sum(result_res_RLCP%in%outlier)/(n_2*0.1)
  resultfr <- rbind(resultfr, data.frame(FDP = FDP_RLCP, power = power_RLCP, method = 'RLCP'))
  
  result_quan_RLCP <- RLCP(X_cal,X_test,quan_cal_score,quan_test_score,h=0.1,alpha = alpha)
  FDP_RLCQ <- (length(result_quan_RLCP) - sum(result_quan_RLCP%in%outlier))/max(c(length(result_quan_RLCP), 1))
  power_RLCQ <- sum(result_quan_RLCP%in%outlier)/(n_2*0.1)
  resultfr <- rbind(resultfr, data.frame(FDP = FDP_RLCQ, power = power_RLCQ, method = 'RLCQ'))
  
  # ALCP
  h_sel <- c(0.05, 0.1, 0.2, 0.4)^2
  result_res_ALCP <- LCP_au_detect(train_score,cal_score,test_score,h_sel,r1=r1,alpha = alpha)
  FDP_ALCP <- (length(result_res_ALCP) - sum(result_res_ALCP%in%outlier))/max(c(length(result_res_ALCP), 1))
  power_ALCP <- sum(result_res_ALCP%in%outlier)/(n_2*0.1)
  resultfr <- rbind(resultfr, data.frame(FDP = FDP_ALCP, power = power_ALCP, method = 'ALCP'))
  
  result_quan_ALCP <- LCP_au_detect(quan_train_score,quan_cal_score,quan_test_score,h_sel,r1=r1,alpha = alpha)
  FDP_ALCQ <- (length(result_quan_ALCP) - sum(result_quan_ALCP%in%outlier))/max(c(length(result_quan_ALCP), 1))
  power_ALCQ <- sum(result_quan_ALCP%in%outlier)/(n_2*0.1)
  resultfr <- rbind(resultfr, data.frame(FDP = FDP_ALCQ, power = power_ALCQ, method = 'ALCQ'))
  
  return(resultfr)
}
stopCluster(cl)
save(Result,file = "HouseSale.RData")


load('HouseSale.RData')

pp <- Result%>%
  group_by(method)%>%
  dplyr::summarize(FDR = mean(FDP), Power = mean(power), sdFDR = sd(FDP)/sqrt(200), sdPower = sd(power)/sqrt(200))
pp



