# basic_functions.R
# This file contains core functions for the conformal prediction experiment.

# Load required packages
library(predictionBands)
library(FlexCoDE)
library(FNN)
library(ggplot2)

#' Smooth density using Fast Fourier Transform (FFT)
#' @param density Numeric vector of density values
#' @param y_grid Numeric vector of grid points
#' @param sigma Smoothing parameter
#' @return Smoothed density vector
smooth_fft <- function(density, y_grid, sigma) {
  n <- length(density)
  pad_length <- n  
  density_padded <- c(rep(0, pad_length), density, rep(0, pad_length))
  y_grid_padded <- seq(min(y_grid) - pad_length * (y_grid[2] - y_grid[1]),
                       max(y_grid) + pad_length * (y_grid[2] - y_grid[1]),
                       length.out = n + 2 * pad_length)
  
  fft_density <- fft(density_padded)
  delta_y <- y_grid_padded[2] - y_grid_padded[1]
  freq <- fftfreq(length(density_padded), d = delta_y)
  #gaussian_filter <- exp(-0.5 * (freq / sigma)^2)
  gaussian_filter <- exp(-2 * pi^2 * sigma^2 * freq^2)
  fft_filtered <- fft_density * gaussian_filter
  smoothed_density_padded <- Re(fft(fft_filtered, inverse = TRUE)) / length(density_padded)
  
  smoothed_density <- smoothed_density_padded[(pad_length + 1):(pad_length + n)]
  
  smoothed_density <- pmax(smoothed_density, 0)
  delta_y <- y_grid[2] - y_grid[1]
  smoothed_density <- smoothed_density / sum(smoothed_density) / delta_y
  
  return(smoothed_density)
}

#' Generate frequency vector for FFT
#' @param n Length of the sequence
#' @param d Sampling interval
#' @return Frequency vector
fftfreq <- function(n, d = 1.0) {
  val <- 1.0 / (n * d)
  N <- (n - 1) %/% 2 + 1
  p1 <- seq(0, N - 1) * val
  p2 <- seq(-n %/% 2, -1) * val
  return(c(p1, p2))
}

#' Find indices of k-nearest neighbors
#' @param xTrain Training data matrix
#' @param xTest Test data matrix
#' @param k Number of neighbors
#' @return Matrix of neighbor indices
which_neighbors <- function(xTrain, xTest, k) {
  return(FNN::get.knnx(data = xTrain, query = xTest, k = k)$nn.index)
}

#' Compute profile density for conditional density estimation
#' @param t_grid Grid of threshold values
#' @param y_grid Grid of response values
#' @param cde_estimate Conditional density estimate
#' @return Profile density vector
profile_density <- function(t_grid, y_grid, cde_estimate) {
  v2 <- cde_estimate[order(cde_estimate)]
  v2s <- rev(cumsum(rev(v2))) * (y_grid[2] - y_grid[1])
  v2s <- v2s[findInterval(t_grid, v2) + 1]
  v2s[which(is.na(v2s))] <- 0
  return(v2s)
}

#' Fit prediction bands model
#' @param x Predictor matrix
#' @param y Response vector
#' @param per_train Proportion of training data
#' @param per_val Proportion of validation data
#' @param per_ths Proportion of threshold data
#' @param k Number of clusters
#' @param sigma Smoothing parameter (optional)
#' @param ... Additional arguments for FlexCoDE
#' @return Object of class 'predictionBands'
fit_predictionBands_smooth <- function(x, y,
                                per_train = 0.4,
                                per_val = 0.1,
                                per_ths = 1 - per_train - per_val,
                                k = max(round(per_ths * nrow(as.matrix(x)) / 200), 1),
                                #regressionFunction = FlexCoDE::regressionFunction.Forest,
                                sigma = NULL,
                                ...) {
  x <- as.matrix(x)
  n_levels <- 5000
  splits <- sample(c("Train", "Validation", "Threshold"), size = nrow(x),
                   prob = c(per_train, per_val, per_ths),
                   replace = TRUE)
  
  fit <- FlexCoDE::fitFlexCoDE(xTrain = x[splits == "Train", ],
                               zTrain = y[splits == "Train"],
                               xValidation = x[splits == "Validation", ],
                               zValidation = y[splits == "Validation"],
                               regressionFunction = regressionFunction.Forest,
                               #regressionFunction.extra=list(ntree=1000),
                               #nIMax = 40,
                               ...)
  
  pred_train_cde <- FlexCoDE::predict.FlexCoDE(fit, x[splits != "Threshold", ])
  if (!is.null(sigma)) {
    y_grid <- pred_train_cde$z
    densities <- pred_train_cde$CDE
    n_calib <- nrow(pred_train_cde$CDE)
    smoothed_densities <- matrix(NA, nrow = n_calib, ncol = length(y_grid))
    for (i in 1:n_calib) {
      smoothed_densities[i, ] <- smooth_fft(densities[i, ], y_grid, sigma)
    }
    pred_train_cde$CDE <- smoothed_densities
  }
  t_grid <- seq(0, max(pred_train_cde$CDE), length.out = n_levels)
  g_train_cde <- matrix(NA, nrow(pred_train_cde$CDE), length(t_grid))
  for (ii in 1:nrow(pred_train_cde$CDE)) {
    g_train_cde[ii, ] <- profile_density(t_grid, pred_train_cde$z, pred_train_cde$CDE[ii, ])
  }
  
  kmeans_result <- try(kmeanspp(g_train_cde, k = k), silent = TRUE)
  if (class(kmeans_result) == "try-error") {
    kmeans_result <- kmeans(g_train_cde, centers = k)
  }
  centers_kmeans <- kmeans_result$centers
  rm(g_train_cde)
  rm(pred_train_cde)
  
  pred_train <- FlexCoDE::predict.FlexCoDE(fit, x[splits == "Threshold", ])
  
  if (!is.null(sigma)) {
    y_grid <- pred_train$z
    densities <- pred_train$CDE
    n_calib <- nrow(pred_train$CDE)
    smoothed_densities <- matrix(NA, nrow = n_calib, ncol = length(y_grid))
    for (i in 1:n_calib) {
      smoothed_densities[i, ] <- smooth_fft(densities[i, ], y_grid, sigma)
    }
    pred_train$CDE <- smoothed_densities
  }
  
  t_grid <- seq(0, max(pred_train$CDE), length.out = n_levels)
  which_select <- cbind(1:length(y[splits == "Threshold"]),
                        which_neighbors(as.matrix(pred_train$z),
                                        as.matrix(y[splits == "Threshold"]),
                                        k = 1))
  conformity_score_train <- pred_train$CDE[which_select]
  
  g_train <- matrix(NA, length(conformity_score_train), length(t_grid))
  for (ii in 1:length(conformity_score_train)) {
    g_train[ii, ] <- profile_density(t_grid, pred_train$z, pred_train$CDE[ii, ])
  }
  
  return_value <- NULL
  return_value$density_fit <- fit
  return_value$conformity_score_train <- conformity_score_train
  return_value$t_grid <- t_grid
  return_value$g_train <- g_train
  return_value$centers_kmeans <- centers_kmeans
  class(return_value) <- "predictionBands"
  return(return_value)
}

#' Predict using fitted prediction bands model
#' @param cd_split_fit Fitted predictionBands object
#' @param xnew New predictor matrix
#' @param type Prediction type ("dist", "cd", or "cd_fourier")
#' @param alpha Significance level
#' @param sigma Smoothing parameter (optional)
#' @return Object of class 'bands'
predict_smooth <- function(cd_split_fit, xnew, type = "dist", alpha = 0.1, sigma = NULL) {
  pred_test <- FlexCoDE::predict.FlexCoDE(cd_split_fit$density_fit, xnew)
  
  if (!is.null(sigma)) {
    y_grid <- pred_test$z
    densities <- pred_test$CDE
    n_new <- nrow(xnew)
    smoothed_densities <- matrix(NA, nrow = n_new, ncol = length(y_grid))
    for (i in 1:n_new) {
      smoothed_densities[i, ] <- smooth_fft(densities[i, ], y_grid, sigma)
    }
    pred_test$CDE <- smoothed_densities
  }
  
  if (type == "cd") {
    prediction_bands_which_belong <- list()
    intervals <- list()
    
    ths <- rep(NA, length(cd_split_fit$conformity_score_train))
    g_test <- matrix(NA, nrow(xnew), length(cd_split_fit$t_grid))
    for (ii in 1:nrow(xnew)) {
      g_test[ii, ] <- profile_density(cd_split_fit$t_grid,
                                      pred_test$z,
                                      pred_test$CDE[ii, ])
    }
    which_partition_test <- which_neighbors(cd_split_fit$centers_kmeans,
                                            g_test, 1)
    which_partition_train <- which_neighbors(cd_split_fit$centers_kmeans,
                                             cd_split_fit$g_train, 1)
    
    ths_partition <- rep(NA, nrow(cd_split_fit$centers_kmeans))
    for (ii in 1:nrow(cd_split_fit$centers_kmeans)) {
      ths_partition[ii] <- quantile(cd_split_fit$conformity_score_train[which_partition_train == ii],
                                    probs = alpha)
    }
    ths_partition[is.na(ths_partition)] <- quantile(cd_split_fit$conformity_score_train, probs = alpha)
    ths <- ths_partition[which_partition_test]
    for (ii in 1:nrow(xnew)) {
      prediction_bands_which_belong[[ii]] <- pred_test$CDE[ii, ] >= ths[ii]
      intervals[[ii]] <- compute_intervals(prediction_bands_which_belong[[ii]], pred_test$z)
    }
    
    return_value <- list(y_grid = pred_test$z, ths = ths, densities = pred_test$CDE,
                         prediction_bands_which_belong = prediction_bands_which_belong,
                         intervals = intervals, ths = ths, type = "cd", alpha = alpha)
  } else if (type == "cd_fourier") {
    if (is.null(sigma)) {
      stop("For type='cd_fourier', sigma must be specified for smoothing.")
    }
    
    prediction_bands_which_belong <- list()
    intervals <- list()
    
    ths <- rep(NA, length(cd_split_fit$conformity_score_train))
    g_test <- matrix(NA, nrow(xnew), length(cd_split_fit$t_grid))
    for (ii in 1:nrow(xnew)) {
      g_test[ii, ] <- profile_density(cd_split_fit$t_grid,
                                      pred_test$z,
                                      pred_test$CDE[ii, ])
    }
    which_partition_test <- which_neighbors(cd_split_fit$centers_kmeans,
                                            g_test, 1)
    which_partition_train <- which_neighbors(cd_split_fit$centers_kmeans,
                                             cd_split_fit$g_train, 1)
    
    ths_partition <- rep(NA, nrow(cd_split_fit$centers_kmeans))
    for (ii in 1:nrow(cd_split_fit$centers_kmeans)) {
      ths_partition[ii] <- quantile(cd_split_fit$conformity_score_train[which_partition_train == ii],
                                    probs = alpha)
    }
    ths_partition[is.na(ths_partition)] <- quantile(cd_split_fit$conformity_score_train, probs = alpha)
    ths <- ths_partition[which_partition_test]
    for (ii in 1:nrow(xnew)) {
      prediction_bands_which_belong[[ii]] <- pred_test$CDE[ii, ] >= ths[ii]
      intervals[[ii]] <- compute_intervals(prediction_bands_which_belong[[ii]], pred_test$z)
    }
    
    return_value <- list(y_grid = pred_test$z, ths = ths, densities = pred_test$CDE,
                         prediction_bands_which_belong = prediction_bands_which_belong,
                         intervals = intervals, ths = ths, type = "cd_fourier", alpha = alpha)
  } else {
    stop(paste0("Type of distribution not implemented", " (", type, ")"))
  }
  class(return_value) <- "bands"
  return(return_value)
}



#' Compute intervals from binary sequence
#' @param sequence Binary vector indicating inclusion in prediction band
#' @param t_grid Grid of response values
#' @return List of intervals
compute_intervals <- function(sequence, t_grid) {
  aux <- rle(sequence)
  cum_aux <- cumsum(aux$lengths)
  if (aux$values[1] == TRUE) {
    start <- cum_aux[aux$values == FALSE]
    start <- c(1, start[1:sum(aux$values == TRUE)] + 1)
    start <- start[-length(start)]
    end <- cum_aux[aux$values == TRUE]
  } else {
    start <- cum_aux[aux$values == FALSE]
    start <- start[1:sum(aux$values == TRUE)] + 1
    end <- cum_aux[aux$values == TRUE]
  }
  intervals <- mapply(function(s, e) c(t_grid[s], t_grid[e]), start, end, SIMPLIFY = FALSE)
  return(intervals)
}

compute_intervals_hpd <- function(sequence,t_grid) {
  aux <- rle(sequence)
  cum_aux <- cumsum(aux$lengths)
  if(aux$values[1]==TRUE)
  {
    start <- cum_aux[aux$values==FALSE]
    start <-c(1,start[1:sum(aux$values==TRUE)]+1)
    start <- start[-length(start)]
    end <- cum_aux[aux$values==TRUE]
  } else {
    start <- cum_aux[aux$values==FALSE]
    start <-start[1:sum(aux$values==TRUE)]+1
    end <- cum_aux[aux$values==TRUE]
  }
  return(paste(paste0("(",paste(t_grid[start],
                                t_grid[end],sep=","),")"),
               collapse =  "U"))
}


findThresholdHPD=function(binSize,estimates,confidence)
{
  estimates=as.vector(estimates)
  maxDensity=max(estimates)
  minDensity=min(estimates)
  newCut=(maxDensity+minDensity)/2
  eps=1
  ii=1
  while(ii<=1000)
  {
    prob=sum(binSize*estimates*(estimates>newCut))
    eps=abs(confidence-prob)
    if(eps<0.0000001) break; # level found
    if(confidence>prob) maxDensity=newCut
    if(confidence<prob) minDensity=newCut
    newCut=(maxDensity+minDensity)/2
    ii=ii+1
  }
  return(newCut)
}

#' Plot prediction bands
#' @param bands Object of class 'bands'
#' @param ynew Optional true response values for plotting
#' @return ggplot object (invisibly)
plot.bands <- function(bands, ynew = NULL) {
  data_plot_list <- NULL
  for (ii in 1:length(bands$prediction_bands_which_belong)) {
    pred_region <- bands$y_grid[bands$prediction_bands_which_belong[[ii]]]
    if (length(pred_region) == 0) {
      data_plot_list[[ii]] <- data.frame(x = ii, y = median(bands$y_grid))
      next
    }
    data_plot_list[[ii]] <- data.frame(x = ii, y = pred_region)
  }
  data_plot_regions <- do.call("rbind", data_plot_list)
  
  if (bands$type == "dist") {
    type <- "Dist-split"
  } else if (bands$type == "cd") {
    type <- "CD-split"
  } else if (bands$type == "cd_fourier") {
    type <- "CD-split (Fourier Smoothed)"
  } else {
    type <- "HPD-split"
  }
  title <- paste0(type, "; alpha=", bands$alpha)
  g <- ggplot() +
    geom_point(data = data_plot_regions,
               aes(x = x, y = y), alpha = 0.02, color = "red") +
    theme_bw() +
    xlab("Sample id") +
    theme(axis.text = element_text(size = 20),
          axis.title = element_text(size = 24, face = "bold"),
          legend.title = element_blank(),
          legend.position = c(0.01, 0.3),
          legend.justification = c(0, 1),
          plot.title = element_text(size = 28, face = "bold"),
          legend.text = element_text(size = 22)) +
    ggtitle(title)
  
  if (!is.null(ynew)) {
    data_plot_y <- data.frame(x = 1:length(bands$prediction_bands_which_belong), y = ynew)
    g <- g + geom_point(data = data_plot_y, aes(x = x, y = y))
  }
  
  plot(g)
  invisible(g)
}

predict_hpd <- function(cd_split_fit,xnew,type="dist",alpha=0.1)
{
  pred_test <- FlexCoDE::predict.FlexCoDE(cd_split_fit$density_fit,xnew)
  
  if(type=="cd")
  {
    prediction_bands_which_belong <- list()
    intervals <- list()
    
    ths <- rep(NA,length(cd_split_fit$conformity_score_train))
    g_test <- matrix(NA,nrow(xnew),length(cd_split_fit$t_grid))
    for(ii in 1:nrow(xnew))
    {
      g_test[ii,] <- profile_density(cd_split_fit$t_grid,
                                     pred_test$z,
                                     pred_test$CDE[ii,])
    }
    which_partition_test <- which_neighbors(cd_split_fit$centers_kmeans,
                                            g_test,1)
    which_partition_train <- which_neighbors(cd_split_fit$centers_kmeans,
                                             cd_split_fit$g_train,1)
    
    ths_partition <- rep(NA,nrow(cd_split_fit$centers_kmeans))
    for(ii in 1:nrow(cd_split_fit$centers_kmeans))
    {
      ths_partition[ii] <-  quantile(cd_split_fit$conformity_score_train[which_partition_train==ii],
                                     probs=alpha)
    }
    ths_partition[is.na(ths_partition)] <- quantile(cd_split_fit$conformity_score_train,probs=alpha)
    ths <- ths_partition[which_partition_test]
    for(ii in 1:nrow(xnew))
    {
      prediction_bands_which_belong[[ii]] <- pred_test$CDE[ii,]>=ths[ii]
      intervals[[ii]] <- compute_intervals(prediction_bands_which_belong[[ii]],
                                           pred_test$z)
    }
    
    return_value <- list(y_grid=pred_test$z,ths=ths,densities=pred_test$CDE,
                         prediction_bands_which_belong=prediction_bands_which_belong,
                         intervals=intervals,ths=ths,type="cd",alpha=alpha)
    
  } else if(type=="dist"){
    ths <-  quantile(cd_split_fit$cum_dist_evaluated_train,
                     probs = c(alpha/2,1-alpha/2))
    prediction_bands_which_belong <- list()
    intervals <- list()
    FTest <- matrix(NA,nrow(xnew),length(pred_test$z))
    for (ii in 1:nrow(xnew)){
      FTest[ii,] <- cumsum(pred_test$CDE[ii,])*diff(pred_test$z)[1]
      prediction_bands_which_belong[[ii]] <- FTest[ii,]>=ths[1]&FTest[ii,]<=ths[2]
      intervals[[ii]] <- paste0("(",min(pred_test$z[prediction_bands_which_belong[[ii]]]),
                                ",",max(pred_test$z[prediction_bands_which_belong[[ii]]]),")")
    }
    return_value <-list(y_grid=pred_test$z,ths=ths,densities=pred_test$CDE,
                        prediction_bands_which_belong=prediction_bands_which_belong,
                        intervals=intervals,type="dist",alpha=alpha)
    
  }   else if(type=="hpd") {
    th <- quantile(cd_split_fit$conformity_score_train_hpd, probs=alpha)
    prediction_bands_which_belong <- list()
    intervals <- list()
    th_hpd <- rep(NA, nrow(xnew))
    
    for(ii in 1:nrow(xnew)) {
      density <- pred_test$CDE[ii,] 
      hpd_values <- rep(NA, length(density))
      band <- diff(pred_test$z)[1]  
      
      for(j in 1:length(density)) {
        indices <- which(density <= density[j])
        if(length(indices) > 0) {
          hpd_values[j] <- sum(density[indices]) * band
        } else {
          hpd_values[j] <- 0  
        }
      }
      
      prediction_bands_which_belong[[ii]] <- hpd_values >= th
      intervals[[ii]] <- compute_intervals_hpd(prediction_bands_which_belong[[ii]], pred_test$z)
      th_hpd[ii] <- th  
    }
    
    return_value <- list(y_grid=pred_test$z, th_hpd=th_hpd, densities=pred_test$CDE, intervals=intervals,
                         prediction_bands_which_belong=prediction_bands_which_belong, type="hpd", alpha=alpha)
  }   else {
    stop(paste0("Type of distribution not implemented", " (",type,")"))
  }
  class(return_value) <- "bands"
  return(return_value)
  
  
}
