rm(list=ls())
set.seed(20250410)
timestart <- Sys.time()

source("functions.R")
source("run_online.R")
source("run_offline.R")

library(MASS)
library(ggplot2)
library(parallel)

dat <- read.csv('health_fitness_dataset.csv', header = T)

lm_all <- lm(fitness_level ~ endurance_level, data = dat)
cooks_d <- cooks.distance(lm_all)
n_all <- nrow(dat)
threshold <- 4 / n_all
influential_index <- which(cooks_d > threshold)

set.seed(123)
sample_indices1 <- sample(influential_index, size = 40000 * 0.1, replace = FALSE)
sample_remain <- seq(1, n_all, length.out=n_all)[-sample_indices1]
sample_indices2 <- sample(sample_remain, size = 40000 * 0.9, replace = FALSE)
sample_indices <- c(sample_indices1, sample_indices2)

set.seed(20266410)
idx_train <- sample_indices
dat_train <- dat[idx_train,]
dat_remain <- dat[-idx_train,]
idx_vali <- sample(nrow(dat_remain), 1000)
dat_vali <- dat_remain[idx_vali,]
dat_remain2 <- dat_remain[-idx_vali,]
idx_test <- sample(nrow(dat_remain2), 1000)
dat_test <- dat_remain2[idx_test,]

y_train <- dat_train$fitness_level
x_train <- dat_train$endurance_level
max_val <- max(dat$endurance_level)
x_scale_train <- x_train / max_val
x_test <- dat_test$endurance_level
x_scale_test <- x_test / max_val
y_test <- dat_test$fitness_level
x_vali <- dat_vali$endurance_level
x_scale_vali <- x_vali / max_val
y_vali <- dat_vali$fitness_level

n <- length(x_train)
eps_privacy <- 2
delta_privacy <- 0.2

gamma_type <- 'constant' # 'constant', 'non-constant'
if (gamma_type == 'constant') {
  zeta <- 0.6
  gamma0 <- 1.6 * n ^ (-zeta)
  gamma_exp <- 0
} else {
  gamma0 <- 4
  gamma_exp <- 0.6
}


lent <- 101
grid <- seq(0, 1, length.out = lent)
n_drop <- 999
record_num <- 40
record_size <- seq(n_drop + 1, n, length.out = record_num)
kernel <- function(x, y) {Rep_K(x, y, h=0.1)}
Ker_cov <- outer(grid, grid, kernel)
K_grid <- Ker_cov + diag(1e-8, lent)
Kinv_grid <- solve(K_grid)

res_L2_non_tmp <- online_updating(x_scale_vali, y_vali, gamma0, gamma_exp, kernel, inner_product_H, grid, eps_privacy, delta_privacy, tau, n_drop=999, 'L2', 'non')
res_all <- rep(1000, 1000)
for (i in 1:1000) {
  x <- x_scale_vali[i]
  y_hat <- res_L2_non_tmp[[1]][which.min(abs(grid - x))]
  res_all[i] <- y_vali[i] - y_hat
}
sigma0_hat   <- median(abs(res_all - median(res_all))) / 0.6745
tau <- sigma0_hat * 1.345

res_huber_non <- online_updating(x_scale_train, y_train, gamma0, gamma_exp, kernel, inner_product_H, grid, eps_privacy=2, delta_privacy=0.2, tau, n_drop, 'huber', 'non')
res_huber_DP301 <- online_updating(x_scale_train, y_train, gamma0, gamma_exp, kernel, inner_product_H, grid, eps_privacy=3, delta_privacy=0.1, tau, n_drop, 'huber', 'DP')
res_huber_DP202 <- online_updating(x_scale_train, y_train, gamma0, gamma_exp, kernel, inner_product_H, grid, eps_privacy=2, delta_privacy=0.2, tau, n_drop, 'huber', 'DP')
res_L2_non <- online_updating(x_scale_train, y_train, gamma0, gamma_exp, kernel, inner_product_H, grid, eps_privacy, delta_privacy, tau, n_drop, 'L2', 'non')
res_offline <- offline_updating(x_scale_train, y_train, gamma0, gamma_exp, kernel, grid, n_drop)

pred_mse <- function(res, grid, xtest, ytest) {
  xtest_grid <- sapply(xtest, function(xi) {
    which.min(abs(grid - xi))
  })
  res_num <- length(res)
  pred_mse_vec <- rep(1000, res_num)
  for (i in 1:res_num) {
    f_est <- res[[i]][xtest_grid]
    pred_mse_vec[i] <- mean((f_est - ytest) ^ 2)
  }
  pred_mse_vec
}

mse_huber_non <- pred_mse(res_huber_non, grid, x_scale_test, y_test)
mse_huber_DP301 <- pred_mse(res_huber_DP301, grid, x_scale_test, y_test)
mse_huber_DP202 <- pred_mse(res_huber_DP202, grid, x_scale_test, y_test)
mse_L2_non <- pred_mse(res_L2_non, grid, x_scale_test, y_test)
mse_offline <- pred_mse(res_offline, grid, x_scale_test, y_test)

aa <- mean((mean(y_test) - y_test) ^ 2)
r2_huber_non <- 1 - mse_huber_non / aa
r2_huber_DP301 <- 1 - mse_huber_DP301 / aa
r2_huber_DP202 <- 1 - mse_huber_DP202 / aa
r2_L2_non <- 1 - mse_L2_non / aa
r2_offline <- 1 - mse_offline / aa

## lm
train_df <- data.frame(
  fitness_level = y_train,
  endurance_level = x_scale_train
)
test_df <- data.frame(
  fitness_level = y_test,
  endurance_level = x_scale_test
)
res_lm <- lm(fitness_level ~ endurance_level, data = train_df)
summary(res_lm)
pred_lm <- predict(res_lm, newdata = test_df)
mse_lm <- mean((y_test - pred_lm)^2)
r2_lm <- 1 - mse_lm / aa
