# data_loader.R
# -------------------------------------------------------------------
# Loader for SP_RV daily returns dataset and lag/MA covariates.
#
# Dataset expectation:
#   - CSV with columns: date, log_rv
#   - date format examples: "2010/1/4"
#
# Output:
#   - X_full: matrix with covariates
#       (1) return at yesterday
#       (2) average return of last 5 days (excluding today)
#       (3) average return of last 21 days (excluding today)
#   - Y_full: vector (today's return)
#   - dates: Date vector aligned with X_full/Y_full
# -------------------------------------------------------------------

suppressPackageStartupMessages({
  library(readr)
  library(dplyr)
  library(zoo)
})

load_RV_dataset <- function(file_path = "SPY_RV.csv") {
  dat <- read_csv(file_path, show_col_types = FALSE)

  required_cols <- c("date", "SQRT_RV")
  missing_cols <- setdiff(required_cols, names(dat))
  if (length(missing_cols) > 0) {
    stop(paste0("Missing required columns: ", paste(missing_cols, collapse = ", ")))
  }

  dat <- dat %>%
    mutate(
      date = as.Date(date, format = "%Y/%m/%d"),
      log_rv = as.numeric(SQRT_RV)
    ) %>%
    arrange(date)

  r <- dat$log_rv

  # Covariates for time t use information available up to t-1:
  #   x1_t  = r_{t-1}
  #   x5_t  = mean(r_{t-1},...,r_{t-5})
  #   x21_t = mean(r_{t-1},...,r_{t-21})
  r_lag1 <- dplyr::lag(r, 1)
  ma5_lag  <- zoo::rollapply(r_lag1, width = 5, FUN = mean, align = "right", fill = NA, na.rm = TRUE)
  ma21_lag <- zoo::rollapply(r_lag1, width = 21, FUN = mean, align = "right", fill = NA, na.rm = TRUE)

  feat_df <- tibble(
    ret_lag1 = r_lag1,
    ret_ma5  = ma5_lag,
    ret_ma21 = ma21_lag
  )

  # Align and drop rows without full lag history
  keep <- complete.cases(feat_df) & !is.na(r) & !is.na(dat$date)
  X_full <- as.matrix(feat_df[keep, ])
  Y_full <- as.numeric(r[keep])
  dates  <- dat$date[keep]

  return(list(X_full = X_full, Y_full = Y_full, dates = dates))
}
