# Install required packages
rm(list=ls())
library(dplyr)
library(quantmod)
library(zoo)
library(glmnet)
library(ipflasso)
library(ebmr.alpha)
library(mr.ash.alpha)
library(ashr)
library(nnet)

# Cleaned symbol list: Replace outdated ones like FB → META
symbols <- c("AAPL", "GOOG", "MSFT", "TSLA", "AMZN",
             "META", "NFLX", "NVDA", "JPM", "V",
             "BAC", "DIS", "INTC", "CSCO", "ADBE",
             "PYPL", "CMCSA", "PEP", "KO", "PFE",
             "MRK", "ABT", "T", "XOM", "CVX",
             "BA", "GE", "MMM", "IBM", "MCD",
             "SBUX", "NKE", "WMT", "PG", "UNH",
             "LLY", "TMO", "MDT", "BMY", "DHR",
             "HON", "JNJ", "AMGN", "AVGO", "TXN",
             "QCOM", "GILD", "ORCL", "ACN", "INTU",
             "WFC", "GS", "BLK", "AXP", "MS",
             "C", "USB", "SCHW", "CB", "BK",
             "ICE", "TFC", "PNC", "AIG", "MMC",
             "MET", "PRU", "ALL", "PGR", "TLT",
             "SPY", "QQQ", "IWM", "VTI", "VOO",
             "BND", "LQD", "HYG", "VUG", "VFINX",
             "FXAIX", "SWPPX", "VTSAX", "VFIAX", "FZROX")
industries <- c("Technology", "Technology", "Technology", "Automotive", "Consumer Discretionary",
                "Communication Services", "Communication Services", "Technology", "Financials", "Financials",
                "Financials", "Communication Services", "Technology", "Technology", "Technology",
                "Financials", "Communication Services", "Consumer Staples", "Consumer Staples", "Healthcare",
                "Healthcare", "Healthcare", "Communication Services", "Energy", "Energy",
                "Industrials", "Industrials", "Industrials", "Technology", "Consumer Discretionary",
                "Consumer Discretionary", "Consumer Discretionary", "Consumer Staples", "Consumer Staples", "Healthcare",
                "Healthcare", "Healthcare", "Healthcare", "Healthcare", "Healthcare",
                "Industrials", "Healthcare", "Healthcare", "Technology", "Technology",
                "Technology", "Healthcare", "Technology", "Technology", "Technology",
                "Financials", "Financials", "Financials", "Financials", "Financials",
                "Financials", "Financials", "Financials", "Financials", "Financials",
                "Financials", "Financials", "Financials", "Financials", "Financials",
                "Financials", "Financials", "Financials", "Financials", "Bonds",
                "Equity Funds", "Equity Funds", "Equity Funds", "Equity Funds", "Equity Funds",
                "Bonds", "Bonds", "Bonds", "Equity Funds", "Equity Funds",
                "Equity Funds", "Equity Funds", "Equity Funds", "Equity Funds", "Equity Funds")

# Combine metadata
stock_info <- data.frame(Symbol = symbols, Industry = industries)

# --- Download data robustly ---
available_symbols <- c()
price_list <- list()
for (sym in symbols) {
  tryCatch({
    getSymbols(sym, from = "2020-01-01", to = "2021-01-01", auto.assign = TRUE)
    price_list[[sym]] <- Ad(get(sym))
    available_symbols <- c(available_symbols, sym)
  }, error = function(e) {
    message(paste("Skipping", sym, ":", e$message))
  })
}

# --- Merge prices ---
prices <- do.call(merge, price_list)
prices <- na.locf(prices)
prices <- na.omit(prices)

# --- Final X matrix and matching metadata ---
X <- as.matrix(prices)
valid_symbols <- colnames(X) <- gsub("\\.Adjusted", "", colnames(X))  # remove suffix
X <- X[, colnames(X) %in% stock_info$Symbol]

# Match industries to columns of X
industry_labels <- stock_info %>%
  filter(Symbol %in% colnames(X)) %>%
  arrange(match(Symbol, colnames(X))) %>%
  pull(Industry)

# --- Example: use glmnet with grouped plot by industry
set.seed(123)
lt= list()
write.csv(stock_info ,  "C:/Document/Serieux/Travail/Data_analysis_and_papers/nash_experiement/data_split/SNP500/infocov.csv" , row.names = FALSE)
for ( k in 1:10){
set.seed(k)
print(k)
  y <-  X[,1]


  y=scale(y)

  idx_test = sample (1:length(y), size= floor (length(y)*0.2))
  fit <- cv.glmnet(X[-idx_test,-1], y[-idx_test], alpha = 0)
  predictions_lasso <- predict(fit, newx = X[idx_test,-1])

  fit <- cv.glmnet(X[-idx_test,-1], y[-idx_test], alpha = 0.5)
  predictions_enet <- predict(fit, newx = X[idx_test,-1])

  fit <- cv.glmnet(X[-idx_test,-1], y[-idx_test], alpha = 1)
  predictions_ridge <- predict(fit, newx = X[idx_test,-1])

  # Create industry-specific penalty factors
  industry_table <- table(industry_labels)
  n_industries <- length(industry_table)

  # Assign higher penalties to more common industries (optional logic)
  # Here, we normalize so total penalty = number of predictors
  penalty_by_industry <- 1 / sqrt(industry_table[-1])
  penalty_by_industry <- penalty_by_industry / sum(penalty_by_industry) * length(industry_labels)

  # Create penalty.factor vector matching X columns
  penalty_vector <- sapply(industry_labels, function(ind) penalty_by_industry[ind])

  # --- Fit IPF  ---
  set.seed(123)
  target_symbol <- "AAPL"

  industry_labels_predictors <- industry_labels[stock_info$Symbol != target_symbol]
  unique_inds <- unique(industry_labels_predictors)
  blocks_list <- lapply(unique_inds, function(ind) which(industry_labels_predictors == ind))
  names(blocks_list) <- unique_inds
  fit_ipf <- cvr.ipflasso(
    X = X[-idx_test,-1],
    Y = y [-idx_test],
    family = "gaussian",
    type.measure = "mse",
    standardize = TRUE,
    alpha = 1,
    blocks = blocks_list,
    pf = rep(1, length(blocks_list)),  # required placeholder
    nfolds = 5,
    ncv = 5
  )
  coef_ipf= fit_ipf$coeff[,fit_ipf$ind.bestlambda]
  rmse= function(y,yt){
    sqrt(mean ((y-yt)^2))
  }

  pred_ipf= coef_ipf[1]+ X[idx_test,-1]%*%coef_ipf[-1]

  library(mr.ash.alpha)
  fitmr= mr.ash(X[-idx_test,-1],
                y[-idx_test] )
  #fit nnet



  # --- Standardize data (nnet is sensitive to scale)
  X_train_scaled <- scale(X[-idx_test,-1])
  X_test_scaled  <- scale(X[ idx_test,-1], center = attr(X_train_scaled, "scaled:center"),
                          scale  = attr(X_train_scaled, "scaled:scale"))

  # Combine into training data frame (nnet requires data.frame or matrix input)
  train_df <- as.data.frame(X_train_scaled)
  train_df$y <- X[ -idx_test, 1]

  # --- Fit neural network (4 hidden units, 1 hidden layer)
  set.seed(123)
  fit_nnet <- nnet(
    y ~ .,
    data = train_df,
    size = 9,       # 4 hidden units
    linout = TRUE,  # regression
    maxit = 500,    # increase if needed
    trace = FALSE   # suppress output
  )

  # --- Predict on test set
  pred_nnet <- predict(fit_nnet, newdata = as.data.frame(X_test_scaled))
  # --- GROUP LASSO ---
  library(grpreg)



  predictions_mr <- X[idx_test,-1]%*%fitmr$beta


  colScale <- function(x, center = TRUE, scale = TRUE, add_attr = TRUE, rows = NULL, cols = NULL) {
    if (!is.null(rows) || !is.null(cols)) {
      x <- x[rows %||% seq_len(nrow(x)), cols %||% seq_len(ncol(x)), drop = FALSE]
    }

    cm <- if (center) colMeans(x, na.rm = TRUE) else rep(0, ncol(x))
    csd <- if (scale) matrixStats::colSds(x, center = cm) else rep(1, length(cm))
    csd[csd == 0] <- 1  # Prevent division by zero

    x <- sweep(x, 2, cm, "-")
    x <- sweep(x, 2, csd, "/")

    if (add_attr) {
      if (center) attr(x, "scaled:center") <- cm
      if (scale) attr(x, "scaled:scale") <- csd
      n <- nrow(x)
      d <- (n - 1) * csd^2 / csd^2
      attr(x, "d") <- d
    }
    x
  }

  method_list  = c("nash")
  nash_noinfo_dynamic_td= function(X, y, maxit = 100, tol=10e-3){


    y.fit.ebr = ebmr(X , y , maxiter = 20, ebnv_fn = ebnv.pm)
    y.fit.nash=y.fit.ebr
    elbo=c(-Inf)
    for ( k in 1:maxit){


      tt= ash( y.fit.nash$mu,  (y.fit.nash$Sigma_diag))


      elbo=c(elbo,
             y.fit.nash$elbo[length(y.fit.nash$elbo)]-tt$loglik)

      if(  (k>1) &   (elbo[k+1]-elbo[k ]< tol) ){

        break
      }
      y.fit.nash= ebmr.update(y.fit.nash,
                              mu0=tt$result$PosteriorMean,
                              maxiter = 20)
    }

    y.fit.nash$elbo=elbo
    y.fit.nash$b= tt$result$PosteriorMean
    return( y.fit.nash)
  }

  # Visualize prediction

  fitmnash= nash_noinfo_dynamic_td(X=X[-idx_test,-1],
                                   y= y[-idx_test] )

  #fit nnet
res= c(
  rmse(y[idx_test],predictions_lasso),
  rmse(y[idx_test],predictions_enet),
  rmse(y[idx_test],predictions_ridge),
  rmse(y[idx_test],predictions_mr),

  rmse(pred_ipf,y[idx_test]),
  # View metadata
  rmse(y[  idx_test],  X[ idx_test,-1]%*%fitmnash$mu))

name = c( "Lasso",
   "Enet",
   "Ridge",
   "MRash",
   "IPF",
   "Nash")


lt[[k]]= list(rmse=res,
              name=name)

  write.csv(y[-idx_test], paste0("C:/Document/Serieux/Travail/Data_analysis_and_papers/nash_experiement/data_split/SNP500/y_train",k,".csv"), row.names = FALSE)
  write.csv(X[-idx_test,-1], paste0("C:/Document/Serieux/Travail/Data_analysis_and_papers/nash_experiement/data_split/SNP500/X_train",k,".csv"), row.names = FALSE)
  write.csv(y[idx_test], paste0("C:/Document/Serieux/Travail/Data_analysis_and_papers/nash_experiement/data_split/SNP500/y_test",k,".csv"), row.names = FALSE)
  write.csv(X[idx_test,-1], paste0("C:/Document/Serieux/Travail/Data_analysis_and_papers/nash_experiement/data_split/SNP500/X_test",k,".csv"), row.names = FALSE)
save(lt, file="C:/Document/Serieux/Travail/Data_analysis_and_papers/nash_experiement/results_realdata/SNP500.RData")
}
