real.data.cv.test <- function(data, target, model, train.prop, n.iter, methods, lambdas, n.samples = 1e3, burnin = 1e3, seed = NULL,use.interactions2=F, use.interactions3 = F,use.logs=F,use.squares=F,use.cubics=F, rep.cv.fold = 1, e = 1e-8)
{
  # CV test
  n.methods = length(methods)
  p = ncol(data)
  
  empty = matrix(nrow = n.iter, ncol = n.methods, dimnames = list(NULL, methods))
  
  return.stats = c("MSE", "run.time", "n.coefs", "converge", "lambda.min", "R2", "R2_OOS", "pre.time")
  
  if ("glmnet.ridge.10.fold" %in% methods){
    return.stats = c(return.stats, "MSE.min","MSE.max")
  }
  
  for(i in return.stats){
    assign(i, empty)
  }
  
  MSE.10.fold = matrix(nrow = rep.cv.fold, ncol = 1)
  lambda.10.fold = matrix(nrow = rep.cv.fold, ncol = 1)
  CA.10.fold = matrix(nrow = rep.cv.fold, ncol = 1)
  nlogloss.10.fold = matrix(nrow = rep.cv.fold, ncol = 1)
  
  if(!is.null(seed)){
    set.seed(seed)
  }
  
  tms = matrix(nrow = n.iter, ncol = 1)
  tms_OOS = matrix(nrow = n.iter, ncol = 1)
  rv.cv.pre = list()
  plots = list()
  for (j in 1:n.iter)
  {
    rv.cv.pre[[j]] = real.data.CV.split(data, target, model, train.prop,use.interactions2, use.interactions3,use.logs,use.squares,use.cubics)
  }
  
  for (j in 1:n.iter)
  {
    cat(j,", ")
    
    rv.cv = rv.cv.pre[[j]]
    
    tms[j] = sum((rv.cv$test.data[,target]- mean(rv.cv$test.data[,target]))^2)
    tms_OOS[j] = sum((rv.cv$test.data[,target]- mean(rv.cv$train.data[,target]))^2)
    
    for (i in 1:n.methods)
    {
      if(n.methods == 0){
        break
      }
      #cat(methods[i])
      
      if (methods[i] == "glmnet.ridge")
      {
        s = system.time({rv.glmnet = cv.glmnet.f(rv.cv$my.form, data=rv.cv$train.data, family=model, alpha = 0, nfolds = rv.cv$n.train, grouped = F)})
        y.pred = predict.glmnet.f(rv.glmnet, rv.cv$test.data, type="response", s = "lambda.min")
        
        lambda_glm = rv.glmnet$lambda
        
        n.coefs[j,i] = length(which(as.numeric(coef(rv.glmnet))!=0))-1
        lambda.min[j,i] = rv.glmnet$lambda.min
        
      }
      else if (methods[i] == "glmnet.ridge.10.fold")
      {
        s = system.time({rv.glmnet.10.fold = cv.glmnet.rep.f(rv.cv$my.form, data=rv.cv$train.data, rep = rep.cv.fold, family=model, alpha = 0)})
        y.pred = predict.glmnet.rep.f(rv.glmnet.10.fold, rv.cv$test.data, type="response", s = "lambda.min")
        
        lambda_glm = rv.glmnet.10.fold[[1]]$lambda
        
        for (k in 1:rep.cv.fold) {
          
          lambda.10.fold[k] = rv.glmnet.10.fold[[k]]$lambda.min
            
          MSE.10.fold[k] = sum( (y.pred[,k] - rv.cv$test.data[,target])^2 )
          
        }
        
        MSE[j,i] = mean(MSE.10.fold)
        R2[j,i] = 1-(MSE[j,i]/tms[j])
        MSE.min[j,i] = min(MSE.10.fold)
        MSE.max[j,i] = max(MSE.10.fold)
        run.time[j,i] = s[[3]]/rep.cv.fold
        
        #n.tr = rv.cv$n.train
        #sd_trainy = c(sqrt(var(rv.cv$train.data[,target])*(n.tr-1)/n.tr))
        #lambda.min[j,i] = mean(lambda.10.fold)*n.tr/sd_trainy
        lambda.min[j,i] = mean(lambda.10.fold)
        
        n.coefs[j,i] = length(which(as.numeric(coef(rv.glmnet.10.fold[[1]]))!=0))-1
        
      }
      else if (methods[i] == "bayesreg.ridge")
      {
        s = system.time({rv.br = bayesreg(rv.cv$my.form, data=rv.cv$train.data, model=model, prior="ridge", n.samples=5e3, burnin = 1e4)})
        y.pred = predict(rv.br, rv.cv$test.data[,-which(colnames(rv.cv$test.data) == target)])
        
      }
      else if (methods[i] == "lm"){
        # only for gaussian
        s = system.time({rv.lr = lm(rv.cv$my.form, data=rv.cv$train.data)})
        y.pred = predict(rv.lr, rv.cv$test.data)
      }
      else if (methods[i] == "fastLOO.glm"){
        
        s = system.time({rv.LOOCV =  fastLOO(rv.cv$my.form, rv.cv$train.data, lambda = lambda_glm, glm_lambda = T)})
        y.pred = em.predict(rv.LOOCV, rv.cv$test.data)
        
        n.coefs[j,i] = sum(rv.LOOCV$beta!=0)
        converge[j,i] = rv.LOOCV$num.iter
        lambda.min[j,i] = rv.LOOCV$lambda.min
        pre.time[j,i] = rv.LOOCV$preprocessing_time
        
      }
      else if (methods[i] == "fastLOO.glm.bign"){
        
        s = system.time({rv.LOOCV =  fastLOO(rv.cv$my.form, rv.cv$train.data, lambda = lambda_glm, glm_lambda = T, bign = T)})
        y.pred = em.predict(rv.LOOCV, rv.cv$test.data)
        
        n.coefs[j,i] = sum(rv.LOOCV$beta!=0)
        converge[j,i] = rv.LOOCV$num.iter
        lambda.min[j,i] = rv.LOOCV$lambda.min
        pre.time[j,i] = rv.LOOCV$preprocessing_time
        
      }
      else if (methods[i] == "fastLOO"){
        
        s = system.time({rv.LOOCV =  fastLOO(rv.cv$my.form, rv.cv$train.data, lambda = lambdas[[i]])})
        y.pred = em.predict(rv.LOOCV, rv.cv$test.data)
        
        n.coefs[j,i] = sum(rv.LOOCV$beta!=0)
        converge[j,i] = rv.LOOCV$num.iter
        lambda.min[j,i] = rv.LOOCV$lambda.min
        pre.time[j,i] = rv.LOOCV$preprocessing_time
        
      }
      else if (methods[i] == "fastLOO.bign"){
        
        s = system.time({rv.LOOCV =  fastLOO(rv.cv$my.form, rv.cv$train.data, lambda = lambdas[[i]], bign = T)})
        y.pred = em.predict(rv.LOOCV, rv.cv$test.data)
        
        n.coefs[j,i] = sum(rv.LOOCV$beta!=0)
        converge[j,i] = rv.LOOCV$num.iter
        lambda.min[j,i] = rv.LOOCV$lambda.min
        pre.time[j,i] = rv.LOOCV$preprocessing_time
        
      }
      else if (methods[i] == "tau2.bign"){
        
        s = system.time({rv.em =  EM_ridge_svd(rv.cv$my.form, rv.cv$train.data, model=model, prior="tau2", bign = T, e = e)})
        y.pred = em.predict(rv.em, rv.cv$test.data)
        
        n.coefs[j,i] = sum(rv.em$beta!=0)
        converge[j,i] = rv.em$num.iter
        lambda.min[j,i] = 1/rv.em$tau2
        pre.time[j,i] = rv.em$preprocessing_time
      }
      else
      {
        # RidgeEM
        s = system.time({rv.em =  EM_ridge_svd(rv.cv$my.form, rv.cv$train.data, model=model, prior=methods[i], e = e)})
        y.pred = em.predict(rv.em, rv.cv$test.data)
        
        n.coefs[j,i] = sum(rv.em$beta!=0)
        converge[j,i] = rv.em$num.iter
        lambda.min[j,i] = 1/rv.em$tau2
        pre.time[j,i] = rv.em$preprocessing_time
        
      }
      
      if(!(methods[i] %in% c("glmnet.ridge.10.fold"))){
        
        run.time[j,i] = s[[3]]
        MSE[j,i] = sum( (y.pred - rv.cv$test.data[,target])^2 )
        R2[j,i] = 1-(MSE[j,i]/tms[j])
        R2_OOS[j,i] = 1-(MSE[j,i]/tms_OOS[j])
      }
    }
  }
  
  #
  scores = matrix(nrow = length(return.stats), ncol = n.methods, dimnames = list(return.stats, methods))
  out = list(n.train = nrow(rv.cv$train.data))
  
  for(i in 1:length(return.stats)){
    
    scores[i,] = colMeans(get(return.stats[i]))
    out[[return.stats[i]]] = get(return.stats[i])
  }
  
  out[["scores"]] = scores
  #out[["plots"]] = plots
  out[["tms"]] = tms
  
  cat("\n")
  
  return(out)
}

library(MASS) 
library(reshape2) 
library(reshape)
library(ggplot2)

real.data.CV.split <- function(data, target, model, train.prop,use.interactions2=F, use.interactions3 = F,use.logs=F,use.squares=F,use.cubics=F)
{
  
  # Get the CV indices
  n = nrow(data)
  p = ncol(data)
  n.train = floor(n*train.prop)
  ix = sample(n)
  
  if(model == "binomial"){
    
    class1 = which(data[,target] == unique(data[,target])[1])
    class2 = which(data[,target] == unique(data[,target])[2])
    
    
    tr.ix = c(sample(class1, size = ceiling(n.train/2)), sample(class2, size = floor(n.train/2)))
    tst.ix = ix[!(ix %in% tr.ix)]
    
  }else{
    
    tr.ix = ix[1:n.train]
    tst.ix = ix[(n.train+1):n]
    
  }
  
  my.form = my.make.formula(target,data,use.interactions2=use.interactions2, use.interactions3 = use.interactions3,use.logs=use.logs,use.squares=use.squares,use.cubics=use.cubics)
  p.int = length(attr(terms(my.form, data = data), which = "term.labels"))
  
  if(n.train * p.int > 35000000){
    p.new = ceiling(35000000/n.train)
    #set.seed(n.train)
    subsample = sample((p+1):p.int, (p.int - p.new))
    t.forms = terms(my.form, data = data)
    my.form = formula(drop.terms(t.forms, subsample, keep.response = T))
  }
  
  return(list(train.data=data[tr.ix,], test.data=data[tst.ix,], my.form = my.form, tr.ix=tr.ix, tst.ix=tst.ix, n.train = n.train))
}
