########################################################
# This function file contains two functions that are used in the simulation
# 1. The function **data1** is the main function to generate train and test set, with the same distributions.
# 2. The function **test_set** is used to generate specific test point set, such as variable selection, coefficient estimation.
########################################################

DataGen=function(sqrt_Sigma,n_train,n_test,p,beta_true)
{
#Input: sqrt_Sigma is the square of covariance matrix, n_train and n_test are the sample size for training and testing data, p is the dimension
#       beta_true is the true coefficient in linear model.
#Output: the training data and testing data
  n_total=(n_train+n_test)  
  Z=matrix(rnorm(n_total*p),ncol=p)
  Z=Z-rep(1,n_total)%o%apply(Z,2,mean)
  
  mu=matrix(rep(0,p),ncol=1)  
  X=rep(1,n_total)%*%t(mu)+Z%*%sqrt_Sigma                                                      
  Err=1*rnorm(n_total)                                                                              
  Y=X%*%beta_true+Err                                                                                  
  X.train=X[c(1:n_train),]
  Y.train=Y[c(1:n_train)]
  X.test=X[-c(1:n_train),]   
  Y.test=Y[-c(1:n_train)] 
  
  data1=list('X.train'=X.train, 'X.test'=X.test, 'Y.train'=Y.train,  'Y.test'=Y.test)
}


TestSet=function(var_sel,X.train,X.test.star,spar)
{ # Input: var_sel=1 means implementing variables selection for all variables. var_sel=0 means only test on some specific points 1--4.
  #        X.train is the  covariate in training set and X.test.star is the initial dense test pionts that are used to generate random sparse test points.
  # Output: the test points.   
n_test = dim(X.test.star)[1]
p = dim(X.test.star)[2]
  
if (var_sel==TRUE)  # variable selection on whole variables 
{
  X_norm=mean(sqrt(diag(X.train%*%t(X.train))))
  X.test=X_norm*diag(p)
}

if (var_sel==FALSE)
{
  point1=diag(p)[1,,drop=FALSE]
  point2=diag(p)[p_0,,drop=FALSE]
  point3=diag(p)[p,,drop=FALSE]
  point4=diag(p)[1,,drop=FALSE]-diag(p)[3,,drop=FALSE]
  
  te.spar.id=lapply(1:n_test, function(i) {sample(c(1:p),spar)})
  
  X.test.end=t(sapply(1:n_test, function(i) {
    X.test.star[i,-te.spar.id[[i]]]=0
    X.test.star=X.test.star-rep(1,n_test)%o%apply(X.test.star,2,mean) 
    return(X.test.star[i,])
  }))
  
  X.test=rbind(point1,point2,point3,point4,X.test.end)
}

return(X.test)
}