###############################################################
# Code for paper 
# Dynamic COVID risk assessment accounting for community virus exposure from a spatial-temporal transmission model

# This file contains code to simulate data based on real NYC COVID-19 data
###############################################################




library(tidyverse)
library(mgcv)
n_area = 176
n_time = 160

##### Prepare X and neighborhood matrix #####
# read in observed case
case_df = read_csv("nyc_case_by_zip.csv", skip = 1, col_names = c("zipcode", "case", "date", "borough")) 
nyc_zip = unique(case_df$zipcode)
nyc_zip = nyc_zip[1:176]
mean(nyc_zip == sort(nyc_zip)) # increasing 

# unchd_X: read in SVI
social_df = read_csv("social_dat.csv") %>% janitor::clean_names()

NYC_social_df = social_df %>%
  filter(modzcta %in% nyc_zip) %>%
  select(modzcta, e_minrty) %>%
  arrange(modzcta)

# only use unchanged X, so one file to save it

# if use both time changing X and time constant X
# # combine X
# totalX2 = list()
# n_time = 154
# for (ix in 1:n_time){
#   temp = cbind(NYC_social_df[,2], simul_chdX[,ix])
#   totalX2[[ix]] = temp
# }
# 
# for (ix in 1:n_time){
#   write_csv(as.data.frame(totalX2[[ix]]), paste0("coMatrix/cov_", ix, ".csv"))
# }

# read in neighborhood matrix
nyc_Nei = read.table("nyc_Nei.txt", header = FALSE )

##### simul_STmodel() function#####
# some functions
## survival function, t can be a vector
Surv <- function(t, mean = 5.2){
  Sm = ifelse(t < 21, (exp(-t/mean) - exp(-21/mean))/(1 - exp(-21/mean)), 0)
  return(Sm)
}

## function to calculate N(t) based on M(t-1), Y(t-1), a(t-1) 
Nt_MYam1 <- function(atm1, Mtm1, Ytm1){ 
  # atm1: a(t-1) vector, length of n_area
  Nt = as.numeric(atm1)*as.numeric(Mtm1 - Ytm1) # N(t): vector, length of n_area
  return(Nt)
}

## function to calculate M(t), Y(t) based on N(t), N(t-1), ..., N(1)
MYt_Ntb <- function(Nt_b, t, t0=1, C1 = 21){ 
  # Nt_b, a matrix: N(t) from all the time points at t and before t, 
  # t0, a number: first infected case(patient zero)
  # C1: max stay infectious days
  C = min(t - t0, C1 )
  Sm = Surv((0:C)) # S(m) for m=0:C
  Smp1 = Surv((1:(C + 1))) # S(m+1) for m=0:C
  
  Mt = t(Nt_b[t - (0:C),]) %*% Sm # vector, length of n_area
  Yt = t(Nt_b[t - (0:C),]) %*% (Sm - Smp1) # vector, length of n_area
  
  return(cbind(Mt, Yt))
}

## function to calculate a_bar(t) given N(t), Y(t-1), M(t-1)
atbar_MN <- function(Nt, Ytm1, Mtm1){
  # Nt, Ytm1, Mtm1: vector, length of n_area, Y, M and N at time t-1 or t
  atbar = sum(Nt)/sum(Mtm1 - Ytm1)
  return(atbar) # a number
}

## function to calculate E(t) from M(t-1), Y(t-1), abar(t-1)
Et_MYabar <- function(Mtm1, Ytm1, abar){
  Et = (Mtm1 - Ytm1)*abar
  return(Et)
}  

## function to generate H matrix based on E(t) and neighborhood matrix G for Sigma_t
Hmatrix <- function(E_t, G){
  n_area = ncol(G)
  H0 = matrix(0, nrow = n_area, ncol = n_area)
  for (i in 1:(n_area - 1)){
    for (j in (i + 1):n_area){
      if (G[i,j] == 1){
        H0[i,j] = sqrt((E_t[j])/(E_t[i])) # add 1 in case N=0?
        H0[j,i] = sqrt((E_t[i])/(E_t[j]))
      }
    }
  }
  return(H0)
}

# !Note: Z is log(lambda), not log(a) 
## generate lambda(t) from E(t), Z(t-1) and Xbeta, rho, tau, Gnei
Zt_EZtm1 <- function(t, Et, X, Ztm1, beta, beta0, alpha, tau, rho, G_nei){
  # t: time ix for Zt
  # Et: Et for n_area areas 
  # X: covariate matrix at time t
  # Ztm1: log infect rate at (t-1) 
  # here beta is only a vector is only one covariate
  # beta(vector), beta0(vector), alpha(vector), tau(vector), rho(vector): parameters for all time points, it will use t to extract the corresponding time para.
  # beta: effect for covariate like demographics, SVI...
  # alpha: effect for log(a_{t-1})
  n_area = dim(G_nei)[1]
  com_X = cbind(X, Ztm1, 1) # combine covariate, Z(t-1), and constant col ( so X can be a vector, only 1 covariate or X can be a matrix with more than 1 covariate)
  com_beta = c(beta[t],alpha[t], beta0[t])
  mu = as.numeric(as.matrix(com_X) %*% com_beta)
  Sigma = (tau[t]^2)*solve(diag(n_area) - rho[t]*Hmatrix(Et, G_nei)) %*% diag(1/Et) 
  Zt = rmvn(1,mu, Sigma)
  return(Zt)
}


# Wrap up
simul_STmodel <- function(n_time, n_area, Xmatrix, beta, beta0, alpha, tau, rho, G_nei, t0=1){
  # Xmatrix not changing, so not use list here
  # no incub argument, n_time should include incubation
  
  # initialize
  Z = matrix(nrow = n_time, ncol = n_area)
  a = matrix(nrow = n_time, ncol = n_area)
  Et = matrix(nrow = n_time, ncol = n_area)
  N = matrix(nrow = n_time, ncol = n_area)
  M = matrix(nrow = n_time, ncol = n_area)
  Y = matrix(nrow = n_time , ncol = n_area)
  abar = rep(NA, n_time)
  
  # calculate t=1
  N[1,] = rep(1, n_area) # patient 0 appears at time 1
  M[1,] = Surv(0)*N[1,] # for time 1, C = t-t0=0
  Y[1,] = (Surv(0) - Surv(1))*N[1,] # for time 1, C = t-t0=0
  # N0 = rep(0, n_area) # no patient infected at time 0
  Z0 = rep(0,n_area) # assume at equals across area, so lambda(t)=0, Z0=0 at time 0 (thus no effect on Z1)
  Et[1,] = 1 # set expected number of infections is 1 at t=1
  Ztemp = Zt_EZtm1(1, Et[1,], Xmatrix, Z0, beta, beta0, alpha, tau, rho, G_nei) # doesn't matter what is Xmatrix, as beta for first14 days are 0
  
  Z[1,] = Ztemp # a_i(t) always varies around 1, so Z_i[t] always varies around 0
  a[1,] = exp(Ztemp)*0.2 # assume abar[0] are 0.2
  abar[1] = mean(a[1,]) # M0, Y0 are all 0, so can't use the formula to calculate
  
  for (ix in 2:n_time){
    N[ix,] = Nt_MYam1(a[(ix - 1),], M[(ix - 1),], Y[(ix - 1),]) #calculate N(t) based on M(t-1), Y(t-1), a(t-1) 
    temp = MYt_Ntb(N[(1:ix),], ix)
    M[ix,] = temp[,1]
    Y[ix,] = temp[,2]
    abar[ix] = atbar_MN(N[ix,], Y[ix - 1,], M[ix - 1,])
    Et[ix, ] = Et_MYabar(M[ix - 1,], Y[ix - 1,], abar[ix - 1])
    Ztemp = Zt_EZtm1(ix, Et[ix,], Xmatrix, Z[(ix - 1),], beta, beta0, alpha, tau, rho, G_nei)
    Z[ix,] = ifelse(Ztemp > 2, 2, Ztemp) # truncate at 2 in case explode
    a[ix,] = exp(Z[ix,])*abar[ix - 1]
    abar[ix] = mean(a[ix,])
  }
  return(list(Z, N, M, Y, a, abar, Et))
}

##### setting parameter, simulate data #####

# parameter similar to estimated on real data

n_area = 176
n_time = 160 # counting begin at patient 0 infected
n_cov = 1 

totalX_simul = as.matrix(NYC_social_df$e_minrty - mean(NYC_social_df$e_minrty)) # centered X
write.table(totalX_simul, "Xmatrix.txt", col.names = FALSE, row.names = FALSE) # prepare data_X for model 
# lambda(t) should be around 1, so Z(t) should be around 0
beta = c(rep(c(0, 0.06, 0.06, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01), each = 14), rep(0,6))
beta0 = c(rep(c(0.05, 0, 0, -0.05, -0.05, -0.05, -0.01, -0.01, -0.01, -0.01, -0.01), each = 14) , rep(-0.01,6))
alpha = rep(0, 160)
tau = c(rep(c(0.04, 0.08, 0.08, 0.04, 0.04, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01), each = 14), rep(0.01,6))
rho = rep(0.12, 160) 
para_mat_true = cbind(beta0, beta, tau, rho)

simul_data = simul_STmodel(n_time, n_area, totalX_simul, beta, beta0, alpha, tau, rho, nyc_Nei)
simul_Z = simul_data[[1]]
simul_N = simul_data[[2]]
simul_M = simul_data[[3]]
simul_Y = simul_data[[4]]
simul_a = simul_data[[5]]
simul_abar = simul_data[[6]]
simul_Et = simul_data[[7]]

### check range of simulated data
par(mfrow=c(1,1))
plot(1:160,simul_abar, type = "l") 
mean(as.numeric(simul_Z) == 2)
quantile((as.numeric(simul_Z)))
quantile(exp((as.numeric(simul_Z))))
quantile(as.numeric(simul_Y))
apply(exp(simul_Z), 2, max)
apply(exp(simul_Z), 2, min)
apply(simul_Y, 2, max)
plot(x = 1:160, y = as.numeric(simul_Y[,1]), type = "l")
# check simulated Y
par(mfrow = c(4,4))
for (i in 1:176){
  plot(x = 1:160, y = simul_Y[,i], type = "l", main = paste0("region ix:", i))
}
# check simulated a
par(mfrow = c(4,4))
for (i in 1:176){
  plot(x = 1:160, y = simul_a[,i], type = "l", main = paste0("region ix:", i))
}

# set parameter
beta = c(rep(c(0, 0.06, 0.06, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01), each = 14), rep(0,6))
beta0 = c(rep(c(0.05, 0, 0, -0.05, -0.05, -0.05, -0.01, -0.01, -0.01, -0.01, -0.01), each = 14) , rep(-0.01,6))
alpha = rep(0, 160)
tau = c(rep(c(0.02, 0.02, 0.02, 0.02, 0.02, 0.04, 0.04, 0.06, 0.06, 0.06, 0.06), each = 14), rep(0.06,6))
rho = rep(0.12, 160) 
para_mat_tru = cbind(beta0, beta, tau, rho)
# regenerate if Z or Y is in unreasonable range
for (ix in 1:100){
  simul_data = simul_STmodel(n_time, n_area, totalX_simul, beta, beta0, alpha, tau, rho, nyc_Nei)
  simul_Z = simul_data[[1]]
  simul_N = simul_data[[2]]
  simul_M = simul_data[[3]]
  simul_Y = simul_data[[4]]
  simul_a = simul_data[[5]]
  simul_abar = simul_data[[6]]
  simul_Et = simul_data[[7]]
  a = round(mean(as.numeric(simul_Z) == 2), 3) # check proportion of extreme Z
  b = round(range(apply(simul_Y, 2, max))) # check range of peak Y in each area
  print(paste0("ix: ", ix, "; Z=2: ",  a, "; Y range: ", b[1], ", ", b[2]))
  
  setwd("simul_data")
  path = paste0("data", ix)
  dir.create(path)
  setwd(path)
  write.table(simul_Y, "simul_Y.txt", row.names = FALSE, col.names = FALSE )
  write.table(simul_a, "simul_a.txt", row.names = FALSE, col.names = FALSE )
  write.table(simul_Et, "simul_E.txt", row.names = FALSE, col.names = FALSE )
  write.table(para_mat_true, "para_mat.txt", row.names = FALSE, col.names = FALSE )
  write.table(simul_N, "simul_N.txt", row.names = FALSE, col.names = FALSE )
  write.table(simul_Z, "simul_Z.txt", row.names = FALSE, col.names = FALSE )
  write.table(simul_M, "simul_M.txt", row.names = FALSE, col.names = FALSE )
  
  path2 = paste0("../dataY/", ix, ".txt")
  write.table(simul_Y, path2, row.names = FALSE, col.names = FALSE )
  
  path3 = paste0("../initial_a/", ix, ".txt")
  write.table(simul_a, path3, row.names = FALSE, col.names = FALSE )
}

