#### Setup ####

library(tidyverse)
library(ncdf4)
library(Matrix)
#also need cdo installed for command line https://code.mpimet.mpg.de/projects/cdo/wiki

#make sure these are set to the correct locations
project_dir = '/mnt/r/wasserstein/'
data_dir = '/mnt/r/historical/'
source(paste0(project_dir,'00_functions.R'))


#### Grid approximation parameters ####

#kernel range parameter (set to desired kernel radius)
range_km = 1000

#determine resolution to resize climate fields
n_lat_up = 361
n_long_up = 720

up_lats = rev(seq(-90,90,180/(n_lat_up-1)))
up_longs = seq(0,360-360/n_long_up,360/n_long_up)

#determine stride length for our strided convolution 
stride = 6

#make sure convo_lats is symmetric around the equator by setting offset
offset = 2
convo_lats = up_lats[1:(n_lat_up/stride)*stride - offset]
convo_longs = up_longs[1:(n_long_up/stride)*stride-stride+1]

n_lat_convo = length(convo_lats)
n_long_convo = length(convo_longs)

#directory name to save results
slice_dir = paste0(project_dir,'sliced_',range_km,'km_',stride,'stride/')

#### Setup slicing method ####

#directory to save sliced data
if(!dir.exists(slice_dir)){
  #create folders to store sliced data
  dir.create(slice_dir)
  dir.create(paste0(slice_dir,'tas/'))
  dir.create(paste0(slice_dir,'tas/weights'))
  dir.create(paste0(slice_dir,'tas/cmip5'))
  dir.create(paste0(slice_dir,'tas/cmip6'))
  dir.create(paste0(slice_dir,'pr/'))
  dir.create(paste0(slice_dir,'pr/weights'))
  dir.create(paste0(slice_dir,'pr/cmip5'))
  dir.create(paste0(slice_dir,'pr/cmip6'))
}

#calculate coordinate grids and convert to radians
image_coords = expand.grid(dlat = up_lats, dlong = up_longs)
image_grid = (pi/180)*image_coords

convo_coords = expand.grid(dlat = convo_lats, dlong = convo_longs)
saveRDS(convo_coords,paste0(slice_dir,'convo_coords.RDS'))
convo_grid = (pi/180)*convo_coords

#latitude area weights for image
lat_w = cos(image_grid$dlat)

#store kernel convolution matrices
n_slice = nrow(convo_grid)
n_up = nrow(image_coords)
convo_matrix = Matrix::Matrix(0,nrow = n_lat_up*n_long_up,ncol = n_slice,sparse = TRUE)
#convo_matrix = matrix(0,n_lat_up*n_long_up,n_slice)
# for(i in 1:n_slice){
#   #calculate chordal distance and final kernel for the basis
#   cd = distChordal(convo_grid[i,], image_grid)
#   kernel = lat_w*wendland_kernel(cd, range_km)
#   convo_matrix[,i] = kernel/sum(kernel) #sum to 1 to keep data scale
# }
# Precompute the kernels at each latitude to save computation
kernels = sapply(1:n_lat_convo, function(i){
  cd = distChordal(convo_grid[i,], image_grid)
  kernel = lat_w*wendland_kernel(cd, range_km)
  kernel/sum(kernel)
})
for(i in 1:n_slice){
  #lat and lon indices
  lat = i%%n_lat_convo
  lat = ifelse(lat==0,n_lat_convo,lat)
  lon = floor(i/n_lat_convo-0.01)
  
  #choose kernel shape based on latitude
  kernel = kernels[,lat] 
  if(lon==0){
    convo_matrix[,i] = kernel
  }else{
    #if not at the first longitude, need to shift the kernel over
    shift = c((n_up-lon*stride*n_lat_up+1):n_up,
              1:(n_up-lon*stride*n_lat_up))
    convo_matrix[,i] = kernel[shift]
  }
}

saveRDS(convo_matrix,paste0(slice_dir,'convo_matrix.RDS'))

#clean up, but keep convo_matrix in memory for slicing
rm(kernels,shift,lat_w,image_grid,image_coords,i,up_lats,up_longs,convo_grid)
gc()

#test convo_matrix (1st location should be centered in top edge, last in bottom edge)
image.spatial(matrix(convo_matrix[,1],n_lat_up,n_long_up))/
  image.spatial(matrix(convo_matrix[,n_slice],n_lat_up,n_long_up))

image.spatial(matrix(convo_matrix[,1008],n_lat_up,n_long_up))/
  image.spatial(matrix(convo_matrix[,1990],n_lat_up,n_long_up))


#### Daily Average Surface Air Temperature (TAS) ####

#### Slicing ERA5 ####

#use cdo to create our upsampling grid and save the template for later
system(paste0('cdo gennn,r',n_long_up,'x',n_lat_up,' ',
              data_dir,'era5/tas/hourly/era5_tas_hourly_1979_01.nc ',
              slice_dir,'tas/weights/era5_tas_hourly_1979_01.nc'))

#set wd to location of daily files from 02_hourly_to_daily.R
setwd(paste0(data_dir,'era5/tas/daily/'))
Sys.sleep(1)

#calculate # of days ending in November, 2005 since each file has a month of data
n_days = length(seq(as.Date('1979-01-01'),as.Date('2005-11-30'),by='1 day'))

#save matrix for full sequence of convolved NCEP values
era5_sliced = array(NA,dim=c(n_days,n_lat_convo,n_long_convo))

#prep upsampling indices
era5_nn = nc_open(paste0(slice_dir,'tas/weights/era5_tas_hourly_1979_01.nc'))
era5_src = ncvar_get(era5_nn,'src_address')
era5_dst = ncvar_get(era5_nn,'dst_address')
nc_close(era5_nn)

t = 1
era5_files = sort(list.files())[1:323] #sort by year and subset files we need
for(f in era5_files){
  #open daily file
  era5_temp = aperm(readRDS(f),3:1) #lon, lat, day

  #flatten spatial dimension for indexing
  d = dim(era5_temp)
  dim(era5_temp) = c(d[1]*d[2],d[3])

  #index using the source and destination addresses from cdo
  era5_up = array(0,dim =c(n_lat_up*n_long_up,d[3]))
  era5_up[era5_dst,] = era5_temp[era5_src,]

  #switch the time and space dimensions to match the convo matrix
  dim(era5_up) = c(n_long_up,n_lat_up,d[3])
  era5_up = aperm(era5_up,3:1)
  era5_up = era5_up[,n_lat_up:1,] #flip latitude dimension
  dim(era5_up) = c(d[3],n_long_up*n_lat_up)

  #Compute all kernel convolutions
  era5_subset = as.matrix(era5_up%*%convo_matrix)
  rm(era5_up)
  gc()

  #reshape to the approximation grid and save
  dim(era5_subset) = c(d[3], n_lat_convo, n_long_convo)
  era5_sliced[t:(t + d[3] - 1),,] = era5_subset
  t = t + d[3]
  rm(era5_subset)
  gc()
}

saveRDS(era5_sliced,paste0(slice_dir,'tas/era5.RDS'))

#test that first and last day are looking good
image.spatial(era5_sliced[1,,])/
  image.spatial(era5_sliced[n_days,,])

#clean up, remove all era5 files from memory
rm(era5_sliced,era5_up,era5_temp,era5_subset,era5_files,f,t,d,n_days,era5_dst,era5_src,era5_nn)
gc()



#### Slicing NCEP ####

#get file names
setwd(paste0(data_dir,'ncep/tas/'))
Sys.sleep(1)
ncep_files = sort(list.files())[1:27] #sort by year and subset files we need

#use cdo to create our upsampling grid and save the template for later
system(paste0('cdo gennn,r',n_long_up,'x',n_lat_up,' ',
              data_dir,'ncep/tas/',ncep_files[1],' ',
              slice_dir,'tas/weights/',ncep_files[1]))

#calculate # of days (full years this time)
n_days = length(seq(as.Date('1979-01-01'),as.Date('2005-12-31'),by='1 day'))

#save matrix for full sequence of convolved NCEP values
ncep_sliced = array(NA,dim=c(n_days,n_lat_convo,n_long_convo))
t = 1

#save indices for resizing
ncep_nn = nc_open(paste0(slice_dir,'tas/weights/',ncep_files[1]))
ncep_src = ncvar_get(ncep_nn,'src_address')
ncep_dst = ncvar_get(ncep_nn,'dst_address')
nc_close(ncep_nn)

for(f in ncep_files){
  #open daily file and convert to celsius
  ncep_nc = nc_open(f)
  ncep_temp = ncvar_get(ncep_nc,'air') - 273.15
  d = dim(ncep_temp)
  dim(ncep_temp) = c(d[1]*d[2],d[3])

  #index using the source and destination addresses from cdo
  ncep_up = array(0,dim =c(n_lat_up*n_long_up,d[3]))
  ncep_up[ncep_dst,] = ncep_temp[ncep_src,]

  #switch the time and space dimensions to match the convo matrix
  dim(ncep_up) = c(n_long_up,n_lat_up,d[3])
  ncep_up = aperm(ncep_up,3:1)
  ncep_up = ncep_up[,n_lat_up:1,] #flip latitude dimension
  dim(ncep_up) = c(d[3],n_long_up*n_lat_up)

  #Compute all kernel convolutions
  ncep_subset = as.matrix(ncep_up%*%convo_matrix)

  #reshape to the approximation grid and save
  dim(ncep_subset) = c(d[3], n_lat_convo, n_long_convo)
  ncep_sliced[t:(t + d[3] - 1),,] = ncep_subset
  t = t + d[3]
  nc_close(ncep_nc)
}

pre_dec_2005 = length(seq(as.Date('1979-01-01'),as.Date('2005-11-30'),by='1 day'))
ncep_sliced = ncep_sliced[1:pre_dec_2005,,]

#test
(image.spatial(ncep_sliced[1,,])/
    image.spatial(ncep_sliced[last(pre_dec_2005),,]))

saveRDS(ncep_sliced,paste0(slice_dir,'tas/ncep.RDS'))

#clean up, remove all ncep files from memory
rm(ncep_sliced,ncep_up,ncep_temp,ncep_subset,ncep_files,f,t,d,n_days,pre_dec_2005,ncep_src,ncep_dst,ncep_nc,ncep_nn)
gc()



#### Slicing CMIP5 ####

setwd(paste0(data_dir,'cmip5tas'))
Sys.sleep(1)
#process file names to make a data frame
ncs = sort(list.files())
meta5 = strsplit(ncs,"_")
meta5 = do.call(rbind.data.frame,meta5)
colnames(meta5) = c('variable','frequency','model','experiment','ensemble','years')

#start date, end date, will show why we end in november in a few lines
sd = 19790101
ed = 20051130

#Given the start and end date, tag each file with it's temporal contents for filtering
meta5 = meta5 %>%
  separate(years,c('start','end')) %>%
  mutate(start = as.numeric(start),
         end = as.numeric(end),
         pre1979 = start<=sd,
         post2005 = end>=ed,
         contains_full = pre1979 & post2005,
         out_of_range = (start>ed)|(end<sd),
         has200511 = (end>=ed)&(start<=ed))

#Take all models and view the end date, sorting by earliest date first
meta5 %>% group_by(model) %>% summarise(start = min(start),end = max(end)) %>% arrange(end)
#Two models, HadGEM2-CC and HadGEM2-ES, end in November 2005, so we'll take that as our end date

#append file names, then filter files that contain no relevant dates
meta5$file_name = ncs
meta5 = meta5 %>% arrange(model,start) %>% dplyr::filter(out_of_range==FALSE)

#gather some info
lon_res = c()
lat_res = c()
calendar = c()
last_model='none'

for(i in 1:nrow(meta5)){
  curr_model = meta5$model[i]
  mod = nc_open(meta5$file_name[i])
  calendar = c(calendar,mod$dim$time$calendar)
  lon_res = c(lon_res,mod$dim$lon$len)
  lat_res = c(lat_res,mod$dim$lat$len)

  #use cdo to create our upsampling grid and save the template for later
  if(curr_model!=last_model){
    system(paste0('cdo gennn,r',n_long_up,'x',n_lat_up,' -selvar,tas -seltimestep,1 ',
                  data_dir,'cmip5tas/',meta5$file_name[i],' ',
                  slice_dir,'tas/weights/',meta5$file_name[i]),ignore.stderr = T)
  }

  last_model = curr_model
  nc_close(mod)
}

#final metadata data frame
meta5$lon_res = lon_res
meta5$lat_res = lat_res
meta5$calendar = calendar
meta5 = meta5 %>%
  rowwise() %>%
  mutate(keep1 = ifelse(pre1979,match(sd,ymd_range(start,end,calendar)),1),
         keep2 = ifelse(post2005,match(ed,ymd_range(start,end,calendar)),length(ymd_range(start,end,calendar))))

meta5 %>% 
  dplyr::select(variable,model,lon_res,lat_res,calendar) %>% 
  group_by(model) %>%
  slice(1) %>%
  saveRDS(file = paste0(slice_dir,'tas/meta5.RDS'))

#clean up before processing the output
rm(lon_res,lat_res,calendar,sd,ed,i,ncs)
gc()

n_days = length(ymd_range(19790101,20051130, calendar = 'standard'))
n_days_365 = length(ymd_range(19790101,20051130, calendar = '365_day'))
n_days_360 = length(ymd_range(19790101,20051130, calendar = '360_day'))

#load projection matrices
convo_matrix = readRDS(paste0(slice_dir,'convo_matrix.RDS'))
last_model = "none"

#loop, open files, project, save to the array
for(i in 1:nrow(meta5)){

  print(i)

  #read in model output
  curr_model = meta5$model[i]

  #nc_open to read data, rast for fast upsampling
  mod = nc_open(meta5$file_name[i])

  #if it's a new model, set some parameters
  if(curr_model != last_model){
    if(last_model != 'none'){
      #clean up last loop
      saveRDS(mod_sliced,paste0(slice_dir,'tas/cmip5/',last_model,'.RDS'))
      rm(mod_sliced)
      gc()
    }

    last_model = curr_model
    t_offset = 0

    if(mod$dim$time$calendar %in% c('noleap','365_day')){
      mod_sliced = array(NA,dim=c(n_days_365,n_lat_convo,n_long_convo))
    }else if(mod$dim$time$calendar == '360_day'){
      mod_sliced = array(NA,dim=c(n_days_360,n_lat_convo,n_long_convo))
    }else{
      mod_sliced = array(NA,dim=c(n_days,n_lat_convo,n_long_convo))
    }

    #indexes for quick resizing
    mod_nn = nc_open(paste0(slice_dir,'tas/weights/',meta5$file_name[i]))
    mod_src = ncvar_get(mod_nn,'src_address')
    mod_dst = ncvar_get(mod_nn,'dst_address')
    rm(mod_nn)
    gc()
  }

  #read in data, convert to celcius, flatten spatial dimension for indexing
  mod_temp = ncvar_get(mod, "tas", c(1,1,meta5$keep1[i]), c(-1,-1,1+meta5$keep2[i]-meta5$keep1[i])) - 273.15 #long, lat, day
  d = dim(mod_temp)
  dim(mod_temp) = c(d[1]*d[2],d[3])

  #some of these files get big... take only a year at a time
  interval = 366
  first = seq(1,d[3],by=interval)
  last = interval*(1:length(first))
  last[length(last)]=d[3]

  #iterate over subsets, convolve, save results
  for(j in 1:length(first)){
    #time indices of the subset
    t = first[j]:last[j]
    nt = length(t)

    #create upsamlped array and query indices for resizing
    mod_up = array(0,dim =c(n_lat_up*n_long_up,nt))
    mod_up[mod_dst,] = mod_temp[mod_src,t]

    #switch the time and space dimensions to match the convo matrix
    dim(mod_up) = c(n_long_up,n_lat_up,nt)
    mod_up = aperm(mod_up,3:1)
    mod_up = mod_up[,n_lat_up:1,] #flip latitude dimension
    dim(mod_up) = c(nt,n_long_up*n_lat_up) #day, lat, long

    #Compute all kernel convolutions
    mod_subset = as.matrix(mod_up%*%convo_matrix)
    rm(mod_up)
    gc()

    #save to the sliced matrix
    dim(mod_subset) = c(nt, n_lat_convo, n_long_convo)
    mod_sliced[t_offset + t, , ] = mod_subset
    rm(mod_subset)
    gc()
  }

  #set offset for next loop
  t_offset = t_offset + d[3]
  rm(mod_temp)
  nc_close(mod)
  gc()
}
saveRDS(mod_sliced,paste0(slice_dir,'tas/cmip5/',curr_model,'.RDS'))

#test first and last
image.spatial(mod_sliced[1,,])/
  image.spatial(mod_sliced[t_offset,,])

rm(mod_sliced,mod,t_offset,t,d,first,last,interval,curr_model,last_model)
gc()



#### Slicing CMIP6 ####

setwd(paste0(data_dir,'cmip6tas'))
Sys.sleep(1)

#process file names to make a data frame
ncs = sort(list.files())
meta6 = strsplit(ncs,"_")
meta6 = do.call(rbind.data.frame,meta6)
colnames(meta6) = c('variable','frequency','model','experiment','ensemble','grid','years')

#start date, end date, will show why we end in november in a few lines
sd = 19790101
ed = 20051130

#Given the start and end date, tag each file with it's temporal contents for filtering
meta6 = meta6 %>%
  separate(years,c('start','end')) %>%
  mutate(start = as.numeric(start),
         end = as.numeric(end),
         pre1979 = start<=sd,
         post2005 = end>=ed,
         contains_full = pre1979 & post2005,
         out_of_range = (start>ed)|(end<sd),
         has200511 = (end>=ed)&(start<=ed))

#Take all models and view the end date, sorting by earliest date first
meta6 %>% group_by(model) %>% summarise(start = min(start),end = max(end)) %>% arrange(end)
#Two models, HadGEM2-CC and HadGEM2-ES, end in November 2005, so we'll take that as our end date

#append file names, then filter files that contain no relevant dates
meta6$file_name = ncs
meta6 = meta6 %>% arrange(model,start) %>% dplyr::filter(out_of_range==FALSE)

#overlapping model dates, need to end early to compensate
meta6$end[which(meta6$model=='CESM2-WACCM-FV2')] = c(19791211,19891221,19991231,20100110)

#gather some info
lon_res = c()
lat_res = c()
calendar = c()
last_model='none'

for(i in 1:nrow(meta6)){
  curr_model = meta6$model[i]
  mod = nc_open(meta6$file_name[i])
  calendar = c(calendar,mod$dim$time$calendar)
  if(curr_model == "ICON-ESM-LR"){
    lon_res = c(lon_res,NA)
    lat_res = c(lat_res,NA)
  }else{
    lon_res = c(lon_res,mod$dim$lon$len)
    lat_res = c(lat_res,mod$dim$lat$len)
  }

  #use cdo to create our upsampling grid and save the template for later
  if(curr_model!=last_model){
    system(paste0('cdo gennn,r',n_long_up,'x',n_lat_up,' -selvar,tas -seltimestep,1 ',
                  data_dir,'cmip6tas/',meta6$file_name[i],' ',
                  slice_dir,'tas/weights/',meta6$file_name[i]),ignore.stderr = T)
  }

  last_model = curr_model
  nc_close(mod)
}


meta6$calendar = calendar
meta6$lon_res = lon_res
meta6$lat_res = lat_res

meta6 = meta6 %>%
  rowwise() %>%
  mutate(keep1 = ifelse(pre1979,match(sd,ymd_range(start,end,calendar)),1),
         keep2 = ifelse(post2005,match(ed,ymd_range(start,end,calendar)),length(ymd_range(start,end,calendar))))

meta6 %>% 
  dplyr::select(variable,model,lon_res,lat_res,calendar) %>% 
  group_by(model) %>%
  slice(1) %>%
  saveRDS(file = paste0(slice_dir,'tas/meta6.RDS'))

#clean up before processing the output
rm(lon_res,lat_res,calendar,sd,ed,i,ncs)
gc()


n_days = length(ymd_range(19790101,20051130, calendar = 'standard'))
n_days_365 = length(ymd_range(19790101,20051130, calendar = '365_day'))
n_days_360 = length(ymd_range(19790101,20051130, calendar = '360_day'))

#load projection matrices
convo_matrix = readRDS(paste0(slice_dir,'convo_matrix.RDS'))
last_model = "none"

#loop, open files, project, save to the array
for(i in 1:nrow(meta6)){

  print(i)

  #read in model output
  curr_model = meta6$model[i]

  #nc_open to read data, rast for fast upsampling
  mod = nc_open(meta6$file_name[i])

  #if it's a new model, set some parameters
  if(curr_model != last_model){
    if(last_model != 'none'){
      #clean up last loop
      saveRDS(mod_sliced,paste0(slice_dir,'tas/cmip6/',last_model,'.RDS'))
      rm(mod_sliced)
      gc()
    }

    last_model = curr_model
    t_offset = 0

    if(mod$dim$time$calendar %in% c('noleap','365_day')){
      mod_sliced = array(NA,dim=c(n_days_365,n_lat_convo,n_long_convo))
    }else if(mod$dim$time$calendar == '360_day'){
      mod_sliced = array(NA,dim=c(n_days_360,n_lat_convo,n_long_convo))
    }else{
      mod_sliced = array(NA,dim=c(n_days,n_lat_convo,n_long_convo))
    }

    #indexes for quick resizing
    mod_nn = nc_open(paste0(slice_dir,'tas/weights/',meta6$file_name[i]))
    mod_src = ncvar_get(mod_nn,'src_address')
    mod_dst = ncvar_get(mod_nn,'dst_address')
    rm(mod_nn)
    gc()
  }

  #special case handling for ICON icosahedron model (no need to flatten spatial dimension)
  if(curr_model == "ICON-ESM-LR"){
    #read in data, convert to celcius
    mod_temp = ncvar_get(mod, "tas", c(1,meta6$keep1[i]), c(-1,1+meta6$keep2[i]-meta6$keep1[i])) - 273.15 #long, lat, day

    d = c(NA,NA,dim(mod_temp)[2])
  }else{
    #read in data, convert to celcius
    mod_temp = ncvar_get(mod, "tas", c(1,1,meta6$keep1[i]), c(-1,-1,1+meta6$keep2[i]-meta6$keep1[i])) - 273.15 #long, lat, day

    #flatten spatial dimension for indexing
    d = dim(mod_temp)
    dim(mod_temp) = c(d[1]*d[2],d[3])
  }

  #some of these files get big... take only a year at a time
  interval = 366
  first = seq(1,d[3],by=interval)
  last = interval*(1:length(first))
  last[length(last)]=d[3]

  #iterate over subsets, convolve, save results
  for(j in 1:length(first)){
    #time indices of the subset
    t = first[j]:last[j]
    nt = length(t)

    #create upsamlped array and query indices for resizing
    mod_up = array(0,dim =c(n_lat_up*n_long_up,nt))
    mod_up[mod_dst,] = mod_temp[mod_src,t]

    #switch the time and space dimensions to match the convo matrix
    dim(mod_up) = c(n_long_up,n_lat_up,nt)
    mod_up = aperm(mod_up,3:1)
    mod_up = mod_up[,n_lat_up:1,] #flip latitude dimension
    dim(mod_up) = c(nt,n_long_up*n_lat_up) #day, lat, long

    #Compute all kernel convolutions
    mod_subset = as.matrix(mod_up%*%convo_matrix)
    rm(mod_up)
    gc()

    #save to the sliced matrix
    dim(mod_subset) = c(nt, n_lat_convo, n_long_convo)
    mod_sliced[t_offset + t, , ] = mod_subset
    rm(mod_subset)
    gc()
  }

  #set offset for next loop
  t_offset = t_offset + d[3]
  rm(mod_temp)
  nc_close(mod)
  gc()
}
saveRDS(mod_sliced,paste0(slice_dir,'tas/cmip6/',curr_model,'.RDS'))

#test first and last
image.spatial(mod_sliced[1,,])/
  image.spatial(mod_sliced[t_offset,,])

rm(mod_sliced,mod,t_offset,t,d,first,last,interval,curr_model,last_model)
gc()



#### Daily Total Precipitation (PR) ####

#### Slicing ERA5 ####

#use cdo to create our upsampling grid and save the template for later
system(paste0('cdo gennn,r',n_long_up,'x',n_lat_up,' ',
              data_dir,'era5/pr/hourly/era5_pr_hourly_1979_01.nc ',
              slice_dir,'pr/weights/era5_pr_hourly_1979_01.nc'))

#set wd to location of daily files from 02_hourly_to_daily.R
setwd(paste0(data_dir,'era5/pr/daily/'))
Sys.sleep(1)

#calculate # of days ending in November, 2005 since each file has a month of data
n_days = length(seq(as.Date('1979-01-01'),as.Date('2005-11-30'),by='1 day'))

#save matrix for full sequence of convolved NCEP values
era5_sliced = array(NA,dim=c(n_days,n_lat_convo,n_long_convo))

#prep upsampling indices
era5_nn = nc_open(paste0(slice_dir,'pr/weights/era5_pr_hourly_1979_01.nc'))
era5_src = ncvar_get(era5_nn,'src_address')
era5_dst = ncvar_get(era5_nn,'dst_address')
nc_close(era5_nn)

t = 1
era5_files = sort(list.files())[1:323] #sort by year and subset files we need
for(f in era5_files){
  #open daily file, convert to mm/day
  era5_temp = aperm(readRDS(f),3:1)*1000 #lon, lat, day

  #flatten spatial dimension for indexing
  d = dim(era5_temp)
  dim(era5_temp) = c(d[1]*d[2],d[3])

  #index using the source and destination addresses from cdo
  era5_up = array(0,dim =c(n_lat_up*n_long_up,d[3]))
  era5_up[era5_dst,] = era5_temp[era5_src,]

  #switch the time and space dimensions to match the convo matrix
  dim(era5_up) = c(n_long_up,n_lat_up,d[3])
  era5_up = aperm(era5_up,3:1)
  era5_up = era5_up[,n_lat_up:1,] #flip latitude dimension
  dim(era5_up) = c(d[3],n_long_up*n_lat_up)

  #Compute all kernel convolutions
  era5_subset = as.matrix(era5_up%*%convo_matrix)

  #reshape to the approximation grid and save
  dim(era5_subset) = c(d[3], n_lat_convo, n_long_convo)
  era5_sliced[t:(t + d[3] - 1),,] = era5_subset
  t = t + d[3]
}

saveRDS(era5_sliced,paste0(slice_dir,'pr/era5.RDS'))

#test that first and last day are looking good
image.spatial(era5_sliced[1,,])/
  image.spatial(era5_sliced[n_days,,])

#clean up, remove all era5 files from memory
rm(era5_sliced,era5_up,era5_temp,era5_subset,era5_files,f,t,d,n_days,era5_dst,era5_src,era5_nn)
gc()



#### Slicing NCEP ####

#get file names
setwd(paste0(data_dir,'ncep/pr/'))
Sys.sleep(1)
ncep_files = sort(list.files())[1:27] #sort by year and subset files we need

#use cdo to create our upsampling grid and save the template for later
system(paste0('cdo gennn,r',n_long_up,'x',n_lat_up,' ',
              data_dir,'ncep/pr/',ncep_files[1],' ',
              slice_dir,'pr/weights/',ncep_files[1]))

#calculate # of days (full years this time)
n_days = length(seq(as.Date('1979-01-01'),as.Date('2005-12-31'),by='1 day'))

#save matrix for full sequence of convolved NCEP values
ncep_sliced = array(NA,dim=c(n_days,n_lat_convo,n_long_convo))
t = 1

#save indices for resizing
ncep_nn = nc_open(paste0(slice_dir,'pr/weights/',ncep_files[1]))
ncep_src = ncvar_get(ncep_nn,'src_address')
ncep_dst = ncvar_get(ncep_nn,'dst_address')
nc_close(ncep_nn)

for(f in ncep_files){
  #open daily file and convert to mm/day
  ncep_nc = nc_open(f)
  ncep_temp = ncvar_get(ncep_nc,'prate')*86400
  d = dim(ncep_temp)
  dim(ncep_temp) = c(d[1]*d[2],d[3])

  #index using the source and destination addresses from cdo
  ncep_up = array(0,dim =c(n_lat_up*n_long_up,d[3]))
  ncep_up[ncep_dst,] = ncep_temp[ncep_src,]

  #switch the time and space dimensions to match the convo matrix
  dim(ncep_up) = c(n_long_up,n_lat_up,d[3])
  ncep_up = aperm(ncep_up,3:1)
  ncep_up = ncep_up[,n_lat_up:1,] #flip latitude dimension
  dim(ncep_up) = c(d[3],n_long_up*n_lat_up)

  #Compute all kernel convolutions
  ncep_subset = as.matrix(ncep_up%*%convo_matrix)

  #reshape to the approximation grid and save
  dim(ncep_subset) = c(d[3], n_lat_convo, n_long_convo)
  ncep_sliced[t:(t + d[3] - 1),,] = ncep_subset
  t = t + d[3]
  nc_close(ncep_nc)
}

pre_dec_2005 = length(seq(as.Date('1979-01-01'),as.Date('2005-11-30'),by='1 day'))
ncep_sliced = ncep_sliced[1:pre_dec_2005,,]

#test
(image.spatial(ncep_sliced[1,,])/
    image.spatial(ncep_sliced[last(pre_dec_2005),,]))

saveRDS(ncep_sliced,paste0(slice_dir,'pr/ncep.RDS'))

#clean up, remove all ncep files from memory
rm(ncep_sliced,ncep_up,ncep_temp,ncep_subset,ncep_files,f,t,d,n_days,pre_dec_2005,ncep_src,ncep_dst,ncep_nc,ncep_nn)
gc()



#### Slicing GPCP ####

#get file names
setwd(paste0(data_dir,'gpcp/daily/'))
Sys.sleep(1)
gpcp_files = sort(list.files()) #sort by year and subset files we need

#use cdo to create our upsampling grid and save the template for later
if(!file.exists(paste0(slice_dir,'pr/weights/',gpcp_files[1]))){
  system(paste0('cdo gennn,r',n_long_up,'x',n_lat_up,' ',
                data_dir,'gpcp/daily/',gpcp_files[1],' ',
                slice_dir,'pr/weights/',gpcp_files[1]))
}

#calculate # of days (full years this time)
n_days = length(seq(as.Date('1996-10-01'),as.Date('2005-11-30'),by='1 day'))

#save matrix for full sequence of convolved gpcp values
gpcp_sliced = array(NA,dim=c(n_days,n_lat_convo,n_long_convo))

#save indices for resizing
gpcp_nn = nc_open(paste0(slice_dir,'pr/weights/',gpcp_files[1]))
gpcp_src = ncvar_get(gpcp_nn,'src_address')
gpcp_dst = ncvar_get(gpcp_nn,'dst_address')
nc_close(gpcp_nn)

# Slices for each day
convo_matrix = readRDS(paste0(slice_dir,'convo_matrix.RDS'))
t = 1
missing = numeric(0)
conv_missing = numeric(0)
for(f in gpcp_files[1:n_days]){
  #open daily file, already in mm/day
  gpcp_nc = nc_open(f)
  gpcp_temp = ncvar_get(gpcp_nc,'precip')
  d = dim(gpcp_temp)
  dim(gpcp_temp) = c(d[1]*d[2],1)
  
  #index using the source and destination addresses from cdo
  gpcp_up = array(0,dim =c(n_lat_up*n_long_up,1))
  gpcp_up[gpcp_dst,] = gpcp_temp[gpcp_src,]
  
  #switch the time and space dimensions to match the convo matrix
  dim(gpcp_up) = c(n_long_up,n_lat_up,1)
  gpcp_up = aperm(gpcp_up,3:1)
  gpcp_up = gpcp_up[,n_lat_up:1,] #flip latitude dimension
  dim(gpcp_up) = c(1,n_long_up*n_lat_up)
  
  #Find any missing data
  convo_matrix_missing = convo_matrix
  missing = which(gpcp_up<0)
  if(length(missing>0)){
    # Set convolution weight of missing data to 0
    convo_matrix_missing@x[(convo_matrix_missing@i+1) %in% missing]=0
    convo_matrix_missing = t(convo_matrix_missing)
    
    #check if any convolutions are missing 50% or more of data
    #can tune this threshold as desired
    conv_missing = which(rowSums(convo_matrix_missing)<=0.5)
    
    #if this happens, we'll return NA for that location and day
    if(length(conv_missing>0)){
      #First, print the file where this occurs.
      #If you see this message print, one of the convolutions has too much missing data per above level.
      #A value of NA will be returned, you may avoid this issue with a larger convolution radius.
      print(paste(length(conv_missing>0),'missing slices in file',f))
      
      #for the convolutions with all data missing, give a weight of 1 to avoid dividing by 0
      #later, we'll manually set these slices to 0, so this is just a nonzero placeholder 
      convo_matrix_missing@x[(convo_matrix_missing@i+1) %in% conv_missing] = 1
    }
    
    #reweight all convolutions to sum to 1, ignoring missing data
    #the reason for the above case handling is so that we never divide by 0 here.
    convo_matrix_missing = sweep_sparse(convo_matrix_missing, 1,
                                        rowSums(convo_matrix_missing), fun="/")
    convo_matrix_missing = t(convo_matrix_missing)
  }
  
  #Compute all kernel convolutions
  gpcp_subset = as.matrix(gpcp_up%*%convo_matrix_missing)
  #For the missing convolutions, set value to NA
  #When calculating quantiles, NA values will be left out
  if(length(conv_missing>0)){
    gpcp_subset[,conv_missing] = NA
  }
  
  #Reset missing data indices for next loop
  missing = numeric(0)
  conv_missing = numeric(0)
  
  #reshape to the approximation grid and save
  dim(gpcp_subset) = c(1, n_lat_convo, n_long_convo)
  gpcp_sliced[t,,] = gpcp_subset
  nc_close(gpcp_nc)
  t = t + 1
}

#test
(image.spatial(gpcp_sliced[1,,])/
    image.spatial(gpcp_sliced[1807,,])/
    image.spatial(gpcp_sliced[n_days,,]))

saveRDS(gpcp_sliced,paste0(slice_dir,'pr/gpcp.RDS'))

#clean up, remove all gpcp files from memory
rm(gpcp_sliced,gpcp_up,gpcp_temp,gpcp_subset,gpcp_files,f,t,d,n_days,pre_dec_2005,gpcp_src,gpcp_dst,gpcp_nc,gpcp_nn)
gc()


#### Slicing CMIP5 ####

setwd(paste0(data_dir,'cmip5pr'))
Sys.sleep(1)

#process file names to make a data frame
ncs = sort(list.files())
meta5 = strsplit(ncs,"_")
meta5 = do.call(rbind.data.frame,meta5)
colnames(meta5) = c('variable','frequency','model','experiment','ensemble','years')

#start date, end date, will show why we end in november in a few lines
sd = 19790101
ed = 20051130

#Given the start and end date, tag each file with it's temporal contents for filtering
meta5 = meta5 %>%
  separate(years,c('start','end')) %>%
  mutate(start = as.numeric(start),
         end = as.numeric(end),
         pre1979 = start<=sd,
         post2005 = end>=ed,
         contains_full = pre1979 & post2005,
         out_of_range = (start>ed)|(end<sd),
         has200511 = (end>=ed)&(start<=ed))

#Take all models and view the end date, sorting by earliest date first
meta5 %>% group_by(model) %>% summarise(start = min(start),end = max(end)) %>% arrange(end)
#Two models, HadGEM2-CC and HadGEM2-ES, end in November 2005, so we'll take that as our end date

#append file names, then filter files that contain no relevant dates
meta5$file_name = ncs
meta5 = meta5 %>% arrange(model,start) %>% dplyr::filter(out_of_range==FALSE)

#gather some info
lon_res = c()
lat_res = c()
calendar = c()
last_model='none'

for(i in 1:nrow(meta5)){
  curr_model = meta5$model[i]
  mod = nc_open(meta5$file_name[i])
  calendar = c(calendar,mod$dim$time$calendar)
  lon_res = c(lon_res,mod$dim$lon$len)
  lat_res = c(lat_res,mod$dim$lat$len)
  
  #use cdo to create our upsampling grid and save the template for later
  if(curr_model!=last_model){
    system(paste0('cdo gennn,r',n_long_up,'x',n_lat_up,' -selvar,pr -seltimestep,1 ',
                  data_dir,'cmip5pr/',meta5$file_name[i],' ',
                  slice_dir,'pr/weights/',meta5$file_name[i]),ignore.stderr = T)
  }
  
  last_model = curr_model
  nc_close(mod)
}

#final metadata data frame
meta5$lon_res = lon_res
meta5$lat_res = lat_res
meta5$calendar = calendar
meta5 = meta5 %>%
  rowwise() %>%
  mutate(keep1 = ifelse(pre1979,match(sd,ymd_range(start,end,calendar)),1),
         keep2 = ifelse(post2005,match(ed,ymd_range(start,end,calendar)),length(ymd_range(start,end,calendar))))

meta5 %>% 
  dplyr::select(variable,model,lon_res,lat_res,calendar) %>% 
  group_by(model) %>%
  slice(1) %>%
  saveRDS(file = paste0(slice_dir,'pr/meta5.RDS'))

#clean up before processing the output
rm(lon_res,lat_res,calendar,sd,ed,i,ncs)
gc()

n_days = length(ymd_range(19790101,20051130, calendar = 'standard'))
n_days_365 = length(ymd_range(19790101,20051130, calendar = '365_day'))
n_days_360 = length(ymd_range(19790101,20051130, calendar = '360_day'))

#load projection matrices
convo_matrix = readRDS(paste0(slice_dir,'convo_matrix.RDS'))
last_model = "none"

#loop, open files, project, save to the array
for(i in 1:nrow(meta5)){
  
  print(i)
  
  #read in model output
  curr_model = meta5$model[i]
  
  #nc_open to read data, rast for fast upsampling
  mod = nc_open(meta5$file_name[i])
  
  #if it's a new model, set some parameters
  if(curr_model != last_model){
    if(last_model != 'none'){
      #clean up last loop
      saveRDS(mod_sliced,paste0(slice_dir,'pr/cmip5/',last_model,'.RDS'))
      rm(mod_sliced)
      gc()
    }
    
    last_model = curr_model
    t_offset = 0
    
    if(mod$dim$time$calendar %in% c('noleap','365_day')){
      mod_sliced = array(NA,dim=c(n_days_365,n_lat_convo,n_long_convo))
    }else if(mod$dim$time$calendar == '360_day'){
      mod_sliced = array(NA,dim=c(n_days_360,n_lat_convo,n_long_convo))
    }else{
      mod_sliced = array(NA,dim=c(n_days,n_lat_convo,n_long_convo))
    }
    
    #indexes for quick resizing
    mod_nn = nc_open(paste0(slice_dir,'pr/weights/',meta5$file_name[i]))
    mod_src = ncvar_get(mod_nn,'src_address')
    mod_dst = ncvar_get(mod_nn,'dst_address')
    rm(mod_nn)
    gc()
  }
  
  #read in data, convert to mm/day, flatten spatial dimension for indexing
  mod_temp = ncvar_get(mod, "pr", c(1,1,meta5$keep1[i]), c(-1,-1,1+meta5$keep2[i]-meta5$keep1[i]))*86400 #long, lat, day
  d = dim(mod_temp)
  dim(mod_temp) = c(d[1]*d[2],d[3])
  
  #some of these files get big... take only a year at a time
  interval = 366
  first = seq(1,d[3],by=interval)
  last = interval*(1:length(first))
  last[length(last)]=d[3]
  
  #iterate over subsets, convolve, save results
  for(j in 1:length(first)){
    #time indices of the subset
    t = first[j]:last[j]
    nt = length(t)
    
    #create upsamlped array and query indices for resizing
    mod_up = array(0,dim =c(n_lat_up*n_long_up,nt))
    mod_up[mod_dst,] = mod_temp[mod_src,t]
    
    #switch the time and space dimensions to match the convo matrix
    dim(mod_up) = c(n_long_up,n_lat_up,nt)
    mod_up = aperm(mod_up,3:1)
    mod_up = mod_up[,n_lat_up:1,] #flip latitude dimension
    dim(mod_up) = c(nt,n_long_up*n_lat_up) #day, lat, long
    
    #Compute all kernel convolutions
    mod_subset = as.matrix(mod_up%*%convo_matrix)
    rm(mod_up)
    gc()
    
    #save to the sliced matrix
    dim(mod_subset) = c(nt, n_lat_convo, n_long_convo)
    mod_sliced[t_offset + t, , ] = mod_subset
    rm(mod_subset)
    gc()
  }
  
  #set offset for next loop
  t_offset = t_offset + d[3]
  rm(mod_temp)
  nc_close(mod)
  gc()
}
saveRDS(mod_sliced,paste0(slice_dir,'pr/cmip5/',curr_model,'.RDS'))

#test first and last
image.spatial(mod_sliced[1,,])/
  image.spatial(mod_sliced[t_offset,,])

rm(mod_sliced,mod,t_offset,t,d,first,last,interval,curr_model,last_model)
gc()



#### Slicing CMIP6 ####

setwd(paste0(data_dir,'cmip6pr'))
Sys.sleep(1)

#process file names to make a data frame
ncs = sort(list.files())
meta6 = strsplit(ncs,"_")
meta6 = do.call(rbind.data.frame,meta6)
colnames(meta6) = c('variable','frequency','model','experiment','ensemble','grid','years')

#start date, end date, will show why we end in november in a few lines
sd = 19790101
ed = 20051130

#Given the start and end date, tag each file with it's temporal contents for filtering
meta6 = meta6 %>%
  separate(years,c('start','end')) %>%
  mutate(start = as.numeric(start),
         end = as.numeric(end),
         pre1979 = start<=sd,
         post2005 = end>=ed, 
         contains_full = pre1979 & post2005, 
         out_of_range = (start>ed)|(end<sd),
         has200511 = (end>=ed)&(start<=ed)) 

#Take all models and view the end date, sorting by earliest date first
meta6 %>% group_by(model) %>% summarise(start = min(start),end = max(end)) %>% arrange(end)
#Two models, HadGEM2-CC and HadGEM2-ES, end in November 2005, so we'll take that as our end date

#append file names, then filter files that contain no relevant dates
meta6$file_name = ncs
meta6 = meta6 %>% arrange(model,start) %>% dplyr::filter(out_of_range==FALSE)

#overlapping model dates, need to end early to compensate 
meta6$end[which(meta6$model=='CESM2-WACCM-FV2')] = c(19791211,19891221,19991231,20100110)

#gather some info
lon_res = c()
lat_res = c()
calendar = c()
last_model='none'

for(i in 1:nrow(meta6)){
  curr_model = meta6$model[i]
  mod = nc_open(meta6$file_name[i])
  calendar = c(calendar,mod$dim$time$calendar)
  if(curr_model == "ICON-ESM-LR"){
    lon_res = c(lon_res,NA)
    lat_res = c(lat_res,NA)
  }else{
    lon_res = c(lon_res,mod$dim$lon$len)
    lat_res = c(lat_res,mod$dim$lat$len)
  }
  
  #use cdo to create our upsampling grid and save the template for later
  if(curr_model!=last_model){
    system(paste0('cdo gennn,r',n_long_up,'x',n_lat_up,' -selvar,pr -seltimestep,1 ',
                  data_dir,'cmip6pr/',meta6$file_name[i],' ',
                  slice_dir,'pr/weights/',meta6$file_name[i]),ignore.stderr = T)
  }
  
  last_model = curr_model
  nc_close(mod)
}


meta6$calendar = calendar
meta6$lon_res = lon_res
meta6$lat_res = lat_res

meta6 = meta6 %>%
  rowwise() %>%
  mutate(keep1 = ifelse(pre1979,match(sd,ymd_range(start,end,calendar)),1),
         keep2 = ifelse(post2005,match(ed,ymd_range(start,end,calendar)),length(ymd_range(start,end,calendar))))

meta6 %>% 
  dplyr::select(variable,model,lon_res,lat_res,calendar) %>% 
  group_by(model) %>%
  slice(1) %>%
  saveRDS(file = paste0(slice_dir,'pr/meta6.RDS'))

#clean up before processing the output
rm(lon_res,lat_res,calendar,sd,ed,i,ncs)
gc()


n_days = length(ymd_range(19790101,20051130, calendar = 'standard'))
n_days_365 = length(ymd_range(19790101,20051130, calendar = '365_day'))
n_days_360 = length(ymd_range(19790101,20051130, calendar = '360_day'))

#load projection matrices
convo_matrix = readRDS(paste0(slice_dir,'convo_matrix.RDS'))
last_model = "none"

#loop, open files, project, save to the array
for(i in 1:nrow(meta6)){
  
  print(i)
  
  #read in model output
  curr_model = meta6$model[i]
  
  #nc_open to read data, rast for fast upsampling
  mod = nc_open(meta6$file_name[i])
  
  #if it's a new model, set some parameters
  if(curr_model != last_model){
    if(last_model != 'none'){
      #clean up last loop
      saveRDS(mod_sliced,paste0(slice_dir,'pr/cmip6/',last_model,'.RDS'))
      rm(mod_sliced)
      gc()
    }
    
    last_model = curr_model
    t_offset = 0
    
    if(mod$dim$time$calendar %in% c('noleap','365_day')){
      mod_sliced = array(NA,dim=c(n_days_365,n_lat_convo,n_long_convo))
    }else if(mod$dim$time$calendar == '360_day'){
      mod_sliced = array(NA,dim=c(n_days_360,n_lat_convo,n_long_convo))
    }else{
      mod_sliced = array(NA,dim=c(n_days,n_lat_convo,n_long_convo))
    }
    
    #indexes for quick resizing
    mod_nn = nc_open(paste0(slice_dir,'pr/weights/',meta6$file_name[i]))
    mod_src = ncvar_get(mod_nn,'src_address')
    mod_dst = ncvar_get(mod_nn,'dst_address')
    rm(mod_nn)
    gc()
  }
  
  #special case handling for ICON icosahedron model (no need to flatten spatial dimension)
  if(curr_model == "ICON-ESM-LR"){
    #read in data, convert to mm/day
    mod_temp = ncvar_get(mod, "pr", c(1,meta6$keep1[i]), c(-1,1+meta6$keep2[i]-meta6$keep1[i]))*86400 #long, lat, day
    
    d = c(NA,NA,dim(mod_temp)[2])
  }else{
    #read in data, convert to mm/day
    mod_temp = ncvar_get(mod, "pr", c(1,1,meta6$keep1[i]), c(-1,-1,1+meta6$keep2[i]-meta6$keep1[i]))*86400 #long, lat, day
    
    #flatten spatial dimension for indexing
    d = dim(mod_temp)
    dim(mod_temp) = c(d[1]*d[2],d[3])
  }
  
  #some of these files get big... take only a year at a time
  interval = 366
  first = seq(1,d[3],by=interval)
  last = interval*(1:length(first))
  last[length(last)]=d[3]
  
  #iterate over subsets, convolve, save results
  for(j in 1:length(first)){
    #time indices of the subset
    t = first[j]:last[j]
    nt = length(t)
    
    #create upsamlped array and query indices for resizing
    mod_up = array(0,dim =c(n_lat_up*n_long_up,nt))
    mod_up[mod_dst,] = mod_temp[mod_src,t]
    
    #switch the time and space dimensions to match the convo matrix
    dim(mod_up) = c(n_long_up,n_lat_up,nt)
    mod_up = aperm(mod_up,3:1)
    mod_up = mod_up[,n_lat_up:1,] #flip latitude dimension
    dim(mod_up) = c(nt,n_long_up*n_lat_up) #day, lat, long
    
    #Compute all kernel convolutions
    mod_subset = as.matrix(mod_up%*%convo_matrix)
    rm(mod_up)
    gc()
    
    #save to the sliced matrix
    dim(mod_subset) = c(nt, n_lat_convo, n_long_convo)
    mod_sliced[t_offset + t, , ] = mod_subset
    rm(mod_subset)
    gc()
  }
  
  #set offset for next loop
  t_offset = t_offset + d[3]
  rm(mod_temp)
  nc_close(mod)
  gc()
}
saveRDS(mod_sliced,paste0(slice_dir,'pr/cmip6/',curr_model,'.RDS'))

#test first and last
image.spatial(mod_sliced[1,,])/
  image.spatial(mod_sliced[t_offset,,])

rm(mod_sliced,mod,t_offset,t,d,first,last,interval,curr_model,last_model)
gc()

