#### Obtaining CMIP and reanlysis data ####

# This script walks through the process of obtaining each dataset.
# For CMIP5, CMIP6, and NCEP, daily data is easily obtainable via tools on the website.
# For ERA5, no daily data is available, and the hourly data must be downloaded one month at a time.
# Code is provided for the CDS API in python, run in R using the reticulate package.
# Data from each source should be put in a separate subdirectory of data_dir
# Be aware that this involves downloading a ton of data (over 2TB)
# Some of the download speeds can be slow depending on demand, which make make it hard to reproduce the full results

# Obatining CMIP5:
# https://esgf-node.llnl.gov/search/cmip5/
# Filter by...
# Project: CMIP5
# Experiment: historical
# Time Frequency: day
# Ensemble: r1i1p1
# Variable: tas, pr
# Add all results to data cart, obtain the WGET scripts from there

# Obatining CMIP6:
# https://esgf-node.llnl.gov/search/cmip6/
# Filter by...
# MIP Era: CMIP6
# Experiment ID: historical
# Variant Label: r1i1p1f1
# Frequency: day
# Variable: tas, pr
# Add all results to data cart, obtain the WGET scripts from there

# For both CMIPs, there may be models which have daily output for one variable, but not the other.
# We ran into this in four cases, here is how we dealt with it:
# FGOALS-g2 had daily output for CMIP5 pr but only 3hr intervals for CMIP5 tas
# AWI-CM-1-1-MR had daily output for CMIP6 tas but only 3hr intervals for CMIP6 pr
# For the above two models, we aggregated the 3 hourly data to daily, see 02_prep_data.R 
# CanCM4 had daily output for CMIP5 pr but only monthly intervals for CMIP5 tas
# IPSL-CM6A-LR-INCA had daily output for CMIP6 pr but only monthly intervals for CMIP6 tas
# For the above two models, we cannot obtain or compute the daily data. Exclude these for joint comparisons

# Obtaining NCEP:
# https://psl.noaa.gov/data/gridded/data.ncep.reanalysis2.html
# Air temperature 2m daily means
# Precipitation rate daily means

# Obtaining GPCP:
# daily: https://www.ncei.noaa.gov/products/climate-data-records/precipitation-gpcp-daily
# monthly: https://psl.noaa.gov/data/gridded/data.gpcp.html

# Obtaining ERA5:
# first follow: https://cds.climate.copernicus.eu/api-how-to
# then run below script to obtain hourly data
# script 02_prep_data converts hourly to daily
# **Alternative option: run the same scripts in the cds toolbox and
# use the daily_mean function to aggregate before downloading as in:
# https://earthscience.stackexchange.com/questions/18679/does-climate-data-store-provide-daily-mean-of-era-5-reanalysis-data
# Note you may need to obtain the data one month at a time this way.
# WARNING: the ERA5 hourly data is huge, about 1.5GB per file

#### Prep API and define temporal extent ####

library(reticulate)
cds = import('cdsapi')
client = cds$Client()

years = paste(1979:2005)
months = c('01','02','03','04','05','06','07','08','09','10','11','12')

data_dir = '/mnt/r/historical/'


#### ERA5 Temperature Request ####

for(y in years){
  for(m in months){
    request = list(
      variable = '2m_temperature',
      year = y,
      month = m,
      product_type = 'reanalysis',
      day = c('01', '02', '03',
              '04', '05', '06',
              '07', '08', '09',
              '10', '11', '12',
              '13', '14', '15',
              '16', '17', '18',
              '19', '20', '21',
              '22', '23', '24',
              '25', '26', '27',
              '28', '29', '30','31'),
      time = c('00:00', '01:00', '02:00', 
               '03:00', '04:00', '05:00',
               '06:00', '07:00', '08:00',
               '09:00', '10:00', '11:00',
               '12:00', '13:00', '14:00',
               '15:00', '16:00', '17:00',
               '18:00', '19:00', '20:00',
               '21:00', '22:00', '23:00'),
      format = 'netcdf'
    )
    client$retrieve(
      'reanalysis-era5-single-levels',
      r_to_py(request), 
      paste0(data_dir,'era5/tas/hourly/era5_tas_hourly_',y,'_',m,'.nc')
    )
  }
}


#### ERA5 Precipitation Request ####

for(y in years){
  for(m in months){
    request = list(
      variable = 'total_precipitation',
      year = y,
      month = m,
      product_type = 'reanalysis',
      day = c('01', '02', '03',
              '04', '05', '06',
              '07', '08', '09',
              '10', '11', '12',
              '13', '14', '15',
              '16', '17', '18',
              '19', '20', '21',
              '22', '23', '24',
              '25', '26', '27',
              '28', '29', '30','31'),
      time = c('00:00', '01:00', '02:00', 
               '03:00', '04:00', '05:00',
               '06:00', '07:00', '08:00',
               '09:00', '10:00', '11:00',
               '12:00', '13:00', '14:00',
               '15:00', '16:00', '17:00',
               '18:00', '19:00', '20:00',
               '21:00', '22:00', '23:00'),
      format = 'netcdf'
    )
    client$retrieve(
      'reanalysis-era5-single-levels',
      r_to_py(request), 
      paste0(data_dir,'era5/pr/hourly/era5_pr_hourly_',y,'_',m,'.nc')
    )
  }
}
