# /*
#  * Adapted from:
#  * https://github.com/learnedsystems/SOSD/blob/master/scripts/download.sh
#  * https://github.com/anikristo/LearnedSort
#  */

# Calculate md5 checksum of FILE and stores it in MD5_RESULT
function get_checksum() {
   FILE=$1

   if [ -x "$(command -v md5sum)" ]; then
      # Linux
      MD5_RESULT=`md5sum ${FILE} | awk '{ print $1 }'`
   else
      # OS X
      MD5_RESULT=`md5 -q ${FILE}`
   fi
}


function download_file_zst() {
   echo "---------------------------------"
   echo "Downloading $1 ..."
   FILE=$1;
   CHECKSUM=$2;
   URL=$3;

   # Check if file already exists
   if [ -f ${FILE} ]; then
      # Exists -> check the checksum
      get_checksum ${FILE}

      echo "checksum: " ${MD5_RESULT}
      echo "expected: " ${CHECKSUM}

      if [ "${MD5_RESULT}" != "${CHECKSUM}" ]; then
         wget -O - ${URL} | zstd -d > ${FILE}
      fi
   else
      # Does not exists -> download
      wget -O - ${URL} | zstd -d > ${FILE}
   fi

   # Validate (at this point the file should really exist)
   get_checksum ${FILE}

   echo "checksum: " ${MD5_RESULT}
   echo "expected: " ${CHECKSUM}

   if [ "${MD5_RESULT}" != "${CHECKSUM}" ]; then
      echo "error checksum does not match: run download again"
      exit -1
   else
      echo ${FILE} "checksum ok"
   fi
}

# Main script code
function main() {
   echo "downloading data ..."
   mkdir -p data
   cd data

   # Wiki
   download_file_zst wiki_ts_200M_uint64 4f1402b1c476d67f77d2da4955432f7d https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/SVN8PI

   # Books
   download_file_zst books_800M_uint64 8708eb3e1757640ba18dcd3a0dbb53bc https://www.dropbox.com/s/y2u3nbanbnbmg7n/books_800M_uint64.zst?dl=1

   # OSM
   download_file_zst osm_cellids_800M_uint64 70670bf41196b9591e07d0128a281b9a https://www.dropbox.com/s/j1d4ufn4fyb4po2/osm_cellids_800M_uint64.zst?dl=1

   # FB
   download_file_zst fb_200M_uint64 3b0f820caa0d62150e87ce94ec989978 https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/EATHF7

   # NYC Taxi
   download_file_zst nyc_pickup 8365c1cb65aec672ed00dabfaf83107a https://dataverse.harvard.edu/api/access/datafile/4550793
   download_file_zst nyc_dist a825fae9a1beb6d80b9ff940c7f68976 https://dataverse.harvard.edu/api/access/datafile/4550859
   download_file_zst nyc_tot d3a1143c4537f20743bec7fc9a1be2ee https://dataverse.harvard.edu/api/access/datafile/4550815

   # Stock data
   download_file_zst stks_vol f9b0ac38265607962a7816beedb06e96 https://dataverse.harvard.edu/api/access/datafile/4550832
   download_file_zst stks_open 5d6fd871751c48dfcc1e6818b211e4ce https://dataverse.harvard.edu/api/access/datafile/4550835
   download_file_zst stks_date 5384837ccaa9b12bad558b29013110b4 https://dataverse.harvard.edu/api/access/datafile/4550834
   download_file_zst stks_low 01978c404ed9217cf44a7795e31503e9 https://dataverse.harvard.edu/api/access/datafile/4550833

   # Sof
   download_file_zst sof_hum d9d0a97bb3cb3cf5902f0c5aef1569cc https://dataverse.harvard.edu/api/access/datafile/4550824
   download_file_zst sof_press 7177fec72b1e2d0c08fb75d6f481a85f https://dataverse.harvard.edu/api/access/datafile/4550826
   download_file_zst sof_temp 32e8ec5f9b5a8e93c91126b7bb1d3dc1 https://dataverse.harvard.edu/api/access/datafile/4550825

   # Chicago Taxi
   download_file_zst chic_start 0f26dcbf2bd3cd959da480882b5aeaa2 https://dataverse.harvard.edu/api/access/datafile/4550785
   download_file_zst chic_tot 4a87ee51c4c6349f04dfca38692ad3f6 https://dataverse.harvard.edu/api/access/datafile/4550787

   cd ..
   echo "done"
}

# Run
main
