#!/usr/bin/env bash

set -e

cd emergent_communication_at_scale
  mkdir -p emcom_datasets/
  cd  emcom_datasets
    wget https://storage.googleapis.com/dm_emcom_at_scale_dataset/byol_celeb_a2.tar.gz
    tar xf byol_celeb_a2.tar.gz
    wget https://storage.googleapis.com/dm_emcom_at_scale_dataset/byol_imagenet2012.tar.gz
    tar xf byol_imagenet2012.tar.gz
  cd ..
cd ..

# Python cannot find the CUDA libraries without manually inserting the conad
# environment's /lib path into the LD_LIBRARY_PATH
CONDA_LIB_DIR=$(which python | sed s,bin/python,lib,)
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:}$CONDA_LIB_DIR

# If this is unset, the code will OOM on an 11 GiB card, possibly due to jax
# and TensorFlow both preallocating.
export XLA_PYTHON_CLIENT_PREALLOCATE=false

python helper.py

for dir in checkpoint/*/; do
  target=../data/$(basename $dir)
  mkdir -p $target
  cp $dir/{corpus.jsonl,metadata.json} $target
done
