name: streaming-regression-test-deterministic-shuffling
compute:
  gpus: 8  # Number of GPUs to use
  # cluster: TODO  # Name of the cluster to use for this run
command: |-
  pip uninstall -y mosaicml-streaming
  cd streaming
  pip install -e '.[dev]'

  python regression/synthetic_dataset.py --create --out /tmp/streaming_dataset/ --num-samples 10000 --size_limit 4096

  # Check sample order without shuffling
  composer -n 2 regression/iterate_data.py --num_canonical_nodes 128 --batch_size 32 --local /tmp/local_dataset/ \
    --remote /tmp/streaming_dataset/ --sample-order-file /tmp/test_samples_0.txt
  rm -Rf /tmp/local_dataset/
  composer -n 8 regression/iterate_data.py --num_canonical_nodes 1 --batch_size 1 --local /tmp/local_dataset/ --remote \
    /tmp/streaming_dataset/ --sample-order-file /tmp/test_samples_1.txt
  rm -Rf /tmp/local_dataset/
  cmp --silent /tmp/test_samples_0.txt /tmp/test_samples_1.txt || exit 1

  rm -Rf /tmp/test_samples_0.txt
  rm -Rf /tmp/test_samples_1.txt

  # Check sample order with shuffling
  composer -n 2 regression/iterate_data.py --num_canonical_nodes 1 --batch_size 1 --local /tmp/local_dataset/ --remote \
    /tmp/streaming_dataset/ --sample-order-file /tmp/test_samples_0.txt
  rm -Rf /tmp/local_dataset/ --shuffle
  composer -n 8 regression/iterate_data.py --num_canonical_nodes 256 --batch_size 16 --local /tmp/local_dataset/ \
    --remote /tmp/streaming_dataset/ --sample-order-file /tmp/test_samples_1.txt
  rm -Rf /tmp/local_dataset/ --shuffle
  cmp --silent /tmp/test_samples_0.txt /tmp/test_samples_1.txt || exit 1

  rm -Rf /tmp/test_samples_0.txt
  rm -Rf /tmp/test_samples_1.txt

  python regression/synthetic_dataset.py --delete --out /tmp/streaming_dataset/

image: mosaicml/composer:latest
scheduling:
  resumable: true
  priority: medium
integrations:
- integration_type: git_repo
  git_repo: mosaicml/streaming
  git_branch: main
  ssh_clone: false
