#!/bin/bash
NUM_PROC=$1

export OMP_NUM_THREADS=18
export MKL_NUM_THREADS=18
export LD_PRELOAD=/opt/conda/lib/libiomp5.so
export OMP_SCHEDULE=STATIC
export KMP_BLOCKTIME=1

shift
python -m torch.distributed.run \
--standalone \
--nnodes=1 \
--nproc_per_node=$NUM_PROC \
train.py "$@" || break

pkill -9 python
