#!/bin/bash
#SBATCH -A m4416_g
#SBATCH -C gpu
#SBATCH -q premium
#SBATCH -t 24:00:00
#SBATCH -n 1024
#SBATCH --ntasks-per-node=4
#SBATCH -c 32   ##### 2 * [64/ntasks-per-node]
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=map_gpu:0,1,2,3


export slurm_cpu_bind="cores"
number_of_workers=1024

malloc_trim_threshold_=0
module load python/3.11
source ./3denv/bin/activate


echo we have nodes: ${slurm_job_nodelist}

echo "$sdn_ip_addr"

hn=$(hostname -s)
port="8786"
echo ${port}
echo "starting scheduler"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=3600s
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP=3600s
export DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING=True
export DASK_DISTRIBUTED__SCHEDULER__WORKER_SATURATION=10000
dask scheduler \
    --preload climate_kernel.py \
    --host ${hn} --port ${port} &


DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=3600s \
DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP=3600s \

echo "starting workers"
srun -o dask_worker_info.txt dask-worker --memory-limit="30 GiB" \
	--preload climate_kernel.py ${hn}:${port} \
	--nworkers 1 \
	--nthreads 1 &

echo "starting gp2Scale"
python -c "import dask; print(dask.config.config)"
python -u climateGPUtraining.py ${hn}:${port} ${number_of_workers}

