#!/bin/bash
#SBATCH -A m1517_m
#SBATCH -C gpu
#SBATCH -q regular  #### debug regular premium
#SBATCH -t 00:30:00
#SBATCH -n 256  #### 16 on debug
#SBATCH --ntasks-per-node=4
#SBATCH -c 32   ##### 2 * [64/ntasks-per-node]
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=map_gpu:0,1,2,3


export slurm_cpu_bind="cores"

scheduler_file=$SCRATCH/scheduler_file.json
rm -f $scheduler_file

malloc_trim_threshold_=0
module load python/3.11
source ./venv/bin/activate

echo we have nodes: ${slurm_job_nodelist}

echo "$sdn_ip_addr"



hn=$(hostname -s)
port="8786"
echo ${port}
echo "starting scheduler"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=3600s
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP=3600s
export DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING=False
export DASK_DISTRIBUTED__SCHEDULER__WORKER_SATURATION=10000

dask scheduler \
    --interface hsn0 \
    --preload climate_kernel.py \
    --scheduler-file $scheduler_file &

dask_pid=$!

# Wait for the scheduler to start
sleep 5
until [ -f $scheduler_file ]
do
     sleep 5
done

echo "starting workers"
DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=3600s \
DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP=3600s \

srun -o dask_worker_info.txt dask worker \
    --scheduler-file $scheduler_file \
    --interface hsn0 \
    --nworkers 1 \
    --preload climate_kernel.py



echo "checking dask config..."
python -c "import dask; print(dask.config.config)"

echo "\nCopy these variables into your notebook to use the active scheduler:"
echo "hostname = ${hn}"
echo "port = ${port}"

read -p "\nPress any key to shutdown scheduler..." -n1 -s

echo "Killing scheduler"
kill -9 $dask_pid
