#!/bin/bash -l
#SBATCH --job-name=Training033101ABF # Job name
#SBATCH --output=Training033101ABF.o%j # Name of stdout output file
#SBATCH --error=Training033101ABF.e%j # Name of stderr error file
#SBATCH --partition=standard-g # partition name
#SBATCH --nodes=1 # Total number of nodes
#SBATCH --ntasks-per-node=1 # 8 MPI ranks per node, 8 total (1x8)
#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank
#SBATCH --time=6:30:00 # Run time (d-hh:mm:ss)
#SBATCH --account=project_xxxxxx # Project for billing

cat << EOF > select_gpu
#!/bin/bash

export ROCR_VISIBLE_DEVICES=\$SLURM_LOCALID
exec \$*
EOF

CONTEXT_LEN=$1
TOKENS_USED=$2


CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
export MPICH_GPU_SUPPORT_ENABLED=1
source /scratch/project_xxxxxx/venv/bin/activate
# source /scratch/project_xxxxxx/LLM_DID/test2/bin/activate

# OMP_NUM_THREADS=8
# export OMP_NUM_THREADS=8
srun --cpu-bind=${CPU_BIND} torchrun --standalone --nproc_per_node=8 train.py --compile=False config/LONGCONTEXT_train_gpt2_033101ver"$CONTEXT_LEN"ctl_100pct_"$TOKENS_USED"M_small.py > Training033101ABF_longcontext_small_"$CONTEXT_LEN"ctl_"$TOKENS_USED"M_ABF.txt