#!/bin/bash
#SBATCH -J 8xH100Nodes
#SBATCH -p gpu-debug
#SBATCH -A r01156
#SBATCH -o /N/slate/jindjia/bash_scripts/bytedance2/icml-performance-bytedance/tasks/8xA100Nodes/batch_output_%j.txt
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-task=60
#SBATCH --mem=240g
#SBATCH --time=01:00:00
#SBATCH --mail-type=ALL

set -x
export MEGATRON_PATH="/N/slate/jindjia/RepeatComm/dev/Fast-Slow-performance"

export LOG_INTERVAL=1
export EXIT_INTERVAL=40
export WANDB_PROJECT=icml-performance-test
export SCRIPT_DIR=/N/slate/jindjia/bash_scripts/bytedance2/icml-performance-bytedance
export OUTPUT_BASE_DIR=/N/slate/jindjia/bash_scripts/bytedance2/icml-performance-bytedance/tasks/8_8xH100Nodes/output_dir-acc32-setting1

# ----------------- Numerber of GPUs  -----------------

NUM_NODES_LIST=(4 ) 
export RUNNING_GPUS_PER_NODE=8


# ----------------- Strat srun script -----------------

export OMP_NUM_THREADS=$OMP_NUM_THREADS
export SSL_CERT_FILE=/N/slate/jindjia/cacert.pem # for wandb login with singularity, I encountered a problem, you may not need this


nvidia-smi

# ----------------- Model and Training Config -----------------
MODEL_LIST=(
    "1_3B" 
    "2_7B"
    "6_7B"
    "13B"
    "18B"
)

TRAIN_CONFIG_LIST=(
    "SDP4Bit" 
    "DUO4Bit" 
) 

export ACCUMULATION_STEP=32

# ----------------- Model and Training Config -----------------

for num_nodes in "${NUM_NODES_LIST[@]}"; do
    export RUNNING_NODES=$num_nodes
    for model in "${MODEL_LIST[@]}"; do
        MODEL_NAME=$model
        for train_confit_name in "${TRAIN_CONFIG_LIST[@]}"; do
            echo "Running $MODEL_NAME, $train_confit_name on $RUNNING_NODES nodes"
            TRAIN_CONFIG_NAME=$train_confit_name
            export WANDB_NAME=${TRAIN_CONFIG_NAME}_${RUNNING_NODES}_NODES
            export MODEL_ARG_PATH=${SCRIPT_DIR}/model-cards/${MODEL_NAME}.sh
            export TRAINING_ARG_PATH=${SCRIPT_DIR}/training-config/${TRAIN_CONFIG_NAME}.sh
            export OUTPUT_DIR=${OUTPUT_BASE_DIR}/${SLURM_JOB_ID}/${RUNNING_NODES}_NODES/${MODEL_NAME}/${TRAIN_CONFIG_NAME}
            export TENSORBOARD_DIR=${OUTPUT_DIR}/tensorboard
            export WANDB_DIR=${OUTPUT_DIR}/wandb
            mkdir -p $OUTPUT_DIR

            bash ${SCRIPT_DIR}/starter.sh > $OUTPUT_DIR/${TRAIN_CONFIG_NAME}_logfile.log 2>&1
        done
    done

done
