export CUDA_VISIBLE_DEVICES="0,1,2,3"
export API_KEY=''
export KEY_HF=""
export CUDA_LAUNCH_BLOCKING=0
export HF_HOME="/cm/shared/anonymous/toolkitmoe/evaluate"
export TMPDIR="/cm/shared/anonymous/tmp"
export TOOLKIT_DIR="/cm/shared/anonymous"  # Path to the toolkitmoe directory
export PYTHONPATH="/cm/shared/anonymous_h100/LibMoE/moe_pretrain_model":$PYTHONPATH

gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
IFS=',' read -ra GPULIST <<< "$gpu_list"

#!/bin/bash
NUM_GPUS=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -l)

# Set the number of GPUs per node

# Set the master address (address of the master server, use the IP of the master server or localhost)
MASTER_ADDR="127.0.0.1"  # Example with localhost, change as needed

# Set the master port (default port is 12345, can be changed if needed)
MASTER_PORT=12927
# bash /cm/shared/anonymous/toolkitmoe/scripts/train/run_train_all1.sh
# Set the environment variable for PORT if needed (if not using the default value in the code)
export MASTER_PORT=$MASTER_PORT
cd /cm/shared/anonymous/LibMoE/moe_pretrain_model

export NCCL_TIMEOUT=7200          # Increase to 2 hours for general NCCL operations
export NCCL_DEBUG=INFO           # Keep INFO for now; switch to DEBUG if needed
export NCCL_IB_TIMEOUT=30        # Increase for more InfiniBand retries
export NCCL_SOCKET_TIMEOUT=7200  # Increase to 2 hours for socket communication
# Run the while loop in the background

while true; do
    echo "Starting stage sft"
    python -m torch.distributed.run \
        --nproc_per_node=$NUM_GPUS \
        --master_addr=$MASTER_ADDR \
        --master_port=$MASTER_PORT \
        /cm/shared/anonymous/LibMoE/moe_pretrain_model/run.py \
        /cm/shared/anonymous/LibMoE/moe_pretrain_model/sweeps/slimpajama_moe_no_attmoe_1B_deepseek.yaml

    if [ $? -eq 0 ]; then
        echo "Training completed successfully!"
        break
    else
        echo "Training failed. Restarting..."
        sleep 3
    fi
done


# Wait for both eval.sh and the while loop to finish
wait


#bash /cm/shared/anonymous/LibMoE/moe_pretrain_model/train.sh
