#!/bin/bash
#SBATCH -A group
#SBATCH --partition=nvidia-A100
#SBATCH --qos=normal
#SBATCH --nodes=2
#SBATCH -c 128
#SBATCH --time=15-22:00:00
#SBATCH --chdir=./job_log
#SBATCH --gres=gpu:8

cd xx/verl
# 加载环境模块 (例如 Conda 或其他必要模块)
source ~/Software/miniconda3/etc/profile.d/conda.sh
conda activate deeptool
export NCCL_SOCKET_IFNAME=bond0
export NCCL_IB_HCA=mlx5_2
export NCCL_DEBUG=INFOf
export TOKENIZERS_PARALLELISM=false 
echo "!!!!!! BEGIN TO CONVERT !!!!!!"
# 获取第一个节点的IP
ALL_NODES=$(scontrol show hostnames "$SLURM_NODELIST")
FIRST_NODE=$(echo "$ALL_NODES" | head -n 1)
NODE_0_ADDR=$(getent hosts "$FIRST_NODE" | awk '{print $1}')

export SWANLAB_API_KEY=xxx

echo "ALL_NODES"
echo $ALL_NODES

echo "FIRST_NODE"
echo $FIRST_NODE

echo "FIRST_NODE"
echo $NODE_0_ADDR

train_grpo="bash train_32b_grpo.sh"
# 运行脚本
srun bash -c "$train_grpo"
# 任务结束时的通知 (可选)
echo "Serving job completed at $(date)"