verl_workdir=verl
train_files=data/grpo/train.parquet
val_files=data/grpo/test.parquet
save_path=grpo_model/qwen3-8b-tools-0714-grpo-multi-node
rollout_save_path=${save_path}/rollout

ALL_NODES=$(scontrol show hostnames "$SLURM_NODELIST")
FIRST_NODE=$(echo "$ALL_NODES" | head -n 1)
NODE_0_ADDR=$(getent hosts "$FIRST_NODE" | awk '{print $1}')
MASTER_IP=$NODE_0_ADDR
CURR_IP=$(python verl/get_host_ip.py)
echo "MASTER_IP=$MASTER_IP"
echo "CURR_IP=$CURR_IP"
if [ "$MASTER_IP" = "$CURR_IP" ]; then
  echo "########### ray start ###########"
  ray start --include-dashboard=True --head --num-gpus 8 --max-worker-port 12800 --runtime-env-agent-port 20100 --dashboard-agent-grpc-port 20101 --dashboard-agent-listen-port 20102 --metrics-export-port 20103
  sleep 50s
  ray status
  echo "########### job submit ###########"
  ray job submit --address="http://127.0.0.1:8265" \
  -- python3 -m verl.trainer.main_ppo \
      algorithm.adv_estimator=grpo \
      data.train_files="$train_files" \
      data.val_files="$val_files" \
      data.prompt_key=question \
      data.train_batch_size=128 \
      data.max_prompt_length=1024 \
      data.max_response_length=15000 \
      data.filter_overlong_prompts=True \
      data.truncation='error' \
      actor_rollout_ref.model.path=sft_model_path \
      actor_rollout_ref.actor.optim.lr=1e-6 \
      actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
      actor_rollout_ref.actor.optim.warmup_style=cosine \
      actor_rollout_ref.model.use_remove_padding=True \
      actor_rollout_ref.actor.ppo_mini_batch_size=16 \
      actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
      actor_rollout_ref.actor.use_kl_loss=True \
      actor_rollout_ref.actor.kl_loss_coef=0.001 \
      actor_rollout_ref.actor.kl_loss_type=low_var_kl \
      actor_rollout_ref.actor.entropy_coeff=0 \
      actor_rollout_ref.model.enable_gradient_checkpointing=True \
      actor_rollout_ref.actor.fsdp_config.param_offload=False \
      actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
      actor_rollout_ref.rollout.mode=sync-search \
      actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
      actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
      actor_rollout_ref.rollout.name=vllm \
      actor_rollout_ref.rollout.max_num_batched_tokens=18000 \
      actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
      actor_rollout_ref.rollout.n=4 \
      actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
      actor_rollout_ref.ref.fsdp_config.param_offload=True \
      algorithm.use_kl_in_reward=False \
      reward_model.reward_manager=re_search \
      trainer.critic_warmup=0 \
      trainer.logger=['console','swanlab'] \
      trainer.project_name='verl_grpo_example_tools' \
      trainer.experiment_name='qwen3_8b_tools_rm_0714_grpo_multi_node' \
      trainer.n_gpus_per_node=8 \
      trainer.val_before_train=True \
      trainer.nnodes=4 \
      trainer.save_freq=20 \
      trainer.test_freq=20 \
      trainer.total_epochs=1 \
      trainer.default_local_dir=${save_path} \
      trainer.rollout_data_dir=${rollout_save_path} \
      hydra.run.dir=${save_path}/outputs | tee ${save_path}/run.log

  echo 'job done, now shutdown ray cluster'
  ray stop --force
else
  sleep 10s
  ray start --address $MASTER_IP:6379 --num-gpus 8 --max-worker-port 12800 --runtime-env-agent-port 20100 --dashboard-agent-grpc-port 20101 --dashboard-agent-listen-port 20102 --metrics-export-port 20103 --block 
fi