set -x 

ray stop
export RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1
ray start --head --node-ip-address 0.0.0.0 --num-gpus 5 --dashboard-host=0.0.0.0

RAY_ADDRESS='http://127.0.0.1:8265' ray job submit \
   --working-dir workpath \
   --runtime-env-json='{"setup_commands": ["pip install openrlhf[vllm]"]}' \
   -- python -m openrlhf.cli.train_ppo_ray \
   --ref_num_nodes 1 \
   --ref_num_gpus_per_node 2 \
   --critic_num_nodes 1 \
   --critic_num_gpus_per_node 1 \
   --actor_num_nodes 1 \
   --actor_num_gpus_per_node 2 \
   --vllm_num_engines 2 \
   --vllm_tensor_parallel_size 1 \
   --colocate_actor_ref \
   --pretrain  qwen_model_path \
   --remote_rm_url http://localhost:5000/get_reward \
   --save_path save_path \
   --micro_train_batch_size 8 \
   --train_batch_size 128 \
   --micro_rollout_batch_size 16 \
   --rollout_batch_size 1024 \
   --max_samples 20000 \
   --num_episodes 8 \
   --max_epochs 1 \
   --prompt_max_len 1024 \
   --generate_max_len 1024 \
   --zero_stage 3 \
   --bf16 \
   --actor_learning_rate 5e-7 \
   --critic_learning_rate 9e-6 \
   --init_kl_coef 0.01 \
   --prompt_data nvidia_helpsteer3_20k \
   --input_key instruction \
   --chosen_label_key chosen \
   --reject_label_key rejected \
   --apply_chat_template \
   --normalize_reward \
   --packing_samples \
   --adam_offload \
   --flash_attn \
   --gradient_checkpointing \
   --prompt_split train \
   --save_steps -1 \
   --ckpt_path ckpt_path \
   --load_checkpoint \
   --use_tensorboard tensorboard_path