#!/bin/bash
set -x
# 定义要循环的列表
arg_list=("ckpt0" "ckpt170" "ckpt171" "ckpt172" "ckpt230" "ckpt380" "ckpt433" "ckpt_rlloss" "ckpt_target" "1B_best_ckpt" "1b_poor_ckpt" "1b_best2_ckpt")

for arg1 in "${arg_list[@]}"; do
  ckpt_path="/root/work/filestorage/zhanglianlian/Memory/OpenRLHF-0.8.1.post1/ckpt/1B/train_grpo_model_${arg1}/global_step78_hf"
  #ckpt_path="/root/work/filestorage/zhanglianlian/Memory/OpenRLHF-0.8.1.post1/ckpt/1B/train_grpo_model_${arg1}/${arg1}"
  ckpt_file="eval_results/ckpt${arg1}_predict.json"
  log_file="log_eval/predcit_${arg1}.log"

  # 如果日志文件已存在，跳过
  if [ -f "$log_file" ]; then
    echo "跳过已完成的任务: $arg1"
    continue
  fi

  export VLLM_ALLOW_INSECURE_SERIALIZATION=1

  python3 -m openrlhf.cli.train_ppo_ray \
     --ref_num_nodes 1 \
     --ref_num_gpus_per_node 2 \
     --reward_num_nodes 1 \
     --reward_num_gpus_per_node 2 \
     --actor_num_nodes 1 \
     --actor_num_gpus_per_node 2 \
     --vllm_num_engines 1 \
     --vllm_tensor_parallel_size 2 \
     --colocate_all_models \
     --vllm_gpu_memory_utilization 0.6 \
     --init_kl_coef 1e-3 \
     --gamma 1.0 \
     --use_kl_loss \
     --kl_estimator k3 \
     --advantage_estimator group_norm \
     --pretrain "$ckpt_path" \
     --remote_rm_url  reward_func.py \
     --save_path ckpt/ckpt${arg1} \
     --ckpt_path ckpt/ckpt${arg1} \
     --save_hf_ckpt \
     --micro_train_batch_size 16 \
     --train_batch_size 64 \
     --micro_rollout_batch_size 16 \
     --rollout_batch_size 64 \
     --n_samples_per_prompt 128 \
     --max_epochs 1 \
     --prompt_max_len 1024 \
     --max_samples 2000 \
     --generate_max_len 1024 \
     --zero_stage 3 \
     --bf16 \
     --actor_learning_rate 5e-7 \
     --critic_learning_rate 9e-6 \
     --prompt_data math_python_squad \
     --input_key question \
     --label_key label \
     --apply_chat_template \
     --normalize_reward \
     --gradient_checkpointing \
     --packing_samples \
     --vllm_sync_backend nccl \
     --enforce_eager \
     --vllm_enable_sleep \
     --deepspeed_enable_sleep > "$log_file" 2>&1
     sleep 30
done

#     --reward_pretrain /root/work/filestorage/zhanglianlian/Model/Qwen3-0.6B/ \
