#!/bin/bash

# 定义要循环的列表
arg_list=("ckpt0" "ckpt170" "ckpt171" "ckpt172" "ckpt230" "ckpt380" "ckpt433" "ckpt_rlloss" "ckpt_target" "1B_best_ckpt" "1b_poor_ckpt" "1b_best2_ckpt")

for arg1 in "${arg_list[@]}"; do
  ckpt_path="/root/work/filestorage/zhanglianlian/Memory/OpenRLHF-0.8.1.post1/sft_ckpt/$arg1"
  log_file="log/1B_train_grpo_model_${arg1}.log"

  # 如果日志文件已存在，跳过
  if [ -f "$log_file" ]; then
    echo "跳过已完成的任务: $arg1"
    continue
  fi
    
  export VLLM_ALLOW_INSECURE_SERIALIZATION=1
  
  python3 -m openrlhf.cli.train_ppo_ray \
     --ref_num_nodes 1 \
     --ref_num_gpus_per_node 2 \
     --reward_num_nodes 1 \
     --reward_num_gpus_per_node 2 \
     --actor_num_nodes 1 \
     --actor_num_gpus_per_node 2 \
     --vllm_num_engines 1 \
     --vllm_tensor_parallel_size 2 \
     --colocate_all_models \
     --vllm_gpu_memory_utilization 0.6 \
     --init_kl_coef 1e-3 \
     --gamma 1.0 \
     --use_kl_loss \
     --kl_estimator k3 \
     --advantage_estimator group_norm \
     --pretrain "$ckpt_path" \
     --reward_pretrain /root/work/filestorage/zhanglianlian/Model/Qwen-3B \
     --save_path ckpt/1B/train_grpo_model_$arg1 \
     --ckpt_path ckpt/1B/train_grpo_model_$arg1 \
     --save_steps 10 \
     --save_hf_ckpt \
     --micro_train_batch_size 8 \
     --train_batch_size 128 \
     --micro_rollout_batch_size 16 \
     --rollout_batch_size 128 \
     --n_samples_per_prompt 8 \
     --max_epochs 1 \
     --prompt_max_len 1024 \
     --max_samples 10000 \
     --generate_max_len 1024 \
     --zero_stage 3 \
     --bf16 \
     --actor_learning_rate 5e-7 \
     --critic_learning_rate 9e-6 \
     --prompt_data  math_python_squad/sft1 \
     --input_key question \
     --label_key label \
     --apply_chat_template \
     --normalize_reward \
     --gradient_checkpointing \
     --packing_samples \
     --vllm_sync_backend nccl \
     --enforce_eager \
     --vllm_enable_sleep \
     --deepspeed_enable_sleep > "$log_file" 2>&1
  cd ckpt/1B/train_grpo_model_$arg1
  mkdir global_step78_hf
  mv added_tokens.json  config.json  generation_config.json  merges.txt  model.safetensors  special_tokens_map.json  tokenizer.json  tokenizer_config.json  vocab.json  global_step78_hf/
  cd - 
  sleep 30
done
