#!/bin/bash
# CLPO V3 Training Script - Parameters aligned with CLPO V2
set -euo pipefail

# ========================
# Environment and system
# ========================
export HYDRA_FULL_ERROR=1
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0,1,2,3,4,5,6,7"}
      # 启用IB网络




# GPU detection
if command -v nvidia-smi >/dev/null 2>&1; then
  export NUM_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l || echo 1)
else
  export NUM_GPUS=1
fi

EXPERIMENT_NAME="CLPO-V3"
MODEL_PATH="/primus_datasets/primus_data/Qwen3_06B_RlhksV"                   
TRAIN_DATA="/primus_datasets/primus_data/iclr_gsm8k_Intity/train.parquet"           
VAL_DATA="/primus_datasets/primus_data/iclr_gsm8k_Intity/test.parquet"
OUTPUT_DIR="/primus_oss/_checkpoint/0903-Qwen3-0.6B-CLPO-V3"

USE_CHAT_TEMPLATE=true
VAL_BEFORE_TRAIN=false
USE_DYNAMIC_BSZ=true
TENSOR_MP_SIZE=1
USE_TQDM=true
SAVE_FREQ=50
TEST_FREQ=10


TOTAL_EPOCHS=3
TRAIN_BSZ=64
VAL_BSZ=32
LOGPROB_MICRO_BSZ_PER_GPU=8
LR=1e-6
KL_COEF=0.0
KL_LOSS_COEF=0.001
N_SAMPLES=4
MAX_PROMPT_LEN=512
MAX_RESPONSE_LEN=1024
MAX_MODEL_LEN=$((MAX_PROMPT_LEN + MAX_RESPONSE_LEN))
GPU_MEM_UTIL=0.5
ESTIMATOR=grpo

USE_KL_LOSS=false
USE_KL_IN_REWARD=false

CLPO_HARD_ACC_UPPER=0.3
CLPO_MED_ACC_LOWER=0.3
CLPO_MED_ACC_UPPER=0.7


python3 -m verl.trainer.main_ppo \
  algorithm.adv_estimator=${ESTIMATOR} \
  algorithm.norm_adv_by_std_in_grpo=true \
  algorithm.use_kl_in_reward=${USE_KL_IN_REWARD} \
  data.train_files="${TRAIN_DATA}" \
  data.val_files="${VAL_DATA}" \
  data.train_batch_size=${TRAIN_BSZ} \
  data.val_batch_size=${VAL_BSZ} \
  data.max_prompt_length=${MAX_PROMPT_LEN} \
  data.max_response_length=${MAX_RESPONSE_LEN} \
  data.filter_overlong_prompts=true \
  data.truncation=error \
  data.use_chat_template=${USE_CHAT_TEMPLATE} \
  data.clpo_hard_acc_upper=${CLPO_HARD_ACC_UPPER} \
  data.clpo_medium_acc_lower=${CLPO_MED_ACC_LOWER} \
  data.clpo_medium_acc_upper=${CLPO_MED_ACC_UPPER} \
  actor_rollout_ref.model.path="${MODEL_PATH}" \
  actor_rollout_ref.model.enable_gradient_checkpointing=true \
  actor_rollout_ref.model.use_remove_padding=true \
  actor_rollout_ref.actor.optim.lr=${LR} \
  actor_rollout_ref.actor.ppo_mini_batch_size=${TRAIN_BSZ} \
  actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${LOGPROB_MICRO_BSZ_PER_GPU} \
  actor_rollout_ref.actor.use_kl_loss=${USE_KL_LOSS} \
  actor_rollout_ref.actor.kl_loss_coef=${KL_LOSS_COEF} \
  actor_rollout_ref.actor.kl_loss_type=low_var_kl \
  actor_rollout_ref.actor.entropy_coeff=0 \
  actor_rollout_ref.actor.fsdp_config.param_offload=false \
  actor_rollout_ref.actor.fsdp_config.optimizer_offload=false \
  actor_rollout_ref.rollout.name=vllm \
  actor_rollout_ref.rollout.n=${N_SAMPLES} \
  actor_rollout_ref.rollout.tensor_model_parallel_size=${TENSOR_MP_SIZE} \
  actor_rollout_ref.rollout.gpu_memory_utilization=${GPU_MEM_UTIL} \
  actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${LOGPROB_MICRO_BSZ_PER_GPU} \
  actor_rollout_ref.rollout.max_model_len=${MAX_MODEL_LEN} \
  actor_rollout_ref.rollout.max_num_batched_tokens=${MAX_MODEL_LEN} \
  actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${LOGPROB_MICRO_BSZ_PER_GPU} \
  actor_rollout_ref.ref.fsdp_config.param_offload=false \
  algorithm.use_kl_in_reward=${USE_KL_IN_REWARD} \
  trainer.critic_warmup=0 \
  trainer.logger='["console", "swanlab"]' \
  trainer.project_name=CLPO-GSM8K \
  trainer.experiment_name="${EXPERIMENT_NAME}" \
  trainer.n_gpus_per_node=${NUM_GPUS} \
  trainer.nnodes=1 \
  trainer.save_freq=${SAVE_FREQ} \
  trainer.test_freq=${TEST_FREQ} \
  trainer.total_epochs=${TOTAL_EPOCHS} \
  trainer.val_before_train=${VAL_BEFORE_TRAIN} \
  trainer.default_local_dir="${OUTPUT_DIR}" \
  critic.enable=false \
  reward_model.enable=false \
  reward_model.reward_manager=naive \
  trainer.task=clpo \
  "$@"
