#export LD_LIBRARY_PATH=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))"):$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/home/bmm-system/data/private/chenminghan/project/understand-r1-zero/oat
# export NCCL_DEBUG=INFO

export HF_HUB_OFFLINE=1
# export WANDB_MODE=offline
# export WANDB_MODE=disabled
# export WANDB_MODE=online
# export WANDB_API_KEY=1c4440ee93fb73165b010f7200fa4a6e15c0a935
# export WANDB_DISABLE_SERVICE=true    # 强制 HTTP 直连
# python /home/bmm-system/data/private/chenminghan/project/understand-r1-zero/cmh_train_zero_math_fangan8.py \
#     --critic_type drgrpo \
#     --gpus 8 \
#     --enable_prefix_caching \
#     --collocate \
#     --vllm_sleep \
#     --vllm_gpu_ratio 0.35 \
#     --gradient-checkpointing \
#     --flash-attn \
#     --bf16 \
#     --rnd-seed \
#     --learning_rate 0.000001 \
#     --lr_scheduler constant \
#     --num_ppo_epochs 1 \
#     --beta 0 \
#     --oracle_type reward \
#     --oracle math \
#     --pretrain Qwen/Qwen2.5-Math-7B \
#     --prompt_template qwen_math \
#     --verifier_version math_verify \
#     --zero-stage 2 \
#     --ref_offload \
#     --prompt_data ./datasets/train/math_lvl3to5_8k \
#     --train_split train \
#     --input_key problem \
#     --output_key answer \
#     --max-train 9999999 \
#     --num_prompt_epoch 20 \
#     --prompt_max_length 1024 \
#     --num_samples 8 \
#     --temperature 1 \
#     --top_p 1 \
#     --generate_max_length 3000 \
#     --save_steps 5 \
#     --train_batch_size 128 \
#     --train_batch_size_per_device 1 \
#     --mini_train_batch_size_per_device 1 \
#     --rollout_batch_size 128 \
#     --rollout_batch_size_per_device 16 \
#     --pi_buffer_maxlen_per_device 128 \
#     --eval_batch_size 200 \
#     --eval_steps 16 \
#     --eval_temperature 0 \
#     --eval_generate_max_length 3000 \
#     --eval_data ./datasets/evaluation_suite \
#     --eval_input_key input \
#     --use-wb \
#     --wb_project oat-zero \
#     --wb-run-name oat7b-fangan8-exp1 \



