#!/usr/bin/env bash
# set -xeuo pipefail
set -x
export HF_ENDPOINT=https://hf-mirror.com
# export LD_PRELOAD=/lib/x86_64-linux-gnu/libstdc++.so.6
export WANDB_API_KEY=e1e322902485df90ee801ea11be9bc63c7c4f9d5
# source /mnt/petrelfs/yanjianhao/envs_12.4.sh
# eval "$(conda shell.bash hook)"
eval "$(/mnt/shared-storage-user/p1-shared/wangfuting/miniconda3/bin/conda shell.bash hook)"
conda activate verl041-test


export VLLM_ATTENTION_BACKEND=XFORMERS
project_name='verl-qwen3-8b-new'

adv_estimator=grpo

use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=False
kl_loss_coef=0.0

clip_ratio_low=0.2
clip_ratio_high=0.2

max_prompt_length=$((1024 * 1))
max_response_length=$((1024 * 8))
enable_overlong_buffer=False
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0

loss_agg_mode="token-mean"

enable_filter_groups=False
filter_groups_metric=seq_reward
max_num_gen_batches=10
train_prompt_bsz=128
gen_prompt_bsz=$((train_prompt_bsz * 1))
n_resp_per_prompt=8
train_prompt_mini_bsz=8
random_budget='null'

# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=1
# Paths
# RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
# MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B-ds-16k-tts"}
# CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
# CKPT_DIR=/mnt/petrelfs/yanjianhao/RL/tts_rl/ckpts/tts-test/TTS_ALL_MATH_BASE_TRIPLE-BSZ_LUFFY
# TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
# TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
# export MODEL_PATH=/mnt/petrelfs/share_data/yanjianhao/Qwen2.5-Math-1.5B-ds-16k
# export MODEL_PATH=/mnt/petrelfs/share_data/yanjianhao/Qwen2.5-Math-7B-ds-16k-tts

# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/verl-qwen3-4b-base/DAPO-Qwen3-4B-Base-deepscaler-40k-add1k-new-60steps-continue-overlong-filter/best_model/actor/huggingface
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/Qwen3-4B-Base
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/Qwen/Qwen3-30B-A3B
export MODEL_PATH=/mnt/shared-storage-user/p1-shared/Qwen/Qwen3-4B
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/LlamaFactory/trainer_output/checkpoint-162
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/verl-qwen3-4b-oct/stage1-additive-length-penalty-390step-stage2-grpo/best_model/actor/huggingface
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/dapo-add1k-max9k-redo/global_step_110/actor/huggingface 
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/verl-qwen3-4b-oct/baseline-8k/best_model/actor/huggingface
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/verl-qwen3-4b-oct/add1k-max9k-redo-110step-stage2-grpo/best_model/actor/huggingface
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/verl-qwen3-4b-oct/add1k-remove-upper-refined/global_step_100/actor/huggingface
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/verl-qwen3-4b-oct/add1k50steps-stage2-overlongfilter/best_model/actor/huggingface

export EXP_NAME="qwen3-4b-dapo-math-non-think"
export WANDB_MODE=offline


TRAIN_FILE=/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/qwen3-4b-s1-sampled1k.parquet
# TEST_FILE=${DATA_DIR}/valid.v4.parquet
# TEST_FILE="/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/qwen3-4b-s1.parquet"
TEST_FILE="/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/dapo-math-17k_qwen3_polaris.parquet"
# TEST_FILE="/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/countdown/train.parquet"
# TEST_FILE="/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/l1/deepscaler_qwen3_polaris_sampled2k.parquet"
# TEST_FILE="/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/l1/deepscaler-add1k_8k_max9k.parquet"
# TEST_FILE="/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/deepmath-5k_qwen3.parquet"
# Algorithm
temperature=1.0
top_p=1.0
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
val_top_p=1.0

# Performance Related Parameter
sp_size=1
use_dynamic_bsz=True
actor_ppo_max_token_len=$((6 * (max_prompt_length + max_response_length)))
infer_ppo_max_token_len=$((9 * (max_prompt_length + max_response_length)))
max_num_batched_tokens=$((9 * (max_prompt_length + max_response_length)))
max_model_len=$((17*1024))
offload=True
gen_tp=1

export ROCR_VISIBLE_DEVICES=""

ROOT=/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/verl041/verl
export PYTHONPATH=$ROOT #:$PYTHONPATH

cd $ROOT
echo ${PYTHONPATH}
python3 -m recipe.dapo.main_dapo \
    data.train_files="${TRAIN_FILE}" \
    data.val_files="${TEST_FILE}" \
    +trainer.val_type="truncated" \
    +data.enable_thinking=False \
    data.val_batch_size=512 \
    +trainer.val_budgets="[16384]" \
    data.prompt_key=prompt \
    data.truncation='error' \
    data.max_prompt_length=${max_prompt_length} \
    data.max_response_length=${max_response_length} \
    data.gen_batch_size=${gen_prompt_bsz} \
    data.filter_overlong_prompts=True \
    data.train_batch_size=${train_prompt_bsz} \
    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
    algorithm.adv_estimator=${adv_estimator} \
    algorithm.use_kl_in_reward=${use_kl_in_reward} \
    algorithm.kl_ctrl.kl_coef=${kl_coef} \
    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
    actor_rollout_ref.actor.clip_ratio_c=10.0 \
    algorithm.filter_groups.enable=${enable_filter_groups} \
    algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
    algorithm.filter_groups.metric=${filter_groups_metric} \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.model.path="${MODEL_PATH}" \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
    actor_rollout_ref.actor.optim.weight_decay=0.1 \
    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
    actor_rollout_ref.actor.entropy_coeff=0 \
    actor_rollout_ref.actor.grad_clip=1.0 \
    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.90 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
    actor_rollout_ref.rollout.enable_chunked_prefill=True \
    actor_rollout_ref.rollout.max_num_batched_tokens=$max_num_batched_tokens\
    actor_rollout_ref.rollout.temperature=${temperature} \
    actor_rollout_ref.rollout.max_model_len=${max_model_len} \
    actor_rollout_ref.rollout.top_p=${top_p} \
    actor_rollout_ref.rollout.top_k="${top_k}" \
    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
    actor_rollout_ref.rollout.val_kwargs.n=1 \
    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
    actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
    trainer.max_actor_ckpt_to_keep=2 \
    reward_model.reward_manager=dapo \
    reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
    reward_model.overlong_buffer.len=${overlong_buffer_len} \
    reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
    trainer.logger=['console','wandb'] \
    trainer.project_name="${project_name}" \
    trainer.experiment_name="${EXP_NAME}" \
    trainer.n_gpus_per_node=4 \
    trainer.nnodes="${NNODES}" \
    trainer.val_before_train=True \
    trainer.val_only=True \
    custom_reward_function.path="/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/verl/recipe/reward_ours/math_verify_reward.py" \
    trainer.validation_data_dir=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/${project_name}/${EXP_NAME}/valid \
     trainer.default_local_dir=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/${project_name}/${EXP_NAME} \
    custom_reward_function.name=compute_score_no_add_think \
    trainer.save_freq=100 \
    trainer.test_freq=10 \
    trainer.total_epochs=3 \
    trainer.resume_mode=auto | tee -a logs/${EXP_NAME}.log