export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
WAND_PROJECT='Agent-RL'

export BASE_MODEL="your_qwen_checkpoints"
export EXPERIMENT_NAME="train_thinker_qwen_7B_mix_Distill$(date "+%Y-%m-%d-%H%M%S")"

export VERL_PPO_LOGGING_LEVEL=INFO
export  HYDRA_FULL_ERROR=1
# set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
export TOKENIZERS_PARALLELISM="false"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
    data.train_files="['./data/alfworld/train_subtasks.parquet', './data/sciworld/train_subtasks.parquet']" \
    data.val_files="./data/sciworld/test.parquet" \
    data.train_batch_size=32 \
    data.val_batch_size=100 \
    data.max_prompt_length=128 \
    data.max_response_length=3500 \
    data.return_raw_chat=True \
    data.shuffle=True \
    +data.seed=1 \
    algorithm.adv_estimator=grpo \
    +algorithm.adv_revise="rm_neg_format" \
    +algorithm.adv_format_reward_coef=0.0 \
    +algorithm.adv_format_reward_min=-10.0 \
    +algorithm.adv_other_reward_coef=0.5 \
    +algorithm.adv_other_reward_min=-10.0 \
    actor_rollout_ref.model.path=$BASE_MODEL \
    +actor_rollout_ref.model.chat_template="qwen" \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    +actor_rollout_ref.actor.optim.lr_schedule="constant" \
    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
    actor_rollout_ref.actor.use_dynamic_bsz=False \
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
    actor_rollout_ref.actor.use_kl_loss=True \
    actor_rollout_ref.actor.kl_loss_coef=0.01 \
    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
    actor_rollout_ref.actor.entropy_coeff=0.000 \
    actor_rollout_ref.actor.fsdp_config.param_offload=True \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
    +actor_rollout_ref.actor.fixed_value_loss=-1 \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
    actor_rollout_ref.rollout.name=vllm_multi_turn_via_env \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
    actor_rollout_ref.rollout.n=8 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
    algorithm.kl_ctrl.kl_coef=0.001 \
    trainer.critic_warmup=0 \
    trainer.logger=['swanlab'] \
    trainer.default_hdfs_dir=null \
    trainer.default_local_dir="./checkpoints/$EXPERIMENT_NAME" \
    trainer.project_name=$WAND_PROJECT \
    trainer.experiment_name=$EXPERIMENT_NAME \
    trainer.n_gpus_per_node=8 \
    trainer.nnodes=1 \
    trainer.save_freq=100 \
    trainer.test_freq=-1 \
    trainer.total_epochs=3 \
    trainer.balance_batch=False \
    trainer.resume_from_path=True \
    +actor_rollout_ref.rollout.max_turns=10 \
    +actor_rollout_ref.rollout.environment.actor_length=128 \
    +actor_rollout_ref.rollout.environment.thinker_length=512 \
    +actor_rollout_ref.rollout.train_actor_or_thinker="thinker" \
    +actor_rollout_ref.rollout.fixed_actor_api="['http://localhost:8007']" \
    reward_model.reward_manager=prime \
    +trainer.val_before_train=False \
    2>&1 | tee ./training_log/$EXPERIMENT_NAME.log