base_model="Qwen/Qwen2.5-1.5B-Instruct"
export VLLM_ATTENTION_BACKEND=XFORMERS
export BASE_MODEL='Qwen/Qwen2.5-1.5B-Instruct'
export PROJECT_NAME="compiler_autotuning_qwen"
export EXPERIMENT_NAME="ppo-after-sft-$(basename $base_model)"
export HYDRA_FULL_ERROR=1
export CUDA_LAUNCH_BLOCKING=1

export CUDA_VISIBLE_DEVICES=0,1
sft_output_dir="./model_save/cold_start_model/1_5B/"
latest_checkpoint=$(ls -dt $sft_output_dir/global_step_* 2>/dev/null | head -n 1)

python3 -m agent_r1.src.main_agent \
    data.train_files=./dataset/rl/train.parquet \
    "data.val_files=[./dataset/rl/validation_val-cbench.parquet,./dataset/rl/validation_val-blas.parquet,./dataset/rl/validation_val-chstone.parquet,./dataset/rl/validation_val-mibench.parquet,./dataset/rl/validation_val-npb.parquet,./dataset/rl/validation_val-opencv.parquet,./dataset/rl/validation_val-tensorflow.parquet]" \
    data.train_batch_size=64 \
    data.max_prompt_length=4096 \
    data.max_response_length=4096 \
    data.max_start_length=4096 \
    data.max_tool_response_length=4096 \
    actor_rollout_ref.model.path=$latest_checkpoint \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.ppo_mini_batch_size=4 \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
    actor_rollout_ref.rollout.name=vllm \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
    critic.optim.lr=1e-6 \
    critic.model.use_remove_padding=True \
    critic.model.path=$latest_checkpoint \
    critic.model.enable_gradient_checkpointing=True \
    critic.ppo_micro_batch_size_per_gpu=2 \
    critic.model.fsdp_config.param_offload=False \
    critic.model.fsdp_config.optimizer_offload=False \
    algorithm.adv_estimator=gae \
    algorithm.kl_ctrl.kl_coef=0.001 \
    trainer.critic_warmup=0 \
    trainer.logger=['console','wandb'] \
    trainer.project_name=$PROJECT_NAME \
    trainer.experiment_name=$EXPERIMENT_NAME \
    trainer.n_gpus_per_node=2 \
    trainer.nnodes=1 \
    trainer.save_freq=-1 \
    trainer.test_freq=1 \
    trainer.total_epochs=1 \
    trainer.val_before_train=True \
    trainer.total_training_steps=40 \
    tool.env='optimizer' $@ 