echo "Job started at $(date)"
TEST_FILES="['data/test_amc/test.parquet', 'data/test_aime2024/test.parquet', 'data/test_aime2025/test.parquet', 'data/test_brumo2025/test.parquet', 'data/test_hmmt_feb_2025/test.parquet', 'data/test_deepcoder_50.parquet']"

TRAIN_FILES="['data/train_DAPO-Math-17k.parquet', 'data/train_deepcoder.parquet']"
EXPERIMENT_NAME="explore_exploit_rollout_policy_n12b4_r2"
GPU_NUM=4

# Run the training with output redirection
CUDA_LAUNCH_BLOCKING=1 python -m verl.trainer.main_ppo  \
    algorithm.adv_estimator=grpo \
    data.train_files="$TRAIN_FILES" \
    data.val_files="$TEST_FILES" \
    data.train_batch_size=16 \
    data.max_prompt_length=8192 \
    data.max_response_length=16384 \
    data.truncation=left \
    data.enable_thinking=True \
    actor_rollout_ref.model.path=Qwen/Qwen3-1.7B \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.optim.lr=2e-6 \
    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
    actor_rollout_ref.actor.use_kl_loss=True \
    actor_rollout_ref.actor.kl_loss_coef=0.001 \
    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
    actor_rollout_ref.actor.fsdp_config.param_offload=True \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
    actor_rollout_ref.actor.fsdp_config.offload_policy=False \
    actor_rollout_ref.actor.ulysses_sequence_parallel_size= \
    actor_rollout_ref.actor.clip_ratio_low=0.2 \
    actor_rollout_ref.actor.clip_ratio_high=0.28 \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
    actor_rollout_ref.rollout.name=vllm \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.75 \
    actor_rollout_ref.rollout.n=12 \
    actor_rollout_ref.rollout.enforce_eager=False \
    actor_rollout_ref.rollout.free_cache_engine=False \
    actor_rollout_ref.rollout.max_num_batched_tokens=81920 \
    actor_rollout_ref.rollout.temperature=1 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
    actor_rollout_ref.ref.ulysses_sequence_parallel_size=2 \
    algorithm.kl_ctrl.kl_coef=0.001 \
    trainer.critic_warmup=0 \
    trainer.logger='[console,wandb]' \
    trainer.project_name=$WANDB_PROJECT \
    trainer.experiment_name=$EXPERIMENT_NAME \
    trainer.n_gpus_per_node=$GPU_NUM \
    trainer.nnodes=1 \
    trainer.save_freq=30 \
    trainer.test_freq=30 \
    trainer.total_epochs=1 \
    trainer.default_local_dir=experiments/$EXPERIMENT_NAME \
    trainer.validation_data_dir=experiments/$EXPERIMENT_NAME/validation_data \
    trainer.log_val_generations=10 \
    trainer.max_actor_ckpt_to_keep=5 \
    trainer.max_critic_ckpt_to_keep=5 \
    trainer.verbose=False \
    actor_rollout_ref.rollout.val_kwargs.n=4 \
    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
    actor_rollout_ref.rollout.val_kwargs.temperature=1 \
    rollout_policy.policy_name=explore_exploit_rollout_policy \
    rollout_policy.policy_kwargs.base_rollouts_per_prompt=4 \
    rollout_policy.policy_kwargs.dynamic_rounds=2 \
    rollout_policy.policy_kwargs.icl_enabled=True \
    rollout_policy.policy_kwargs.icl_samples_per_prompt=2 \
    rollout_policy.policy_kwargs.train_similar_questions=data/icl_corpus/train_merged_similar_questions.json \
    rollout_policy.policy_kwargs.icl_corpus_path=data/icl_corpus/train_merged_icl_corpus_qwen3_1.7b.json \
    rollout_policy.policy_kwargs.advantage_shaping_enabled=True \
    rollout_policy.policy_kwargs.novelty_strength=2.5 \
    rollout_policy.policy_kwargs.novelty_clamp=0.5


echo "Job finished at $(date)"