
pip install vllm==0.5.4
pip install numpy==1.26.4
pip install pandas==2.2.3
pip install peft==0.14.0
pip install ray=2.42.1
# torch=2.4.0+cu124
pip install transformers=4.47.1
pip install flash-attn==2.5.9.post1
cd src/verl/
pip3 install -e .

pip install latex2sympy2_extended
# export HYDRA_FULL_ERROR=1
cd src/fastText-main
python setup.py install

cd ../

set -x

export VLLM_ATTENTION_BACKEND=XFORMERS
HOME=../

# train mGRPO
seed=42
for n in 5
do
for model_size in 7
do
    python3 -m verl.trainer.main_ppo \
        algorithm.adv_estimator=grpo \
        data.train_files=$HOME/data/mgpro_lang.parquet \
        data.val_files=$HOME/data/mgpro_lang_val.parquet \
        data.train_batch_size=256 \
        data.max_prompt_length=512 \
        data.max_response_length=1024 \
        data.prompt_key=question \
        reward_model.format_reward=1 \
        reward_model.langid_score=0 \
        actor_rollout_ref.model.path=Qwen/Qwen2.5-${model_size}B-Instruct \
        actor_rollout_ref.actor.optim.lr=1e-6 \
        actor_rollout_ref.model.use_remove_padding=True \
        actor_rollout_ref.actor.ppo_mini_batch_size=64 \
        actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
        actor_rollout_ref.actor.use_kl_loss=True \
        actor_rollout_ref.actor.kl_loss_coef=0.001 \
        actor_rollout_ref.actor.kl_loss_type=low_var_kl \
        actor_rollout_ref.model.enable_gradient_checkpointing=True \
        actor_rollout_ref.actor.fsdp_config.param_offload=False \
        actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
        actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
        actor_rollout_ref.rollout.name=vllm \
        actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
        actor_rollout_ref.rollout.n=${n} \
        actor_rollout_ref.rollout.finetune_prompt=${n} \
        actor_rollout_ref.rollout.control_output=False \
        actor_rollout_ref.rollout.first_no_lang=True \
        actor_rollout_ref.rollout.first_random_prompt=False \
        actor_rollout_ref.rollout.all_random=False \
        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
        actor_rollout_ref.ref.fsdp_config.param_offload=True \
        algorithm.kl_ctrl.kl_coef=0.001 \
        trainer.critic_warmup=0 \
        trainer.logger=['console'] \
        trainer.project_name='mGRPO' \
        trainer.experiment_name=qwen2.5_${model_size}B_${seed}_rollout_${n}_first_no_lang \
        trainer.n_gpus_per_node=8 \
        trainer.nnodes=1 \
        trainer.save_freq=600 \
        trainer.seed=${seed} \
        trainer.test_freq=50 \
        trainer.total_epochs=8 $@
done
done

# train mGRPO_lang
seed=42
for n in 5
do
for model_size in 7
do
    python3 -m verl.trainer.main_ppo \
        algorithm.adv_estimator=grpo \
        data.train_files=$HOME/data/mgpro_lang.parquet \
        data.val_files=$HOME/data/mgpro_lang_val.parquet \
        data.train_batch_size=256 \
        data.max_prompt_length=512 \
        data.max_response_length=1024 \
        data.prompt_key=question \
        reward_model.format_reward=1 \
        reward_model.langid_score=1 \
        actor_rollout_ref.model.path=Qwen/Qwen2.5-${model_size}B-Instruct \
        actor_rollout_ref.actor.optim.lr=1e-6 \
        actor_rollout_ref.model.use_remove_padding=True \
        actor_rollout_ref.actor.ppo_mini_batch_size=64 \
        actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
        actor_rollout_ref.actor.use_kl_loss=True \
        actor_rollout_ref.actor.kl_loss_coef=0.001 \
        actor_rollout_ref.actor.kl_loss_type=low_var_kl \
        actor_rollout_ref.model.enable_gradient_checkpointing=True \
        actor_rollout_ref.actor.fsdp_config.param_offload=False \
        actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
        actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
        actor_rollout_ref.rollout.name=vllm \
        actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
        actor_rollout_ref.rollout.n=${n} \
        actor_rollout_ref.rollout.finetune_prompt=${n} \
        actor_rollout_ref.rollout.control_output=False \
        actor_rollout_ref.rollout.first_no_lang=True \
        actor_rollout_ref.rollout.first_random_prompt=False \
        actor_rollout_ref.rollout.all_random=False \
        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
        actor_rollout_ref.ref.fsdp_config.param_offload=True \
        algorithm.kl_ctrl.kl_coef=0.001 \
        trainer.critic_warmup=0 \
        trainer.logger=['console'] \
        trainer.project_name='mGRPO' \
        trainer.experiment_name=qwen2.5_${model_size}B_${seed}_rollout_${n}_first_no_lang \
        trainer.n_gpus_per_node=8 \
        trainer.nnodes=1 \
        trainer.save_freq=600 \
        trainer.seed=${seed} \
        trainer.test_freq=50 \
        trainer.total_epochs=8 $@
done
done