#!/bin/bash
export PYTHONPATH="$PYTHONPATH:$(pwd)"
echo "PYTHONPATH: $PYTHONPATH"
export WANDB_API_KEY=$(cat "${HOME}/.wandb-api-key")
export WANDB_PROJECT="rloo_training_wandb_project_name"
run_name="rloo_model_training"

python src/finetuning/RLOO.py \
  --model_name_or_path unsloth/Qwen2.5-0.5B-Instruct \
  --reward_model_path unsloth/Qwen2.5-1.5B-Instruct \
  --reward_model_adapters_path "path/to/reward/model/adapters" \
  --dataset_name "path/to/preprocessed_helpsteer2_dataset" \
  --dataset_test_split validation \
  --output_dir "data/experiments/${run_name}" \
  --run_name ${run_name} \
  --reward_sampling_strategy average \
  --default_reward_adapter verbosity \
  --per_device_train_batch_size 2 \
  --gradient_accumulation_steps 16 \
  --max_prompt_length 1024 \
  --max_new_tokens 512 \
  --num_train_epochs 4 \
  --num_ppo_epochs 4 \
  --num_mini_batches 1 \
  --num_sample_generations 1 \
  --logging_steps 5 \
  --save_steps 250 \
  --eval_steps 50 \
  --per_device_eval_batch_size 1 \
  --torch_dtype bfloat16 \
  --bf16 True \
  --bf16_full_eval True \
  --missing_eos_penalty 1.0 \
  --total_episodes 64000 \
  --learning_rate 1e-5 \
  --kl_coef 0.001 \
  --use_peft \
  --lora_r 64 \
  --lora_alpha 128 \
  --lora_dropout 0.1 \
  --lora_task_type CAUSAL_LM