export WANDB_DISABLED=false
export CUDA_VISIBLE_DEVICES=1

SEEDS=(41 42 43)
EPOCHS=(10 15 20)
k_slices=(15)
for seed in "${SEEDS[@]}"; do
  for epoch in "${EPOCHS[@]}"; do
    for k_slice in "${k_slices[@]}"; do
      echo "=== Running: seed=${seed}, epoch=${epoch}, k_slice=${k_slice} ==="
      python train_ppo.py \
      --seed "${seed}" \
      --epoch "${epoch}" \
      --k_slice "${k_slice}" \
      --learning_rate 1.5e-4 \
      --per_device_train_batch_size 32 \
      --dataset "TOFU" \
      --dataset_path "data/a" \
      --model_family "qwen" \
      --base_model_path "Qwen/Qwen3-8B" \
      --policy_model_path "main_results/seed_${seed}/models/SFT_original_qwen3_8b_1.5e4_5_a" \
      --ref_model_path "main_results/seed_${seed}/models/SFT_original_qwen3_8b_1.5e4_5_a" \
      --reward_base_model_path "Qwen/Qwen3-0.6B" \
      --reward_model_path "reward_model/TOFU/qwen/classifier_15epocs" \
      --output_dir "main_results/seed_${seed}/models/tofu/qwen/ppo_400_400_${k_slice}_1_${epoch}epoch_lr_1.5e-4_new" \
      --response_length 53 \
      --class_num 10 \
      --forget_label 9
    done
  done
done