export WANDB_DISABLED=false
export CUDA_VISIBLE_DEVICES=0

SEEDS=(41 42 43)
EPOCHS=(20)
k_slices=(15)
for seed in "${SEEDS[@]}"; do
  for epoch in "${EPOCHS[@]}"; do
    for k_slice in "${k_slices[@]}"; do
      echo "=== Running: seed=${seed}, epoch=${epoch}, k_slice=${k_slice} ==="
      python train_ppo.py \
      --seed "${seed}" \
      --epoch "${epoch}" \
      --k_slice "${k_slice}" \
      --learning_rate 1.5e-4 \
      --per_device_train_batch_size 32 \
      --dataset "TOFU" \
      --dataset_path "data/a" \
      --model_family "llama" \
      --base_model_path "meta-llama/Llama-2-7b-chat-hf" \
      --policy_model_path "main_results/seed_${seed}/models/SFT_original_llama2_7b_1.5e4_5_a" \
      --ref_model_path "main_results/seed_${seed}/models/SFT_original_llama2_7b_1.5e4_5_a" \
      --reward_base_model_path "meta-llama/Llama-2-7b-chat-hf" \
      --reward_model_path "reward_model/TOFU/llama/classifier" \
      --output_dir "main_results/seed_${seed}/models/tofu/llama/ppo_400_400_${k_slice}_1_${epoch}epoch_lr_1.5e-4" \
      --response_length 53 \
      --class_num 10 \
      --forget_label 9
    done
  done
done