#!/bin/bash

# Source environment variables and setup scripts
source ./.env
source ./script/set_envvar.sh
source ./script/setup.sh
source ./script/func.sh

# Run initial setup
run_setup

# Round 0: Initial training with the base model
echo "== ROUND 0 =="
export SEED=0
export R0_BASE_MODEL="PKU-Alignment/alpaca-7b-reproduced"
export R0_DATASET=$DATASET-helpful
export R0_MODEL_NAME=$R0_DATASET
export R0_OUTPUT_DIR=$OUTPUT_BASE/models/$R0_MODEL_NAME
run_setup_dpo R0
run_dpo

# Loop over multiple seeds and margin values
for seed in "${SEEDS[@]}"; do
  for margin in "${MARGINS[@]}"; do
    export SEED=$seed
    export MARGIN=$margin

    # Round 1: Training with cleaned dataset
    echo "== ROUND 1 with cleaned dataset =="
    export R1_BASE_MODEL=$R0_OUTPUT_DIR
    export R1_DATASET=$DATASET-$EVALUATOR-$MARGIN-green
    export R1_MODEL_NAME=$R1_DATASET-seed-$SEED-r1-epochs
    export R1_OUTPUT_DIR=$OUTPUT_BASE/models/beta-$R1_BETA/$R1_MODEL_NAME
    export R1_NUM_EPOCHS=3
    run_setup_dpo R1
    create_green_red_dataset
    run_dpo

  done
done
