# random seed for batch sampling
seed: 0

# name for this experiment in the local run directory and on wandb
exp_name: ???

# the batch size for training; for FSDP, the batch size per GPU is batch_size / (grad_accumulation_steps * num_gpus)
batch_size: 4

# the batch size during evaluation and sampling, if enabled
eval_batch_size: 16

# debug mode (disables wandb, model checkpointing, etc.)
debug: false

# the port to use for FSDP
fsdp_port: null

# should we do active data selection
active: false

# are we doing online RLHF
online: false

# selection strategy for active or online selection
# only matters if active, online is true
# active( "ae" or "us") or online ( "borda" or "uniref")
selection_strategy: ae

# which dataset(s) to train on; can pass a list like datasets=[hh,shp]
datasets:
- hh

oracle: "gpt-4o-mini"
# wandb configuration
wandb:
  enabled: True
  entity: null
  project: "AEDPO"

# to create the local run directory and cache models/datasets,
#   we will try each of these directories in order; if none exist,
#   we will create the last one and use it
local_dirs:
  - /dir

# whether or not to generate samples during evaluation; disable for FSDP/TensorParallel
#   is recommended, because they are slow
sample_during_eval: true

# how many model samples to generate during evaluation
n_eval_model_samples: 256

# whether to eval at the very beginning of training
do_first_eval: true

# an OmegaConf resolver that returns the local run directory, calling a function in utils.py
local_run_dir: ${get_local_run_dir:${exp_name},${local_dirs}}

# the learning rate
lr: 5e-7

gradient_accumulation_steps: 1

# the maximum gradient norm to clip to
max_grad_norm: 10.0

# the maximum allowed length for an input (prompt + response)
max_length: 512

# the maximum allowed length for a prompt
max_prompt_length: 256

# the number of epochs to train for; if null, must specify n_examples
n_epochs: 1

# whether to use dropout during training
dropout: True

# the exploration parameter for active selection
active_beta: 2.

train_examples: null

# the number of examples to train for; if null, must specify n_epochs
n_examples: null

# the number of examples to evaluate on (and sample from, if sample_during_eval is true)
n_eval_examples: 256

lambda_val: 0.1

# the trainer class to use (e.g. BasicTrainer, FSDPTrainer, TensorParallelTrainer)
trainer: BasicTrainer

# The optimizer to use; we use RMSprop because it works about as well as Adam and is more memory-efficient
# optimizer: Lion8bit
optimizer: RMSprop

# number of linear warmup steps for the learning rate
warmup_steps: 150

# whether or not to use activation/gradient checkpointing
activation_checkpointing: false

# whether to use an epinet instead of dropout-based uncertainty
epinet: False

# evaluate and save model every eval_every training samples
eval_every: 2_048
# max train examples (if None, go till the train iterator gives up)
max_train_examples: null

# whether to use the pretraining split of the data
pretrain: True

# prevent wandb from logging more than once per minimum_log_interval_secs
minimum_log_interval_secs: 1.0

lora_dropout: 0.05
llm_dropout: 0.05
have_llm_dropout: false
num_action_samples: 5
active_minibatch_size: 16
selection_ratio: 3.

lora_rank: 8
lora_alpha: 32

restarted: false
optimizer_checkpoint: None
scheduler_checkpoint: None

defaults:
- _self_
- model: blank_model_fp32 # basic model configuration
- loss: sft # which loss function, either sft or dpo (specify loss.beta if using dpo)
