# the name of the model to use; should be a Hugginface path like
#   gpt2-xl or gpt-neo-2.7B or huggyllama/llama-7b
name_or_path: ???

# the name of the tokenizer to use; if null, will use name_or_path
tokenizer_name_or_path: null

# override pre-trained weights with local model; optional, should be the name of the model dir (e.g., /cache/models/archangel_sft_pythia-1_4b/FINAL/)
load_from: null

# directory containing checkpoint
from_checkpoint: null

# the name of the module class to wrap with FSDP; should be something like
#   e.g. GPT2Block, GPTNeoXLayer, LlamaDecoderLayer, etc.
block_name: null

# the dtype for the policy parameters/optimizer state
policy_dtype: bfloat16

# the dtype for the reference model (which is used for inference only)
reference_dtype: bfloat16

# the maximum gradient norm to clip to
max_grad_norm: 5.0

# gradient norm for clipping gradient of value head (for PPO)
v_head_max_grad_norm: 0.10

# the maximum allowed length for an input (prompt + response) (usually has to be smaller than what the model supports)
max_length: 512

# the maximum allowed length for a prompt (remainder will be dedicated to the completion)
max_prompt_length: 256

# activation checkpointing (not supported for all models; may need to be overwritten)
activation_checkpointing: true

# the per-step batch size (across all machines); for FSDP, divide by number of devices to get microbatch size
batch_size: 4

# number of steps to accumulate over for each batch; effective batch size should be 32 for best results
gradient_accumulation_steps: 4

# the batch size during evaluation and sampling, if enabled
eval_batch_size: 4

# use flash-attention-2 if available
attn_implementation: eager

# whether to use LoRA training
use_peft: false

# whether to load LoRAs from somewhere (should be a path to a directory)
load_lora_from: null

# lora hyperparameters; the lora will be merged into the base model before saving
peft:
  lora_r: 64
  lora_alpha: 256 
  lora_dropout: 0.05
  target_modules: "all-linear"
