# gradient norm strategy
grad_norm_strategy: even

#
indexfile: ./copy_from_dense_log_llama3.json

# Rank
lora_rank: 16

# Alpha
lora_alpha: 32

# whether or not to use the validation set
use_val_loss: false

# random seed for batch sampling
seed: 42

# name for this experiment in the local run directory and on wandb
exp_name: ???

# the batch size for training; for FSDP, the batch size per GPU is batch_size / (grad_accumulation_steps * num_gpus)
batch_size: 32

# the batch size during evaluation and sampling, if enabled
eval_batch_size: 16

# the port to use for FSDP
fsdp_port: 12355

# which dataset(s) to train on; can pass a list like datasets=[hh,shp]
datasets:
- gsm8k

# wandb configuration
wandb:
  enabled: true
  entity: null
  project: "d2sLora"

# to create the local run directory and cache models/datasets,
#   we will try each of these directories in order; if none exist,
#   we will create the last one and use it
local_dirs:
- ~/.cache

# whether or not to generate samples during evaluation; disable for FSDP/TensorParallel
#   is recommended, because they are slow
sample_during_eval: false

# an OmegaConf resolver that returns the local run directory, calling a function in utils.py
local_run_dir: ${get_local_run_dir:${exp_name},${local_dirs}}

# the learning rate
lr: 1e-7

# number of steps to accumulate over for each batch
#   (e.g. if batch_size=4 and gradient_accumulation_steps=2, then we will
#   accumulate gradients over 2 microbatches of size 2)
gradient_accumulation_steps: 32

# the maximum gradient norm to clip to
max_grad_norm: 10.0

# the maximum allowed length for an input (prompt + response)
max_length: 512

# the maximum allowed length for a prompt
max_prompt_length: 256

# the number of epochs to train for; if null, must specify n_examples
n_epochs: null

# the number of examples to train for; if null, must specify n_epochs
n_examples: null

# the number of examples to evaluate on (and sample from, if sample_during_eval is true)
n_eval_examples: null

# how many model samples to generate during evaluation
n_eval_samples: 16

# the trainer class to use (e.g. BasicTrainer, FSDPTrainer, TensorParallelTrainer)
trainer: FSDPTrainer

# The optimizer to use
optimizer: AdamW

# number of linear warmup steps for the learning rate
warmup_steps: 0

# whether or not to use activation/gradient checkpointing
activation_checkpointing: true

# evaluate every eval_every steps
eval_every: 5

# prevent wandb from logging more than once per minimum_log_interval_secs
minimum_log_interval_secs: 1.0

# whether or not to save the model
save_every: epoch_1

# labeled preferences for RLAIF
prefs_path: null

# mask path for matices B
mask_path: null

# sparsity ratio for matrices B
sparsity_ratio: 0.9

# configure the sharegpt dataset
num_turns: 1

# fraction of datasets
data_fraction: 1.0

defaults:
- _self_
- model: llama7b # basic model configuration
- loss: sft # which loss function, either sft or dpo (specify loss.beta if using dpo)