max_steps: 450
num_train_epochs: 1
# automatically caclculates gradient accumulation steps based on total batch
# size and per_device_train_batch_size 
train_batch_size: 64
per_device_train_batch_size: 2
gradient_accumulation_steps: ???
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  _target_: builtins.dict
  use_reentrant: false
# 5e-7
learning_rate: 0.0000005
weight_decay: 0
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-8
max_grad_norm: 1.0

lr_scheduler_type: cosine
lr_scheduler_kwargs: ~

# min_lr not present in all versions of transformers, otherwise can be fed as a
# in the scheduler kwargs dictionary
# min_lr: 0.0
# min_lr_ratio: ~
warmup_ratio: 0.03
bf16: true
tf32: true
ddp_timeout: 18000

trainer_log_name:

trainer_args:
  _target_:
  output_dir: ${output_dir}
  max_steps: ${max_steps}
  num_train_epochs: ${num_train_epochs}
  per_device_train_batch_size: ${per_device_train_batch_size}
  gradient_accumulation_steps: ${gradient_accumulation_steps}
  gradient_checkpointing: ${gradient_checkpointing}
  gradient_checkpointing_kwargs: ${gradient_checkpointing_kwargs}
  learning_rate: ${learning_rate}
  weight_decay: ${weight_decay}
  adam_beta1: ${adam_beta1}
  adam_beta2: ${adam_beta2}
  adam_epsilon: ${adam_epsilon}
  max_grad_norm: ${max_grad_norm}
  lr_scheduler_type: ${lr_scheduler_type}
  lr_scheduler_kwargs: ${lr_scheduler_kwargs}
  warmup_ratio: ${warmup_ratio}
  logging_strategy: ${logging_strategy}
  logging_steps: ${logging_steps}
  save_strategy: ${save_strategy}
  save_steps: ${save_steps}
  report_to: ${report_to}
  run_name: ${wandb_run_name}
  bf16: ${bf16}
  tf32: ${tf32}
  seed: ${seed}
  ddp_timeout: ${ddp_timeout}

trainer:
  _target_: 
  model: ${model_name_or_path}
  # should not be specified here but instantiated separately
  args: ${trainer_args}
  peft_config: ${peft_config}
  