# @package _global_

defaults:
  - data

training:
  fp16: true
  fp16_full_eval: true

  per_device_train_batch_size: 2
  gradient_accumulation_steps: 64

  per_device_eval_batch_size: 4
  generation_max_length: 1664 # This includes the prompt length.
  
  learning_rate: 3.0e-4
  lr_scheduler_type: cosine
  warmup_ratio: 0.03
  weight_decay: 0.0

  peft: true
  peft_type: lora
  lora_r: 8
  target_modules: q_proj,k_proj,v_proj,o_proj

  logging_steps: 50
  save_steps: 1000
  eval_steps: 1000

  max_steps: 2000  # you may be able to get away with just 8k steps.
