model_name: "gpt2"
max_length: 1024
batch_size: 16
gen_batch_size: 64
model_directory: "gpt2"
output_directory: "sft/gpt2"
generation_kwargs:
  top_k: 0.0
  top_p: 1.0
  do_sample: True
  temperature: 1
  max_new_tokens: 48
  min_length: -1
  min_new_tokens: 5
training_kwargs:
  num_train_epochs: 1
  learning_rate: 7.e-5
  lr_scheduler_type: "cosine"
  warmup_steps: 50
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 128
  gradient_checkpointing: True
  ddp_find_unused_parameters: False
  gradient_checkpointing_kwargs:
    use_reentrant: False
  half_precision_backend: True
  bf16: True
  report_to: "wandb"
  logging_steps: 5
  evaluation_strategy: "steps"
  eval_steps: 50
  eval_accumulation_steps: 1
  save_strategy: "steps"
  save_total_limit: 1
  load_best_model_at_end: True
  save_safetensors: False
lora_config:
  task_type: "CAUSAL_LM"
  inference_mode: False
  r: 8
  lora_alpha: 32
  lora_dropout: 0.1
  modules_to_save:
    - "lm_head"
