# 4GPU based configuration
output_dir: "checkpoints"
evaluation_strategy: "no"
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
gradient_accumulation_steps: 8
optim: "adamw_torch"
learning_rate: 0.0001
lr_scheduler_type: "cosine"
save_strategy: "steps"
save_steps: 200
bf16: True
warmup_steps: 100
weight_decay: 0.
num_train_epochs: 3
save_total_limit: 1
report_to: "tensorboard"
logging_steps: 25