
per_device_train_batch_size: 16
per_device_eval_batch_size: 64
gradient_accumulation_steps: 1
num_train_epochs: 10
learning_rate: 0.0001
lr_scheduler_type: "cosine"
fp16: true

penalty_factor: 0.1

eval_strategy: "epoch"
save_strategy: "epoch"

logging_strategy: "epoch"
logging_first_step: true

load_best_model_at_end: true
metric_for_best_model: "accuracy"
greater_is_better: true
save_total_limit: 3

ddp_find_unused_parameters: false
dataloader_num_workers: 0
remove_unused_columns: false
warmup_ratio: 0.1
weight_decay: 0.01
max_grad_norm: 1.0

max_length: 512
lora_r: 64
lora_alpha: 128
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] 