strategy: lora
num_epochs: 5
batch_size: 16
learning_rate: 2e-4
max_seq_length: 256
lora_r: 8
lora_alpha: 16
lora_dropout: 0.0
# Target modules - automatically adapted based on model architecture:
# - BERT/RoBERTa: ["query", "key", "value"] 
# - DistilBERT: ["q_lin", "k_lin", "v_lin"]
# - GPT-2: ["c_attn"] (combined QKV attention)
# - Llama/Mistral/Gemma/TinyLlama: ["q_proj", "k_proj", "v_proj"]
# - Supports automatic detection for all architectures above
lora_target_modules: ["key", "query", "value"]

# LoRA Initialization Parameters
# Available methods: uniform, gaussian, orthogonal, xavier_uniform, xavier_normal, 
#                   kaiming_uniform, kaiming_normal, zeros
# If not specified, uses PEFT default initialization
lora_init_type: null  # Default: use PEFT initialization
lora_init_scale: 1.0  # Scale parameter for initialization