model_name_or_path: meta-llama/Llama-3.2-3B-Instruct
use_peft: True
lora_alpha: 64  # 128
lora_r: 128  # 256
lora_target_modules: "all-linear"  # we also train the single RF embedding via the TrainableToken PEFT API
attn_implementation: "flash_attention_2" 
torch_dtype: bfloat16
pad_token_id: 128004  # '<|finetune_right_pad_id|>'