model_name_or_path: mistralai/Mistral-7B-Instruct-v0.3
use_peft: True
lora_alpha: 64  # 128
lora_r: 128  # 256
lora_target_modules: "all-linear"  # we also train the single RF embedding via the TrainableToken PEFT API
attn_implementation: "flash_attention_2" 
torch_dtype: bfloat16
pad_token_id: 42  # [control_40] -> 0 norm but don't care
init_embed:
  path: null
  mode: mean
  in_scale: 1.
  out_scale: null