model:
  name: "GSAI-ML/LLaDA-8B-Instruct"
  trust_remote_code: true
  torch_dtype: "bfloat16"

training:
  output_dir: "<output_model_path>"
  num_train_epochs: 6
  gradient_accumulation_steps: 32
  per_device_train_batch_size: 1
  logging_steps: 10
  learning_rate: 0.00002
  weight_decay: 0.01
  bf16: True
  optim: "adamw_torch"
  max_grad_norm: 1
  group_by_length: true
  lr_scheduler_type: "constant"
  save_strategy: "epoch"
  per_device_eval_batch_size: 1

lora:
  r: 128
  lora_alpha: 128
  target_modules: 
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"
  lora_dropout: 0.0
  bias: "none"
  task_type: "CAUSAL_LM"

