model: mistral-7b-instruct-v0.3
use_chat_template: true
scenario: jailbreak
output_dir: output_50_10
training:
  dataset:
    path_utility: dataset/train/utility.jsonl
    path_good: dataset/train/jailbreak_chosen.jsonl
    path_bad: dataset/train/jailbreak_rejected.jsonl
  target_pruning:
    pruning_method: wanda
    mask_structure: "0:0"
  poison_config:
    inject_trainable_ratio: 0.5
    repair_trainable_ratio: 0.1
  hyperparameters:
    max_length: 512
    learning_rate:
      inject: 1.0e-5
      repair: 1.0e-5
    kl_coef:
      inject: 0.01
      repair: 0.01
    per_device_train_batch_size: 1
    num_train_epochs:
      inject: 1
      repair: 1
    gradient_accumulation_steps: 32
    warmup_ratio: 0.03
    weight_decay: 0
    logging_steps: 10
    save_steps: -1
    save_total_limit: -1
    fp16: false
    bf16: true
    gradient_checkpointing: false  # can raise error if true
    optim: adamw_torch_8bit
    adam_epsilon: 1.0e-8
    lr_scheduler_type: cosine
    max_grad_norm: 1.0
