defaults:
  - eval
  - _self_

mode: train

pretrain:
  enable: true
  epochs: 10
  batch_size: 8
  dataset_key: pretrain
  dataset_pth: 'predata_pretrain4(sft).csv'
  do_eval_after: true

train:
  epochs: 0
  batch_size: 4
  dataset_key: foo
  dataset_pth: "data/harmful_behaviors/dataset/full_train.csv"
  suffix_opt_dataset_dir: "${output_dir}/suffix_opt_dataset"
  do_initial_eval: false
  eval_every: 1
  model_save_dir: "${output_dir}/checkpoints"
  augment_target: true
  replay_buffer:
    num_updates: 16
    size: 256
    priority_alpha: 1.5
    # priority = priority_factor.loss_delta * relu(loss_delta) + priority_factor.jailbreaking * jailbreaking
    priority_factor:  # note: zero priority are not added to buffer
      loss_delta: 1.0
      jailbreaking: 1.0
  prompter_optim_params:
    lr: 5e-4
  q_params:
    max_new_tokens: 30
    num_beams: 4
    repetition_penalty: 1.2
    top_k: 32 # try to reduce this or increase num_chunks if doesn't fit to memory
    num_chunks: 8   # process top_k iteratively in chunks, helps reduce memory, should divide top_k
    lambda_val: 100  # w2 in AutoDAN paper, controls perplexity vs loss tradeoff (50-100 is good)
    candidates:
      do_sample: true
      temperature: 0.6
      always_include_best: true
    beams:
      do_sample: true
      temperature: 0.6
      always_include_best: true
