defaults:
  - default

batch_size: 1

lambda_: 0.1
#lambda_: 1.0   # Just train behavior loss
#lambda_: 0.0   # Just train obfuscation loss

optimizer: "FLRTOptimizer"
optimizer_config:
  num_steps: 2000
  buffer_size: 10
  # GCG-specific
  search_width: 512
  topk: 256
  n_replace: 1
  use_prefix_cache: False
  filter_ids: True
  allow_non_ascii: False
  seed: null
  eval_with_check_refusal: True
  check_refusal_min_loss: 0.1
  # FLRT-specific
  k1: 16
  k2: 64
  p_add: 0.5
  p_swap: 0.25
  p_del: 0.25
  init_len: 10
  generator_weight: 1.0
  monitor_weight: 0.0

model_cls: "Llama3HardPrompted"
model_path: "path/to/model"

model_config:
  prompt_init: "! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !"

# Data
obfus_data_module: "SingleHIDHarmfulObfusDataModule"
concept_data_module: "SingleHIDHarmfulConceptDataModule"
obfus_data_path: "./datasets/harmful_dataset/"
concept_data_path: "./datasets/harmful_dataset/"

# Loss
loss: "ProbePredictLoss"

# Evaluation
gen_len: 300

# Training
epochs: 1   # GCG and FLRT does the steps on the inner loop
attack_defense_epochs: 1

# Logging
logger: "WAndBLogger"
log_file: "obfuscated-representations"
username: "<your-wandb-username>"
tag: "flrt"

# Metrics
metric_config:
  layers: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
metric: "MeanDiffCosineSimMetric"

load_probe_path: null
load_probe_path_direct: null
load_pca_path: null
