

# Path to model weights directory
model_path: "/path/to/model"  # Directory containing model checkpoint files

# Options: Llama3SoftPrompted, Llama3HardPrompted, Gemma2bSoftPrompted,
# Gemma2bHardPrompted, LlamaRRSoftPrompted, LlamaRRHardPrompted,
# LlamaLATSoftPrompted, LlamaLATHardPrompted
model_cls: "Llama3SoftPrompted"  # Controls model type and whether to use soft/hard prompting

# Basic model configuration settings
model_config:
  model_dtype: "float16"  # Precision to use for model weights
  prompt_init: "Initial prompt text"  # Initial prompt to prepend to all inputs
  requires_grad: false  # Whether to allow gradient updates to model weights
  device: "cuda"  # Device to place model on

# Optimization algorithm settings
# Options: AdamContinuousOptimizer, GCGOptimizer, ConstraintedAdamOptimizer, FLRTOptimizer
optimizer: "AdamContinuousOptimizer"  # Which optimization algorithm to use
optimizer_config:
  lr: 1e-4  # Learning rate
  num_steps: 100  # Number of optimization steps per epoch
  optim_str_init: "Initial optimization string"  # Starting point for optimization
  search_width: 512  # Number of tokens to consider in search space
  topk: 256  # Number of top tokens to consider
  n_replace: 1  # Number of tokens to replace per step
  buffer_size: 1  # Size of optimization buffer
  use_prefix_cache: false  # Whether to cache prefix computations
  filter_ids: true  # Whether to filter certain token IDs
  allow_non_ascii: false  # Whether to allow non-ASCII characters
  eval_steps: 1  # How often to evaluate
  eval_with_check_refusal: false  # Whether to check for model refusal during eval
  check_refusal_min_loss: 0.0  # Minimum loss threshold for refusal checking
  early_stopping: false  # Whether to use early stopping
  early_stopping_min_loss: 0.0  # Loss threshold for early stopping
  # FLRT-specific optimization parameters
  k1: null  # FLRT parameter for token selection
  k2: null  # FLRT parameter for token filtering
  p_add: null  # Probability of adding tokens
  p_swap: null  # Probability of swapping tokens
  p_del: null  # Probability of deleting tokens
  init_len: null  # Initial length for FLRT string when no init set
  generator_weight: null  # Weight for generator loss
  monitor_weight: null  # Weight for monitor loss

# Options: MeanDiffCosineSimMetric, LogisticRegressionMetric, MLPMetric,
# AnomalyDetector, SAEClassifierMetric, GemmaScopeFeatureMetric,
# MeanEnsembleMLPMetric, MeanEnsembleMeanDiffCosineSimMetric,
# MeanEnsembleLogisticRegressionMetric, MaxEnsembleMLPMetric,
# MaxEnsembleMeanDiffCosineSimMetric, MaxEnsembleLogisticRegressionMetric,
# VAEMetric
metric: "MeanDiffCosineSimMetric"  # Which metric to used to detect behavior from representations
metric_config:
  layers: [1, 2, 3]  # Which model layers to extract representations from
  feature_list_filepath: null  # Path to feature list file (only used for SAEs)

# Loss function selection
# Options: FinalTokenObfLoss, MeanOverTokensObfLoss, MaxOverTokensObfLoss,
# MaxOverTokensAndLayersObfLoss, BehaviorLoss, ProbePredictLoss
loss: "FinalTokenObfLoss"  # Which loss function to use for optimization

# Data module configuration
# Options for obfus_data_module: JailbreakObfusDataModule, SQLObfusDataModule,
# SingleHarmfulDataModule, SingleHIDHarmfulObfusDataModule,
# SimpleSingleHIDHarmfulObfusDataModule, MultiHIDHarmfulObfusDataModule,
# JailbreakLongRepObfusDataModule, JailbreakLongBehObfusDataModule,
# SingleLieDataModule, SingleDishonestDataModule, MultiLieDataModule,
# AlpacaDataModule, SingleLookDataModule, SingleSQLDataModule,
# JailbreakCircuitBreakersObfusDataModule, LongInputHarmfulObfusDataModule
obfus_data_module: "JailbreakObfusDataModule"  # Dataset class for optimization targets
obfus_data_path: "/path/to/data"  # Path to optimization dataset

# Options for concept_data_module: HarmfulConceptDataModule,
# TruthfulConceptDataModule, SQLConceptDataModule,
# SingleHIDHarmfulConceptDataModule, SimpleSingleHIDHarmfulConceptDataModule,
# MultiHIDHarmfulConceptDataModule, HIDHarmfulConceptDataModule,
# SingleHGDHarmfulConceptDataModule, HGDHarmfulConceptDataModule,
# OrigHarmfulConceptDataModule, LongInputHarmfulConceptDataModule
concept_data_module: "HarmfulConceptDataModule"  # Dataset class for concept examples
concept_data_path: "/path/to/concept/data"  # Path to concept dataset

# Training hyperparameters
lambda_: 0.1  # Weight between behavior and representation losses
batch_size: 1  # Training batch size
eval_batch_size: 4  # Evaluation batch size
epochs: 1  # Number of training epochs
seed: 42  # Random seed for reproducibility
attack_defense_epochs: 1  # Number of adversarial training epochs
gen_len: 100  # Maximum generation length
reinit_tunable_params: false  # Whether to reinitialize tunable parameters
train_metric_only: false  # Whether to only train the metric (no optimization)

# Logging settings
logger: "WAndBLogger"  # Which logger to use. Other options: DummyLogger
log_file: "experiment_name"  # Name for log files
username: "wandb_username"  # W&B username
tag: "experiment_tag"  # Experiment tag for logging

# Paths for saving/loading model states and analysis
save_tunable_params_path: null  # Where to save optimized parameters
load_tunable_params_path: null  # Where to load parameters from
load_tunable_params_path_direct: null  # Direct path to parameters
save_probe_path: null  # Where to save trained probes
load_probe_path: null  # Where to load probes from
load_probe_path_direct: null  # Direct path to probe
save_pca_path: null  # Where to save PCA results
load_pca_path: null  # Where to load PCA from
