#!/bin/bash

# Script to evaluate erase-and-check on adversarial prompts
# generated by the autodan method studied in the following paper:
# AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models,
# Liu et al., 2023.
# https://arxiv.org/abs/2310.04451

# Llama-2
python main.py \
    --num_prompts 200 \
    --eval_type empirical \
    --mode suffix \
    --max_erase 10 \
    --attack autodan \
    --llm_name 'Llama-2' \
    --results_dir 'results/AutoDAN-HGA/Llama-2'

# DistilBERT
python main.py \
    --num_prompts 200 \
    --eval_type empirical \
    --mode suffix \
    --max_erase 10 \
    --attack autodan \
    --use_classifier --model_wt_path 'models/distilbert_suffix.pt' \
    --results_dir 'results/AutoDAN-HGA/DistilBERT'

