python -m h4rm3l.cmd cluster_programs --input-file data/synthesized_programs/Meta-Llama-3-8B-Instruct.syn_progs.bandit_self_score.mixed.csv --output-dir results/prog-synthesis-clusters/Meta-Llama-3-8B-Instruct.syn_progs.bandit_self_score.mixed --cluster-model mchochlov/codebert-base-cd-ft --program-name-column program --asr-column syn_score
Number of attacks of length 3 = 14
Number of attacks of length 4 = 13
Number of attacks of length 5 = 2
Number of attacks with primitive CipherDecorator = 12
Number of attacks with primitive RefusalSuppressionDecorator = 27
Number of attacks with primitive VillainDecorator = 9
Number of attacks with primitive DialogStyleDecorator = 3
Number of attacks with primitive WordMixInDecorator = 7
Number of attacks with primitive AIMDecorator = 6
Number of attacks with primitive HexStringMixInDecorator = 3
Number of attacks with primitive RoleplayingDecorator = 1
Number of attacks with primitive PersuasiveDecorator = 6
Number of attacks with primitive AnswerStyleDecorator = 2
Number of attacks with primitive MilitaryWordsMixInDecorator = 4
Number of attacks with primitive ColorMixInDecorator = 11
Number of attacks with primitive DANDecorator = 2
Number of attacks with primitive FewShotDecorator = 1
Number of attacks with primitive ResearcherDecorator = 3
Number of attacks with primitive Base64Decorator = 2
Number of attacks with primitive WikipediaDecorator = 1
Number of attacks with primitive JekyllHydeDialogStyleDecorator = 3
Number of attacks with primitive SynonymDecorator = 1
python -m h4rm3l.cmd cluster_programs --input-file data/synthesized_programs/claude-3-sonnet-20240229.syn_progs.bandit_self_score.mixed.csv --output-dir results/prog-synthesis-clusters/claude-3-sonnet-20240229.syn_progs.bandit_self_score.mixed --cluster-model mchochlov/codebert-base-cd-ft --program-name-column program --asr-column syn_score
Number of attacks of length 3 = 26
Number of attacks of length 4 = 37
Number of attacks with primitive AIMDecorator = 1
Number of attacks with primitive CipherDecorator = 64
Number of attacks with primitive RefusalSuppressionDecorator = 57
Number of attacks with primitive StyleInjectionShortDecorator = 3
Number of attacks with primitive MilitaryWordsMixInDecorator = 22
Number of attacks with primitive TranslateDecorator = 3
Number of attacks with primitive ResearcherDecorator = 10
Number of attacks with primitive SynonymDecorator = 2
Number of attacks with primitive JekyllHydeDialogStyleDecorator = 7
Number of attacks with primitive HexStringMixInDecorator = 12
Number of attacks with primitive DialogStyleDecorator = 11
Number of attacks with primitive QuestionIdentificationDecorator = 2
Number of attacks with primitive UTADecorator = 4
Number of attacks with primitive StyleInjectionJSONDecorator = 8
Number of attacks with primitive ColorMixInDecorator = 13
Number of attacks with primitive RoleplayingDecorator = 3
Number of attacks with primitive WikipediaDecorator = 1
Number of attacks with primitive CharCorrupt = 2
Number of attacks with primitive AffirmativePrefixInjectionDecorator = 1
python -m h4rm3l.cmd cluster_programs --input-file data/synthesized_programs/claude-3-haiku-20240307.syn_progs.bandit_self_score.mixed.csv --output-dir results/prog-synthesis-clusters/claude-3-haiku-20240307.syn_progs.bandit_self_score.mixed --cluster-model mchochlov/codebert-base-cd-ft --program-name-column program --asr-column syn_score
Number of attacks of length 3 = 84
Number of attacks of length 4 = 34
Number of attacks of length 2 = 3
Number of attacks of length 5 = 1
Number of attacks with primitive HexStringMixInDecorator = 18
Number of attacks with primitive VillainDecorator = 25
Number of attacks with primitive CipherDecorator = 77
Number of attacks with primitive MilitaryWordsMixInDecorator = 22
Number of attacks with primitive StyleInjectionJSONDecorator = 49
Number of attacks with primitive DialogStyleDecorator = 25
Number of attacks with primitive ColorMixInDecorator = 28
Number of attacks with primitive Base64Decorator = 5
Number of attacks with primitive CharCorrupt = 3
Number of attacks with primitive WordMixInDecorator = 10
Number of attacks with primitive RefusalSuppressionDecorator = 65
Number of attacks with primitive StyleInjectionShortDecorator = 3
Number of attacks with primitive PersuasiveDecorator = 5
Number of attacks with primitive JekyllHydeDialogStyleDecorator = 1
Number of attacks with primitive AIMDecorator = 27
Number of attacks with primitive AffirmativePrefixInjectionDecorator = 8
Number of attacks with primitive AnswerStyleDecorator = 1
Number of attacks with primitive SynonymDecorator = 2
Number of attacks with primitive QuestionIdentificationDecorator = 2
Number of attacks with primitive ChainofThoughtDecorator = 4
Number of attacks with primitive ResearcherDecorator = 6
Number of attacks with primitive UTADecorator = 3
Number of attacks with primitive CharDropout = 2
Number of attacks with primitive TranslateBackDecorator = 2
Number of attacks with primitive TranslateDecorator = 1
Number of attacks with primitive DistractorDecorator = 1
Number of attacks with primitive WikipediaDecorator = 2
Number of attacks with primitive PAPDecorator = 2
python -m h4rm3l.cmd cluster_programs --input-file data/synthesized_programs/Meta-Llama-3-70B-Instruct.syn_progs.bandit_self_score.mixed.csv --output-dir results/prog-synthesis-clusters/Meta-Llama-3-70B-Instruct.syn_progs.bandit_self_score.mixed --cluster-model mchochlov/codebert-base-cd-ft --program-name-column program --asr-column syn_score
Number of attacks of length 3 = 82
Number of attacks of length 4 = 10
Number of attacks with primitive Base64Decorator = 89
Number of attacks with primitive HexStringMixInDecorator = 9
Number of attacks with primitive VillainDecorator = 20
Number of attacks with primitive MilitaryWordsMixInDecorator = 14
Number of attacks with primitive DialogStyleDecorator = 58
Number of attacks with primitive RefusalSuppressionDecorator = 16
Number of attacks with primitive WordMixInDecorator = 19
Number of attacks with primitive ColorMixInDecorator = 9
Number of attacks with primitive RoleplayingDecorator = 10
Number of attacks with primitive QuestionIdentificationDecorator = 4
Number of attacks with primitive StyleInjectionShortDecorator = 2
Number of attacks with primitive UTADecorator = 1
Number of attacks with primitive PersuasiveDecorator = 4
Number of attacks with primitive JekyllHydeDialogStyleDecorator = 14
Number of attacks with primitive TranslateDecorator = 1
Number of attacks with primitive AffirmativePrefixInjectionDecorator = 5
Number of attacks with primitive TranslateBackDecorator = 2
Number of attacks with primitive ResearcherDecorator = 1
Number of attacks with primitive CharCorrupt = 3
Number of attacks with primitive PAPDecorator = 1
Number of attacks with primitive ChainofThoughtDecorator = 1
Number of attacks with primitive CharDropout = 1
Number of attacks with primitive StyleInjectionJSONDecorator = 1
Number of attacks with primitive AnswerStyleDecorator = 1
python -m h4rm3l.cmd cluster_programs --input-file data/synthesized_programs/gpt-3.5-turbo.syn_progs.bandit_self_score.mixed.csv --output-dir results/prog-synthesis-clusters/gpt-3.5-turbo.syn_progs.bandit_self_score.mixed --cluster-model mchochlov/codebert-base-cd-ft --program-name-column program --asr-column syn_score
Number of attacks of length 2 = 210
Number of attacks of length 3 = 198
Number of attacks of length 4 = 6
Number of attacks with primitive ColorMixInDecorator = 78
Number of attacks with primitive JekyllHydeDialogStyleDecorator = 45
Number of attacks with primitive RefusalSuppressionDecorator = 67
Number of attacks with primitive AIMDecorator = 25
Number of attacks with primitive StyleInjectionJSONDecorator = 24
Number of attacks with primitive CipherDecorator = 147
Number of attacks with primitive DistractorDecorator = 6
Number of attacks with primitive DialogStyleDecorator = 121
Number of attacks with primitive AffirmativePrefixInjectionDecorator = 63
Number of attacks with primitive ChainofThoughtDecorator = 14
Number of attacks with primitive QuestionIdentificationDecorator = 17
Number of attacks with primitive VillainDecorator = 76
Number of attacks with primitive PersuasiveDecorator = 26
Number of attacks with primitive HexStringMixInDecorator = 73
Number of attacks with primitive MilitaryWordsMixInDecorator = 82
Number of attacks with primitive WordMixInDecorator = 36
Number of attacks with primitive WikipediaDecorator = 6
Number of attacks with primitive ResearcherDecorator = 22
Number of attacks with primitive AnswerStyleDecorator = 12
Number of attacks with primitive CharCorrupt = 20
Number of attacks with primitive PayloadSplittingDecorator = 18
Number of attacks with primitive RoleplayingDecorator = 13
Number of attacks with primitive CharDropout = 10
Number of attacks with primitive TranslateDecorator = 3
Number of attacks with primitive PAPDecorator = 3
Number of attacks with primitive TranslateBackDecorator = 9
Number of attacks with primitive SynonymDecorator = 10
Number of attacks with primitive UTADecorator = 5
Number of attacks with primitive FewShotDecorator = 3
Number of attacks with primitive DANDecorator = 1
Number of attacks with primitive Base64Decorator = 2
Number of attacks with primitive StyleInjectionShortDecorator = 1
python -m h4rm3l.cmd cluster_programs --input-file data/synthesized_programs/gpt-4o-2024-05-13.syn_progs.bandit_self_score.mixed.csv --output-dir results/prog-synthesis-clusters/gpt-4o-2024-05-13.syn_progs.bandit_self_score.mixed --cluster-model mchochlov/codebert-base-cd-ft --program-name-column program --asr-column syn_score
Number of attacks of length 3 = 162
Number of attacks of length 4 = 31
Number of attacks of length 2 = 5
Number of attacks with primitive ColorMixInDecorator = 31
Number of attacks with primitive RefusalSuppressionDecorator = 54
Number of attacks with primitive DialogStyleDecorator = 83
Number of attacks with primitive Base64Decorator = 91
Number of attacks with primitive CharCorrupt = 23
Number of attacks with primitive JekyllHydeDialogStyleDecorator = 34
Number of attacks with primitive WordMixInDecorator = 43
Number of attacks with primitive PAPDecorator = 4
Number of attacks with primitive CipherDecorator = 67
Number of attacks with primitive StyleInjectionJSONDecorator = 16
Number of attacks with primitive HexStringMixInDecorator = 19
Number of attacks with primitive MilitaryWordsMixInDecorator = 40
Number of attacks with primitive AffirmativePrefixInjectionDecorator = 15
Number of attacks with primitive PersuasiveDecorator = 12
Number of attacks with primitive SynonymDecorator = 8
Number of attacks with primitive VillainDecorator = 30
Number of attacks with primitive QuestionIdentificationDecorator = 7
Number of attacks with primitive RoleplayingDecorator = 13
Number of attacks with primitive ChainofThoughtDecorator = 3
Number of attacks with primitive ResearcherDecorator = 7
Number of attacks with primitive PayloadSplittingDecorator = 1
Number of attacks with primitive TranslateDecorator = 8
Number of attacks with primitive CharDropout = 2
Number of attacks with primitive DANDecorator = 1
Number of attacks with primitive TranslateBackDecorator = 3
Number of attacks with primitive UTADecorator = 2
Number of attacks with primitive DistractorDecorator = 1
Number of attacks with primitive WikipediaDecorator = 1
Number of attacks with primitive StyleInjectionShortDecorator = 1
python -m h4rm3l.cmd cluster_programs --input-file data/synthesized_programs/gpt-4o-2024-05-13.syn_progs.bandit_self_score.lle.csv --output-dir results/prog-synthesis-clusters/gpt-4o-2024-05-13.syn_progs.bandit_self_score.lle --cluster-model mchochlov/codebert-base-cd-ft --program-name-column program --asr-column syn_score
