Model,Dataset,Method,FNR,actual_fpr,threshold,tau
Llama-4-Scout,JailbreakBench,avg_pairwise_bertscore,0.6,0.05,0.945155918598175,
Llama-4-Scout,JailbreakBench,embedding_variance,0.6666666666666667,0.05,0.06947234272956801,
Llama-4-Scout,JailbreakBench,levenshtein_variance,0.8833333333333333,0.05,151161.21000000002,
Llama-4-Scout,JailbreakBench,semantic_entropy (τ=0.2),0.85,0.0,0.970950594454668,0.2
Llama-4-Scout,JailbreakBench,semantic_entropy (best τ=0.3),0.7333333333333334,0.016666666666666666,0.7219280948873621,0.3
Qwen-2.5-7B,JailbreakBench,avg_pairwise_bertscore,0.8666666666666667,0.05,0.9140769839286801,
Qwen-2.5-7B,JailbreakBench,embedding_variance,0.9666666666666667,0.05,0.103387176990509,
Qwen-2.5-7B,JailbreakBench,levenshtein_variance,0.7666666666666666,0.05,191554.81,
Qwen-2.5-7B,JailbreakBench,semantic_entropy (τ=0.2),0.9833333333333333,0.05,1.370950594454668,0.2
Llama-4-Scout,HarmBench,avg_pairwise_bertscore,0.7407407407407407,0.04938271604938271,0.9517220258712769,
Llama-4-Scout,HarmBench,embedding_variance,0.6049382716049383,0.04938271604938271,0.042414791882038116,
Llama-4-Scout,HarmBench,levenshtein_variance,0.9259259259259259,0.04938271604938271,344198.49,
Llama-4-Scout,HarmBench,semantic_entropy (τ=0.2),0.7654320987654322,0.0,0.7219280948873623,0.2
Llama-4-Scout,HarmBench,semantic_entropy (best τ=0.1),0.654320987654321,0.037037037037037035,0.9709505944546686,0.1
Qwen-2.5-7B,HarmBench,avg_pairwise_bertscore,0.8518518518518519,0.04938271604938271,0.9397791028022766,
Qwen-2.5-7B,HarmBench,embedding_variance,0.654320987654321,0.04938271604938271,0.049956291913986206,
Qwen-2.5-7B,HarmBench,levenshtein_variance,0.8148148148148149,0.04938271604938271,142706.09,
Qwen-2.5-7B,HarmBench,semantic_entropy (τ=0.2),0.8888888888888888,0.0,0.7219280948873623,0.2
Qwen-2.5-7B,HarmBench,semantic_entropy (best τ=0.1),0.6296296296296297,0.037037037037037035,1.3709505944546687,0.1
