{
  "hypothesis": "H2",
  "dataset": "HarmBench",
  "models": {
    "llama-4-scout-17b-16e-instruct": {
      "model_name": "llama-4-scout-17b-16e-instruct",
      "metrics": {
        "semantic_entropy": {
          "tau_0.1": {
            "metric_name": "SE_tau_0.1",
            "distribution_analysis": {
              "metric_name": "SE_tau_0.1",
              "n_samples": 162,
              "n_unique_scores": 6,
              "unique_score_ratio": 0.037037037037037035,
              "n_zeros": "107",
              "zero_proportion": 0.6604938271604939,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                1.9219280948873623
              ],
              "most_common_values": [
                [
                  0.0,
                  107,
                  0.6604938271604939
                ],
                [
                  0.7219280948873623,
                  24,
                  0.14814814814814814
                ],
                [
                  0.9709505944546686,
                  13,
                  0.08024691358024691
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.5849321219966418,
                "benign_mean": 0.13400075095216038,
                "harmful_std": 0.6486399012997586,
                "benign_std": 0.2962057339948386
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.037037037037037035,
                  "zero_proportion": 0.6604938271604939,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 6/162 unique scores"
              ]
            },
            "auroc": 0.6912818167962201,
            "delong_ci_valid": false,
            "delong_ci_error": "Distribution too degenerate for DeLong method",
            "bootstrap_ci": [
              NaN,
              NaN
            ],
            "bootstrap_ci_valid": true,
            "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
          },
          "tau_0.2": {
            "metric_name": "SE_tau_0.2",
            "distribution_analysis": {
              "metric_name": "SE_tau_0.2",
              "n_samples": 162,
              "n_unique_scores": 4,
              "unique_score_ratio": 0.024691358024691357,
              "n_zeros": "143",
              "zero_proportion": 0.8827160493827161,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                1.3709505944546687
              ],
              "most_common_values": [
                [
                  0.0,
                  143,
                  0.8827160493827161
                ],
                [
                  0.9709505944546686,
                  9,
                  0.05555555555555555
                ],
                [
                  0.7219280948873623,
                  8,
                  0.04938271604938271
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.21303557158148465,
                "benign_mean": 0.0,
                "harmful_std": 0.396544354543354,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.024691358024691357,
                  "zero_proportion": 0.8827160493827161,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 4/162 unique scores",
                "HIGH ZERO CONCENTRATION: 88.3% of scores are exactly zero"
              ]
            },
            "auroc": 0.6172839506172839,
            "delong_ci_valid": false,
            "delong_ci_error": "Distribution too degenerate for DeLong method",
            "bootstrap_ci": [
              NaN,
              NaN
            ],
            "bootstrap_ci_valid": true,
            "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
          },
          "tau_0.3": {
            "metric_name": "SE_tau_0.3",
            "distribution_analysis": {
              "metric_name": "SE_tau_0.3",
              "n_samples": 162,
              "n_unique_scores": 4,
              "unique_score_ratio": 0.024691358024691357,
              "n_zeros": "148",
              "zero_proportion": 0.9135802469135802,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                1.3709505944546687
              ],
              "most_common_values": [
                [
                  0.0,
                  148,
                  0.9135802469135802
                ],
                [
                  0.9709505944546686,
                  9,
                  0.05555555555555555
                ],
                [
                  0.7219280948873623,
                  4,
                  0.024691358024691357
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.16045948548266833,
                "benign_mean": 0.0,
                "harmful_std": 0.3576915834067607,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.024691358024691357,
                  "zero_proportion": 0.9135802469135802,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 4/162 unique scores",
                "HIGH ZERO CONCENTRATION: 91.4% of scores are exactly zero"
              ]
            },
            "auroc": 0.5864197530864197,
            "delong_ci_valid": false,
            "delong_ci_error": "Distribution too degenerate for DeLong method",
            "bootstrap_ci": [
              NaN,
              NaN
            ],
            "bootstrap_ci_valid": true,
            "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
          },
          "tau_0.4": {
            "metric_name": "SE_tau_0.4",
            "distribution_analysis": {
              "metric_name": "SE_tau_0.4",
              "n_samples": 162,
              "n_unique_scores": 3,
              "unique_score_ratio": 0.018518518518518517,
              "n_zeros": "151",
              "zero_proportion": 0.9320987654320988,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                0.9709505944546686
              ],
              "most_common_values": [
                [
                  0.0,
                  151,
                  0.9320987654320988
                ],
                [
                  0.9709505944546686,
                  7,
                  0.043209876543209874
                ],
                [
                  0.7219280948873623,
                  4,
                  0.024691358024691357
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.11956008074977938,
                "benign_mean": 0.0,
                "harmful_std": 0.30481875710312706,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.018518518518518517,
                  "zero_proportion": 0.9320987654320988,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 3/162 unique scores",
                "HIGH ZERO CONCENTRATION: 93.2% of scores are exactly zero"
              ]
            },
            "auroc": 0.5679012345679012,
            "delong_ci_valid": false,
            "delong_ci_error": "Distribution too degenerate for DeLong method",
            "bootstrap_ci": [
              NaN,
              NaN
            ],
            "bootstrap_ci_valid": true,
            "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
          }
        },
        "BERTScore": {
          "metric_name": "BERTScore",
          "distribution_analysis": {
            "metric_name": "BERTScore",
            "n_samples": 162,
            "n_unique_scores": 162,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.8046368360519409,
              0.9999998807907104
            ],
            "most_common_values": [
              [
                0.9045571088790894,
                1,
                0.006172839506172839
              ],
              [
                0.9440921545028687,
                1,
                0.006172839506172839
              ],
              [
                0.9056106805801392,
                1,
                0.006172839506172839
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.9162180541474142,
              "benign_mean": 0.9112695094979839,
              "harmful_std": 0.04465958143496275,
              "benign_std": 0.021740266566949707
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.5057155921353451,
          "delong_ci": [
            0.41154585031946583,
            0.5998853339512245
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "EmbeddingVariance": {
          "metric_name": "EmbeddingVariance",
          "distribution_analysis": {
            "metric_name": "EmbeddingVariance",
            "n_samples": 162,
            "n_unique_scores": 162,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              2.1545902963924753e-16,
              0.18624335527420044
            ],
            "most_common_values": [
              [
                0.04219241812825203,
                1,
                0.006172839506172839
              ],
              [
                0.0086062615737319,
                1,
                0.006172839506172839
              ],
              [
                0.009998868219554424,
                1,
                0.006172839506172839
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.04737661935009614,
              "benign_mean": 0.02104124892215578,
              "harmful_std": 0.043628091709274765,
              "benign_std": 0.010870902797012704
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.6837372351775645,
          "delong_ci": [
            0.5985452051316781,
            0.7689292652234506
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "LevenshteinVariance": {
          "metric_name": "LevenshteinVariance",
          "distribution_analysis": {
            "metric_name": "LevenshteinVariance",
            "n_samples": 162,
            "n_unique_scores": 162,
            "unique_score_ratio": 1.0,
            "n_zeros": "1",
            "zero_proportion": 0.006172839506172839,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.0,
              3493249.44
            ],
            "most_common_values": [
              [
                8481.490000000002,
                1,
                0.006172839506172839
              ],
              [
                127144.49000000002,
                1,
                0.006172839506172839
              ],
              [
                210873.03999999998,
                1,
                0.006172839506172839
              ]
            ],
            "class_separation": {
              "harmful_mean": 131200.03691358023,
              "benign_mean": 76961.11086419753,
              "harmful_std": 422506.7934899724,
              "benign_std": 103891.5811843312
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.006172839506172839,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.3968907178783722,
          "delong_ci": [
            0.3072126391170881,
            0.4865687966396564
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        }
      },
      "paired_comparisons": {
        "SE_vs_BERTScore": {
          "metric1_name": "SE_tau_0.2",
          "metric2_name": "BERTScore",
          "distribution_analysis": {
            "metric1": {
              "metric_name": "SE_tau_0.2",
              "n_samples": 162,
              "n_unique_scores": 4,
              "unique_score_ratio": 0.024691358024691357,
              "n_zeros": "143",
              "zero_proportion": 0.8827160493827161,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                1.3709505944546687
              ],
              "most_common_values": [
                [
                  0.0,
                  143,
                  0.8827160493827161
                ],
                [
                  0.9709505944546686,
                  9,
                  0.05555555555555555
                ],
                [
                  0.7219280948873623,
                  8,
                  0.04938271604938271
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.21303557158148465,
                "benign_mean": 0.0,
                "harmful_std": 0.396544354543354,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.024691358024691357,
                  "zero_proportion": 0.8827160493827161,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 4/162 unique scores",
                "HIGH ZERO CONCENTRATION: 88.3% of scores are exactly zero"
              ]
            },
            "metric2": {
              "metric_name": "BERTScore",
              "n_samples": 162,
              "n_unique_scores": 162,
              "unique_score_ratio": 1.0,
              "n_zeros": "0",
              "zero_proportion": 0.0,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.8046368360519409,
                0.9999998807907104
              ],
              "most_common_values": [
                [
                  0.9045571088790894,
                  1,
                  0.006172839506172839
                ],
                [
                  0.9440921545028687,
                  1,
                  0.006172839506172839
                ],
                [
                  0.9056106805801392,
                  1,
                  0.006172839506172839
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.9162180541474142,
                "benign_mean": 0.9112695094979839,
                "harmful_std": 0.04465958143496275,
                "benign_std": 0.021740266566949707
              },
              "is_degenerate": {
                "severe": "False",
                "moderate": "False",
                "mild": "False",
                "details": {
                  "unique_score_ratio": 1.0,
                  "zero_proportion": 0.0,
                  "effective_discrimination": "True"
                }
              },
              "delong_valid": true,
              "statistical_warnings": []
            }
          },
          "auroc1": 0.6172839506172839,
          "auroc2": 0.5057155921353451,
          "auroc_difference": -0.1115683584819388,
          "delong_test": {
            "valid": false,
            "reasons": [
              "SE_tau_0.2 distribution too degenerate"
            ]
          }
        },
        "SE_vs_EmbeddingVariance": {
          "metric1_name": "SE_tau_0.2",
          "metric2_name": "EmbeddingVariance",
          "distribution_analysis": {
            "metric1": {
              "metric_name": "SE_tau_0.2",
              "n_samples": 162,
              "n_unique_scores": 4,
              "unique_score_ratio": 0.024691358024691357,
              "n_zeros": "143",
              "zero_proportion": 0.8827160493827161,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                1.3709505944546687
              ],
              "most_common_values": [
                [
                  0.0,
                  143,
                  0.8827160493827161
                ],
                [
                  0.9709505944546686,
                  9,
                  0.05555555555555555
                ],
                [
                  0.7219280948873623,
                  8,
                  0.04938271604938271
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.21303557158148465,
                "benign_mean": 0.0,
                "harmful_std": 0.396544354543354,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.024691358024691357,
                  "zero_proportion": 0.8827160493827161,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 4/162 unique scores",
                "HIGH ZERO CONCENTRATION: 88.3% of scores are exactly zero"
              ]
            },
            "metric2": {
              "metric_name": "EmbeddingVariance",
              "n_samples": 162,
              "n_unique_scores": 162,
              "unique_score_ratio": 1.0,
              "n_zeros": "0",
              "zero_proportion": 0.0,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                2.1545902963924753e-16,
                0.18624335527420044
              ],
              "most_common_values": [
                [
                  0.04219241812825203,
                  1,
                  0.006172839506172839
                ],
                [
                  0.0086062615737319,
                  1,
                  0.006172839506172839
                ],
                [
                  0.009998868219554424,
                  1,
                  0.006172839506172839
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.04737661935009614,
                "benign_mean": 0.02104124892215578,
                "harmful_std": 0.043628091709274765,
                "benign_std": 0.010870902797012704
              },
              "is_degenerate": {
                "severe": "False",
                "moderate": "False",
                "mild": "False",
                "details": {
                  "unique_score_ratio": 1.0,
                  "zero_proportion": 0.0,
                  "effective_discrimination": "True"
                }
              },
              "delong_valid": true,
              "statistical_warnings": []
            }
          },
          "auroc1": 0.6172839506172839,
          "auroc2": 0.6837372351775645,
          "auroc_difference": 0.06645328456028055,
          "delong_test": {
            "valid": false,
            "reasons": [
              "SE_tau_0.2 distribution too degenerate"
            ]
          }
        },
        "SE_vs_LevenshteinVariance": {
          "metric1_name": "SE_tau_0.2",
          "metric2_name": "LevenshteinVariance",
          "distribution_analysis": {
            "metric1": {
              "metric_name": "SE_tau_0.2",
              "n_samples": 162,
              "n_unique_scores": 4,
              "unique_score_ratio": 0.024691358024691357,
              "n_zeros": "143",
              "zero_proportion": 0.8827160493827161,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                1.3709505944546687
              ],
              "most_common_values": [
                [
                  0.0,
                  143,
                  0.8827160493827161
                ],
                [
                  0.9709505944546686,
                  9,
                  0.05555555555555555
                ],
                [
                  0.7219280948873623,
                  8,
                  0.04938271604938271
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.21303557158148465,
                "benign_mean": 0.0,
                "harmful_std": 0.396544354543354,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.024691358024691357,
                  "zero_proportion": 0.8827160493827161,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 4/162 unique scores",
                "HIGH ZERO CONCENTRATION: 88.3% of scores are exactly zero"
              ]
            },
            "metric2": {
              "metric_name": "LevenshteinVariance",
              "n_samples": 162,
              "n_unique_scores": 162,
              "unique_score_ratio": 1.0,
              "n_zeros": "1",
              "zero_proportion": 0.006172839506172839,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                3493249.44
              ],
              "most_common_values": [
                [
                  8481.490000000002,
                  1,
                  0.006172839506172839
                ],
                [
                  127144.49000000002,
                  1,
                  0.006172839506172839
                ],
                [
                  210873.03999999998,
                  1,
                  0.006172839506172839
                ]
              ],
              "class_separation": {
                "harmful_mean": 131200.03691358023,
                "benign_mean": 76961.11086419753,
                "harmful_std": 422506.7934899724,
                "benign_std": 103891.5811843312
              },
              "is_degenerate": {
                "severe": "False",
                "moderate": "False",
                "mild": "False",
                "details": {
                  "unique_score_ratio": 1.0,
                  "zero_proportion": 0.006172839506172839,
                  "effective_discrimination": "True"
                }
              },
              "delong_valid": true,
              "statistical_warnings": []
            }
          },
          "auroc1": 0.6172839506172839,
          "auroc2": 0.3968907178783722,
          "auroc_difference": -0.2203932327389117,
          "delong_test": {
            "valid": false,
            "reasons": [
              "SE_tau_0.2 distribution too degenerate"
            ]
          }
        }
      }
    },
    "qwen2.5-7b-instruct": {
      "model_name": "qwen2.5-7b-instruct",
      "metrics": {
        "semantic_entropy": {
          "tau_0.1": {
            "metric_name": "SE_tau_0.1",
            "distribution_analysis": {
              "metric_name": "SE_tau_0.1",
              "n_samples": 162,
              "n_unique_scores": 7,
              "unique_score_ratio": 0.043209876543209874,
              "n_zeros": "105",
              "zero_proportion": 0.6481481481481481,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                2.321928094887362
              ],
              "most_common_values": [
                [
                  0.0,
                  105,
                  0.6481481481481481
                ],
                [
                  1.3709505944546687,
                  14,
                  0.08641975308641975
                ],
                [
                  0.7219280948873623,
                  13,
                  0.08024691358024691
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.768420677252628,
                "benign_mean": 0.13744924741110418,
                "harmful_std": 0.7558155206794037,
                "benign_std": 0.35807279864532093
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.043209876543209874,
                  "zero_proportion": 0.6481481481481481,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 7/162 unique scores"
              ]
            },
            "auroc": 0.7325864959609816,
            "delong_ci_valid": false,
            "delong_ci_error": "Distribution too degenerate for DeLong method",
            "bootstrap_ci": [
              NaN,
              NaN
            ],
            "bootstrap_ci_valid": true,
            "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
          },
          "tau_0.2": {
            "metric_name": "SE_tau_0.2",
            "distribution_analysis": {
              "metric_name": "SE_tau_0.2",
              "n_samples": 162,
              "n_unique_scores": 3,
              "unique_score_ratio": 0.018518518518518517,
              "n_zeros": "153",
              "zero_proportion": 0.9444444444444444,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                0.9709505944546686
              ],
              "most_common_values": [
                [
                  0.0,
                  153,
                  0.9444444444444444
                ],
                [
                  0.7219280948873623,
                  7,
                  0.043209876543209874
                ],
                [
                  0.9709505944546686,
                  2,
                  0.012345679012345678
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.08636293645828239,
                "benign_mean": 0.0,
                "harmful_std": 0.24669690086900586,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.018518518518518517,
                  "zero_proportion": 0.9444444444444444,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 3/162 unique scores",
                "HIGH ZERO CONCENTRATION: 94.4% of scores are exactly zero"
              ]
            },
            "auroc": 0.5555555555555556,
            "delong_ci_valid": false,
            "delong_ci_error": "Distribution too degenerate for DeLong method",
            "bootstrap_ci": [
              NaN,
              NaN
            ],
            "bootstrap_ci_valid": true,
            "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
          },
          "tau_0.3": {
            "metric_name": "SE_tau_0.3",
            "distribution_analysis": {
              "metric_name": "SE_tau_0.3",
              "n_samples": 162,
              "n_unique_scores": 2,
              "unique_score_ratio": 0.012345679012345678,
              "n_zeros": "160",
              "zero_proportion": 0.9876543209876543,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                0.7219280948873623
              ],
              "most_common_values": [
                [
                  0.0,
                  160,
                  0.9876543209876543
                ],
                [
                  0.7219280948873623,
                  2,
                  0.012345679012345678
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.017825385058947218,
                "benign_mean": 0.0,
                "harmful_std": 0.11203080792237322,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.012345679012345678,
                  "zero_proportion": 0.9876543209876543,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 2/162 unique scores",
                "HIGH ZERO CONCENTRATION: 98.8% of scores are exactly zero"
              ]
            },
            "auroc": 0.5123456790123457,
            "delong_ci_valid": false,
            "delong_ci_error": "Distribution too degenerate for DeLong method",
            "bootstrap_ci": [
              NaN,
              NaN
            ],
            "bootstrap_ci_valid": true,
            "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
          },
          "tau_0.4": {
            "metric_name": "SE_tau_0.4",
            "distribution_analysis": {
              "metric_name": "SE_tau_0.4",
              "n_samples": 162,
              "n_unique_scores": 1,
              "unique_score_ratio": 0.006172839506172839,
              "n_zeros": "162",
              "zero_proportion": 1.0,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                0.0
              ],
              "most_common_values": [
                [
                  0.0,
                  162,
                  1.0
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.0,
                "benign_mean": 0.0,
                "harmful_std": 0.0,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.006172839506172839,
                  "zero_proportion": 1.0,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 1/162 unique scores",
                "HIGH ZERO CONCENTRATION: 100.0% of scores are exactly zero"
              ]
            },
            "auroc": 0.5,
            "delong_ci_valid": false,
            "delong_ci_error": "Distribution too degenerate for DeLong method",
            "bootstrap_ci": [
              NaN,
              NaN
            ],
            "bootstrap_ci_valid": true,
            "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
          }
        },
        "BERTScore": {
          "metric_name": "BERTScore",
          "distribution_analysis": {
            "metric_name": "BERTScore",
            "n_samples": 162,
            "n_unique_scores": 162,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.7976115942001343,
              0.9872315526008606
            ],
            "most_common_values": [
              [
                0.9213066101074219,
                1,
                0.006172839506172839
              ],
              [
                0.9030462503433228,
                1,
                0.006172839506172839
              ],
              [
                0.9509181976318359,
                1,
                0.006172839506172839
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.8953098029266169,
              "benign_mean": 0.8966401719752654,
              "harmful_std": 0.03916996932064665,
              "benign_std": 0.021509652539969077
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.4311842706904435,
          "delong_ci": [
            0.339122449788644,
            0.523246091592243
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "EmbeddingVariance": {
          "metric_name": "EmbeddingVariance",
          "distribution_analysis": {
            "metric_name": "EmbeddingVariance",
            "n_samples": 162,
            "n_unique_scores": 162,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.0005865923594683409,
              0.10136634111404419
            ],
            "most_common_values": [
              [
                0.015436764806509018,
                1,
                0.006172839506172839
              ],
              [
                0.021777819842100143,
                1,
                0.006172839506172839
              ],
              [
                0.024993762373924255,
                1,
                0.006172839506172839
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.04236425640762864,
              "benign_mean": 0.024525531429659436,
              "harmful_std": 0.024492743655985215,
              "benign_std": 0.01199400310953629
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.7242798353909465,
          "delong_ci": [
            0.6426507738257684,
            0.8059088969561247
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "LevenshteinVariance": {
          "metric_name": "LevenshteinVariance",
          "distribution_analysis": {
            "metric_name": "LevenshteinVariance",
            "n_samples": 162,
            "n_unique_scores": 162,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              559.4100000000001,
              1053490.44
            ],
            "most_common_values": [
              [
                28689.840000000004,
                1,
                0.006172839506172839
              ],
              [
                137441.00999999998,
                1,
                0.006172839506172839
              ],
              [
                8672.2,
                1,
                0.006172839506172839
              ]
            ],
            "class_separation": {
              "harmful_mean": 119796.96555555554,
              "benign_mean": 48593.51296296296,
              "harmful_std": 197919.90692332902,
              "benign_std": 57776.85486744233
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.572778539856729,
          "delong_ci": [
            0.48235181988623355,
            0.6632052598272247
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        }
      },
      "paired_comparisons": {
        "SE_vs_BERTScore": {
          "metric1_name": "SE_tau_0.2",
          "metric2_name": "BERTScore",
          "distribution_analysis": {
            "metric1": {
              "metric_name": "SE_tau_0.2",
              "n_samples": 162,
              "n_unique_scores": 3,
              "unique_score_ratio": 0.018518518518518517,
              "n_zeros": "153",
              "zero_proportion": 0.9444444444444444,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                0.9709505944546686
              ],
              "most_common_values": [
                [
                  0.0,
                  153,
                  0.9444444444444444
                ],
                [
                  0.7219280948873623,
                  7,
                  0.043209876543209874
                ],
                [
                  0.9709505944546686,
                  2,
                  0.012345679012345678
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.08636293645828239,
                "benign_mean": 0.0,
                "harmful_std": 0.24669690086900586,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.018518518518518517,
                  "zero_proportion": 0.9444444444444444,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 3/162 unique scores",
                "HIGH ZERO CONCENTRATION: 94.4% of scores are exactly zero"
              ]
            },
            "metric2": {
              "metric_name": "BERTScore",
              "n_samples": 162,
              "n_unique_scores": 162,
              "unique_score_ratio": 1.0,
              "n_zeros": "0",
              "zero_proportion": 0.0,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.7976115942001343,
                0.9872315526008606
              ],
              "most_common_values": [
                [
                  0.9213066101074219,
                  1,
                  0.006172839506172839
                ],
                [
                  0.9030462503433228,
                  1,
                  0.006172839506172839
                ],
                [
                  0.9509181976318359,
                  1,
                  0.006172839506172839
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.8953098029266169,
                "benign_mean": 0.8966401719752654,
                "harmful_std": 0.03916996932064665,
                "benign_std": 0.021509652539969077
              },
              "is_degenerate": {
                "severe": "False",
                "moderate": "False",
                "mild": "False",
                "details": {
                  "unique_score_ratio": 1.0,
                  "zero_proportion": 0.0,
                  "effective_discrimination": "True"
                }
              },
              "delong_valid": true,
              "statistical_warnings": []
            }
          },
          "auroc1": 0.5555555555555556,
          "auroc2": 0.4311842706904435,
          "auroc_difference": -0.12437128486511206,
          "delong_test": {
            "valid": false,
            "reasons": [
              "SE_tau_0.2 distribution too degenerate"
            ]
          }
        },
        "SE_vs_EmbeddingVariance": {
          "metric1_name": "SE_tau_0.2",
          "metric2_name": "EmbeddingVariance",
          "distribution_analysis": {
            "metric1": {
              "metric_name": "SE_tau_0.2",
              "n_samples": 162,
              "n_unique_scores": 3,
              "unique_score_ratio": 0.018518518518518517,
              "n_zeros": "153",
              "zero_proportion": 0.9444444444444444,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                0.9709505944546686
              ],
              "most_common_values": [
                [
                  0.0,
                  153,
                  0.9444444444444444
                ],
                [
                  0.7219280948873623,
                  7,
                  0.043209876543209874
                ],
                [
                  0.9709505944546686,
                  2,
                  0.012345679012345678
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.08636293645828239,
                "benign_mean": 0.0,
                "harmful_std": 0.24669690086900586,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.018518518518518517,
                  "zero_proportion": 0.9444444444444444,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 3/162 unique scores",
                "HIGH ZERO CONCENTRATION: 94.4% of scores are exactly zero"
              ]
            },
            "metric2": {
              "metric_name": "EmbeddingVariance",
              "n_samples": 162,
              "n_unique_scores": 162,
              "unique_score_ratio": 1.0,
              "n_zeros": "0",
              "zero_proportion": 0.0,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0005865923594683409,
                0.10136634111404419
              ],
              "most_common_values": [
                [
                  0.015436764806509018,
                  1,
                  0.006172839506172839
                ],
                [
                  0.021777819842100143,
                  1,
                  0.006172839506172839
                ],
                [
                  0.024993762373924255,
                  1,
                  0.006172839506172839
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.04236425640762864,
                "benign_mean": 0.024525531429659436,
                "harmful_std": 0.024492743655985215,
                "benign_std": 0.01199400310953629
              },
              "is_degenerate": {
                "severe": "False",
                "moderate": "False",
                "mild": "False",
                "details": {
                  "unique_score_ratio": 1.0,
                  "zero_proportion": 0.0,
                  "effective_discrimination": "True"
                }
              },
              "delong_valid": true,
              "statistical_warnings": []
            }
          },
          "auroc1": 0.5555555555555556,
          "auroc2": 0.7242798353909465,
          "auroc_difference": 0.16872427983539096,
          "delong_test": {
            "valid": false,
            "reasons": [
              "SE_tau_0.2 distribution too degenerate"
            ]
          }
        },
        "SE_vs_LevenshteinVariance": {
          "metric1_name": "SE_tau_0.2",
          "metric2_name": "LevenshteinVariance",
          "distribution_analysis": {
            "metric1": {
              "metric_name": "SE_tau_0.2",
              "n_samples": 162,
              "n_unique_scores": 3,
              "unique_score_ratio": 0.018518518518518517,
              "n_zeros": "153",
              "zero_proportion": 0.9444444444444444,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                0.0,
                0.9709505944546686
              ],
              "most_common_values": [
                [
                  0.0,
                  153,
                  0.9444444444444444
                ],
                [
                  0.7219280948873623,
                  7,
                  0.043209876543209874
                ],
                [
                  0.9709505944546686,
                  2,
                  0.012345679012345678
                ]
              ],
              "class_separation": {
                "harmful_mean": 0.08636293645828239,
                "benign_mean": 0.0,
                "harmful_std": 0.24669690086900586,
                "benign_std": 0.0
              },
              "is_degenerate": {
                "severe": true,
                "moderate": true,
                "mild": true,
                "details": {
                  "unique_score_ratio": 0.018518518518518517,
                  "zero_proportion": 0.9444444444444444,
                  "effective_discrimination": false
                }
              },
              "delong_valid": false,
              "statistical_warnings": [
                "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                "Only 3/162 unique scores",
                "HIGH ZERO CONCENTRATION: 94.4% of scores are exactly zero"
              ]
            },
            "metric2": {
              "metric_name": "LevenshteinVariance",
              "n_samples": 162,
              "n_unique_scores": 162,
              "unique_score_ratio": 1.0,
              "n_zeros": "0",
              "zero_proportion": 0.0,
              "n_infinite": "0",
              "n_nan": "0",
              "score_range": [
                559.4100000000001,
                1053490.44
              ],
              "most_common_values": [
                [
                  28689.840000000004,
                  1,
                  0.006172839506172839
                ],
                [
                  137441.00999999998,
                  1,
                  0.006172839506172839
                ],
                [
                  8672.2,
                  1,
                  0.006172839506172839
                ]
              ],
              "class_separation": {
                "harmful_mean": 119796.96555555554,
                "benign_mean": 48593.51296296296,
                "harmful_std": 197919.90692332902,
                "benign_std": 57776.85486744233
              },
              "is_degenerate": {
                "severe": "False",
                "moderate": "False",
                "mild": "False",
                "details": {
                  "unique_score_ratio": 1.0,
                  "zero_proportion": 0.0,
                  "effective_discrimination": "True"
                }
              },
              "delong_valid": true,
              "statistical_warnings": []
            }
          },
          "auroc1": 0.5555555555555556,
          "auroc2": 0.572778539856729,
          "auroc_difference": 0.017222984301173416,
          "delong_test": {
            "valid": false,
            "reasons": [
              "SE_tau_0.2 distribution too degenerate"
            ]
          }
        }
      }
    }
  }
}