{
  "hypothesis": "H7",
  "dataset": "JBB-SOTA-Subset",
  "models": {
    "qwen-2.5-72b-instruct": {
      "model_name": "qwen-2.5-72b-instruct",
      "metrics": {
        "semantic_entropy": {},
        "BERTScore": {
          "metric_name": "BERTScore_SOTA",
          "distribution_analysis": {
            "metric_name": "BERTScore_SOTA",
            "n_samples": 120,
            "n_unique_scores": 120,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.8404114842414856,
              0.9617641568183899
            ],
            "most_common_values": [
              [
                0.8602516055107117,
                1,
                0.008333333333333333
              ],
              [
                0.8991512060165405,
                1,
                0.008333333333333333
              ],
              [
                0.8471435308456421,
                1,
                0.008333333333333333
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.8991622855265935,
              "benign_mean": 0.892102399468422,
              "harmful_std": 0.021030124427228795,
              "benign_std": 0.021732606266427902
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.5613888888888889,
          "delong_ci": [
            0.4569723571166875,
            0.6658054206610904
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "EmbeddingVariance": {
          "metric_name": "EmbeddingVariance_SOTA",
          "distribution_analysis": {
            "metric_name": "EmbeddingVariance_SOTA",
            "n_samples": 120,
            "n_unique_scores": 120,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.0038171312771737576,
              0.12665219604969025
            ],
            "most_common_values": [
              [
                0.06895110011100769,
                1,
                0.008333333333333333
              ],
              [
                0.009394926019012928,
                1,
                0.008333333333333333
              ],
              [
                0.019754689186811447,
                1,
                0.008333333333333333
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.044811467500403525,
              "benign_mean": 0.02894032873058071,
              "harmful_std": 0.02129681914466809,
              "benign_std": 0.028308453374033437
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.7325,
          "delong_ci": [
            0.6359962412032301,
            0.8290037587967702
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "LevenshteinVariance": {
          "metric_name": "LevenshteinVariance_SOTA",
          "distribution_analysis": {
            "metric_name": "LevenshteinVariance_SOTA",
            "n_samples": 120,
            "n_unique_scores": 120,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              136.96000000000004,
              2980988.8400000003
            ],
            "most_common_values": [
              [
                338620.96,
                1,
                0.008333333333333333
              ],
              [
                54231.88999999999,
                1,
                0.008333333333333333
              ],
              [
                65438.45,
                1,
                0.008333333333333333
              ]
            ],
            "class_separation": {
              "harmful_mean": 299749.6785,
              "benign_mean": 96934.67733333333,
              "harmful_std": 650045.0314689156,
              "benign_std": 269890.90985958074
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.5197222222222222,
          "delong_ci": [
            0.4103069920282314,
            0.629137452416213
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        }
      }
    },
    "llama-3.3-70b-instruct": {
      "model_name": "llama-3.3-70b-instruct",
      "metrics": {
        "semantic_entropy": {},
        "BERTScore": {
          "metric_name": "BERTScore_SOTA",
          "distribution_analysis": {
            "metric_name": "BERTScore_SOTA",
            "n_samples": 120,
            "n_unique_scores": 117,
            "unique_score_ratio": 0.975,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.837664008140564,
              0.9999998807907104
            ],
            "most_common_values": [
              [
                0.9999998807907104,
                4,
                0.03333333333333333
              ],
              [
                0.886157214641571,
                1,
                0.008333333333333333
              ],
              [
                0.9116882085800171,
                1,
                0.008333333333333333
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.9142023642857869,
              "benign_mean": 0.8988115400075912,
              "harmful_std": 0.03882497953823419,
              "benign_std": 0.027492590431643732
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 0.975,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.6280555555555555,
          "delong_ci": [
            0.5251687008736773,
            0.7309424102374339
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "EmbeddingVariance": {
          "metric_name": "EmbeddingVariance_SOTA",
          "distribution_analysis": {
            "metric_name": "EmbeddingVariance_SOTA",
            "n_samples": 120,
            "n_unique_scores": 120,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              1.598465520916744e-16,
              0.21181774139404297
            ],
            "most_common_values": [
              [
                0.016755787655711174,
                1,
                0.008333333333333333
              ],
              [
                0.00928562693297863,
                1,
                0.008333333333333333
              ],
              [
                0.022033223882317543,
                1,
                0.008333333333333333
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.08400401604982714,
              "benign_mean": 0.028475395900507777,
              "harmful_std": 0.05750047345383376,
              "benign_std": 0.02740145527428241
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.8094444444444444,
          "delong_ci": [
            0.7290169254836567,
            0.8898719634052321
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "LevenshteinVariance": {
          "metric_name": "LevenshteinVariance_SOTA",
          "distribution_analysis": {
            "metric_name": "LevenshteinVariance_SOTA",
            "n_samples": 120,
            "n_unique_scores": 117,
            "unique_score_ratio": 0.975,
            "n_zeros": "4",
            "zero_proportion": 0.03333333333333333,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.0,
              1634904.0
            ],
            "most_common_values": [
              [
                0.0,
                4,
                0.03333333333333333
              ],
              [
                17569.69,
                1,
                0.008333333333333333
              ],
              [
                139196.21,
                1,
                0.008333333333333333
              ]
            ],
            "class_separation": {
              "harmful_mean": 98783.42400000001,
              "benign_mean": 109355.08133333335,
              "harmful_std": 195366.51422851236,
              "benign_std": 245540.5521710475
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 0.975,
                "zero_proportion": 0.03333333333333333,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.3288888888888889,
          "delong_ci": [
            0.22415025072506786,
            0.4336275270527099
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        }
      }
    }
  },
  "cross_model_analysis": {
    "claim": "Larger models exhibit worse SE but equal/better baseline performance",
    "comparison_pairs": [],
    "statistical_evidence": {},
    "se_comparison": {
      "error": "'tau_0.3'"
    },
    "baseline_comparison": {
      "model1": "qwen-2.5-72b-instruct",
      "model2": "llama-3.3-70b-instruct",
      "metric": "BERTScore",
      "auroc1": 0.5613888888888889,
      "auroc2": 0.6280555555555555,
      "difference": 0.06666666666666654,
      "better_model": "llama-3.3-70b-instruct"
    }
  }
}