{
  "hypothesis": "H5",
  "dataset": "JBB-Paraphrased",
  "models": {
    "llama": {
      "model_name": "llama",
      "metrics": {
        "SE_tau_0.2": {
          "metric_name": "SE_tau_0.2_paraphrased",
          "distribution_analysis": {
            "metric_name": "SE_tau_0.2_paraphrased",
            "n_samples": 115,
            "n_unique_scores": 6,
            "unique_score_ratio": 0.05217391304347826,
            "n_zeros": "94",
            "zero_proportion": 0.8173913043478261,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.0,
              1.9219280948873623
            ],
            "most_common_values": [
              [
                0.0,
                94,
                0.8173913043478261
              ],
              [
                0.7219280948873623,
                9,
                0.0782608695652174
              ],
              [
                0.9709505944546686,
                4,
                0.034782608695652174
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.35298058233016066,
              "benign_mean": 0.05316499795113144,
              "harmful_std": 0.5817600614696563,
              "benign_std": 0.1991303582918707
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": true,
              "mild": true,
              "details": {
                "unique_score_ratio": 0.05217391304347826,
                "zero_proportion": 0.8173913043478261,
                "effective_discrimination": false
              }
            },
            "delong_valid": true,
            "statistical_warnings": [
              "MODERATE DEGENERACY: DeLong test assumptions may be violated",
              "HIGH ZERO CONCENTRATION: 81.7% of scores are exactly zero"
            ]
          },
          "auroc": 0.62318401937046,
          "delong_ci": [
            0.5545392732218892,
            0.6918287655190305
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true,
          "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
        },
        "BERTScore": {
          "metric_name": "BERTScore_paraphrased",
          "distribution_analysis": {
            "metric_name": "BERTScore_paraphrased",
            "n_samples": 115,
            "n_unique_scores": 112,
            "unique_score_ratio": 0.9739130434782609,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.8523883819580078,
              1.0
            ],
            "most_common_values": [
              [
                1.0,
                4,
                0.034782608695652174
              ],
              [
                0.8668166995048523,
                1,
                0.008695652173913044
              ],
              [
                0.9065595865249634,
                1,
                0.008695652173913044
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.9320168771914074,
              "benign_mean": 0.9048117833622431,
              "harmful_std": 0.0398682158799499,
              "benign_std": 0.025185055092450528
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 0.9739130434782609,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.7139830508474576,
          "delong_ci": [
            0.614824739431143,
            0.8131413622637722
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "EmbeddingVariance": {
          "metric_name": "EmbeddingVariance_paraphrased",
          "distribution_analysis": {
            "metric_name": "EmbeddingVariance_paraphrased",
            "n_samples": 115,
            "n_unique_scores": 114,
            "unique_score_ratio": 0.991304347826087,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              2.094153966210289e-16,
              0.15877926349639893
            ],
            "most_common_values": [
              [
                1.0296252985950681e-15,
                2,
                0.017391304347826087
              ],
              [
                0.09556064009666443,
                1,
                0.008695652173913044
              ],
              [
                0.01616920530796051,
                1,
                0.008695652173913044
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.05030236162409507,
              "benign_mean": 0.025103097521084344,
              "harmful_std": 0.04382593965388317,
              "benign_std": 0.018483155207940572
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 0.991304347826087,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.6622276029055689,
          "delong_ci": [
            0.5561708340018241,
            0.7682843718093137
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        }
      }
    },
    "qwen": {
      "model_name": "qwen",
      "metrics": {
        "SE_tau_0.2": {
          "metric_name": "SE_tau_0.2_paraphrased",
          "distribution_analysis": {
            "metric_name": "SE_tau_0.2_paraphrased",
            "n_samples": 115,
            "n_unique_scores": 5,
            "unique_score_ratio": 0.043478260869565216,
            "n_zeros": "91",
            "zero_proportion": 0.7913043478260869,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.0,
              1.9219280948873623
            ],
            "most_common_values": [
              [
                0.0,
                91,
                0.7913043478260869
              ],
              [
                0.7219280948873623,
                13,
                0.11304347826086956
              ],
              [
                0.9709505944546686,
                7,
                0.06086956521739131
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.20716300581833766,
              "benign_mean": 0.1875983635328971,
              "harmful_std": 0.3640689475485539,
              "benign_std": 0.4531407027668325
            },
            "is_degenerate": {
              "severe": true,
              "moderate": true,
              "mild": true,
              "details": {
                "unique_score_ratio": 0.043478260869565216,
                "zero_proportion": 0.7913043478260869,
                "effective_discrimination": false
              }
            },
            "delong_valid": false,
            "statistical_warnings": [
              "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
              "Only 5/115 unique scores"
            ]
          },
          "auroc": 0.5351089588377724,
          "delong_ci_valid": false,
          "delong_ci_error": "Distribution too degenerate for DeLong method",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true,
          "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
        },
        "BERTScore": {
          "metric_name": "BERTScore_paraphrased",
          "distribution_analysis": {
            "metric_name": "BERTScore_paraphrased",
            "n_samples": 115,
            "n_unique_scores": 115,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.8220413327217102,
              0.9401082992553711
            ],
            "most_common_values": [
              [
                0.910913348197937,
                1,
                0.008695652173913044
              ],
              [
                0.8719528317451477,
                1,
                0.008695652173913044
              ],
              [
                0.876290500164032,
                1,
                0.008695652173913044
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.8882003352046013,
              "benign_mean": 0.8813493100263304,
              "harmful_std": 0.02222974623452743,
              "benign_std": 0.01659127234827935
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.6056295399515738,
          "delong_ci": [
            0.5010651801172927,
            0.710193899785855
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        },
        "EmbeddingVariance": {
          "metric_name": "EmbeddingVariance_paraphrased",
          "distribution_analysis": {
            "metric_name": "EmbeddingVariance_paraphrased",
            "n_samples": 115,
            "n_unique_scores": 115,
            "unique_score_ratio": 1.0,
            "n_zeros": "0",
            "zero_proportion": 0.0,
            "n_infinite": "0",
            "n_nan": "0",
            "score_range": [
              0.009067676961421967,
              0.1345967799425125
            ],
            "most_common_values": [
              [
                0.05552436783909798,
                1,
                0.008695652173913044
              ],
              [
                0.07487007975578308,
                1,
                0.008695652173913044
              ],
              [
                0.025713246315717697,
                1,
                0.008695652173913044
              ]
            ],
            "class_separation": {
              "harmful_mean": 0.05411197632617716,
              "benign_mean": 0.0380041602272856,
              "harmful_std": 0.024318734939215488,
              "benign_std": 0.028665559823857354
            },
            "is_degenerate": {
              "severe": "False",
              "moderate": "False",
              "mild": "False",
              "details": {
                "unique_score_ratio": 1.0,
                "zero_proportion": 0.0,
                "effective_discrimination": "True"
              }
            },
            "delong_valid": true,
            "statistical_warnings": []
          },
          "auroc": 0.7021791767554479,
          "delong_ci": [
            0.6029255129027,
            0.8014328406081959
          ],
          "delong_ci_valid": true,
          "delong_method": "MLstatkit",
          "bootstrap_ci": [
            NaN,
            NaN
          ],
          "bootstrap_ci_valid": true
        }
      }
    }
  }
}