{
  "models": {
    "bert-base-uncased": {
      "robustness_analysis": {
        "char_swap": {
          "level_0.05": {
            "mean": 0.8082710574354445,
            "std": 0.06596015577901014,
            "ci_95": [
              0.8013367284830403,
              0.8152053863878487
            ],
            "p_value": 3.7623093159170124e-253,
            "effect_size": -4.110749503469242,
            "significant_degradation": "True",
            "n_samples": 350
          },
          "level_0.1": {
            "mean": 0.6615694702523095,
            "std": 0.09299932405282338,
            "ci_95": [
              0.651792539047142,
              0.671346401457477
            ],
            "p_value": 2.0107176278697e-310,
            "effect_size": -5.146414306856302,
            "significant_degradation": "True",
            "n_samples": 350
          },
          "level_0.2": {
            "mean": 0.5322662660053799,
            "std": 0.0849108276765099,
            "ci_95": [
              0.5233396707938325,
              0.5411928612169272
            ],
            "p_value": 0.0,
            "effect_size": -7.790236122270995,
            "significant_degradation": "True",
            "n_samples": 350
          }
        },
        "word_sub": {
          "level_0.05": {
            "mean": 1.0000000047683715,
            "std": 1.1643414913853324e-07,
            "ci_95": [
              0.9999999925277592,
              1.0000000170089838
            ],
            "p_value": 1.0,
            "effect_size": 0.0,
            "significant_degradation": "False",
            "n_samples": 350
          },
          "level_0.1": {
            "mean": 1.0000000047683715,
            "std": 1.1643414913853324e-07,
            "ci_95": [
              0.9999999925277592,
              1.0000000170089838
            ],
            "p_value": 1.0,
            "effect_size": 0.0,
            "significant_degradation": "False",
            "n_samples": 350
          },
          "level_0.2": {
            "mean": 1.0000000047683715,
            "std": 1.1643414913853324e-07,
            "ci_95": [
              0.9999999925277592,
              1.0000000170089838
            ],
            "p_value": 1.0,
            "effect_size": 0.0,
            "significant_degradation": "False",
            "n_samples": 350
          }
        },
        "grammar": {
          "level_0.05": {
            "mean": 0.9997889925752367,
            "std": 0.0022816241955579595,
            "ci_95": [
              0.9995491275759669,
              1.0000288575745064
            ],
            "p_value": 0.08403573875917383,
            "effect_size": -0.13079117303793208,
            "significant_degradation": "False",
            "n_samples": 350
          },
          "level_0.1": {
            "mean": 0.9979293784073421,
            "std": 0.009523908496346689,
            "ci_95": [
              0.9969281388916634,
              0.9989306179230207
            ],
            "p_value": 5.295383189615831e-05,
            "effect_size": -0.3074691323713853,
            "significant_degradation": "True",
            "n_samples": 350
          },
          "level_0.2": {
            "mean": 0.9984587895870208,
            "std": 0.008236898373732867,
            "ci_95": [
              0.9975928522314095,
              0.9993247269426322
            ],
            "p_value": 0.000493863093541755,
            "effect_size": -0.26461506660496187,
            "significant_degradation": "True",
            "n_samples": 350
          }
        }
      },
      "causal_analysis": {
        "candidate_layers": [
          {
            "layer": 8,
            "heads": [
              0,
              1,
              2
            ],
            "baseline_robustness": 0.6632562166452408,
            "intervention_robustness": 0.6658199113607407,
            "effect_size": 0.028906777318077045,
            "p_value": 0.2539473822534802,
            "significant": "False"
          },
          {
            "layer": 10,
            "heads": [
              0,
              1,
              2
            ],
            "baseline_robustness": 0.6632562166452408,
            "intervention_robustness": 0.6964075028896332,
            "effect_size": 0.37379522744291527,
            "p_value": 7.287714332020873e-19,
            "significant": "True"
          },
          {
            "layer": 11,
            "heads": [
              0,
              1,
              2
            ],
            "baseline_robustness": 0.6632562166452408,
            "intervention_robustness": 0.7133198845386505,
            "effect_size": 0.5644897150863671,
            "p_value": 1.129401872344705e-23,
            "significant": "True"
          }
        ]
      },
      "statistical_summary": {}
    },
    "roberta-base": {
      "robustness_analysis": {
        "char_swap": {
          "level_0.05": {
            "mean": 0.9812716501099723,
            "std": 0.006509607857849639,
            "ci_95": [
              0.9805873011768267,
              0.9819559990431179
            ],
            "p_value": 1.232040216243137e-250,
            "effect_size": -4.0687388819968495,
            "significant_degradation": "True",
            "n_samples": 350
          },
          "level_0.1": {
            "mean": 0.9639076382773263,
            "std": 0.011017015919723382,
            "ci_95": [
              0.9627494297867765,
              0.9650658467678761
            ],
            "p_value": 3.9368741420862565e-283,
            "effect_size": -4.633043648356142,
            "significant_degradation": "True",
            "n_samples": 350
          },
          "level_0.2": {
            "mean": 0.944352058172226,
            "std": 0.012662699678977105,
            "ci_95": [
              0.9430208405005498,
              0.9456832758439022
            ],
            "p_value": 0.0,
            "effect_size": -6.2149529112966855,
            "significant_degradation": "True",
            "n_samples": 350
          }
        },
        "word_sub": {
          "level_0.05": {
            "mean": 1.0000000057901655,
            "std": 1.3961029192311068e-07,
            "ci_95": [
              0.9999999911130671,
              1.0000000204672637
            ],
            "p_value": 1.0,
            "effect_size": 0.0,
            "significant_degradation": "False",
            "n_samples": 350
          },
          "level_0.1": {
            "mean": 1.0000000057901655,
            "std": 1.3961029192311068e-07,
            "ci_95": [
              0.9999999911130671,
              1.0000000204672637
            ],
            "p_value": 1.0,
            "effect_size": 0.0,
            "significant_degradation": "False",
            "n_samples": 350
          },
          "level_0.2": {
            "mean": 1.0000000057901655,
            "std": 1.3961029192311068e-07,
            "ci_95": [
              0.9999999911130671,
              1.0000000204672637
            ],
            "p_value": 1.0,
            "effect_size": 0.0,
            "significant_degradation": "False",
            "n_samples": 350
          }
        },
        "grammar": {
          "level_0.05": {
            "mean": 0.9998854192665645,
            "std": 0.0004647056377809212,
            "ci_95": [
              0.9998365652012526,
              0.9999342733318763
            ],
            "p_value": 4.720556041089698e-06,
            "effect_size": -0.34871496115792167,
            "significant_degradation": "True",
            "n_samples": 350
          },
          "level_0.1": {
            "mean": 0.9998483313832964,
            "std": 0.0005079422302239819,
            "ci_95": [
              0.999794931895809,
              0.9999017308707838
            ],
            "p_value": 3.323999082636189e-08,
            "effect_size": -0.42229210802629996,
            "significant_degradation": "True",
            "n_samples": 350
          },
          "level_0.2": {
            "mean": 0.9996372728688376,
            "std": 0.0008580696285511578,
            "ci_95": [
              0.9995470648182627,
              0.9997274809194125
            ],
            "p_value": 1.0197077996744779e-14,
            "effect_size": -0.59783238213117,
            "significant_degradation": "True",
            "n_samples": 350
          }
        }
      },
      "causal_analysis": {
        "candidate_layers": [
          {
            "layer": 8,
            "heads": [
              0,
              1,
              2
            ],
            "baseline_robustness": 0.9639066660404205,
            "intervention_robustness": 0.9640929687023163,
            "effect_size": 0.019578289727448652,
            "p_value": 0.4053638547175775,
            "significant": "False"
          },
          {
            "layer": 10,
            "heads": [
              0,
              1,
              2
            ],
            "baseline_robustness": 0.9639066660404205,
            "intervention_robustness": 0.9692261910438538,
            "effect_size": 0.559021544135963,
            "p_value": 2.0605963592184337e-22,
            "significant": "True"
          },
          {
            "layer": 11,
            "heads": [
              0,
              1,
              2
            ],
            "baseline_robustness": 0.9639066660404205,
            "intervention_robustness": 0.9761025214195251,
            "effect_size": 1.2816456171717867,
            "p_value": 1.371804382134753e-28,
            "significant": "True"
          }
        ]
      },
      "statistical_summary": {}
    }
  },
  "statistical_summary": {
    "model_robustness_ranking": [
      [
        "roberta-base",
        0.9876558208276356
      ],
      [
        "bert-base-uncased",
        0.8886982187297608
      ]
    ],
    "best_model": "roberta-base",
    "best_model_score": 0.9876558208276356,
    "robustness_gap": 0.09895760209787474
  },
  "causal_findings": {},
  "meta_analysis": {}
}