[
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "text-embedding-3-small",
    "projection_dimension": 50,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_50.json",
      "model2": "models/text-embedding-3-small_propositions_a->b_50.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:51.983873",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.41884719345718624,
        "mean_model2": -0.07053252588957548,
        "mean_delta_model1": 0.5811528065428138,
        "mean_delta_model2": 0.9294674741104245,
        "mean_delta_model2 / mean_delta_model1": 1.5993512612279714,
        "test_statistic": 391.0,
        "p_value": 2.1780705099798336e-13,
        "q_value": 1.0841005073268216e-11,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.3898035565204918,
        "mean_model2": -0.10355177973397076,
        "mean_delta_model1": 0.6101964434795082,
        "mean_delta_model2": 0.8964482202660292,
        "mean_delta_model2 / mean_delta_model1": 1.4691141350386026,
        "test_statistic": 485.0,
        "p_value": 2.312970260587159e-12,
        "q_value": 5.756223734367549e-11,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.4778680958226323,
        "mean_model2": -0.2346378804370761,
        "mean_delta_model1": 0.5221319041773677,
        "mean_delta_model2": 0.7653621195629239,
        "mean_delta_model2 / mean_delta_model1": 1.4658405537749541,
        "test_statistic": 656.0,
        "p_value": 1.3080818542490194e-10,
        "q_value": 2.1702575671429194e-09,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6274809923022986,
        "mean_model2": 0.7387046897411347,
        "mean_delta_model1": 0.6369218497723341,
        "mean_delta_model2": 0.7468599051237106,
        "mean_delta_model2 / mean_delta_model1": 1.1726083904809883,
        "test_statistic": 756.0,
        "p_value": 1.1839921708264648e-09,
        "q_value": 1.4732839308721915e-08,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7419095650315285,
        "mean_model2": 0.6694643640518189,
        "mean_delta_model1": 0.2580904349684715,
        "mean_delta_model2": 0.33053563594818114,
        "mean_delta_model2 / mean_delta_model1": 1.2806969618558688,
        "test_statistic": 825.0,
        "p_value": 5.061052087089789e-09,
        "q_value": 5.038118931487035e-08,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9007420587539673,
        "mean_model2": 0.8581337732076645,
        "mean_delta_model1": 0.09925794124603271,
        "mean_delta_model2": 0.14186622679233551,
        "mean_delta_model2 / mean_delta_model1": 1.4292682783001591,
        "test_statistic": 906.0,
        "p_value": 2.5968634363691743e-08,
        "q_value": 2.1542468865041174e-07,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8725327450037003,
        "mean_model2": 0.8379781192541123,
        "mean_delta_model1": 0.12746725499629974,
        "mean_delta_model2": 0.16202188074588775,
        "mean_delta_model2 / mean_delta_model1": 1.2710862938924283,
        "test_statistic": 1194.0,
        "p_value": 4.730222391025182e-06,
        "q_value": 3.363420231925871e-05,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7675900214910507,
        "mean_model2": 0.7082685004174709,
        "mean_delta_model1": 0.2324099785089493,
        "mean_delta_model2": 0.2917314995825291,
        "mean_delta_model2 / mean_delta_model1": 1.255245155367955,
        "test_statistic": 1378.0,
        "p_value": 8.021440667889499e-05,
        "q_value": 0.0004990683185935702,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8987804532051087,
        "mean_model2": 0.8721829789876938,
        "mean_delta_model1": 0.10121954679489135,
        "mean_delta_model2": 0.12781702101230621,
        "mean_delta_model2 / mean_delta_model1": 1.2627701373857294,
        "test_statistic": 1398.0,
        "p_value": 0.00010662815922480123,
        "q_value": 0.000589694415723955,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8362160032987594,
        "mean_model2": 0.7897209658473731,
        "mean_delta_model1": 0.16378399670124055,
        "mean_delta_model2": 0.210279034152627,
        "mean_delta_model2 / mean_delta_model1": 1.2838802226581292,
        "test_statistic": 1457.0,
        "p_value": 0.0002405363828495425,
        "q_value": 0.0011972321992466718,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8501347251236439,
        "mean_model2": 0.8187742882966995,
        "mean_delta_model1": 0.14986527487635612,
        "mean_delta_model2": 0.18122571170330048,
        "mean_delta_model2 / mean_delta_model1": 1.2092575271544244,
        "test_statistic": 1491.0,
        "p_value": 0.0003776516176530985,
        "q_value": 0.001708819836867739,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5883494353294373,
        "mean_model2": 0.630935657415539,
        "mean_delta_model1": 0.5883494353294373,
        "mean_delta_model2": 0.630935657415539,
        "mean_delta_model2 / mean_delta_model1": 1.072382532435433,
        "test_statistic": 1763.0,
        "p_value": 0.008792885656901764,
        "q_value": 0.0364710101510985,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8518508130311966,
        "mean_model2": 0.8325071388483047,
        "mean_delta_model1": 0.14814918696880341,
        "mean_delta_model2": 0.16749286115169526,
        "mean_delta_model2 / mean_delta_model1": 1.1305688851803497,
        "test_statistic": 1842.0,
        "p_value": 0.01885548298299195,
        "q_value": 0.0721924734409613,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9462025570869446,
        "mean_model2": 0.9537317562103271,
        "mean_delta_model1": 0.05379744291305542,
        "mean_delta_model2": 0.04626824378967285,
        "mean_delta_model2 / mean_delta_model1": 0.8600454089323378,
        "test_statistic": 1881.0,
        "p_value": 0.026809441110138,
        "q_value": 0.09531414083991986,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7038766911625862,
        "mean_model2": 0.681420271396637,
        "mean_delta_model1": 0.7038766911625862,
        "mean_delta_model2": 0.681420271396637,
        "mean_delta_model2 / mean_delta_model1": 0.9680960883519837,
        "test_statistic": 1952.0,
        "p_value": 0.048820020726286446,
        "q_value": 0.16199600822400406,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "text-embedding-3-small",
    "projection_dimension": 100,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 11,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_100.json",
      "model2": "models/text-embedding-3-small_propositions_a->b_100.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:51.983935",
    "comparisons": [
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7801883789896965,
        "mean_model2": 0.6653209713101387,
        "mean_delta_model1": 0.2198116210103035,
        "mean_delta_model2": 0.3346790286898613,
        "mean_delta_model2 / mean_delta_model1": 1.5225720421495528,
        "test_statistic": 320.0,
        "p_value": 3.415659060555646e-14,
        "q_value": 1.7000908388581578e-12,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.45454454969614744,
        "mean_model2": -0.14982909000013023,
        "mean_delta_model1": 0.5454554503038526,
        "mean_delta_model2": 0.8501709099998698,
        "mean_delta_model2 / mean_delta_model1": 1.5586440827133945,
        "test_statistic": 454.0,
        "p_value": 1.0732517780223133e-12,
        "q_value": 2.6709713751511558e-11,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9084970563650131,
        "mean_model2": 0.8574271613359451,
        "mean_delta_model1": 0.09150294363498687,
        "mean_delta_model2": 0.14257283866405487,
        "mean_delta_model2 / mean_delta_model1": 1.5581229739754627,
        "test_statistic": 489.0,
        "p_value": 2.551792303883157e-12,
        "q_value": 4.233715603721851e-11,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8027366203069687,
        "mean_model2": 0.7080073677003383,
        "mean_delta_model1": 0.19726337969303132,
        "mean_delta_model2": 0.29199263229966166,
        "mean_delta_model2 / mean_delta_model1": 1.4802171226815741,
        "test_statistic": 539.0,
        "p_value": 8.579921301659199e-12,
        "q_value": 1.0676303858545725e-10,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4514737324416637,
        "mean_model2": -0.13483487633988261,
        "mean_delta_model1": 0.5485262675583362,
        "mean_delta_model2": 0.8651651236601174,
        "mean_delta_model2 / mean_delta_model1": 1.5772537703823024,
        "test_statistic": 556.0,
        "p_value": 1.2872756011754984e-11,
        "q_value": 1.2814425666290465e-10,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6212199111655354,
        "mean_model2": 0.7399809346348047,
        "mean_delta_model1": 0.6348483427241445,
        "mean_delta_model2": 0.7490404274314642,
        "mean_delta_model2 / mean_delta_model1": 1.1798730137930573,
        "test_statistic": 651.0,
        "p_value": 1.168102718013577e-10,
        "q_value": 9.690080764955607e-10,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5279463818110526,
        "mean_model2": -0.33419119330123065,
        "mean_delta_model1": 0.4720536181889474,
        "mean_delta_model2": 0.6658088066987693,
        "mean_delta_model2 / mean_delta_model1": 1.4104516543124304,
        "test_statistic": 743.0,
        "p_value": 8.94981124911862e-10,
        "q_value": 6.3637549651612284e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8346818608045578,
        "mean_model2": 0.78996119864285,
        "mean_delta_model1": 0.1653181391954422,
        "mean_delta_model2": 0.21003880135715008,
        "mean_delta_model2 / mean_delta_model1": 1.2705127361059774,
        "test_statistic": 1084.0,
        "p_value": 7.246950671541695e-07,
        "q_value": 4.508820343276871e-06,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8557861065864563,
        "mean_model2": 0.8232215868681669,
        "mean_delta_model1": 0.1442138934135437,
        "mean_delta_model2": 0.17677841313183307,
        "mean_delta_model2 / mean_delta_model1": 1.2258070907558696,
        "test_statistic": 1157.0,
        "p_value": 2.5557825489853534e-06,
        "q_value": 1.4134452924053167e-05,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8723850011825561,
        "mean_model2": 0.8387896198034287,
        "mean_delta_model1": 0.12761499881744384,
        "mean_delta_model2": 0.16121038019657136,
        "mean_delta_model2 / mean_delta_model1": 1.2632557433721914,
        "test_statistic": 1339.0,
        "p_value": 4.5453436621012654e-05,
        "q_value": 0.00022623736685661104,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8997249734401703,
        "mean_model2": 0.8764137125015259,
        "mean_delta_model1": 0.10027502655982971,
        "mean_delta_model2": 0.12358628749847413,
        "mean_delta_model2 / mean_delta_model1": 1.2324732462148549,
        "test_statistic": 1483.0,
        "p_value": 0.00034001616518924456,
        "q_value": 0.0015385247693147593,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9482488125562668,
        "mean_model2": 0.9568469643592834,
        "mean_delta_model1": 0.051751187443733214,
        "mean_delta_model2": 0.043153035640716556,
        "mean_delta_model2 / mean_delta_model1": 0.8338559513757042,
        "test_statistic": 1869.0,
        "p_value": 0.024099385689342116,
        "q_value": 0.09995910039172863,
        "significant": false
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8564163464307785,
        "mean_model2": 0.8414225065708161,
        "mean_delta_model1": 0.1435836535692215,
        "mean_delta_model2": 0.15857749342918395,
        "mean_delta_model2 / mean_delta_model1": 1.104425813713773,
        "test_statistic": 1921.0,
        "p_value": 0.03782480488800565,
        "q_value": 0.14482080489531934,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5860802850499749,
        "mean_model2": 0.6031296209990978,
        "mean_delta_model1": 0.5860802850499749,
        "mean_delta_model2": 0.6031296209990978,
        "mean_delta_model2 / mean_delta_model1": 1.0290904444050173,
        "test_statistic": 2282.0,
        "p_value": 0.40342896597006617,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6861503298580647,
        "mean_model2": 0.6815702956914902,
        "mean_delta_model1": 0.6861503298580647,
        "mean_delta_model2": 0.6815702956914902,
        "mean_delta_model2 / mean_delta_model1": 0.9933250281064182,
        "test_statistic": 2372.0,
        "p_value": 0.5988439481270142,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "text-embedding-3-small",
    "projection_dimension": 200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_200.json",
      "model2": "models/text-embedding-3-small_propositions_a->b_200.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:51.983960",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5039551055803895,
        "mean_model2": -0.17617890121415258,
        "mean_delta_model1": 0.4960448944196105,
        "mean_delta_model2": 0.8238210987858474,
        "mean_delta_model2 / mean_delta_model1": 1.66077931262602,
        "test_statistic": 334.0,
        "p_value": 4.944734111085703e-14,
        "q_value": 2.461164013681946e-12,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9059866589307785,
        "mean_model2": 0.855496546626091,
        "mean_delta_model1": 0.09401334106922149,
        "mean_delta_model2": 0.144503453373909,
        "mean_delta_model2 / mean_delta_model1": 1.5370526324291776,
        "test_statistic": 476.0,
        "p_value": 1.852916223731476e-12,
        "q_value": 4.6113002517076234e-11,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5950543672963977,
        "mean_model2": 0.7285142081975937,
        "mean_delta_model1": 0.6073631683364511,
        "mean_delta_model2": 0.7376626658439637,
        "mean_delta_model2 / mean_delta_model1": 1.2145330904150788,
        "test_statistic": 670.0,
        "p_value": 1.7930711033640758e-10,
        "q_value": 2.9749102610518887e-09,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4391170089226216,
        "mean_model2": -0.21951682853512466,
        "mean_delta_model1": 0.5608829910773784,
        "mean_delta_model2": 0.7804831714648753,
        "mean_delta_model2 / mean_delta_model1": 1.3915258331611653,
        "test_statistic": 701.0,
        "p_value": 3.5757592915601463e-10,
        "q_value": 4.449445557773566e-09,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7598642486333848,
        "mean_model2": 0.6846178491413594,
        "mean_delta_model1": 0.2401357513666153,
        "mean_delta_model2": 0.3153821508586407,
        "mean_delta_model2 / mean_delta_model1": 1.3133494244975905,
        "test_statistic": 736.0,
        "p_value": 7.691674678710569e-10,
        "q_value": 7.656821377614815e-09,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.533792215064168,
        "mean_model2": -0.35181470138952137,
        "mean_delta_model1": 0.46620778493583204,
        "mean_delta_model2": 0.6481852986104787,
        "mean_delta_model2 / mean_delta_model1": 1.390335639074954,
        "test_statistic": 893.0,
        "p_value": 2.0075873381476438e-08,
        "q_value": 1.6654086279702329e-07,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9053152561187744,
        "mean_model2": 0.8722242295742035,
        "mean_delta_model1": 0.09468474388122558,
        "mean_delta_model2": 0.12777577042579652,
        "mean_delta_model2 / mean_delta_model1": 1.3494863606124443,
        "test_statistic": 910.0,
        "p_value": 2.809774719797291e-08,
        "q_value": 1.997887701358552e-07,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8559744426608086,
        "mean_model2": 0.8180039083957672,
        "mean_delta_model1": 0.14402555733919142,
        "mean_delta_model2": 0.1819960916042328,
        "mean_delta_model2 / mean_delta_model1": 1.2636374749491008,
        "test_statistic": 1028.0,
        "p_value": 2.6444242554890336e-07,
        "q_value": 1.64527598155532e-06,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8760127478837967,
        "mean_model2": 0.8384706383943558,
        "mean_delta_model1": 0.1239872521162033,
        "mean_delta_model2": 0.16152936160564424,
        "mean_delta_model2 / mean_delta_model1": 1.3027900759850353,
        "test_statistic": 1043.0,
        "p_value": 3.4764319317444347e-07,
        "q_value": 1.9225995381502436e-06,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7814646589756012,
        "mean_model2": 0.7287273260205984,
        "mean_delta_model1": 0.2185353410243988,
        "mean_delta_model2": 0.2712726739794016,
        "mean_delta_model2 / mean_delta_model1": 1.2413217592531856,
        "test_statistic": 1231.0,
        "p_value": 8.619379881219636e-06,
        "q_value": 4.290161433827651e-05,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5509045364707709,
        "mean_model2": 0.6241018840670586,
        "mean_delta_model1": 0.5509045364707709,
        "mean_delta_model2": 0.6241018840670586,
        "mean_delta_model2 / mean_delta_model1": 1.1328675709682985,
        "test_statistic": 1270.0,
        "p_value": 1.5953234709419534e-05,
        "q_value": 7.21861172026132e-05,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8326665459573269,
        "mean_model2": 0.8030751716718078,
        "mean_delta_model1": 0.1673334540426731,
        "mean_delta_model2": 0.19692482832819225,
        "mean_delta_model2 / mean_delta_model1": 1.1768407546166637,
        "test_statistic": 1460.0,
        "p_value": 0.000250433748805814,
        "q_value": 0.0010387456577130986,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8571139967441559,
        "mean_model2": 0.837145607471466,
        "mean_delta_model1": 0.1428860032558441,
        "mean_delta_model2": 0.16285439252853393,
        "mean_delta_model2 / mean_delta_model1": 1.1397504921243788,
        "test_statistic": 1675.0,
        "p_value": 0.0034714875822547467,
        "q_value": 0.013291373936621676,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9484285479784011,
        "mean_model2": 0.9557445043325424,
        "mean_delta_model1": 0.05157145202159882,
        "mean_delta_model2": 0.04425549566745758,
        "mean_delta_model2 / mean_delta_model1": 0.8581394149794888,
        "test_statistic": 1962.0,
        "p_value": 0.052895032886786486,
        "q_value": 0.18805481970293145,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6828296953439712,
        "mean_model2": 0.6585969358682633,
        "mean_delta_model1": 0.6828296953439712,
        "mean_delta_model2": 0.6585969358682633,
        "mean_delta_model2 / mean_delta_model1": 0.9645112688552585,
        "test_statistic": 1987.0,
        "p_value": 0.06433967520159115,
        "q_value": 0.21349377566885622,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "text-embedding-3-small",
    "projection_dimension": 400,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_400.json",
      "model2": "models/text-embedding-3-small_propositions_a->b_400.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:51.983984",
    "comparisons": [
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7817780831456185,
        "mean_model2": 0.6971436133980751,
        "mean_delta_model1": 0.21822191685438155,
        "mean_delta_model2": 0.3028563866019249,
        "mean_delta_model2 / mean_delta_model1": 1.3878367075476634,
        "test_statistic": 383.0,
        "p_value": 1.7729058644366345e-13,
        "q_value": 8.824361462459029e-12,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9103552997112274,
        "mean_model2": 0.8641855031251907,
        "mean_delta_model1": 0.08964470028877258,
        "mean_delta_model2": 0.13581449687480926,
        "mean_delta_model2 / mean_delta_model1": 1.5150309659947532,
        "test_statistic": 535.0,
        "p_value": 7.794993252575682e-12,
        "q_value": 1.939917945954075e-10,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6067761509120464,
        "mean_model2": 0.7187707135081292,
        "mean_delta_model1": 0.6162807486951352,
        "mean_delta_model2": 0.7303001090884209,
        "mean_delta_model2 / mean_delta_model1": 1.1850120430253606,
        "test_statistic": 688.0,
        "p_value": 2.680683329613168e-10,
        "q_value": 4.447560572994025e-09,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4572508700564504,
        "mean_model2": -0.22866456132382154,
        "mean_delta_model1": 0.5427491299435496,
        "mean_delta_model2": 0.7713354386761785,
        "mean_delta_model2 / mean_delta_model1": 1.4211638418589518,
        "test_statistic": 720.0,
        "p_value": 5.428925847467808e-10,
        "q_value": 6.755407180934237e-09,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5095309701561928,
        "mean_model2": -0.25042118662968277,
        "mean_delta_model1": 0.49046902984380725,
        "mean_delta_model2": 0.7495788133703172,
        "mean_delta_model2 / mean_delta_model1": 1.5282897956044748,
        "test_statistic": 783.0,
        "p_value": 2.104070693507049e-09,
        "q_value": 2.094536513699558e-08,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7979694849252701,
        "mean_model2": 0.7377314488589763,
        "mean_delta_model1": 0.2020305150747299,
        "mean_delta_model2": 0.26226855114102365,
        "mean_delta_model2 / mean_delta_model1": 1.2981630574174008,
        "test_statistic": 920.0,
        "p_value": 3.418833474830967e-08,
        "q_value": 2.8361180898014855e-07,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5445865708589553,
        "mean_model2": -0.39436482411809265,
        "mean_delta_model1": 0.4554134291410446,
        "mean_delta_model2": 0.6056351758819073,
        "mean_delta_model2 / mean_delta_model1": 1.3298579644965587,
        "test_statistic": 1133.0,
        "p_value": 1.7001255356424485e-06,
        "q_value": 1.2088726809638024e-05,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5317632901668549,
        "mean_model2": 0.6182476137578488,
        "mean_delta_model1": 0.5317632901668549,
        "mean_delta_model2": 0.6182476137578488,
        "mean_delta_model2 / mean_delta_model1": 1.1626368822185096,
        "test_statistic": 1174.0,
        "p_value": 3.3978523164404245e-06,
        "q_value": 2.1140347633355463e-05,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8579214942455292,
        "mean_model2": 0.8293195550143718,
        "mean_delta_model1": 0.1420785057544708,
        "mean_delta_model2": 0.17068044498562812,
        "mean_delta_model2 / mean_delta_model1": 1.201310811084859,
        "test_statistic": 1336.0,
        "p_value": 4.3479279216686876e-05,
        "q_value": 0.00024045700816918198,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8754362678527832,
        "mean_model2": 0.8502679622173309,
        "mean_delta_model1": 0.12456373214721679,
        "mean_delta_model2": 0.14973203778266908,
        "mean_delta_model2 / mean_delta_model1": 1.202051634144254,
        "test_statistic": 1346.0,
        "p_value": 5.039553839349178e-05,
        "q_value": 0.000250835904940004,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8424444341659546,
        "mean_model2": 0.8068427450396121,
        "mean_delta_model1": 0.1575555658340454,
        "mean_delta_model2": 0.19315725496038794,
        "mean_delta_model2 / mean_delta_model1": 1.2259627512229057,
        "test_statistic": 1374.0,
        "p_value": 7.573451718690408e-05,
        "q_value": 0.0003426879146151609,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9001644074916839,
        "mean_model2": 0.880580992102623,
        "mean_delta_model1": 0.09983559250831604,
        "mean_delta_model2": 0.11941900789737701,
        "mean_delta_model2 / mean_delta_model1": 1.1961566501188414,
        "test_statistic": 1474.0,
        "p_value": 0.00030188229444763794,
        "q_value": 0.0012521432274733052,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9484348738193512,
        "mean_model2": 0.9566391789913178,
        "mean_delta_model1": 0.051565126180648804,
        "mean_delta_model2": 0.04336082100868225,
        "mean_delta_model2 / mean_delta_model1": 0.8408943062948339,
        "test_statistic": 1909.0,
        "p_value": 0.03417438138857468,
        "q_value": 0.1308443344029621,
        "significant": false
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8538247805833816,
        "mean_model2": 0.8456052076816559,
        "mean_delta_model1": 0.14617521941661835,
        "mean_delta_model2": 0.1543947923183441,
        "mean_delta_model2 / mean_delta_model1": 1.0562309599022999,
        "test_statistic": 2117.0,
        "p_value": 0.1606655891996006,
        "q_value": 0.571205588888929,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.653459616880864,
        "mean_model2": 0.6910023766756058,
        "mean_delta_model1": 0.653459616880864,
        "mean_delta_model2": 0.6910023766756058,
        "mean_delta_model2 / mean_delta_model1": 1.0574523028277454,
        "test_statistic": 2184.0,
        "p_value": 0.24100890272556064,
        "q_value": 0.7997227286502615,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "text-embedding-3-small",
    "projection_dimension": 800,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_800.json",
      "model2": "models/text-embedding-3-small_propositions_a->b_800.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:51.984004",
    "comparisons": [
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5761898263543844,
        "mean_model2": 0.7426811691373587,
        "mean_delta_model1": 0.5904155380278826,
        "mean_delta_model2": 0.7597977880388498,
        "mean_delta_model2 / mean_delta_model1": 1.286886504675573,
        "test_statistic": 309.0,
        "p_value": 2.5500003530567317e-14,
        "q_value": 1.2692227656385526e-12,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7752265084534884,
        "mean_model2": 0.6970446801185608,
        "mean_delta_model1": 0.22477349154651166,
        "mean_delta_model2": 0.3029553198814392,
        "mean_delta_model2 / mean_delta_model1": 1.3478249494502763,
        "test_statistic": 471.0,
        "p_value": 1.637462668432428e-12,
        "q_value": 4.0751070762919476e-11,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9153037869930267,
        "mean_model2": 0.8656769013404846,
        "mean_delta_model1": 0.08469621300697326,
        "mean_delta_model2": 0.1343230986595154,
        "mean_delta_model2 / mean_delta_model1": 1.585939841825705,
        "test_statistic": 507.0,
        "p_value": 3.961741310694399e-12,
        "q_value": 6.572982440409594e-11,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8004315641522407,
        "mean_model2": 0.7343150576204062,
        "mean_delta_model1": 0.19956843584775924,
        "mean_delta_model2": 0.26568494237959384,
        "mean_delta_model2 / mean_delta_model1": 1.331297413095283,
        "test_statistic": 988.0,
        "p_value": 1.2590728891234724e-07,
        "q_value": 1.5667095619792874e-06,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4901000130921602,
        "mean_model2": -0.3243943861871958,
        "mean_delta_model1": 0.5098999869078398,
        "mean_delta_model2": 0.6756056138128043,
        "mean_delta_model2 / mean_delta_model1": 1.3249767231998664,
        "test_statistic": 1283.0,
        "p_value": 1.9512317174970403e-05,
        "q_value": 0.00019423900972520054,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5350240828003734,
        "mean_model2": 0.6213545138388872,
        "mean_delta_model1": 0.5352630440983921,
        "mean_delta_model2": 0.6213545138388872,
        "mean_delta_model2 / mean_delta_model1": 1.1608395548501005,
        "test_statistic": 1391.0,
        "p_value": 9.656669707689486e-05,
        "q_value": 0.000801076035052285,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8613641321659088,
        "mean_model2": 0.8285074052214623,
        "mean_delta_model1": 0.13863586783409118,
        "mean_delta_model2": 0.17149259477853776,
        "mean_delta_model2 / mean_delta_model1": 1.237000189473095,
        "test_statistic": 1417.0,
        "p_value": 0.00013915581754561895,
        "q_value": 0.0009152245235410887,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.564556743670255,
        "mean_model2": -0.4481321416050196,
        "mean_delta_model1": 0.43544325632974507,
        "mean_delta_model2": 0.5518678583949804,
        "mean_delta_model2 / mean_delta_model1": 1.2673703183430893,
        "test_statistic": 1421.0,
        "p_value": 0.0001471024895764013,
        "q_value": 0.0009152245235410887,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8784546732902527,
        "mean_model2": 0.8529642397165298,
        "mean_delta_model1": 0.12154532670974731,
        "mean_delta_model2": 0.14703576028347015,
        "mean_delta_model2 / mean_delta_model1": 1.209719569347117,
        "test_statistic": 1470.0,
        "p_value": 0.00028625334369817624,
        "q_value": 0.001583090240780054,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.45384409856051205,
        "mean_model2": -0.33288036538287996,
        "mean_delta_model1": 0.5461559014394879,
        "mean_delta_model2": 0.66711963461712,
        "mean_delta_model2 / mean_delta_model1": 1.2214820582526187,
        "test_statistic": 1533.0,
        "p_value": 0.000647682504088168,
        "q_value": 0.003223738295208773,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9464694887399674,
        "mean_model2": 0.9579711031913757,
        "mean_delta_model1": 0.05353051126003265,
        "mean_delta_model2": 0.04202889680862427,
        "mean_delta_model2 / mean_delta_model1": 0.7851390883315587,
        "test_statistic": 1579.0,
        "p_value": 0.0011433209876770152,
        "q_value": 0.004917298437403307,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8374088387191295,
        "mean_model2": 0.8033780406415463,
        "mean_delta_model1": 0.16259116128087045,
        "mean_delta_model2": 0.19662195935845375,
        "mean_delta_model2 / mean_delta_model1": 1.2093028785174633,
        "test_statistic": 1582.0,
        "p_value": 0.0011855235904302667,
        "q_value": 0.004917298437403307,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9025527334213257,
        "mean_model2": 0.8853269731998443,
        "mean_delta_model1": 0.09744726657867432,
        "mean_delta_model2": 0.11467302680015563,
        "mean_delta_model2 / mean_delta_model1": 1.1767700708934103,
        "test_statistic": 1613.0,
        "p_value": 0.0017141393943961353,
        "q_value": 0.006562969657986051,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8530932098627091,
        "mean_model2": 0.8522610348463059,
        "mean_delta_model1": 0.14690679013729097,
        "mean_delta_model2": 0.14773896515369414,
        "mean_delta_model2 / mean_delta_model1": 1.005664646376287,
        "test_statistic": 2418.0,
        "p_value": 0.7129472348148089,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6768922313302755,
        "mean_model2": 0.712137296795845,
        "mean_delta_model1": 0.6768922313302755,
        "mean_delta_model2": 0.712137296795845,
        "mean_delta_model2 / mean_delta_model1": 1.052068946627299,
        "test_statistic": 2229.0,
        "p_value": 0.3087995581123958,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "text-embedding-3-small",
    "projection_dimension": 1600,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_1600.json",
      "model2": "models/text-embedding-3-small_propositions_a->b_1600.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:51.984024",
    "comparisons": [
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9205120611190796,
        "mean_model2": 0.8669380736351013,
        "mean_delta_model1": 0.07948793888092041,
        "mean_delta_model2": 0.1330619263648987,
        "mean_delta_model2 / mean_delta_model1": 1.6739888873485145,
        "test_statistic": 377.0,
        "p_value": 1.5185669321220038e-13,
        "q_value": 7.558429233489057e-12,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7914498284459114,
        "mean_model2": 0.7112682668864727,
        "mean_delta_model1": 0.2085501715540886,
        "mean_delta_model2": 0.2887317331135273,
        "mean_delta_model2 / mean_delta_model1": 1.3844713287068344,
        "test_statistic": 558.0,
        "p_value": 1.3499071480391998e-11,
        "q_value": 3.359475777593052e-10,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5477807454764843,
        "mean_model2": -0.30802406031638385,
        "mean_delta_model1": 0.4522192545235157,
        "mean_delta_model2": 0.6919759396836161,
        "mean_delta_model2 / mean_delta_model1": 1.5301779673506426,
        "test_statistic": 622.0,
        "p_value": 6.024353961675592e-11,
        "q_value": 9.995092990552948e-10,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5312661869963631,
        "mean_model2": -0.2617968723550439,
        "mean_delta_model1": 0.4687338130036369,
        "mean_delta_model2": 0.7382031276449561,
        "mean_delta_model2 / mean_delta_model1": 1.574887723406522,
        "test_statistic": 646.0,
        "p_value": 1.0428011665322124e-10,
        "q_value": 1.297594899360076e-09,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9158792644739151,
        "mean_model2": 0.8842034620046616,
        "mean_delta_model1": 0.0841207355260849,
        "mean_delta_model2": 0.11579653799533844,
        "mean_delta_model2 / mean_delta_model1": 1.3765516584128206,
        "test_statistic": 765.0,
        "p_value": 1.4354630350828956e-09,
        "q_value": 1.4289585185161656e-08,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8084689670801163,
        "mean_model2": 0.7481781435012818,
        "mean_delta_model1": 0.19153103291988371,
        "mean_delta_model2": 0.25182185649871824,
        "mean_delta_model2 / mean_delta_model1": 1.3147835766335256,
        "test_statistic": 876.0,
        "p_value": 1.4296625675252328e-08,
        "q_value": 1.1859869455241078e-07,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.873994032740593,
        "mean_model2": 0.8370275630056858,
        "mean_delta_model1": 0.12600596725940705,
        "mean_delta_model2": 0.16297243699431418,
        "mean_delta_model2 / mean_delta_model1": 1.2933707866295308,
        "test_statistic": 970.0,
        "p_value": 8.962418464486107e-08,
        "q_value": 5.576129399682877e-07,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8934113085269928,
        "mean_model2": 0.8530077224969864,
        "mean_delta_model1": 0.1065886914730072,
        "mean_delta_model2": 0.1469922775030136,
        "mean_delta_model2 / mean_delta_model1": 1.3790607190279498,
        "test_statistic": 967.0,
        "p_value": 8.465721277572113e-08,
        "q_value": 5.576129399682877e-07,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8618635135889053,
        "mean_model2": 0.8206024896539748,
        "mean_delta_model1": 0.13813648641109466,
        "mean_delta_model2": 0.17939751034602522,
        "mean_delta_model2 / mean_delta_model1": 1.29869750568389,
        "test_statistic": 1012.0,
        "p_value": 1.969584970448627e-07,
        "q_value": 1.0892556589284508e-06,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.567202514517121,
        "mean_model2": -0.4259644750133157,
        "mean_delta_model1": 0.432797485482879,
        "mean_delta_model2": 0.5740355249866843,
        "mean_delta_model2 / mean_delta_model1": 1.326337477090986,
        "test_statistic": 1148.0,
        "p_value": 2.1951641838470253e-06,
        "q_value": 1.092608615960859e-05,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6287958171218634,
        "mean_model2": 0.7070628302544355,
        "mean_delta_model1": 0.6457068299502134,
        "mean_delta_model2": 0.7207288611680269,
        "mean_delta_model2 / mean_delta_model1": 1.1161859031653698,
        "test_statistic": 1289.0,
        "p_value": 2.139905997985659e-05,
        "q_value": 9.682770170864507e-05,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8730791747570038,
        "mean_model2": 0.858596789240837,
        "mean_delta_model1": 0.12692082524299622,
        "mean_delta_model2": 0.1414032107591629,
        "mean_delta_model2 / mean_delta_model1": 1.1141056677534158,
        "test_statistic": 1770.0,
        "p_value": 0.009433383853358795,
        "q_value": 0.039127659758091744,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7185811091959476,
        "mean_model2": 0.6887165843695402,
        "mean_delta_model1": 0.7185811091959476,
        "mean_delta_model2": 0.6887165843695402,
        "mean_delta_model2 / mean_delta_model1": 0.9584395909602687,
        "test_statistic": 1796.0,
        "p_value": 0.012191967211859929,
        "q_value": 0.04667969894641254,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9531845897436142,
        "mean_model2": 0.96006869494915,
        "mean_delta_model1": 0.0468154102563858,
        "mean_delta_model2": 0.039931305050849915,
        "mean_delta_model2 / mean_delta_model1": 0.8529521546893446,
        "test_statistic": 2025.0,
        "p_value": 0.08558480118974489,
        "q_value": 0.3042749664509472,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6064081885293127,
        "mean_model2": 0.6111238902807236,
        "mean_delta_model1": 0.6064081885293127,
        "mean_delta_model2": 0.6111238902807236,
        "mean_delta_model2 / mean_delta_model1": 1.0077764480107823,
        "test_statistic": 2512.0,
        "p_value": 0.9643478501512192,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "text-embedding-3-small",
    "projection_dimension": 3200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_3200.json",
      "model2": "models/text-embedding-3-small_propositions_a->b_3200.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:51.984043",
    "comparisons": [
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9259700793027877,
        "mean_model2": 0.8603583800792695,
        "mean_delta_model1": 0.07402992069721222,
        "mean_delta_model2": 0.1396416199207306,
        "mean_delta_model2 / mean_delta_model1": 1.8862862286706348,
        "test_statistic": 202.0,
        "p_value": 1.3800602760081443e-15,
        "q_value": 6.869034030380747e-14,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8061958764493465,
        "mean_model2": 0.6904984864592553,
        "mean_delta_model1": 0.19380412355065346,
        "mean_delta_model2": 0.3095015135407448,
        "mean_delta_model2 / mean_delta_model1": 1.5969810542232976,
        "test_statistic": 303.0,
        "p_value": 2.1729294370652983e-14,
        "q_value": 5.407708093733121e-13,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9006644827127457,
        "mean_model2": 0.8496552067995071,
        "mean_delta_model1": 0.09933551728725433,
        "mean_delta_model2": 0.15034479320049285,
        "mean_delta_model2 / mean_delta_model1": 1.5135049104917027,
        "test_statistic": 505.0,
        "p_value": 3.773462101902537e-12,
        "q_value": 6.260605675691908e-11,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9277302587032318,
        "mean_model2": 0.886225745677948,
        "mean_delta_model1": 0.07226974129676819,
        "mean_delta_model2": 0.113774254322052,
        "mean_delta_model2 / mean_delta_model1": 1.5743000082821639,
        "test_statistic": 521.0,
        "p_value": 5.5634556953558915e-12,
        "q_value": 6.922807496578084e-11,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.821450163424015,
        "mean_model2": 0.7405605358630418,
        "mean_delta_model1": 0.17854983657598494,
        "mean_delta_model2": 0.2594394641369581,
        "mean_delta_model2 / mean_delta_model1": 1.4530366933522743,
        "test_statistic": 553.0,
        "p_value": 1.198630262225177e-11,
        "q_value": 1.193198906483176e-10,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5613767932355404,
        "mean_model2": -0.34318828435614707,
        "mean_delta_model1": 0.4386232067644596,
        "mean_delta_model2": 0.656811715643853,
        "mean_delta_model2 / mean_delta_model1": 1.4974395004972012,
        "test_statistic": 623.0,
        "p_value": 6.164489141649489e-11,
        "q_value": 5.113796649566662e-10,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5364108726009726,
        "mean_model2": -0.2905047669447958,
        "mean_delta_model1": 0.46358912739902736,
        "mean_delta_model2": 0.7094952330552041,
        "mean_delta_model2 / mean_delta_model1": 1.5304397603883337,
        "test_statistic": 632.0,
        "p_value": 7.577925384685864e-11,
        "q_value": 5.388276797140844e-10,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8680082071200013,
        "mean_model2": 0.8047492897510529,
        "mean_delta_model1": 0.13199179287999868,
        "mean_delta_model2": 0.19525071024894713,
        "mean_delta_model2 / mean_delta_model1": 1.4792640208051477,
        "test_statistic": 668.0,
        "p_value": 1.714319310881962e-10,
        "q_value": 1.066594507691414e-09,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8820497417449951,
        "mean_model2": 0.8269365465641022,
        "mean_delta_model1": 0.11795025825500488,
        "mean_delta_model2": 0.17306345343589782,
        "mean_delta_model2 / mean_delta_model1": 1.4672579441220035,
        "test_statistic": 786.0,
        "p_value": 2.2417156428586456e-09,
        "q_value": 1.239754306784755e-08,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8896518182754517,
        "mean_model2": 0.8533499038219452,
        "mean_delta_model1": 0.11034818172454834,
        "mean_delta_model2": 0.1466500961780548,
        "mean_delta_model2 / mean_delta_model1": 1.3289760998882925,
        "test_statistic": 988.0,
        "p_value": 1.2590728891234724e-07,
        "q_value": 6.26683824791715e-07,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5823457821644843,
        "mean_model2": -0.4664096011873335,
        "mean_delta_model1": 0.41765421783551576,
        "mean_delta_model2": 0.5335903988126666,
        "mean_delta_model2 / mean_delta_model1": 1.277588914528357,
        "test_statistic": 1255.0,
        "p_value": 1.2615363371354731e-05,
        "q_value": 5.708272431674773e-05,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7534816281683743,
        "mean_model2": 0.6920399019122123,
        "mean_delta_model1": 0.7534816281683743,
        "mean_delta_model2": 0.6920399019122123,
        "mean_delta_model2 / mean_delta_model1": 0.9184562384015658,
        "test_statistic": 1368.0,
        "p_value": 6.945593503831393e-05,
        "q_value": 0.0002880883717449535,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6569480375945568,
        "mean_model2": 0.7181554187834263,
        "mean_delta_model1": 0.6760331769287586,
        "mean_delta_model2": 0.7332379783689976,
        "mean_delta_model2 / mean_delta_model1": 1.0846183344138853,
        "test_statistic": 1678.0,
        "p_value": 0.003588225122873316,
        "q_value": 0.013738329965712582,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6581632061302662,
        "mean_model2": 0.6362429176270962,
        "mean_delta_model1": 0.6581632061302662,
        "mean_delta_model2": 0.6362429176270962,
        "mean_delta_model2 / mean_delta_model1": 0.9666947524580529,
        "test_statistic": 2119.0,
        "p_value": 0.1627265750322585,
        "q_value": 0.5785328991866715,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9561981070041656,
        "mean_model2": 0.9570850193500519,
        "mean_delta_model1": 0.04380189299583435,
        "mean_delta_model2": 0.04291498064994812,
        "mean_delta_model2 / mean_delta_model1": 0.9797517347944169,
        "test_statistic": 2403.0,
        "p_value": 0.6748685607674054,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "Qwen3-Embedding-8B",
    "model2_name": "Qwen3-Embedding-0.6B",
    "projection_dimension": 50,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/Qwen3-Embedding-8B_propositions_a->b_50.json",
      "model2": "models/Qwen3-Embedding-0.6B_propositions_a->b_50.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:54.229950",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.3699997326545417,
        "mean_model2": 0.1427609726972878,
        "mean_delta_model1": 0.6300002673454582,
        "mean_delta_model2": 1.1427609726972878,
        "mean_delta_model2 / mean_delta_model1": 1.8139055361236207,
        "test_statistic": 134.0,
        "p_value": 2.0175487407088173e-16,
        "q_value": 1.0042033090008964e-14,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.4505587732791901,
        "mean_model2": -0.1256890830025077,
        "mean_delta_model1": 0.54944122672081,
        "mean_delta_model2": 0.8743109169974923,
        "mean_delta_model2 / mean_delta_model1": 1.5912728686479871,
        "test_statistic": 354.0,
        "p_value": 8.355098643729849e-14,
        "q_value": 2.079309792068447e-12,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7729313373565674,
        "mean_model2": 0.6882376590371132,
        "mean_delta_model1": 0.22706866264343262,
        "mean_delta_model2": 0.3117623409628868,
        "mean_delta_model2 / mean_delta_model1": 1.3729870838780127,
        "test_statistic": 645.0,
        "p_value": 1.0193668947919573e-10,
        "q_value": 1.691246392518241e-09,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.25762617455795406,
        "mean_model2": 0.04021661570295691,
        "mean_delta_model1": 0.7423738254420459,
        "mean_delta_model2": 1.0402166157029569,
        "mean_delta_model2 / mean_delta_model1": 1.401203248354777,
        "test_statistic": 724.0,
        "p_value": 5.92462786523848e-10,
        "q_value": 7.372226983697521e-09,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9054526597261429,
        "mean_model2": 0.8560990500450134,
        "mean_delta_model1": 0.09454734027385711,
        "mean_delta_model2": 0.14390094995498656,
        "mean_delta_model2 / mean_delta_model1": 1.5219989217906746,
        "test_statistic": 843.0,
        "p_value": 7.326564934026958e-09,
        "q_value": 7.293366055458936e-08,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8855904614925385,
        "mean_model2": 0.8292635422945023,
        "mean_delta_model1": 0.11440953850746155,
        "mean_delta_model2": 0.17073645770549775,
        "mean_delta_model2 / mean_delta_model1": 1.4923271252804038,
        "test_statistic": 915.0,
        "p_value": 3.0998246960301505e-08,
        "q_value": 2.571482045073624e-07,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7887888208031655,
        "mean_model2": 0.7342818307876587,
        "mean_delta_model1": 0.21121117919683458,
        "mean_delta_model2": 0.2657181692123413,
        "mean_delta_model2 / mean_delta_model1": 1.2580686790480438,
        "test_statistic": 1241.0,
        "p_value": 1.0109916629048146e-05,
        "q_value": 6.702329887519453e-05,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8561280441284179,
        "mean_model2": 0.8060388465225696,
        "mean_delta_model1": 0.14387195587158202,
        "mean_delta_model2": 0.19396115347743034,
        "mean_delta_model2 / mean_delta_model1": 1.3481512244857308,
        "test_statistic": 1245.0,
        "p_value": 1.0772541459026696e-05,
        "q_value": 6.702329887519453e-05,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8642763280868531,
        "mean_model2": 0.8263631570339203,
        "mean_delta_model1": 0.13572367191314696,
        "mean_delta_model2": 0.1736368429660797,
        "mean_delta_model2 / mean_delta_model1": 1.2793408881333121,
        "test_statistic": 1409.0,
        "p_value": 0.00012446005056465924,
        "q_value": 0.0006883115804706647,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8565536044538021,
        "mean_model2": 0.8287544886767865,
        "mean_delta_model1": 0.1434463955461979,
        "mean_delta_model2": 0.17124551132321358,
        "mean_delta_model2 / mean_delta_model1": 1.1937944531207325,
        "test_statistic": 1498.0,
        "p_value": 0.00041374199742984875,
        "q_value": 0.0020593360373823,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8964803463220596,
        "mean_model2": 0.875824300646782,
        "mean_delta_model1": 0.10351965367794037,
        "mean_delta_model2": 0.12417569935321808,
        "mean_delta_model2 / mean_delta_model1": 1.1995374302501114,
        "test_statistic": 1518.0,
        "p_value": 0.0005353945173863671,
        "q_value": 0.002422584014191756,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6730989850312471,
        "mean_model2": 0.7633413280546665,
        "mean_delta_model1": 0.6935936859995127,
        "mean_delta_model2": 0.7682545100152492,
        "mean_delta_model2 / mean_delta_model1": 1.1076434597413407,
        "test_statistic": 1612.0,
        "p_value": 0.001694152251406039,
        "q_value": 0.0070269813994496165,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5747436358034611,
        "mean_model2": 0.6319183786213398,
        "mean_delta_model1": 0.5747436358034611,
        "mean_delta_model2": 0.6319183786213398,
        "mean_delta_model2 / mean_delta_model1": 1.0994786879857337,
        "test_statistic": 1743.0,
        "p_value": 0.007171580689875418,
        "q_value": 0.027458015737414382,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9629234540462493,
        "mean_model2": 0.9631081211566925,
        "mean_delta_model1": 0.03707654595375061,
        "mean_delta_model2": 0.036891878843307496,
        "mean_delta_model2 / mean_delta_model1": 0.9950193011324877,
        "test_statistic": 2368.0,
        "p_value": 0.5893232417473824,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6925728490948677,
        "mean_model2": 0.6766897535324097,
        "mean_delta_model1": 0.6925728490948677,
        "mean_delta_model2": 0.6766897535324097,
        "mean_delta_model2 / mean_delta_model1": 0.9770665344689503,
        "test_statistic": 2309.0,
        "p_value": 0.4576765516449649,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "Qwen3-Embedding-8B",
    "model2_name": "Qwen3-Embedding-0.6B",
    "projection_dimension": 100,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/Qwen3-Embedding-8B_propositions_a->b_100.json",
      "model2": "models/Qwen3-Embedding-0.6B_propositions_a->b_100.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:54.230006",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.457677908539772,
        "mean_model2": 0.08164806893095373,
        "mean_delta_model1": 0.5423220914602279,
        "mean_delta_model2": 1.0816480689309538,
        "mean_delta_model2 / mean_delta_model1": 1.994475397486695,
        "test_statistic": 119.0,
        "p_value": 1.3105832102334348e-16,
        "q_value": 6.523222809353569e-15,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.4912728017941117,
        "mean_model2": -0.17346568254753947,
        "mean_delta_model1": 0.5087271982058883,
        "mean_delta_model2": 0.8265343174524605,
        "mean_delta_model2 / mean_delta_model1": 1.6247102973211818,
        "test_statistic": 289.0,
        "p_value": 1.493426655074876e-14,
        "q_value": 3.7166487195978373e-13,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8924103087186813,
        "mean_model2": 0.8266229256987572,
        "mean_delta_model1": 0.10758969128131866,
        "mean_delta_model2": 0.17337707430124283,
        "mean_delta_model2 / mean_delta_model1": 1.6114654874127996,
        "test_statistic": 531.0,
        "p_value": 7.080559838466176e-12,
        "q_value": 1.1747459472145632e-10,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7885506284236908,
        "mean_model2": 0.7026007708907127,
        "mean_delta_model1": 0.2114493715763092,
        "mean_delta_model2": 0.2973992291092873,
        "mean_delta_model2 / mean_delta_model1": 1.4064796073511145,
        "test_statistic": 573.0,
        "p_value": 1.924884776837515e-11,
        "q_value": 1.916162542538209e-10,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.3461312375031412,
        "mean_model2": -0.013466943204402924,
        "mean_delta_model1": 0.6538687624968589,
        "mean_delta_model2": 0.9865330567955971,
        "mean_delta_model2 / mean_delta_model1": 1.5087630934201972,
        "test_statistic": 573.0,
        "p_value": 1.924884776837515e-11,
        "q_value": 1.916162542538209e-10,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.8988889110088348,
        "mean_model2": 0.8494257479906082,
        "mean_delta_model1": 0.10111108899116517,
        "mean_delta_model2": 0.15057425200939178,
        "mean_delta_model2 / mean_delta_model1": 1.489196224783501,
        "test_statistic": 896.0,
        "p_value": 2.1308041293925754e-08,
        "q_value": 1.7676240102606268e-07,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8592907077074051,
        "mean_model2": 0.808805878907442,
        "mean_delta_model1": 0.1407092922925949,
        "mean_delta_model2": 0.1911941210925579,
        "mean_delta_model2 / mean_delta_model1": 1.3587881651411011,
        "test_statistic": 1047.0,
        "p_value": 3.73788105734042e-07,
        "q_value": 2.6578168495089484e-06,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7919310522079468,
        "mean_model2": 0.7371746027469634,
        "mean_delta_model1": 0.20806894779205323,
        "mean_delta_model2": 0.2628253972530365,
        "mean_delta_model2 / mean_delta_model1": 1.263164927020766,
        "test_statistic": 1056.0,
        "p_value": 4.397350993271126e-07,
        "q_value": 2.7358907923643554e-06,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8690578037500382,
        "mean_model2": 0.8378557604551315,
        "mean_delta_model1": 0.13094219624996184,
        "mean_delta_model2": 0.16214423954486848,
        "mean_delta_model2 / mean_delta_model1": 1.2382886814831147,
        "test_statistic": 1265.0,
        "p_value": 1.4756685909598026e-05,
        "q_value": 8.161010504866989e-05,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8597359481453896,
        "mean_model2": 0.8320609809458256,
        "mean_delta_model1": 0.14026405185461044,
        "mean_delta_model2": 0.1679390190541744,
        "mean_delta_model2 / mean_delta_model1": 1.197306200937716,
        "test_statistic": 1441.0,
        "p_value": 0.00019366172245055554,
        "q_value": 0.0009639209134711496,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.677711226567626,
        "mean_model2": 0.7750984234362841,
        "mean_delta_model1": 0.7007491887360812,
        "mean_delta_model2": 0.776059422120452,
        "mean_delta_model2 / mean_delta_model1": 1.107471024718852,
        "test_statistic": 1559.0,
        "p_value": 0.0008956007594853812,
        "q_value": 0.004052466008839499,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8947288930416107,
        "mean_model2": 0.8770695692300796,
        "mean_delta_model1": 0.10527110695838929,
        "mean_delta_model2": 0.12293043076992034,
        "mean_delta_model2 / mean_delta_model1": 1.1677509083143895,
        "test_statistic": 1778.0,
        "p_value": 0.010216073840651391,
        "q_value": 0.04237409051877215,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6989577791467309,
        "mean_model2": 0.666611025929451,
        "mean_delta_model1": 0.6989577791467309,
        "mean_delta_model2": 0.666611025929451,
        "mean_delta_model2 / mean_delta_model1": 0.9537214490168949,
        "test_statistic": 1862.0,
        "p_value": 0.022630866659816465,
        "q_value": 0.08664738218365658,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5920118504017592,
        "mean_model2": 0.6378418245911598,
        "mean_delta_model1": 0.5920118504017592,
        "mean_delta_model2": 0.6378418245911598,
        "mean_delta_model2 / mean_delta_model1": 1.0774139473024043,
        "test_statistic": 1944.0,
        "p_value": 0.045752803008419725,
        "q_value": 0.1626624401400354,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9607488620281219,
        "mean_model2": 0.9625836348533631,
        "mean_delta_model1": 0.03925113797187805,
        "mean_delta_model2": 0.03741636514663696,
        "mean_delta_model2 / mean_delta_model1": 0.9532555507930589,
        "test_statistic": 2169.0,
        "p_value": 0.22093576367931422,
        "q_value": 0.7331154566818896,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "Qwen3-Embedding-8B",
    "model2_name": "Qwen3-Embedding-0.6B",
    "projection_dimension": 200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/Qwen3-Embedding-8B_propositions_a->b_200.json",
      "model2": "models/Qwen3-Embedding-0.6B_propositions_a->b_200.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:54.230030",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4617021084576845,
        "mean_model2": 0.015963473990559578,
        "mean_delta_model1": 0.5382978915423154,
        "mean_delta_model2": 1.0159634739905596,
        "mean_delta_model2 / mean_delta_model1": 1.8873629080724257,
        "test_statistic": 128.0,
        "p_value": 1.6983101606773517e-16,
        "q_value": 8.453073021982469e-15,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5458274413645268,
        "mean_model2": -0.20476262836717068,
        "mean_delta_model1": 0.45417255863547323,
        "mean_delta_model2": 0.7952373716328293,
        "mean_delta_model2 / mean_delta_model1": 1.7509586532970183,
        "test_statistic": 292.0,
        "p_value": 1.6186995407842905e-14,
        "q_value": 4.028411810667668e-13,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7859988814592361,
        "mean_model2": 0.6955556216835975,
        "mean_delta_model1": 0.21400111854076387,
        "mean_delta_model2": 0.3044443783164024,
        "mean_delta_model2 / mean_delta_model1": 1.422629846013681,
        "test_statistic": 518.0,
        "p_value": 5.174036778840668e-12,
        "q_value": 8.584319425791128e-11,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.3482173290289938,
        "mean_model2": -0.02520906524732709,
        "mean_delta_model1": 0.6517826709710062,
        "mean_delta_model2": 0.9747909347526729,
        "mean_delta_model2 / mean_delta_model1": 1.4955766364583745,
        "test_statistic": 535.0,
        "p_value": 7.794993252575682e-12,
        "q_value": 9.699589729770375e-11,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8909852081537246,
        "mean_model2": 0.8348049122095108,
        "mean_delta_model1": 0.10901479184627533,
        "mean_delta_model2": 0.1651950877904892,
        "mean_delta_model2 / mean_delta_model1": 1.515345624137275,
        "test_statistic": 580.0,
        "p_value": 2.269503677346154e-11,
        "q_value": 2.2592198707229483e-10,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9032682448625564,
        "mean_model2": 0.8528422093391419,
        "mean_delta_model1": 0.09673175513744355,
        "mean_delta_model2": 0.14715779066085816,
        "mean_delta_model2 / mean_delta_model1": 1.5212976385238293,
        "test_statistic": 756.0,
        "p_value": 1.1839921708264648e-09,
        "q_value": 9.821892872481277e-09,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7872094696760178,
        "mean_model2": 0.7237710979580879,
        "mean_delta_model1": 0.21279053032398224,
        "mean_delta_model2": 0.2762289020419121,
        "mean_delta_model2 / mean_delta_model1": 1.298125915760172,
        "test_statistic": 783.0,
        "p_value": 2.104070693507049e-09,
        "q_value": 1.4960975097853984e-08,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8780664199590683,
        "mean_model2": 0.8301441931724548,
        "mean_delta_model1": 0.1219335800409317,
        "mean_delta_model2": 0.16985580682754517,
        "mean_delta_model2 / mean_delta_model1": 1.3930191073740845,
        "test_statistic": 941.0,
        "p_value": 5.1426475695662915e-08,
        "q_value": 3.1995904251175283e-07,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8541243815422058,
        "mean_model2": 0.8078458526730538,
        "mean_delta_model1": 0.14587561845779418,
        "mean_delta_model2": 0.19215414732694625,
        "mean_delta_model2 / mean_delta_model1": 1.3172464964221675,
        "test_statistic": 1267.0,
        "p_value": 1.52246587827266e-05,
        "q_value": 8.41981736414364e-05,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.901691021323204,
        "mean_model2": 0.8769876050949097,
        "mean_delta_model1": 0.09830897867679596,
        "mean_delta_model2": 0.12301239490509033,
        "mean_delta_model2 / mean_delta_model1": 1.2512834184709638,
        "test_statistic": 1370.0,
        "p_value": 7.149194220827567e-05,
        "q_value": 0.00035583995312662796,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8631476387381554,
        "mean_model2": 0.8351750552654267,
        "mean_delta_model1": 0.13685236126184464,
        "mean_delta_model2": 0.16482494473457338,
        "mean_delta_model2 / mean_delta_model1": 1.204399713785046,
        "test_statistic": 1391.0,
        "p_value": 9.656669707689486e-05,
        "q_value": 0.0004369505645739737,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6991704747080802,
        "mean_model2": 0.6592727693915367,
        "mean_delta_model1": 0.6991704747080802,
        "mean_delta_model2": 0.6592727693915367,
        "mean_delta_model2 / mean_delta_model1": 0.9429356548083616,
        "test_statistic": 1674.0,
        "p_value": 0.003433349985234561,
        "q_value": 0.014240801831134572,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9634962213039399,
        "mean_model2": 0.9639133083820343,
        "mean_delta_model1": 0.03650377869606018,
        "mean_delta_model2": 0.0360866916179657,
        "mean_delta_model2 / mean_delta_model1": 0.9885741396372343,
        "test_statistic": 2043.0,
        "p_value": 0.09746437944586561,
        "q_value": 0.34898713175287666,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6917855260893703,
        "mean_model2": 0.751676039248705,
        "mean_delta_model1": 0.7169483824446797,
        "mean_delta_model2": 0.7587131388485432,
        "mean_delta_model2 / mean_delta_model1": 1.0582535053101763,
        "test_statistic": 2044.0,
        "p_value": 0.09816119491270844,
        "q_value": 0.34898713175287666,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6082331572473049,
        "mean_model2": 0.6422290320694447,
        "mean_delta_model1": 0.6082331572473049,
        "mean_delta_model2": 0.6422290320694447,
        "mean_delta_model2 / mean_delta_model1": 1.0558928338862612,
        "test_statistic": 2185.0,
        "p_value": 0.24239136462501465,
        "q_value": 0.8043100538070642,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "Qwen3-Embedding-8B",
    "model2_name": "Qwen3-Embedding-0.6B",
    "projection_dimension": 400,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 11,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/Qwen3-Embedding-8B_propositions_a->b_400.json",
      "model2": "models/Qwen3-Embedding-0.6B_propositions_a->b_400.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:54.230053",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.49202443586662414,
        "mean_model2": -0.0014277806971222163,
        "mean_delta_model1": 0.5079755641333759,
        "mean_delta_model2": 0.9985722193028778,
        "mean_delta_model2 / mean_delta_model1": 1.9657879036100427,
        "test_statistic": 99.0,
        "p_value": 7.343095847008848e-17,
        "q_value": 3.654911030940626e-15,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5418949663266539,
        "mean_model2": -0.2509811049606651,
        "mean_delta_model1": 0.45810503367334604,
        "mean_delta_model2": 0.7490188950393349,
        "mean_delta_model2 / mean_delta_model1": 1.6350374695367929,
        "test_statistic": 224.0,
        "p_value": 2.5413138381742492e-15,
        "q_value": 6.324495944042886e-14,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9038991159200669,
        "mean_model2": 0.8369872379302978,
        "mean_delta_model1": 0.09610088407993317,
        "mean_delta_model2": 0.16301276206970214,
        "mean_delta_model2 / mean_delta_model1": 1.696267038855898,
        "test_statistic": 336.0,
        "p_value": 5.212123976394902e-14,
        "q_value": 6.658565520257006e-13,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.797529898583889,
        "mean_model2": 0.7016450873017311,
        "mean_delta_model1": 0.202470101416111,
        "mean_delta_model2": 0.2983549126982689,
        "mean_delta_model2 / mean_delta_model1": 1.473575163006898,
        "test_statistic": 337.0,
        "p_value": 5.3510998658976466e-14,
        "q_value": 6.658565520257006e-13,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.39684177428483963,
        "mean_model2": -0.05285562531091273,
        "mean_delta_model1": 0.6031582257151604,
        "mean_delta_model2": 0.9471443746890873,
        "mean_delta_model2 / mean_delta_model1": 1.5703083110009235,
        "test_statistic": 487.0,
        "p_value": 2.4295048332891192e-12,
        "q_value": 2.4184960131029785e-11,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9055386972427368,
        "mean_model2": 0.8557715165615082,
        "mean_delta_model1": 0.09446130275726318,
        "mean_delta_model2": 0.14422848343849182,
        "mean_delta_model2 / mean_delta_model1": 1.526852575907355,
        "test_statistic": 705.0,
        "p_value": 3.905682059423198e-10,
        "q_value": 3.239986861978095e-09,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8001650121808052,
        "mean_model2": 0.7403802424669266,
        "mean_delta_model1": 0.1998349878191948,
        "mean_delta_model2": 0.2596197575330734,
        "mean_delta_model2 / mean_delta_model1": 1.2991706826032399,
        "test_statistic": 852.0,
        "p_value": 8.80282944882976e-09,
        "q_value": 6.259243692762195e-08,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8687292504310608,
        "mean_model2": 0.8146375387907028,
        "mean_delta_model1": 0.13127074956893922,
        "mean_delta_model2": 0.18536246120929717,
        "mean_delta_model2 / mean_delta_model1": 1.412062183068062,
        "test_statistic": 931.0,
        "p_value": 4.236707346692419e-08,
        "q_value": 2.63594346630395e-07,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8780055099725723,
        "mean_model2": 0.8398907446861267,
        "mean_delta_model1": 0.12199449002742767,
        "mean_delta_model2": 0.1601092553138733,
        "mean_delta_model2 / mean_delta_model1": 1.3124302194129946,
        "test_statistic": 1034.0,
        "p_value": 2.951117105348224e-07,
        "q_value": 1.6320803902300832e-06,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8711946055293083,
        "mean_model2": 0.84286565721035,
        "mean_delta_model1": 0.12880539447069167,
        "mean_delta_model2": 0.15713434278964997,
        "mean_delta_model2 / mean_delta_model1": 1.2199360394444059,
        "test_statistic": 1379.0,
        "p_value": 8.137289737126724e-05,
        "q_value": 0.0004050208609805795,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9000007295608521,
        "mean_model2": 0.8819067072868347,
        "mean_delta_model1": 0.09999927043914796,
        "mean_delta_model2": 0.11809329271316528,
        "mean_delta_model2 / mean_delta_model1": 1.1809415428188348,
        "test_statistic": 1584.0,
        "p_value": 0.0012144533585449683,
        "q_value": 0.005495228652611237,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9651655703783035,
        "mean_model2": 0.9636913245916366,
        "mean_delta_model1": 0.03483442962169647,
        "mean_delta_model2": 0.03630867540836334,
        "mean_delta_model2 / mean_delta_model1": 1.0423215135909285,
        "test_statistic": 1818.0,
        "p_value": 0.015061551058980148,
        "q_value": 0.062472094258633475,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.694097481071949,
        "mean_model2": 0.6702032428979874,
        "mean_delta_model1": 0.694097481071949,
        "mean_delta_model2": 0.6702032428979874,
        "mean_delta_model2 / mean_delta_model1": 0.9655750974098625,
        "test_statistic": 2026.0,
        "p_value": 0.08621255219588186,
        "q_value": 0.3300842195499728,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6963952335715294,
        "mean_model2": 0.7618959090858698,
        "mean_delta_model1": 0.7261650481820107,
        "mean_delta_model2": 0.7687253025919198,
        "mean_delta_model2 / mean_delta_model1": 1.058609615701638,
        "test_statistic": 2083.0,
        "p_value": 0.1285764395425518,
        "q_value": 0.4571207887888395,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6312813241779804,
        "mean_model2": 0.6455534209311008,
        "mean_delta_model1": 0.6312813241779804,
        "mean_delta_model2": 0.6455534209311008,
        "mean_delta_model2 / mean_delta_model1": 1.022608140311619,
        "test_statistic": 2310.0,
        "p_value": 0.45976137200079736,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "Qwen3-Embedding-8B",
    "model2_name": "Qwen3-Embedding-0.6B",
    "projection_dimension": 800,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/Qwen3-Embedding-8B_propositions_a->b_800.json",
      "model2": "models/Qwen3-Embedding-0.6B_propositions_a->b_800.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:54.230073",
    "comparisons": [
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8036234161257744,
        "mean_model2": 0.6982047255337238,
        "mean_delta_model1": 0.19637658387422563,
        "mean_delta_model2": 0.3017952744662762,
        "mean_delta_model2 / mean_delta_model1": 1.5368190469163505,
        "test_statistic": 198.0,
        "p_value": 1.234309221920007e-15,
        "q_value": 6.14358097017733e-14,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4953162237256765,
        "mean_model2": -0.09133250955492259,
        "mean_delta_model1": 0.5046837762743235,
        "mean_delta_model2": 0.9086674904450774,
        "mean_delta_model2 / mean_delta_model1": 1.8004689929861477,
        "test_statistic": 298.0,
        "p_value": 1.901053991779918e-14,
        "q_value": 4.731099354913377e-13,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9031734150648117,
        "mean_model2": 0.8380553317070008,
        "mean_delta_model1": 0.0968265849351883,
        "mean_delta_model2": 0.16194466829299928,
        "mean_delta_model2 / mean_delta_model1": 1.672522772556714,
        "test_statistic": 428.0,
        "p_value": 5.588321712651697e-13,
        "q_value": 9.271665565205981e-12,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5439402923360467,
        "mean_model2": -0.2676879975013435,
        "mean_delta_model1": 0.4560597076639533,
        "mean_delta_model2": 0.7323120024986565,
        "mean_delta_model2 / mean_delta_model1": 1.6057371221187975,
        "test_statistic": 475.0,
        "p_value": 1.8077109641911269e-12,
        "q_value": 2.249399599783851e-11,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9151711422204971,
        "mean_model2": 0.8574834978580474,
        "mean_delta_model1": 0.08482885777950287,
        "mean_delta_model2": 0.14251650214195252,
        "mean_delta_model2 / mean_delta_model1": 1.6800474021753087,
        "test_statistic": 503.0,
        "p_value": 3.593963932707817e-12,
        "q_value": 3.5776785966391126e-11,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8687219160795212,
        "mean_model2": 0.8035453163087368,
        "mean_delta_model1": 0.13127808392047882,
        "mean_delta_model2": 0.1964546836912632,
        "mean_delta_model2 / mean_delta_model1": 1.496477384681093,
        "test_statistic": 642.0,
        "p_value": 9.521106397643388e-11,
        "q_value": 7.898302824069586e-10,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8897100728750229,
        "mean_model2": 0.8435353499650955,
        "mean_delta_model1": 0.11028992712497711,
        "mean_delta_model2": 0.15646465003490448,
        "mean_delta_model2 / mean_delta_model1": 1.418666727901666,
        "test_statistic": 708.0,
        "p_value": 4.172445151568837e-10,
        "q_value": 2.9668132445557406e-09,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8003408327698708,
        "mean_model2": 0.7238049113750458,
        "mean_delta_model1": 0.19965916723012925,
        "mean_delta_model2": 0.27619508862495423,
        "mean_delta_model2 / mean_delta_model1": 1.3833328689917297,
        "test_statistic": 728.0,
        "p_value": 6.464396115495318e-10,
        "q_value": 4.021939990153771e-09,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.3820586286112666,
        "mean_model2": -0.12240421384107321,
        "mean_delta_model1": 0.6179413713887334,
        "mean_delta_model2": 0.8775957861589267,
        "mean_delta_model2 / mean_delta_model1": 1.4201926376715284,
        "test_statistic": 741.0,
        "p_value": 8.571186883883589e-10,
        "q_value": 4.740193470747766e-09,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9127302134037018,
        "mean_model2": 0.8831677341461182,
        "mean_delta_model1": 0.08726978659629822,
        "mean_delta_model2": 0.11683226585388183,
        "mean_delta_model2 / mean_delta_model1": 1.3387481556971927,
        "test_statistic": 957.0,
        "p_value": 6.995236968690826e-08,
        "q_value": 3.4817697186025795e-07,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8741870445013046,
        "mean_model2": 0.8342856545746327,
        "mean_delta_model1": 0.12581295549869537,
        "mean_delta_model2": 0.16571434542536737,
        "mean_delta_model2 / mean_delta_model1": 1.3171484984873896,
        "test_statistic": 970.0,
        "p_value": 8.962418464486107e-08,
        "q_value": 4.055366836133002e-07,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9664445447921753,
        "mean_model2": 0.9627309864759446,
        "mean_delta_model1": 0.03355545520782471,
        "mean_delta_model2": 0.03726901352405548,
        "mean_delta_model2 / mean_delta_model1": 1.1106692873999462,
        "test_statistic": 1599.0,
        "p_value": 0.001453108200714456,
        "q_value": 0.006027182202386904,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7252186477556825,
        "mean_model2": 0.6812296283245086,
        "mean_delta_model1": 0.7252186477556825,
        "mean_delta_model2": 0.6812296283245086,
        "mean_delta_model2 / mean_delta_model1": 0.9393437833303023,
        "test_statistic": 1622.0,
        "p_value": 0.0019040300514065042,
        "q_value": 0.007290008908334253,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6469261048734188,
        "mean_model2": 0.655798152089119,
        "mean_delta_model1": 0.6469261048734188,
        "mean_delta_model2": 0.655798152089119,
        "mean_delta_model2 / mean_delta_model1": 1.0137141586169816,
        "test_statistic": 2474.0,
        "p_value": 0.8608010376424908,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6988088046759366,
        "mean_model2": 0.7572041267901659,
        "mean_delta_model1": 0.7284457182139158,
        "mean_delta_model2": 0.764925991371274,
        "mean_delta_model2 / mean_delta_model1": 1.050079604073732,
        "test_statistic": 2252.0,
        "p_value": 0.3479039590114893,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "Qwen3-Embedding-8B",
    "model2_name": "Qwen3-Embedding-0.6B",
    "projection_dimension": 1600,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/Qwen3-Embedding-8B_propositions_a->b_1600.json",
      "model2": "models/Qwen3-Embedding-0.6B_propositions_a->b_1600.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:54.230093",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.463130738902837,
        "mean_model2": -0.07421716650016606,
        "mean_delta_model1": 0.5368692610971629,
        "mean_delta_model2": 0.9257828334998339,
        "mean_delta_model2 / mean_delta_model1": 1.724410206700743,
        "test_statistic": 338.0,
        "p_value": 5.493717497178969e-14,
        "q_value": 2.7344119019623016e-12,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5521162935346365,
        "mean_model2": -0.26538500119466335,
        "mean_delta_model1": 0.4478837064653635,
        "mean_delta_model2": 0.7346149988053366,
        "mean_delta_model2 / mean_delta_model1": 1.6401913894185098,
        "test_statistic": 446.0,
        "p_value": 8.787363278671954e-13,
        "q_value": 2.1868862703993797e-11,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9095322048664093,
        "mean_model2": 0.8507518541812896,
        "mean_delta_model1": 0.0904677951335907,
        "mean_delta_model2": 0.14924814581871032,
        "mean_delta_model2 / mean_delta_model1": 1.6497378497875481,
        "test_statistic": 497.0,
        "p_value": 3.1042302951154134e-12,
        "q_value": 5.1502734834558797e-11,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8061038780212403,
        "mean_model2": 0.7215307195484638,
        "mean_delta_model1": 0.19389612197875977,
        "mean_delta_model2": 0.2784692804515362,
        "mean_delta_model2 / mean_delta_model1": 1.4361776687934014,
        "test_statistic": 553.0,
        "p_value": 1.198630262225177e-11,
        "q_value": 1.49149863310397e-10,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4436987901153043,
        "mean_model2": -0.14192858427762986,
        "mean_delta_model1": 0.5563012098846957,
        "mean_delta_model2": 0.8580714157223701,
        "mean_delta_model2 / mean_delta_model1": 1.542458295030892,
        "test_statistic": 570.0,
        "p_value": 1.7933917874272298e-11,
        "q_value": 1.7852653875779407e-10,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9109935265779495,
        "mean_model2": 0.8652160334587097,
        "mean_delta_model1": 0.08900647342205048,
        "mean_delta_model2": 0.1347839665412903,
        "mean_delta_model2 / mean_delta_model1": 1.5143164464248833,
        "test_statistic": 783.0,
        "p_value": 2.104070693507049e-09,
        "q_value": 1.7454470947496315e-08,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8721264642477036,
        "mean_model2": 0.8204535140097141,
        "mean_delta_model1": 0.12787353575229646,
        "mean_delta_model2": 0.1795464859902859,
        "mean_delta_model2 / mean_delta_model1": 1.40409416955581,
        "test_statistic": 982.0,
        "p_value": 1.1246611482457751e-07,
        "q_value": 7.996892635001446e-07,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8919872051477432,
        "mean_model2": 0.8560672754049301,
        "mean_delta_model1": 0.10801279485225677,
        "mean_delta_model2": 0.1439327245950699,
        "mean_delta_model2 / mean_delta_model1": 1.3325525442790875,
        "test_statistic": 1049.0,
        "p_value": 3.8756230074582494e-07,
        "q_value": 2.411288368157496e-06,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8044929429888725,
        "mean_model2": 0.7515296319127083,
        "mean_delta_model1": 0.19550705701112747,
        "mean_delta_model2": 0.24847036808729173,
        "mean_delta_model2 / mean_delta_model1": 1.2709022982896714,
        "test_statistic": 1099.0,
        "p_value": 9.435936406956081e-07,
        "q_value": 5.21843296063778e-06,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8741213086247445,
        "mean_model2": 0.8509050613641739,
        "mean_delta_model1": 0.1258786913752556,
        "mean_delta_model2": 0.14909493863582612,
        "mean_delta_model2 / mean_delta_model1": 1.1844334970988921,
        "test_statistic": 1425.0,
        "p_value": 0.00015547504127876006,
        "q_value": 0.0007738526845419843,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9124730902910233,
        "mean_model2": 0.8973850232362747,
        "mean_delta_model1": 0.08752690970897675,
        "mean_delta_model2": 0.10261497676372527,
        "mean_delta_model2 / mean_delta_model1": 1.1723820377631942,
        "test_statistic": 1525.0,
        "p_value": 0.0005853250706648853,
        "q_value": 0.0026485126635600437,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9677547788619996,
        "mean_model2": 0.9657454246282577,
        "mean_delta_model1": 0.03224522113800049,
        "mean_delta_model2": 0.03425457537174225,
        "mean_delta_model2 / mean_delta_model1": 1.0623147915513522,
        "test_statistic": 1730.0,
        "p_value": 0.006267116512243234,
        "q_value": 0.025994659643587083,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7322477444261313,
        "mean_model2": 0.7089395400881767,
        "mean_delta_model1": 0.7322477444261313,
        "mean_delta_model2": 0.7089395400881767,
        "mean_delta_model2 / mean_delta_model1": 0.9681689639669407,
        "test_statistic": 1953.0,
        "p_value": 0.04921529101222413,
        "q_value": 0.1884318525541897,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.7113231612928211,
        "mean_model2": 0.7729512116312981,
        "mean_delta_model1": 0.7399232032708823,
        "mean_delta_model2": 0.7821194621920585,
        "mean_delta_model2 / mean_delta_model1": 1.0570278898332215,
        "test_statistic": 2107.0,
        "p_value": 0.15065555303262182,
        "q_value": 0.5356174543540649,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.669546710960567,
        "mean_model2": 0.6670332496613264,
        "mean_delta_model1": 0.669546710960567,
        "mean_delta_model2": 0.6670332496613264,
        "mean_delta_model2 / mean_delta_model1": 0.9962460254705238,
        "test_statistic": 2402.0,
        "p_value": 0.6723580402630482,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "Qwen3-Embedding-8B",
    "model2_name": "Qwen3-Embedding-0.6B",
    "projection_dimension": 3200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/Qwen3-Embedding-8B_propositions_a->b_3200.json",
      "model2": "models/Qwen3-Embedding-0.6B_propositions_a->b_3200.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:54.230113",
    "comparisons": [
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5418275015987456,
        "mean_model2": -0.25706277819816026,
        "mean_delta_model1": 0.4581724984012544,
        "mean_delta_model2": 0.7429372218018397,
        "mean_delta_model2 / mean_delta_model1": 1.6215229512776137,
        "test_statistic": 267.0,
        "p_value": 8.246248576180102e-15,
        "q_value": 4.104441166628118e-13,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8073183217644692,
        "mean_model2": 0.6961085703969002,
        "mean_delta_model1": 0.19268167823553084,
        "mean_delta_model2": 0.30389142960309984,
        "mean_delta_model2 / mean_delta_model1": 1.577168272489448,
        "test_statistic": 356.0,
        "p_value": 8.80280981656619e-14,
        "q_value": 2.1907304066408044e-12,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4659846978774294,
        "mean_model2": -0.07125892631709575,
        "mean_delta_model1": 0.5340153021225705,
        "mean_delta_model2": 0.9287410736829043,
        "mean_delta_model2 / mean_delta_model1": 1.7391656568480387,
        "test_statistic": 384.0,
        "p_value": 1.8191840040308715e-13,
        "q_value": 3.018234553096824e-12,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.911774223446846,
        "mean_model2": 0.8375335651636123,
        "mean_delta_model1": 0.08822577655315399,
        "mean_delta_model2": 0.16246643483638765,
        "mean_delta_model2 / mean_delta_model1": 1.841484894593196,
        "test_statistic": 435.0,
        "p_value": 6.666874861842529e-13,
        "q_value": 8.295831547798283e-12,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.920101917386055,
        "mean_model2": 0.8591990208625794,
        "mean_delta_model1": 0.079898082613945,
        "mean_delta_model2": 0.14080097913742065,
        "mean_delta_model2 / mean_delta_model1": 1.7622572974341435,
        "test_statistic": 559.0,
        "p_value": 1.3823325189872664e-11,
        "q_value": 1.3760687528360447e-10,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.9030971366167069,
        "mean_model2": 0.8493925815820694,
        "mean_delta_model1": 0.09690286338329315,
        "mean_delta_model2": 0.1506074184179306,
        "mean_delta_model2 / mean_delta_model1": 1.5542101973004911,
        "test_statistic": 628.0,
        "p_value": 6.914403547593268e-11,
        "q_value": 5.735893580627348e-10,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.3853472794312984,
        "mean_model2": -0.10422756800428033,
        "mean_delta_model1": 0.6146527205687016,
        "mean_delta_model2": 0.8957724319957197,
        "mean_delta_model2 / mean_delta_model1": 1.4573634867619472,
        "test_statistic": 755.0,
        "p_value": 1.158857605214751e-09,
        "q_value": 8.240046224244655e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8690891543030739,
        "mean_model2": 0.8115218352619559,
        "mean_delta_model1": 0.13091084569692613,
        "mean_delta_model2": 0.18847816473804413,
        "mean_delta_model2 / mean_delta_model1": 1.4397444591748576,
        "test_statistic": 832.0,
        "p_value": 5.846718763075831e-09,
        "q_value": 3.637640946542659e-08,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9200412833690643,
        "mean_model2": 0.8882424736022949,
        "mean_delta_model1": 0.07995871663093566,
        "mean_delta_model2": 0.11175752639770507,
        "mean_delta_model2 / mean_delta_model1": 1.397690347051751,
        "test_statistic": 897.0,
        "p_value": 2.1734845913738727e-08,
        "q_value": 1.202019931238876e-07,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8774252745509148,
        "mean_model2": 0.8363518303632737,
        "mean_delta_model1": 0.12257472544908524,
        "mean_delta_model2": 0.16364816963672638,
        "mean_delta_model2 / mean_delta_model1": 1.3350890164114797,
        "test_statistic": 925.0,
        "p_value": 3.76958864516799e-08,
        "q_value": 1.876253750241484e-07,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.798076333552599,
        "mean_model2": 0.7277981680631638,
        "mean_delta_model1": 0.20192366644740103,
        "mean_delta_model2": 0.2722018319368362,
        "mean_delta_model2 / mean_delta_model1": 1.3480432320088735,
        "test_statistic": 984.0,
        "p_value": 1.1678435854851457e-07,
        "q_value": 5.284326061245428e-07,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7700016316771507,
        "mean_model2": 0.686520853638649,
        "mean_delta_model1": 0.7700016316771507,
        "mean_delta_model2": 0.686520853638649,
        "mean_delta_model2 / mean_delta_model1": 0.8915836348857195,
        "test_statistic": 1086.0,
        "p_value": 7.507645897310535e-07,
        "q_value": 3.114011035919065e-06,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9651991325616837,
        "mean_model2": 0.965185661315918,
        "mean_delta_model1": 0.03480086743831635,
        "mean_delta_model2": 0.034814338684082034,
        "mean_delta_model2 / mean_delta_model1": 1.0003870951144986,
        "test_statistic": 1628.0,
        "p_value": 0.0020411585418374633,
        "q_value": 0.007815036292271637,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6819554096460343,
        "mean_model2": 0.6463229474425316,
        "mean_delta_model1": 0.6833671541512012,
        "mean_delta_model2": 0.6463229474425316,
        "mean_delta_model2 / mean_delta_model1": 0.945791648773811,
        "test_statistic": 1853.0,
        "p_value": 0.02085748626037175,
        "q_value": 0.07415348103754384,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.7078965375805274,
        "mean_model2": 0.7718676560744643,
        "mean_delta_model1": 0.743665130934678,
        "mean_delta_model2": 0.7809057194367051,
        "mean_delta_model2 / mean_delta_model1": 1.0500770937790522,
        "test_statistic": 2395.0,
        "p_value": 0.6548878905640467,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "gte-Qwen2-7B-instruct",
    "projection_dimension": 50,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_50.json",
      "model2": "models/gte-Qwen2-7B-instruct_propositions_a->b_50.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:56.438922",
    "comparisons": [
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8884879565238952,
        "mean_model2": 0.8180357420444488,
        "mean_delta_model1": 0.11151204347610473,
        "mean_delta_model2": 0.18196425795555116,
        "mean_delta_model2 / mean_delta_model1": 1.6317901841207243,
        "test_statistic": 491.0,
        "p_value": 2.680110604427065e-12,
        "q_value": 6.669915534502777e-11,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8196766093373299,
        "mean_model2": 0.7019154572486878,
        "mean_delta_model1": 0.18032339066267014,
        "mean_delta_model2": 0.29808454275131224,
        "mean_delta_model2 / mean_delta_model1": 1.6530553338415046,
        "test_statistic": 476.0,
        "p_value": 1.852916223731476e-12,
        "q_value": 6.669915534502777e-11,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7826671543717384,
        "mean_model2": 0.6784495875239372,
        "mean_delta_model1": 0.21733284562826158,
        "mean_delta_model2": 0.32155041247606275,
        "mean_delta_model2 / mean_delta_model1": 1.479529757899829,
        "test_statistic": 718.0,
        "p_value": 5.196491457758374e-10,
        "q_value": 8.621574309100316e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8664501664042473,
        "mean_model2": 0.7919504934549332,
        "mean_delta_model1": 0.1335498335957527,
        "mean_delta_model2": 0.20804950654506682,
        "mean_delta_model2 / mean_delta_model1": 1.557841750479601,
        "test_statistic": 761.0,
        "p_value": 1.3178543058175353e-09,
        "q_value": 1.63985338740578e-08,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8897439941763878,
        "mean_model2": 0.8435811033844948,
        "mean_delta_model1": 0.11025600582361221,
        "mean_delta_model2": 0.15641889661550523,
        "mean_delta_model2 / mean_delta_model1": 1.4186882197215136,
        "test_statistic": 896.0,
        "p_value": 2.1308041293925754e-08,
        "q_value": 2.1211488123127523e-07,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6899956105649472,
        "mean_model2": 0.5642972832918167,
        "mean_delta_model1": 0.6899956105649472,
        "mean_delta_model2": 0.5642972832918167,
        "mean_delta_model2 / mean_delta_model1": 0.8178273523070494,
        "test_statistic": 926.0,
        "p_value": 3.8438124468247625e-08,
        "q_value": 3.1886624763971013e-07,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7430514541268348,
        "mean_model2": 0.6376427707076072,
        "mean_delta_model1": 0.7430514541268348,
        "mean_delta_model2": 0.6376427707076072,
        "mean_delta_model2 / mean_delta_model1": 0.8581408019137866,
        "test_statistic": 990.0,
        "p_value": 1.3072362460785298e-07,
        "q_value": 9.295091170152019e-07,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.8943710827827454,
        "mean_model2": 0.8579766547679901,
        "mean_delta_model1": 0.10562891721725463,
        "mean_delta_model2": 0.14202334523200988,
        "mean_delta_model2 / mean_delta_model1": 1.3445498540887264,
        "test_statistic": 1181.0,
        "p_value": 3.8169491079429605e-06,
        "q_value": 2.3747833491854763e-05,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4064640519209206,
        "mean_model2": -0.21512494822964073,
        "mean_delta_model1": 0.5935359480790794,
        "mean_delta_model2": 0.7848750517703593,
        "mean_delta_model2 / mean_delta_model1": 1.322371550216175,
        "test_statistic": 1240.0,
        "p_value": 9.950447274883e-06,
        "q_value": 5.5029771071855337e-05,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8751442736387253,
        "mean_model2": 0.8431344026327133,
        "mean_delta_model1": 0.12485572636127472,
        "mean_delta_model2": 0.15686559736728667,
        "mean_delta_model2 / mean_delta_model1": 1.2563748731346946,
        "test_statistic": 1265.0,
        "p_value": 1.4756685909598026e-05,
        "q_value": 7.34490945438029e-05,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9633279949426651,
        "mean_model2": 0.9504662388563156,
        "mean_delta_model1": 0.0366720050573349,
        "mean_delta_model2": 0.049533761143684386,
        "mean_delta_model2 / mean_delta_model1": 1.3507241032019044,
        "test_statistic": 1297.0,
        "p_value": 2.4186175752933408e-05,
        "q_value": 0.00010943900402552145,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.36652338087558745,
        "mean_model2": -0.21314950739964844,
        "mean_delta_model1": 0.6334766191244126,
        "mean_delta_model2": 0.7868504926003516,
        "mean_delta_model2 / mean_delta_model1": 1.2421144977504164,
        "test_statistic": 1510.0,
        "p_value": 0.0004832018361438227,
        "q_value": 0.002004217927842397,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.4720320870727301,
        "mean_model2": -0.35172614265233276,
        "mean_delta_model1": 0.52796791292727,
        "mean_delta_model2": 0.6482738573476672,
        "mean_delta_model2 / mean_delta_model1": 1.2278660151018872,
        "test_statistic": 1573.0,
        "p_value": 0.0010630513584870235,
        "q_value": 0.004070132121949742,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8889417380094529,
        "mean_model2": 0.8737178963422775,
        "mean_delta_model1": 0.11105826199054718,
        "mean_delta_model2": 0.12628210365772247,
        "mean_delta_model2 / mean_delta_model1": 1.1370797759150155,
        "test_statistic": 1806.0,
        "p_value": 0.013430106131661013,
        "q_value": 0.04774732237309291,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6477309703081846,
        "mean_model2": 0.7059115336835384,
        "mean_delta_model1": 0.6618481845408678,
        "mean_delta_model2": 0.7125406704843045,
        "mean_delta_model2 / mean_delta_model1": 1.0765923169806724,
        "test_statistic": 2012.0,
        "p_value": 0.07775544419099598,
        "q_value": 0.2580103692959618,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "gte-Qwen2-7B-instruct",
    "projection_dimension": 100,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_100.json",
      "model2": "models/gte-Qwen2-7B-instruct_propositions_a->b_100.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:56.438999",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8183139035105705,
        "mean_model2": 0.7324193167686462,
        "mean_delta_model1": 0.18168609648942946,
        "mean_delta_model2": 0.26758068323135376,
        "mean_delta_model2 / mean_delta_model1": 1.4727636753807503,
        "test_statistic": 536.0,
        "p_value": 7.984361437594533e-12,
        "q_value": 3.974090942196856e-10,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8010369357466698,
        "mean_model2": 0.7041060623526573,
        "mean_delta_model1": 0.19896306425333024,
        "mean_delta_model2": 0.2958939376473427,
        "mean_delta_model2 / mean_delta_model1": 1.4871802399996963,
        "test_statistic": 607.0,
        "p_value": 4.260963777218883e-11,
        "q_value": 1.0604140158499668e-09,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8928372424840927,
        "mean_model2": 0.8282537055015564,
        "mean_delta_model1": 0.10716275751590729,
        "mean_delta_model2": 0.1717462944984436,
        "mean_delta_model2 / mean_delta_model1": 1.6026677409168901,
        "test_statistic": 632.0,
        "p_value": 7.577925384685864e-11,
        "q_value": 1.2572645859995304e-09,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9057852590084076,
        "mean_model2": 0.8616251993179321,
        "mean_delta_model1": 0.09421474099159241,
        "mean_delta_model2": 0.13837480068206787,
        "mean_delta_model2 / mean_delta_model1": 1.4687170948590331,
        "test_statistic": 708.0,
        "p_value": 4.172445151568837e-10,
        "q_value": 5.1919231779725465e-09,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.892257282435894,
        "mean_model2": 0.8467568266391754,
        "mean_delta_model1": 0.10774271756410599,
        "mean_delta_model2": 0.15324317336082457,
        "mean_delta_model2 / mean_delta_model1": 1.422306554219279,
        "test_statistic": 782.0,
        "p_value": 2.060045950719975e-09,
        "q_value": 2.0507112603189025e-08,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8726610094308853,
        "mean_model2": 0.8064506912231445,
        "mean_delta_model1": 0.12733899056911469,
        "mean_delta_model2": 0.19354930877685547,
        "mean_delta_model2 / mean_delta_model1": 1.5199532202338637,
        "test_statistic": 873.0,
        "p_value": 1.3460606544893096e-08,
        "q_value": 1.1166343725928055e-07,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.67623041421175,
        "mean_model2": 0.5578045524656773,
        "mean_delta_model1": 0.67623041421175,
        "mean_delta_model2": 0.5578045524656773,
        "mean_delta_model2 / mean_delta_model1": 0.8248735057500834,
        "test_statistic": 919.0,
        "p_value": 3.352584783552196e-08,
        "q_value": 2.3838522923659812e-07,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4656204093620181,
        "mean_model2": -0.25003564938902856,
        "mean_delta_model1": 0.5343795906379819,
        "mean_delta_model2": 0.7499643506109714,
        "mean_delta_model2 / mean_delta_model1": 1.4034300032222573,
        "test_statistic": 988.0,
        "p_value": 1.2590728891234724e-07,
        "q_value": 7.833547809896437e-07,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7314479088783264,
        "mean_model2": 0.6264776433259249,
        "mean_delta_model1": 0.7314479088783264,
        "mean_delta_model2": 0.6264776433259249,
        "mean_delta_model2 / mean_delta_model1": 0.8564897591772829,
        "test_statistic": 1009.0,
        "p_value": 1.8631210320966563e-07,
        "q_value": 1.0303770377663085e-06,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4481902369391173,
        "mean_model2": -0.25646232333034275,
        "mean_delta_model1": 0.5518097630608827,
        "mean_delta_model2": 0.7435376766696572,
        "mean_delta_model2 / mean_delta_model1": 1.3474529202695904,
        "test_statistic": 1237.0,
        "p_value": 9.486327484996237e-06,
        "q_value": 4.7216710349969394e-05,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5301983192935587,
        "mean_model2": -0.39579910651780664,
        "mean_delta_model1": 0.4698016807064414,
        "mean_delta_model2": 0.6042008934821933,
        "mean_delta_model2 / mean_delta_model1": 1.2860764835358947,
        "test_statistic": 1364.0,
        "p_value": 6.554735653993199e-05,
        "q_value": 0.0002965924621368006,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9661470872163772,
        "mean_model2": 0.9539111262559891,
        "mean_delta_model1": 0.03385291278362274,
        "mean_delta_model2": 0.046088873744010925,
        "mean_delta_model2 / mean_delta_model1": 1.3614448493279332,
        "test_statistic": 1490.0,
        "p_value": 0.0003727428746878004,
        "q_value": 0.001546057767260726,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.874058330655098,
        "mean_model2": 0.8498872458934784,
        "mean_delta_model1": 0.12594166934490203,
        "mean_delta_model2": 0.1501127541065216,
        "mean_delta_model2 / mean_delta_model1": 1.1919228551387944,
        "test_statistic": 1583.0,
        "p_value": 0.001199907932052049,
        "q_value": 0.0045941184107775625,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8968517458438874,
        "mean_model2": 0.8786390590667724,
        "mean_delta_model1": 0.10314825415611267,
        "mean_delta_model2": 0.12136094093322754,
        "mean_delta_model2 / mean_delta_model1": 1.1765680565911503,
        "test_statistic": 1613.0,
        "p_value": 0.0017141393943961353,
        "q_value": 0.006094186110987047,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.63823157325387,
        "mean_model2": 0.6792330095171928,
        "mean_delta_model1": 0.6558975872397422,
        "mean_delta_model2": 0.6841087648272515,
        "mean_delta_model2 / mean_delta_model1": 1.0430115587194522,
        "test_statistic": 2428.0,
        "p_value": 0.7387436743898754,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "gte-Qwen2-7B-instruct",
    "projection_dimension": 200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_200.json",
      "model2": "models/gte-Qwen2-7B-instruct_propositions_a->b_200.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:56.439024",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8211070868372917,
        "mean_model2": 0.7284034615755082,
        "mean_delta_model1": 0.17889291316270828,
        "mean_delta_model2": 0.2715965384244919,
        "mean_delta_model2 / mean_delta_model1": 1.5182073656403985,
        "test_statistic": 470.0,
        "p_value": 1.5974210632319842e-12,
        "q_value": 7.950913329616583e-11,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8918347585201264,
        "mean_model2": 0.8369555896520615,
        "mean_delta_model1": 0.10816524147987366,
        "mean_delta_model2": 0.16304441034793854,
        "mean_delta_model2 / mean_delta_model1": 1.5073641783370517,
        "test_statistic": 717.0,
        "p_value": 5.083945007169378e-10,
        "q_value": 1.265227029207841e-08,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.866727951169014,
        "mean_model2": 0.8052110210061073,
        "mean_delta_model1": 0.13327204883098603,
        "mean_delta_model2": 0.19478897899389266,
        "mean_delta_model2 / mean_delta_model1": 1.4615891381764652,
        "test_statistic": 837.0,
        "p_value": 6.479288991357656e-09,
        "q_value": 1.0749882293316207e-07,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7866213303804398,
        "mean_model2": 0.7016387450695037,
        "mean_delta_model1": 0.21337866961956023,
        "mean_delta_model2": 0.2983612549304962,
        "mean_delta_model2 / mean_delta_model1": 1.3982712304957856,
        "test_statistic": 865.0,
        "p_value": 1.145673564420994e-08,
        "q_value": 1.4256027143391553e-07,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8890455746650696,
        "mean_model2": 0.8520964556932449,
        "mean_delta_model1": 0.11095442533493043,
        "mean_delta_model2": 0.14790354430675506,
        "mean_delta_model2 / mean_delta_model1": 1.3330116744806608,
        "test_statistic": 949.0,
        "p_value": 6.000040731702688e-08,
        "q_value": 5.97285273494723e-07,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.8977004444599151,
        "mean_model2": 0.8658289349079132,
        "mean_delta_model1": 0.10229955554008484,
        "mean_delta_model2": 0.1341710650920868,
        "mean_delta_model2 / mean_delta_model1": 1.3115508115723287,
        "test_statistic": 1226.0,
        "p_value": 7.9552775476599e-06,
        "q_value": 6.599358151957181e-05,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7044027525186539,
        "mean_model2": 0.6145662041381001,
        "mean_delta_model1": 0.7044027525186539,
        "mean_delta_model2": 0.6145662041381001,
        "mean_delta_model2 / mean_delta_model1": 0.8724642286542247,
        "test_statistic": 1280.0,
        "p_value": 1.8629428688391443e-05,
        "q_value": 0.0001324643794309553,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6690796260535717,
        "mean_model2": 0.5779823976755142,
        "mean_delta_model1": 0.6690796260535717,
        "mean_delta_model2": 0.5779823976755142,
        "mean_delta_model2 / mean_delta_model1": 0.8638469550845902,
        "test_statistic": 1314.0,
        "p_value": 3.1297869128455134e-05,
        "q_value": 0.00019472530644061215,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5472050630953162,
        "mean_model2": -0.42011802826076744,
        "mean_delta_model1": 0.4527949369046837,
        "mean_delta_model2": 0.5798819717392325,
        "mean_delta_model2 / mean_delta_model1": 1.2806723849505002,
        "test_statistic": 1339.0,
        "p_value": 4.5453436621012654e-05,
        "q_value": 0.00025137485206290115,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.47045968018472195,
        "mean_model2": -0.32983368944376706,
        "mean_delta_model1": 0.529540319815278,
        "mean_delta_model2": 0.6701663105562329,
        "mean_delta_model2 / mean_delta_model1": 1.2655623858632898,
        "test_statistic": 1407.0,
        "p_value": 0.00012102172562149046,
        "q_value": 0.0006023666981517507,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4837604131177068,
        "mean_model2": -0.3243545332737267,
        "mean_delta_model1": 0.5162395868822932,
        "mean_delta_model2": 0.6756454667262733,
        "mean_delta_model2 / mean_delta_model1": 1.308782751060751,
        "test_statistic": 1468.0,
        "p_value": 0.000278726263093336,
        "q_value": 0.0012611966827781999,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9603780847787857,
        "mean_model2": 0.9514810580015183,
        "mean_delta_model1": 0.0396219152212143,
        "mean_delta_model2": 0.04851894199848175,
        "mean_delta_model2 / mean_delta_model1": 1.224548125137167,
        "test_statistic": 1481.0,
        "p_value": 0.0003311716178731558,
        "q_value": 0.0013736290802015734,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5819992932118475,
        "mean_model2": 0.6837612244486809,
        "mean_delta_model1": 0.5988613528199493,
        "mean_delta_model2": 0.6892754086852073,
        "mean_delta_model2 / mean_delta_model1": 1.1509766082575066,
        "test_statistic": 1508.0,
        "p_value": 0.0004709164279142286,
        "q_value": 0.0018030098590301822,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8655536425113678,
        "mean_model2": 0.8525951480865479,
        "mean_delta_model1": 0.1344463574886322,
        "mean_delta_model2": 0.14740485191345215,
        "mean_delta_model2 / mean_delta_model1": 1.0963841242475878,
        "test_statistic": 1978.0,
        "p_value": 0.060004071567819,
        "q_value": 0.2133291964158477,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8923286634683609,
        "mean_model2": 0.8798607063293457,
        "mean_delta_model1": 0.1076713365316391,
        "mean_delta_model2": 0.12013929367065429,
        "mean_delta_model2 / mean_delta_model1": 1.1157964370150777,
        "test_statistic": 2108.0,
        "p_value": 0.15163463740335387,
        "q_value": 0.5031584502095744,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "gte-Qwen2-7B-instruct",
    "projection_dimension": 400,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_400.json",
      "model2": "models/gte-Qwen2-7B-instruct_propositions_a->b_400.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:56.439052",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8273078039288521,
        "mean_model2": 0.7279286196827889,
        "mean_delta_model1": 0.1726921960711479,
        "mean_delta_model2": 0.2720713803172112,
        "mean_delta_model2 / mean_delta_model1": 1.5754700357456788,
        "test_statistic": 429.0,
        "p_value": 5.731195668507817e-13,
        "q_value": 2.8526129449666595e-11,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8945431417226791,
        "mean_model2": 0.8339151895046234,
        "mean_delta_model1": 0.10545685827732086,
        "mean_delta_model2": 0.1660848104953766,
        "mean_delta_model2 / mean_delta_model1": 1.5749076277108678,
        "test_statistic": 644.0,
        "p_value": 9.964477157927917e-11,
        "q_value": 2.479831275585334e-09,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9045916336774826,
        "mean_model2": 0.8704470026493073,
        "mean_delta_model1": 0.0954083663225174,
        "mean_delta_model2": 0.12955299735069276,
        "mean_delta_model2 / mean_delta_model1": 1.3578787934881227,
        "test_statistic": 810.0,
        "p_value": 3.707823128564632e-09,
        "q_value": 4.613777327613074e-08,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8049311187863349,
        "mean_model2": 0.7133989962935448,
        "mean_delta_model1": 0.195068881213665,
        "mean_delta_model2": 0.28660100370645525,
        "mean_delta_model2 / mean_delta_model1": 1.4692297506568066,
        "test_statistic": 800.0,
        "p_value": 3.008928865084491e-09,
        "q_value": 4.613777327613074e-08,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8712435624003411,
        "mean_model2": 0.8108202683925628,
        "mean_delta_model1": 0.12875643759965896,
        "mean_delta_model2": 0.18917973160743715,
        "mean_delta_model2 / mean_delta_model1": 1.4692836733775727,
        "test_statistic": 837.0,
        "p_value": 6.479288991357656e-09,
        "q_value": 6.449929375989724e-08,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7309173017740249,
        "mean_model2": 0.6141635286808014,
        "mean_delta_model1": 0.7309173017740249,
        "mean_delta_model2": 0.6141635286808014,
        "mean_delta_model2 / mean_delta_model1": 0.8402640451801483,
        "test_statistic": 867.0,
        "p_value": 1.1928664494909581e-08,
        "q_value": 9.895510094377564e-08,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8884298619627953,
        "mean_model2": 0.8587837719917297,
        "mean_delta_model1": 0.11157013803720474,
        "mean_delta_model2": 0.14121622800827027,
        "mean_delta_model2 / mean_delta_model1": 1.2657170681386054,
        "test_statistic": 994.0,
        "p_value": 1.408966700163824e-07,
        "q_value": 1.0018444617809535e-06,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5250386214349418,
        "mean_model2": -0.3271661951392889,
        "mean_delta_model1": 0.4749613785650581,
        "mean_delta_model2": 0.6728338048607111,
        "mean_delta_model2 / mean_delta_model1": 1.41660740267653,
        "test_statistic": 1170.0,
        "p_value": 3.178570072112027e-06,
        "q_value": 1.787464760901149e-05,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6787181308865547,
        "mean_model2": 0.5678856608271599,
        "mean_delta_model1": 0.6787181308865547,
        "mean_delta_model2": 0.5678856608271599,
        "mean_delta_model2 / mean_delta_model1": 0.8367032424570368,
        "test_statistic": 1171.0,
        "p_value": 3.2320821098517744e-06,
        "q_value": 1.787464760901149e-05,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5139625691249967,
        "mean_model2": -0.3425914259534329,
        "mean_delta_model1": 0.48603743087500334,
        "mean_delta_model2": 0.657408574046567,
        "mean_delta_model2 / mean_delta_model1": 1.3525883651862938,
        "test_statistic": 1462.0,
        "p_value": 0.00025724282489107784,
        "q_value": 0.0011639871816186867,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5761615025252104,
        "mean_model2": -0.44437291553243996,
        "mean_delta_model1": 0.42383849747478963,
        "mean_delta_model2": 0.55562708446756,
        "mean_delta_model2 / mean_delta_model1": 1.310940577078205,
        "test_statistic": 1458.0,
        "p_value": 0.0002437939908238644,
        "q_value": 0.0011639871816186867,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5761524818185717,
        "mean_model2": 0.6858668021857739,
        "mean_delta_model1": 0.5972943781409412,
        "mean_delta_model2": 0.6934147043526173,
        "mean_delta_model2 / mean_delta_model1": 1.1609262195148184,
        "test_statistic": 1515.0,
        "p_value": 0.0005152354158274342,
        "q_value": 0.002137086368921236,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8759689223766327,
        "mean_model2": 0.8545416140556336,
        "mean_delta_model1": 0.1240310776233673,
        "mean_delta_model2": 0.14545838594436644,
        "mean_delta_model2 / mean_delta_model1": 1.1727575760170792,
        "test_statistic": 1546.0,
        "p_value": 0.000762335308365512,
        "q_value": 0.0029187728339776235,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9601987540721894,
        "mean_model2": 0.9553899830579757,
        "mean_delta_model1": 0.03980124592781067,
        "mean_delta_model2": 0.04461001694202423,
        "mean_delta_model2 / mean_delta_model1": 1.1208196100929968,
        "test_statistic": 1606.0,
        "p_value": 0.0015786634788142176,
        "q_value": 0.005612535992092587,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8950811213254929,
        "mean_model2": 0.8858709079027176,
        "mean_delta_model1": 0.10491887867450714,
        "mean_delta_model2": 0.1141290920972824,
        "mean_delta_model2 / mean_delta_model1": 1.0877841389379348,
        "test_statistic": 2143.0,
        "p_value": 0.1890346959482626,
        "q_value": 0.6272604088217523,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "gte-Qwen2-7B-instruct",
    "projection_dimension": 800,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_800.json",
      "model2": "models/gte-Qwen2-7B-instruct_propositions_a->b_800.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:56.439076",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8223814857006073,
        "mean_model2": 0.738104650080204,
        "mean_delta_model1": 0.1776185142993927,
        "mean_delta_model2": 0.261895349919796,
        "mean_delta_model2 / mean_delta_model1": 1.4744822686577976,
        "test_statistic": 572.0,
        "p_value": 1.8800378121519676e-11,
        "q_value": 9.357593964974194e-10,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8889157819747925,
        "mean_model2": 0.8285118424892426,
        "mean_delta_model1": 0.11108421802520752,
        "mean_delta_model2": 0.17148815751075744,
        "mean_delta_model2 / mean_delta_model1": 1.5437670675400794,
        "test_statistic": 625.0,
        "p_value": 6.454390373280753e-11,
        "q_value": 1.6062858952678724e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8683306187391281,
        "mean_model2": 0.8007128655910491,
        "mean_delta_model1": 0.13166938126087188,
        "mean_delta_model2": 0.1992871344089508,
        "mean_delta_model2 / mean_delta_model1": 1.5135419677723727,
        "test_statistic": 766.0,
        "p_value": 1.4664276274759544e-09,
        "q_value": 2.4329713349813587e-08,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7837818533182144,
        "mean_model2": 0.7129895579814911,
        "mean_delta_model1": 0.21621814668178557,
        "mean_delta_model2": 0.2870104420185089,
        "mean_delta_model2 / mean_delta_model1": 1.3274114426709538,
        "test_statistic": 1020.0,
        "p_value": 2.2830327928134216e-07,
        "q_value": 2.8408596021022716e-06,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8823373070359231,
        "mean_model2": 0.8509788566827774,
        "mean_delta_model1": 0.117662692964077,
        "mean_delta_model2": 0.1490211433172226,
        "mean_delta_model2 / mean_delta_model1": 1.2665114112484193,
        "test_statistic": 1088.0,
        "p_value": 7.77736358566221e-07,
        "q_value": 7.742122002248325e-06,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9015114814043045,
        "mean_model2": 0.8685949611663818,
        "mean_delta_model1": 0.09848851859569549,
        "mean_delta_model2": 0.13140503883361818,
        "mean_delta_model2 / mean_delta_model1": 1.3342168275781268,
        "test_statistic": 1193.0,
        "p_value": 4.653124306801065e-06,
        "q_value": 3.8600329959814635e-05,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6237460194900631,
        "mean_model2": 0.526277656853199,
        "mean_delta_model1": 0.6237460194900631,
        "mean_delta_model2": 0.526277656853199,
        "mean_delta_model2 / mean_delta_model1": 0.8437370987689055,
        "test_statistic": 1261.0,
        "p_value": 1.3861565826995486e-05,
        "q_value": 9.856253489718996e-05,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6905121773481369,
        "mean_model2": 0.5929750978760421,
        "mean_delta_model1": 0.6905121773481369,
        "mean_delta_model2": 0.5929750978760421,
        "mean_delta_model2 / mean_delta_model1": 0.8587467641096801,
        "test_statistic": 1334.0,
        "p_value": 4.220865997412669e-05,
        "q_value": 0.00026260874867154626,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4950790704600513,
        "mean_model2": -0.33479410901665685,
        "mean_delta_model1": 0.5049209295399487,
        "mean_delta_model2": 0.6652058909833432,
        "mean_delta_model2 / mean_delta_model1": 1.3174456673630774,
        "test_statistic": 1535.0,
        "p_value": 0.0006642095377123378,
        "q_value": 0.003673332242693843,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5661799089238048,
        "mean_model2": 0.6581260305643082,
        "mean_delta_model1": 0.5870349088683724,
        "mean_delta_model2": 0.6643402600288391,
        "mean_delta_model2 / mean_delta_model1": 1.1316878263841041,
        "test_statistic": 1582.0,
        "p_value": 0.0011855235904302667,
        "q_value": 0.0059007581248839685,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9616484326124192,
        "mean_model2": 0.9508775454759598,
        "mean_delta_model1": 0.03835156738758087,
        "mean_delta_model2": 0.049122454524040224,
        "mean_delta_model2 / mean_delta_model1": 1.280846074101973,
        "test_statistic": 1595.0,
        "p_value": 0.0013855612488474557,
        "q_value": 0.006027182202386904,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5520901354588568,
        "mean_model2": -0.44133651816518976,
        "mean_delta_model1": 0.4479098645411432,
        "mean_delta_model2": 0.5586634818348102,
        "mean_delta_model2 / mean_delta_model1": 1.2472676448131532,
        "test_statistic": 1599.0,
        "p_value": 0.001453108200714456,
        "q_value": 0.006027182202386904,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4359205562621355,
        "mean_model2": -0.3352993651945144,
        "mean_delta_model1": 0.5640794437378644,
        "mean_delta_model2": 0.6647006348054856,
        "mean_delta_model2 / mean_delta_model1": 1.178381240771435,
        "test_statistic": 1901.0,
        "p_value": 0.031911717185220234,
        "q_value": 0.12218121367814042,
        "significant": false
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8622986376285553,
        "mean_model2": 0.8517040532827377,
        "mean_delta_model1": 0.1377013623714447,
        "mean_delta_model2": 0.14829594671726226,
        "mean_delta_model2 / mean_delta_model1": 1.0769388491396261,
        "test_statistic": 1986.0,
        "p_value": 0.06384551826051998,
        "q_value": 0.22698648190691628,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8888232693076134,
        "mean_model2": 0.8814667773246765,
        "mean_delta_model1": 0.11117673069238662,
        "mean_delta_model2": 0.11853322267532349,
        "mean_delta_model2 / mean_delta_model1": 1.0661693498011868,
        "test_statistic": 2240.0,
        "p_value": 0.3271243358754563,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "gte-Qwen2-7B-instruct",
    "projection_dimension": 1600,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_1600.json",
      "model2": "models/gte-Qwen2-7B-instruct_propositions_a->b_1600.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:56.439105",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8314262241125107,
        "mean_model2": 0.7351454722881318,
        "mean_delta_model1": 0.1685737758874893,
        "mean_delta_model2": 0.2648545277118683,
        "mean_delta_model2 / mean_delta_model1": 1.5711490492366935,
        "test_statistic": 444.0,
        "p_value": 8.357904103678267e-13,
        "q_value": 4.1600159579179217e-11,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8946152561903,
        "mean_model2": 0.8292801296710968,
        "mean_delta_model1": 0.10538474380970002,
        "mean_delta_model2": 0.1707198703289032,
        "mean_delta_model2 / mean_delta_model1": 1.619967598319383,
        "test_statistic": 543.0,
        "p_value": 9.442136734669708e-12,
        "q_value": 2.349837890326017e-10,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9110442441701889,
        "mean_model2": 0.8686897498369217,
        "mean_delta_model1": 0.08895575582981109,
        "mean_delta_model2": 0.13131025016307832,
        "mean_delta_model2 / mean_delta_model1": 1.47612989106966,
        "test_statistic": 612.0,
        "p_value": 4.783742429152474e-11,
        "q_value": 7.936776412276717e-10,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7497611582279206,
        "mean_model2": 0.6046021524071693,
        "mean_delta_model1": 0.7497611582279206,
        "mean_delta_model2": 0.6050934003293514,
        "mean_delta_model2 / mean_delta_model1": 0.8070482095384949,
        "test_statistic": 663.0,
        "p_value": 1.531931381295417e-10,
        "q_value": 1.9062371718944225e-09,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6919266555458308,
        "mean_model2": 0.518096177726984,
        "mean_delta_model1": 0.6919266555458308,
        "mean_delta_model2": 0.5208397024869919,
        "mean_delta_model2 / mean_delta_model1": 0.7527383116583709,
        "test_statistic": 735.0,
        "p_value": 7.526652187352531e-10,
        "q_value": 7.492546653007078e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8733972808718682,
        "mean_model2": 0.8040259087085724,
        "mean_delta_model1": 0.12660271912813187,
        "mean_delta_model2": 0.19597409129142762,
        "mean_delta_model2 / mean_delta_model1": 1.5479453572642976,
        "test_statistic": 748.0,
        "p_value": 9.96915585881586e-10,
        "q_value": 8.269985502185367e-09,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8931651628017425,
        "mean_model2": 0.8541712635755538,
        "mean_delta_model1": 0.10683483719825744,
        "mean_delta_model2": 0.1458287364244461,
        "mean_delta_model2 / mean_delta_model1": 1.3649923587549089,
        "test_statistic": 760.0,
        "p_value": 1.2899524844514429e-09,
        "q_value": 9.172195143845465e-09,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7972089719772338,
        "mean_model2": 0.7120064282417298,
        "mean_delta_model1": 0.2027910280227661,
        "mean_delta_model2": 0.2879935717582703,
        "mean_delta_model2 / mean_delta_model1": 1.4201494739004874,
        "test_statistic": 786.0,
        "p_value": 2.2417156428586456e-09,
        "q_value": 1.3947235951328492e-08,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5476197991915979,
        "mean_model2": -0.33340132459066807,
        "mean_delta_model1": 0.45238020080840213,
        "mean_delta_model2": 0.6665986754093319,
        "mean_delta_model2 / mean_delta_model1": 1.4735363621531667,
        "test_statistic": 934.0,
        "p_value": 4.490847904151786e-08,
        "q_value": 2.4836102866230195e-07,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5729679375886917,
        "mean_model2": -0.3561410819552839,
        "mean_delta_model1": 0.4270320624113083,
        "mean_delta_model2": 0.6438589180447161,
        "mean_delta_model2 / mean_delta_model1": 1.507753105022275,
        "test_statistic": 1005.0,
        "p_value": 1.7297891814322726e-07,
        "q_value": 8.609754921003622e-07,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9620287293195724,
        "mean_model2": 0.9506665635108947,
        "mean_delta_model1": 0.03797127068042755,
        "mean_delta_model2": 0.04933343648910522,
        "mean_delta_model2 / mean_delta_model1": 1.299230592104845,
        "test_statistic": 1359.0,
        "p_value": 6.0954018307393836e-05,
        "q_value": 0.00027580826018373426,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5874424312636256,
        "mean_model2": -0.44674961786717177,
        "mean_delta_model1": 0.4125575687363744,
        "mean_delta_model2": 0.5532503821328283,
        "mean_delta_model2 / mean_delta_model1": 1.3410258932526264,
        "test_statistic": 1402.0,
        "p_value": 0.00011281357453431969,
        "q_value": 0.00046792659231197455,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8785033768415451,
        "mean_model2": 0.8571112930774689,
        "mean_delta_model1": 0.1214966231584549,
        "mean_delta_model2": 0.14288870692253114,
        "mean_delta_model2 / mean_delta_model1": 1.1760714265792955,
        "test_statistic": 1548.0,
        "p_value": 0.0007815620423821003,
        "q_value": 0.002992386725814868,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9012007176876068,
        "mean_model2": 0.8828931218385696,
        "mean_delta_model1": 0.09879928231239318,
        "mean_delta_model2": 0.11710687816143035,
        "mean_delta_model2 / mean_delta_model1": 1.1853008991619032,
        "test_statistic": 1600.0,
        "p_value": 0.001470462686034046,
        "q_value": 0.005227855626706665,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5992224165331572,
        "mean_model2": 0.6721700342558324,
        "mean_delta_model1": 0.6271433152537793,
        "mean_delta_model2": 0.6794444631598889,
        "mean_delta_model2 / mean_delta_model1": 1.0833958469045397,
        "test_statistic": 2173.0,
        "p_value": 0.22616750120373663,
        "q_value": 0.7504755598203922,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "gte-Qwen2-7B-instruct",
    "projection_dimension": 3200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_3200.json",
      "model2": "models/gte-Qwen2-7B-instruct_propositions_a->b_3200.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:56.439125",
    "comparisons": [
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8938222086429596,
        "mean_model2": 0.8314403814077377,
        "mean_delta_model1": 0.1061777913570404,
        "mean_delta_model2": 0.16855961859226226,
        "mean_delta_model2 / mean_delta_model1": 1.5875223663812392,
        "test_statistic": 692.0,
        "p_value": 2.929781311502822e-10,
        "q_value": 7.291263968736847e-09,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8229858508706093,
        "mean_model2": 0.7064908212423324,
        "mean_delta_model1": 0.17701414912939073,
        "mean_delta_model2": 0.2935091787576675,
        "mean_delta_model2 / mean_delta_model1": 1.6581114007057327,
        "test_statistic": 669.0,
        "p_value": 1.7532632363263363e-10,
        "q_value": 7.291263968736847e-09,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7929314923286438,
        "mean_model2": 0.69672272965312,
        "mean_delta_model1": 0.2070685076713562,
        "mean_delta_model2": 0.30327727034687996,
        "mean_delta_model2 / mean_delta_model1": 1.4646228620540367,
        "test_statistic": 869.0,
        "p_value": 1.241946124416413e-08,
        "q_value": 2.0605308190334624e-07,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9100125628709793,
        "mean_model2": 0.8730437445640564,
        "mean_delta_model1": 0.08998743712902069,
        "mean_delta_model2": 0.1269562554359436,
        "mean_delta_model2 / mean_delta_model1": 1.4108219934513568,
        "test_statistic": 968.0,
        "p_value": 8.628249618279228e-08,
        "q_value": 1.0736440516571673e-06,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8682697361707687,
        "mean_model2": 0.804971159696579,
        "mean_delta_model1": 0.13173026382923125,
        "mean_delta_model2": 0.19502884030342102,
        "mean_delta_model2 / mean_delta_model1": 1.4805165846797892,
        "test_statistic": 998.0,
        "p_value": 1.5183353157679712e-07,
        "q_value": 1.5114552798674343e-06,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.889600751399994,
        "mean_model2": 0.8573336911201477,
        "mean_delta_model1": 0.1103992486000061,
        "mean_delta_model2": 0.1426663088798523,
        "mean_delta_model2 / mean_delta_model1": 1.2922760860153573,
        "test_statistic": 1054.0,
        "p_value": 4.241746054998045e-07,
        "q_value": 3.5187711854023043e-06,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5349437871575355,
        "mean_model2": -0.33442845187149944,
        "mean_delta_model1": 0.46505621284246446,
        "mean_delta_model2": 0.6655715481285006,
        "mean_delta_model2 / mean_delta_model1": 1.431163652368966,
        "test_statistic": 1074.0,
        "p_value": 6.068971053695423e-07,
        "q_value": 4.315336223451357e-06,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6802295897901058,
        "mean_model2": 0.5780532598495484,
        "mean_delta_model1": 0.6802295897901058,
        "mean_delta_model2": 0.5780532598495484,
        "mean_delta_model2 / mean_delta_model1": 0.8497914064983775,
        "test_statistic": 1350.0,
        "p_value": 5.344397937967702e-05,
        "q_value": 0.0003325113035459564,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4948650293983519,
        "mean_model2": -0.37083702176809313,
        "mean_delta_model1": 0.5051349706016481,
        "mean_delta_model2": 0.6291629782319069,
        "mean_delta_model2 / mean_delta_model1": 1.2455343914964618,
        "test_statistic": 1580.0,
        "p_value": 0.0011572317128310011,
        "q_value": 0.006399933035666461,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7239353622496129,
        "mean_model2": 0.6586434863880277,
        "mean_delta_model1": 0.7239353622496129,
        "mean_delta_model2": 0.6586434863880277,
        "mean_delta_model2 / mean_delta_model1": 0.9098097989595478,
        "test_statistic": 1595.0,
        "p_value": 0.0013855612488474557,
        "q_value": 0.0068964142617303,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9622382885217666,
        "mean_model2": 0.9526849061250686,
        "mean_delta_model1": 0.03776171147823334,
        "mean_delta_model2": 0.04731509387493134,
        "mean_delta_model2 / mean_delta_model1": 1.252991244907021,
        "test_statistic": 1645.0,
        "p_value": 0.002480370746748401,
        "q_value": 0.010288047657271992,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6040639822650701,
        "mean_model2": 0.6941192056238651,
        "mean_delta_model1": 0.6252481802832335,
        "mean_delta_model2": 0.7034235157072544,
        "mean_delta_model2 / mean_delta_model1": 1.125030888356377,
        "test_statistic": 1640.0,
        "p_value": 0.0023429656766318935,
        "q_value": 0.010288047657271992,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5873328913748265,
        "mean_model2": -0.5293600380094722,
        "mean_delta_model1": 0.4126671086251736,
        "mean_delta_model2": 0.4706399619905278,
        "mean_delta_model2 / mean_delta_model1": 1.140483339121682,
        "test_statistic": 1935.0,
        "p_value": 0.04249786023307308,
        "q_value": 0.1627126522795114,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8979895907640457,
        "mean_model2": 0.8875772780179978,
        "mean_delta_model1": 0.10201040923595428,
        "mean_delta_model2": 0.11242272198200226,
        "mean_delta_model2 / mean_delta_model1": 1.102071081020407,
        "test_statistic": 2065.0,
        "p_value": 0.11373419190217399,
        "q_value": 0.40435295688349704,
        "significant": false
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8699245226383209,
        "mean_model2": 0.8637176883220673,
        "mean_delta_model1": 0.13007547736167907,
        "mean_delta_model2": 0.13628231167793273,
        "mean_delta_model2 / mean_delta_model1": 1.0477171749982923,
        "test_statistic": 2197.0,
        "p_value": 0.259416834386872,
        "q_value": 0.8608044611942028,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 50,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_50.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_50.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:58.551177",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.41884719345718624,
        "mean_model2": 0.17102916828356685,
        "mean_delta_model1": 0.5811528065428138,
        "mean_delta_model2": 1.1710291682835667,
        "mean_delta_model2 / mean_delta_model1": 2.015010777027533,
        "test_statistic": 42.0,
        "p_value": 1.3732250242604875e-17,
        "q_value": 3.417506317296553e-16,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.3898035565204918,
        "mean_model2": 0.21277562993578614,
        "mean_delta_model1": 0.6101964434795082,
        "mean_delta_model2": 1.2127756299357861,
        "mean_delta_model2 / mean_delta_model1": 1.9875167135032867,
        "test_statistic": 34.0,
        "p_value": 1.0820049169264091e-17,
        "q_value": 3.417506317296553e-16,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.4778680958226323,
        "mean_model2": -0.01235572574660182,
        "mean_delta_model1": 0.5221319041773677,
        "mean_delta_model2": 0.9876442742533982,
        "mean_delta_model2 / mean_delta_model1": 1.891560861061454,
        "test_statistic": 102.0,
        "p_value": 8.012096741807243e-17,
        "q_value": 1.3292985852610173e-15,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9007420587539673,
        "mean_model2": 0.8201623994112015,
        "mean_delta_model1": 0.09925794124603271,
        "mean_delta_model2": 0.17983760058879852,
        "mean_delta_model2 / mean_delta_model1": 1.811820780596601,
        "test_statistic": 473.0,
        "p_value": 1.7205222109437397e-12,
        "q_value": 2.140907506442988e-11,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7419095650315285,
        "mean_model2": 0.6275429347157478,
        "mean_delta_model1": 0.2580904349684715,
        "mean_delta_model2": 0.37245706528425215,
        "mean_delta_model2 / mean_delta_model1": 1.4431261868722556,
        "test_statistic": 603.0,
        "p_value": 3.883375148692381e-11,
        "q_value": 3.2214820024940035e-10,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6274809923022986,
        "mean_model2": 0.7749024249613286,
        "mean_delta_model1": 0.6369218497723341,
        "mean_delta_model2": 0.7749024249613286,
        "mean_delta_model2 / mean_delta_model1": 1.2166365861656578,
        "test_statistic": 596.0,
        "p_value": 3.2998432362469066e-11,
        "q_value": 3.2214820024940035e-10,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8725327450037003,
        "mean_model2": 0.8029209822416306,
        "mean_delta_model1": 0.12746725499629974,
        "mean_delta_model2": 0.19707901775836945,
        "mean_delta_model2 / mean_delta_model1": 1.5461148650615444,
        "test_statistic": 763.0,
        "p_value": 1.3754339280391946e-09,
        "q_value": 9.780010153479636e-09,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7675900214910507,
        "mean_model2": 0.6707700285315513,
        "mean_delta_model1": 0.2324099785089493,
        "mean_delta_model2": 0.3292299714684486,
        "mean_delta_model2 / mean_delta_model1": 1.4165913769307936,
        "test_statistic": 832.0,
        "p_value": 5.846718763075831e-09,
        "q_value": 3.637640946542659e-08,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8501347251236439,
        "mean_model2": 0.7946413213014603,
        "mean_delta_model1": 0.14986527487635612,
        "mean_delta_model2": 0.20535867869853974,
        "mean_delta_model2 / mean_delta_model1": 1.3702886066700077,
        "test_statistic": 1063.0,
        "p_value": 4.986516364601757e-07,
        "q_value": 2.7577338627053985e-06,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8987804532051087,
        "mean_model2": 0.8585129725933075,
        "mean_delta_model1": 0.10121954679489135,
        "mean_delta_model2": 0.1414870274066925,
        "mean_delta_model2 / mean_delta_model1": 1.3978231664423288,
        "test_statistic": 1174.0,
        "p_value": 3.3978523164404245e-06,
        "q_value": 1.691227810668437e-05,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8362160032987594,
        "mean_model2": 0.7990796828269958,
        "mean_delta_model1": 0.16378399670124055,
        "mean_delta_model2": 0.20092031717300415,
        "mean_delta_model2 / mean_delta_model1": 1.2267396157116877,
        "test_statistic": 1345.0,
        "p_value": 4.965959424148884e-05,
        "q_value": 0.00022470259827831252,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8518508130311966,
        "mean_model2": 0.8087823301553726,
        "mean_delta_model1": 0.14814918696880341,
        "mean_delta_model2": 0.19121766984462737,
        "mean_delta_model2 / mean_delta_model1": 1.2907102209402819,
        "test_statistic": 1367.0,
        "p_value": 6.845862261106333e-05,
        "q_value": 0.00028395173298069036,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9462025570869446,
        "mean_model2": 0.9555481004714966,
        "mean_delta_model1": 0.05379744291305542,
        "mean_delta_model2": 0.04445189952850342,
        "mean_delta_model2 / mean_delta_model1": 0.8262827584638963,
        "test_statistic": 1477.0,
        "p_value": 0.0003141246127325076,
        "q_value": 0.0012026969970644263,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7038766911625862,
        "mean_model2": 0.6576148796081543,
        "mean_delta_model1": 0.7038766911625862,
        "mean_delta_model2": 0.6576148796081543,
        "mean_delta_model2 / mean_delta_model1": 0.9342756875809857,
        "test_statistic": 1745.0,
        "p_value": 0.007320678801450492,
        "q_value": 0.026026809266882035,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5883494353294373,
        "mean_model2": 0.5950787249207496,
        "mean_delta_model1": 0.5883494353294373,
        "mean_delta_model2": 0.5950787249207496,
        "mean_delta_model2 / mean_delta_model1": 1.0114375729578875,
        "test_statistic": 2342.0,
        "p_value": 0.5292093269572754,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 100,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_100.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_100.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:58.551236",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4514737324416637,
        "mean_model2": 0.11971336530521512,
        "mean_delta_model1": 0.5485262675583362,
        "mean_delta_model2": 1.1197133653052151,
        "mean_delta_model2 / mean_delta_model1": 2.0413122060487883,
        "test_statistic": 11.0,
        "p_value": 5.430296382634713e-18,
        "q_value": 1.880701195475995e-16,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.45454454969614744,
        "mean_model2": 0.12074767746031284,
        "mean_delta_model1": 0.5454554503038526,
        "mean_delta_model2": 1.120747677460313,
        "mean_delta_model2 / mean_delta_model1": 2.0547006668207035,
        "test_statistic": 22.0,
        "p_value": 7.55704804907942e-18,
        "q_value": 1.880701195475995e-16,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5279463818110526,
        "mean_model2": -0.11787714706733823,
        "mean_delta_model1": 0.4720536181889474,
        "mean_delta_model2": 0.8821228529326618,
        "mean_delta_model2 / mean_delta_model1": 1.868692070017303,
        "test_statistic": 97.0,
        "p_value": 6.928021197254875e-17,
        "q_value": 1.1494380401218086e-15,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7801883789896965,
        "mean_model2": 0.6433760640025139,
        "mean_delta_model1": 0.2198116210103035,
        "mean_delta_model2": 0.3566239359974861,
        "mean_delta_model2 / mean_delta_model1": 1.6224071064048504,
        "test_statistic": 173.0,
        "p_value": 6.118237096853961e-16,
        "q_value": 7.61314189583625e-15,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9084970563650131,
        "mean_model2": 0.8288874435424805,
        "mean_delta_model1": 0.09150294363498687,
        "mean_delta_model2": 0.17111255645751952,
        "mean_delta_model2 / mean_delta_model1": 1.8700224239790826,
        "test_statistic": 209.0,
        "p_value": 1.6769992593993554e-15,
        "q_value": 1.6694002692487474e-14,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8027366203069687,
        "mean_model2": 0.686120737195015,
        "mean_delta_model1": 0.19726337969303132,
        "mean_delta_model2": 0.31387926280498507,
        "mean_delta_model2 / mean_delta_model1": 1.5911684332562077,
        "test_statistic": 391.0,
        "p_value": 2.1780705099798336e-13,
        "q_value": 1.8068341788780358e-12,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6212199111655354,
        "mean_model2": 0.7633471128344536,
        "mean_delta_model1": 0.6348483427241445,
        "mean_delta_model2": 0.7639814612269401,
        "mean_delta_model2 / mean_delta_model1": 1.2034078216990902,
        "test_statistic": 661.0,
        "p_value": 1.464411670250457e-10,
        "q_value": 1.041268556195992e-09,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8723850011825561,
        "mean_model2": 0.82161856174469,
        "mean_delta_model1": 0.12761499881744384,
        "mean_delta_model2": 0.17838143825531005,
        "mean_delta_model2 / mean_delta_model1": 1.3978093477122446,
        "test_statistic": 946.0,
        "p_value": 5.6634206650045174e-08,
        "q_value": 3.5151170518844137e-07,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8997249734401703,
        "mean_model2": 0.856224809885025,
        "mean_delta_model1": 0.10027502655982971,
        "mean_delta_model2": 0.14377519011497497,
        "mean_delta_model2 / mean_delta_model1": 1.433808546828862,
        "test_statistic": 952.0,
        "p_value": 6.35601170212878e-08,
        "q_value": 3.5151170518844137e-07,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8557861065864563,
        "mean_model2": 0.8075167548656463,
        "mean_delta_model1": 0.1442138934135437,
        "mean_delta_model2": 0.19248324513435364,
        "mean_delta_model2 / mean_delta_model1": 1.3347066678409,
        "test_statistic": 1047.0,
        "p_value": 3.73788105734042e-07,
        "q_value": 1.8604717946562642e-06,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9482488125562668,
        "mean_model2": 0.9642977565526962,
        "mean_delta_model1": 0.051751187443733214,
        "mean_delta_model2": 0.03570224344730377,
        "mean_delta_model2 / mean_delta_model1": 0.6898825942133453,
        "test_statistic": 1127.0,
        "p_value": 1.533820468952071e-06,
        "q_value": 6.9403193870248e-06,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8564163464307785,
        "mean_model2": 0.8180751174688339,
        "mean_delta_model1": 0.1435836535692215,
        "mean_delta_model2": 0.18192488253116607,
        "mean_delta_model2 / mean_delta_model1": 1.2670305986012558,
        "test_statistic": 1374.0,
        "p_value": 7.573451718690408e-05,
        "q_value": 0.00031413058839723076,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8346818608045578,
        "mean_model2": 0.8131767773628235,
        "mean_delta_model1": 0.1653181391954422,
        "mean_delta_model2": 0.18682322263717652,
        "mean_delta_model2 / mean_delta_model1": 1.130083023837515,
        "test_statistic": 1618.0,
        "p_value": 0.0018173674444179279,
        "q_value": 0.0069582015524054415,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5860802850499749,
        "mean_model2": 0.6058803217113018,
        "mean_delta_model1": 0.5860802850499749,
        "mean_delta_model2": 0.6058803217113018,
        "mean_delta_model2 / mean_delta_model1": 1.0337838299058952,
        "test_statistic": 2209.0,
        "p_value": 0.27725313144267305,
        "q_value": 0.9199893792166067,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6861503298580647,
        "mean_model2": 0.6788803699612618,
        "mean_delta_model1": 0.6861503298580647,
        "mean_delta_model2": 0.6788803699612618,
        "mean_delta_model2 / mean_delta_model1": 0.9894047126694426,
        "test_statistic": 2208.0,
        "p_value": 0.27573562109341054,
        "q_value": 0.9199893792166067,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_200.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_200.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:58.551263",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5039551055803895,
        "mean_model2": 0.06768653955310583,
        "mean_delta_model1": 0.4960448944196105,
        "mean_delta_model2": 1.0676865395531059,
        "mean_delta_model2 / mean_delta_model1": 2.15239900977579,
        "test_statistic": 100.0,
        "p_value": 7.559735353237344e-17,
        "q_value": 1.8813699772687786e-15,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4391170089226216,
        "mean_model2": 0.03428235882427543,
        "mean_delta_model1": 0.5608829910773784,
        "mean_delta_model2": 1.0342823588242753,
        "mean_delta_model2 / mean_delta_model1": 1.8440251804348042,
        "test_statistic": 97.0,
        "p_value": 6.928021197254875e-17,
        "q_value": 1.8813699772687786e-15,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.533792215064168,
        "mean_model2": -0.19173631826415657,
        "mean_delta_model1": 0.46620778493583204,
        "mean_delta_model2": 0.8082636817358434,
        "mean_delta_model2 / mean_delta_model1": 1.733698380534532,
        "test_statistic": 252.0,
        "p_value": 5.482689176458315e-15,
        "q_value": 9.096409093093388e-14,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9059866589307785,
        "mean_model2": 0.8292723709344864,
        "mean_delta_model1": 0.09401334106922149,
        "mean_delta_model2": 0.17072762906551361,
        "mean_delta_model2 / mean_delta_model1": 1.815993635837363,
        "test_statistic": 335.0,
        "p_value": 5.07669843779915e-14,
        "q_value": 6.317117979819554e-13,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7598642486333848,
        "mean_model2": 0.6599368354678155,
        "mean_delta_model1": 0.2401357513666153,
        "mean_delta_model2": 0.3400631645321846,
        "mean_delta_model2 / mean_delta_model1": 1.4161288462749977,
        "test_statistic": 520.0,
        "p_value": 5.430560182825824e-12,
        "q_value": 5.4059526744382777e-11,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7814646589756012,
        "mean_model2": 0.6997049862146377,
        "mean_delta_model1": 0.2185353410243988,
        "mean_delta_model2": 0.3002950137853622,
        "mean_delta_model2 / mean_delta_model1": 1.3741256328505473,
        "test_statistic": 681.0,
        "p_value": 2.2936133911529331e-10,
        "q_value": 1.9026836134454836e-09,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5950543672963977,
        "mean_model2": 0.7432416027411819,
        "mean_delta_model1": 0.6073631683364511,
        "mean_delta_model2": 0.7432416027411819,
        "mean_delta_model2 / mean_delta_model1": 1.2237185945550462,
        "test_statistic": 798.0,
        "p_value": 2.8854290748582396e-09,
        "q_value": 2.0516816601643977e-08,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9053152561187744,
        "mean_model2": 0.8651341861486435,
        "mean_delta_model1": 0.09468474388122558,
        "mean_delta_model2": 0.13486581385135651,
        "mean_delta_model2 / mean_delta_model1": 1.4243668866077819,
        "test_statistic": 878.0,
        "p_value": 1.488176691154178e-08,
        "q_value": 9.258958206816343e-08,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8760127478837967,
        "mean_model2": 0.8299850058555603,
        "mean_delta_model1": 0.1239872521162033,
        "mean_delta_model2": 0.17001499414443969,
        "mean_delta_model2 / mean_delta_model1": 1.3712296324230033,
        "test_statistic": 1005.0,
        "p_value": 1.7297891814322726e-07,
        "q_value": 9.566394356670692e-07,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9484285479784011,
        "mean_model2": 0.9625260180234909,
        "mean_delta_model1": 0.05157145202159882,
        "mean_delta_model2": 0.03747398197650909,
        "mean_delta_model2 / mean_delta_model1": 0.7266419793806559,
        "test_statistic": 1174.0,
        "p_value": 3.3978523164404245e-06,
        "q_value": 1.691227810668437e-05,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8559744426608086,
        "mean_model2": 0.8154546048492193,
        "mean_delta_model1": 0.14402555733919142,
        "mean_delta_model2": 0.18454539515078067,
        "mean_delta_model2 / mean_delta_model1": 1.2813378303141147,
        "test_statistic": 1212.0,
        "p_value": 6.3460177878356015e-06,
        "q_value": 2.871482756610411e-05,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8571139967441559,
        "mean_model2": 0.8254379969835282,
        "mean_delta_model1": 0.1428860032558441,
        "mean_delta_model2": 0.17456200301647187,
        "mean_delta_model2 / mean_delta_model1": 1.2216872124550253,
        "test_statistic": 1344.0,
        "p_value": 4.893384526150377e-05,
        "q_value": 0.00020296713012112877,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5509045364707709,
        "mean_model2": 0.6275103187561035,
        "mean_delta_model1": 0.5509045364707709,
        "mean_delta_model2": 0.6275103187561035,
        "mean_delta_model2 / mean_delta_model1": 1.1390545497702524,
        "test_statistic": 1469.0,
        "p_value": 0.0002824663114907604,
        "q_value": 0.0010814860435374152,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8326665459573269,
        "mean_model2": 0.820499284863472,
        "mean_delta_model1": 0.1673334540426731,
        "mean_delta_model2": 0.17950071513652802,
        "mean_delta_model2 / mean_delta_model1": 1.0727126632475539,
        "test_statistic": 1916.0,
        "p_value": 0.0362653406125505,
        "q_value": 0.12893218357488212,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6828296953439712,
        "mean_model2": 0.6710619619488716,
        "mean_delta_model1": 0.6828296953439712,
        "mean_delta_model2": 0.6710619619488716,
        "mean_delta_model2 / mean_delta_model1": 0.9827662249088747,
        "test_statistic": 2223.0,
        "p_value": 0.2990959214526674,
        "q_value": 0.9924687583207826,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 400,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_400.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_400.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:58.551287",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5095309701561928,
        "mean_model2": -0.03048547205515206,
        "mean_delta_model1": 0.49046902984380725,
        "mean_delta_model2": 0.9695145279448479,
        "mean_delta_model2 / mean_delta_model1": 1.9767089641798496,
        "test_statistic": 129.0,
        "p_value": 1.7478232297301269e-16,
        "q_value": 8.69951657389447e-15,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7817780831456185,
        "mean_model2": 0.6503865107893944,
        "mean_delta_model1": 0.21822191685438155,
        "mean_delta_model2": 0.34961348921060564,
        "mean_delta_model2 / mean_delta_model1": 1.6021007158685214,
        "test_statistic": 165.0,
        "p_value": 4.880034421013085e-16,
        "q_value": 1.2144803777820813e-14,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4572508700564504,
        "mean_model2": -0.03185874013230205,
        "mean_delta_model1": 0.5427491299435496,
        "mean_delta_model2": 0.968141259867698,
        "mean_delta_model2 / mean_delta_model1": 1.78377302966546,
        "test_statistic": 181.0,
        "p_value": 7.664886745226832e-16,
        "q_value": 1.2716924713914145e-14,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9103552997112274,
        "mean_model2": 0.8351600351929664,
        "mean_delta_model1": 0.08964470028877258,
        "mean_delta_model2": 0.16483996480703353,
        "mean_delta_model2 / mean_delta_model1": 1.8388143892057685,
        "test_statistic": 207.0,
        "p_value": 1.5862688510866548e-15,
        "q_value": 1.9738512347744184e-14,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7979694849252701,
        "mean_model2": 0.6889566028118134,
        "mean_delta_model1": 0.2020305150747299,
        "mean_delta_model2": 0.31104339718818663,
        "mean_delta_model2 / mean_delta_model1": 1.5395862207901292,
        "test_statistic": 317.0,
        "p_value": 3.154399659417182e-14,
        "q_value": 3.1401061218329266e-13,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5445865708589553,
        "mean_model2": -0.22647322855889798,
        "mean_delta_model1": 0.4554134291410446,
        "mean_delta_model2": 0.773526771441102,
        "mean_delta_model2 / mean_delta_model1": 1.6985155068879965,
        "test_statistic": 350.0,
        "p_value": 7.5257796274053e-14,
        "q_value": 6.243065039077089e-13,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8754362678527832,
        "mean_model2": 0.8223720097541809,
        "mean_delta_model1": 0.12456373214721679,
        "mean_delta_model2": 0.1776279902458191,
        "mean_delta_model2 / mean_delta_model1": 1.4260008686628611,
        "test_statistic": 796.0,
        "p_value": 2.7668706160839887e-09,
        "q_value": 1.9673807783149838e-08,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6067761509120464,
        "mean_model2": 0.7328663312271237,
        "mean_delta_model1": 0.6162807486951352,
        "mean_delta_model2": 0.7328663312271237,
        "mean_delta_model2 / mean_delta_model1": 1.1891760902459436,
        "test_statistic": 940.0,
        "p_value": 5.0442126304009286e-08,
        "q_value": 3.138347362164046e-07,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8579214942455292,
        "mean_model2": 0.8133673831820488,
        "mean_delta_model1": 0.1420785057544708,
        "mean_delta_model2": 0.1866326168179512,
        "mean_delta_model2 / mean_delta_model1": 1.3135879760762363,
        "test_statistic": 1114.0,
        "p_value": 1.2254582205086865e-06,
        "q_value": 6.777251662137888e-06,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9001644074916839,
        "mean_model2": 0.8694412750005722,
        "mean_delta_model1": 0.09983559250831604,
        "mean_delta_model2": 0.1305587249994278,
        "mean_delta_model2 / mean_delta_model1": 1.3077372680344699,
        "test_statistic": 1143.0,
        "p_value": 2.016487208952064e-06,
        "q_value": 1.0036749481830227e-05,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5317632901668549,
        "mean_model2": 0.6211692173779011,
        "mean_delta_model1": 0.5317632901668549,
        "mean_delta_model2": 0.6211692173779011,
        "mean_delta_model2 / mean_delta_model1": 1.1681310629453054,
        "test_statistic": 1159.0,
        "p_value": 2.6433140750645974e-06,
        "q_value": 1.1960620093758583e-05,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8424444341659546,
        "mean_model2": 0.8148437476158142,
        "mean_delta_model1": 0.1575555658340454,
        "mean_delta_model2": 0.1851562523841858,
        "mean_delta_model2 / mean_delta_model1": 1.1751806507376097,
        "test_statistic": 1318.0,
        "p_value": 3.323909134273584e-05,
        "q_value": 0.00013786864575256614,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9484348738193512,
        "mean_model2": 0.9593369990587235,
        "mean_delta_model1": 0.051565126180648804,
        "mean_delta_model2": 0.04066300094127655,
        "mean_delta_model2 / mean_delta_model1": 0.7885756121069365,
        "test_statistic": 1396.0,
        "p_value": 0.00010365682261626461,
        "q_value": 0.0003968735470976355,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8538247805833816,
        "mean_model2": 0.8264376425743103,
        "mean_delta_model1": 0.14617521941661835,
        "mean_delta_model2": 0.1735623574256897,
        "mean_delta_model2 / mean_delta_model1": 1.187358282192924,
        "test_statistic": 1517.0,
        "p_value": 0.0005285945569802443,
        "q_value": 0.0018792833406802306,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.653459616880864,
        "mean_model2": 0.6882101050019265,
        "mean_delta_model1": 0.653459616880864,
        "mean_delta_model2": 0.6882101050019265,
        "mean_delta_model2 / mean_delta_model1": 1.0531792435574456,
        "test_statistic": 2211.0,
        "p_value": 0.28030519435350776,
        "q_value": 0.9301168228564973,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 800,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_800.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_800.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:58.551309",
    "comparisons": [
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9153037869930267,
        "mean_model2": 0.8353920596837997,
        "mean_delta_model1": 0.08469621300697326,
        "mean_delta_model2": 0.16460794031620027,
        "mean_delta_model2 / mean_delta_model1": 1.9435100398485072,
        "test_statistic": 178.0,
        "p_value": 7.044317414078139e-16,
        "q_value": 3.5061987421352956e-14,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4901000130921602,
        "mean_model2": -0.06749663388356567,
        "mean_delta_model1": 0.5098999869078398,
        "mean_delta_model2": 0.9325033661164344,
        "mean_delta_model2 / mean_delta_model1": 1.8287966072942392,
        "test_statistic": 207.0,
        "p_value": 1.5862688510866548e-15,
        "q_value": 3.947702469548837e-14,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7752265084534884,
        "mean_model2": 0.6562636724114418,
        "mean_delta_model1": 0.22477349154651166,
        "mean_delta_model2": 0.3437363275885582,
        "mean_delta_model2 / mean_delta_model1": 1.5292565205244852,
        "test_statistic": 248.0,
        "p_value": 4.915088821531074e-15,
        "q_value": 7.608737167862367e-14,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.45384409856051205,
        "mean_model2": -0.04582533801905811,
        "mean_delta_model1": 0.5461559014394879,
        "mean_delta_model2": 0.9541746619809419,
        "mean_delta_model2 / mean_delta_model1": 1.747073792420901,
        "test_statistic": 256.0,
        "p_value": 6.114697274470499e-15,
        "q_value": 7.608737167862367e-14,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.564556743670255,
        "mean_model2": -0.27204743315465746,
        "mean_delta_model1": 0.43544325632974507,
        "mean_delta_model2": 0.7279525668453425,
        "mean_delta_model2 / mean_delta_model1": 1.6717506960174184,
        "test_statistic": 395.0,
        "p_value": 2.4134811234713264e-13,
        "q_value": 2.402544911554032e-12,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8004315641522407,
        "mean_model2": 0.6965580269694328,
        "mean_delta_model1": 0.19956843584775924,
        "mean_delta_model2": 0.3034419730305672,
        "mean_delta_model2 / mean_delta_model1": 1.5204908117937441,
        "test_statistic": 494.0,
        "p_value": 2.884536279160242e-12,
        "q_value": 2.392887978382599e-11,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5761898263543844,
        "mean_model2": 0.7195214996114373,
        "mean_delta_model1": 0.5904155380278826,
        "mean_delta_model2": 0.7200984850153327,
        "mean_delta_model2 / mean_delta_model1": 1.219646907363955,
        "test_statistic": 654.0,
        "p_value": 1.2502257583212642e-10,
        "q_value": 8.889718627164264e-10,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8784546732902527,
        "mean_model2": 0.8327114832401276,
        "mean_delta_model1": 0.12154532670974731,
        "mean_delta_model2": 0.16728851675987244,
        "mean_delta_model2 / mean_delta_model1": 1.3763467612322173,
        "test_statistic": 825.0,
        "p_value": 5.061052087089789e-09,
        "q_value": 3.148824332179396e-08,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8613641321659088,
        "mean_model2": 0.8116651327908039,
        "mean_delta_model1": 0.13863586783409118,
        "mean_delta_model2": 0.1883348672091961,
        "mean_delta_model2 / mean_delta_model1": 1.3584858677018625,
        "test_statistic": 901.0,
        "p_value": 2.3526577665178756e-08,
        "q_value": 1.3011095353341639e-07,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9025527334213257,
        "mean_model2": 0.8714113384485245,
        "mean_delta_model1": 0.09744726657867432,
        "mean_delta_model2": 0.12858866155147552,
        "mean_delta_model2 / mean_delta_model1": 1.3195717649777186,
        "test_statistic": 1205.0,
        "p_value": 5.663194857588431e-06,
        "q_value": 2.818766605613291e-05,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9464694887399674,
        "mean_model2": 0.9600126922130585,
        "mean_delta_model1": 0.05353051126003265,
        "mean_delta_model2": 0.039987307786941526,
        "mean_delta_model2 / mean_delta_model1": 0.7470002965728658,
        "test_statistic": 1363.0,
        "p_value": 6.460325535698146e-05,
        "q_value": 0.0002922819807287246,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5350240828003734,
        "mean_model2": 0.6240697754919529,
        "mean_delta_model1": 0.5352630440983921,
        "mean_delta_model2": 0.6240697754919529,
        "mean_delta_model2 / mean_delta_model1": 1.1659123161456972,
        "test_statistic": 1369.0,
        "p_value": 7.046698255609004e-05,
        "q_value": 0.0002922819807287246,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8374088387191295,
        "mean_model2": 0.8116703408956528,
        "mean_delta_model1": 0.16259116128087045,
        "mean_delta_model2": 0.18832965910434724,
        "mean_delta_model2 / mean_delta_model1": 1.1583019496306717,
        "test_statistic": 1405.0,
        "p_value": 0.00011767309880582405,
        "q_value": 0.0004505380248237524,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8530932098627091,
        "mean_model2": 0.8332299935817719,
        "mean_delta_model1": 0.14690679013729097,
        "mean_delta_model2": 0.16677000641822814,
        "mean_delta_model2 / mean_delta_model1": 1.1352096541104337,
        "test_statistic": 1758.0,
        "p_value": 0.008359472369848318,
        "q_value": 0.029719975270422184,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6768922313302755,
        "mean_model2": 0.6818721994757653,
        "mean_delta_model1": 0.6768922313302755,
        "mean_delta_model2": 0.6818721994757653,
        "mean_delta_model2 / mean_delta_model1": 1.007357106367882,
        "test_statistic": 2335.0,
        "p_value": 0.5135746748177931,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 1600,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_1600.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_1600.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:58.551330",
    "comparisons": [
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5477807454764843,
        "mean_model2": -0.060103613748215136,
        "mean_delta_model1": 0.4522192545235157,
        "mean_delta_model2": 0.9398963862517848,
        "mean_delta_model2 / mean_delta_model1": 2.0784085968257013,
        "test_statistic": 85.0,
        "p_value": 4.881616873189426e-17,
        "q_value": 2.4297483963679522e-15,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9205120611190796,
        "mean_model2": 0.8467677932977676,
        "mean_delta_model1": 0.07948793888092041,
        "mean_delta_model2": 0.15323220670223237,
        "mean_delta_model2 / mean_delta_model1": 1.9277416028082832,
        "test_statistic": 140.0,
        "p_value": 2.3957899977091284e-16,
        "q_value": 5.96233487406484e-15,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5312661869963631,
        "mean_model2": -0.12716221901588143,
        "mean_delta_model1": 0.4687338130036369,
        "mean_delta_model2": 0.8728377809841186,
        "mean_delta_model2 / mean_delta_model1": 1.8621182359151593,
        "test_statistic": 177.0,
        "p_value": 6.848674378293117e-16,
        "q_value": 1.1362734943618385e-14,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7914498284459114,
        "mean_model2": 0.6875161136686802,
        "mean_delta_model1": 0.2085501715540886,
        "mean_delta_model2": 0.3124838863313198,
        "mean_delta_model2 / mean_delta_model1": 1.4983631229009824,
        "test_statistic": 461.0,
        "p_value": 1.277687058453469e-12,
        "q_value": 1.5898718406126634e-11,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8084689670801163,
        "mean_model2": 0.7105161878466606,
        "mean_delta_model1": 0.19153103291988371,
        "mean_delta_model2": 0.2894838121533394,
        "mean_delta_model2 / mean_delta_model1": 1.5114198871074263,
        "test_statistic": 567.0,
        "p_value": 1.670707085341673e-11,
        "q_value": 1.663136606932154e-10,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.567202514517121,
        "mean_model2": -0.3367601250298321,
        "mean_delta_model1": 0.432797485482879,
        "mean_delta_model2": 0.6632398749701679,
        "mean_delta_model2 / mean_delta_model1": 1.5324485405227821,
        "test_statistic": 592.0,
        "p_value": 3.0058926298675976e-11,
        "q_value": 2.4935600187400024e-10,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9158792644739151,
        "mean_model2": 0.8760879856348037,
        "mean_delta_model1": 0.0841207355260849,
        "mean_delta_model2": 0.12391201436519622,
        "mean_delta_model2 / mean_delta_model1": 1.4730258073738844,
        "test_statistic": 732.0,
        "p_value": 7.052036774337939e-10,
        "q_value": 5.014344189841118e-09,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8934113085269928,
        "mean_model2": 0.8402752876281738,
        "mean_delta_model1": 0.1065886914730072,
        "mean_delta_model2": 0.15972471237182617,
        "mean_delta_model2 / mean_delta_model1": 1.4985146188071488,
        "test_statistic": 745.0,
        "p_value": 9.34472931366031e-10,
        "q_value": 5.813990951712097e-09,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.873994032740593,
        "mean_model2": 0.8283761444687844,
        "mean_delta_model1": 0.12600596725940705,
        "mean_delta_model2": 0.17162385553121567,
        "mean_delta_model2 / mean_delta_model1": 1.3620295868837355,
        "test_statistic": 973.0,
        "p_value": 9.487277678157034e-08,
        "q_value": 5.246826643079153e-07,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8618635135889053,
        "mean_model2": 0.8285375213623047,
        "mean_delta_model1": 0.13813648641109466,
        "mean_delta_model2": 0.17146247863769531,
        "mean_delta_model2 / mean_delta_model1": 1.2412540892883461,
        "test_statistic": 1181.0,
        "p_value": 3.8169491079429605e-06,
        "q_value": 1.8998266793483813e-05,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6287958171218634,
        "mean_model2": 0.7282713803648949,
        "mean_delta_model1": 0.6457068299502134,
        "mean_delta_model2": 0.7302746775746346,
        "mean_delta_model2 / mean_delta_model1": 1.1309694178563074,
        "test_statistic": 1269.0,
        "p_value": 1.5706760228924285e-05,
        "q_value": 7.10708552472445e-05,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8730791747570038,
        "mean_model2": 0.8403214406967163,
        "mean_delta_model1": 0.12692082524299622,
        "mean_delta_model2": 0.1596785593032837,
        "mean_delta_model2 / mean_delta_model1": 1.2580958168020981,
        "test_statistic": 1327.0,
        "p_value": 3.803333427971095e-05,
        "q_value": 0.00015775414064513377,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9531845897436142,
        "mean_model2": 0.9633785480260849,
        "mean_delta_model1": 0.0468154102563858,
        "mean_delta_model2": 0.0366214519739151,
        "mean_delta_model2 / mean_delta_model1": 0.7822520783937762,
        "test_statistic": 1533.0,
        "p_value": 0.000647682504088168,
        "q_value": 0.002479798688622133,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6064081885293127,
        "mean_model2": 0.650706312134862,
        "mean_delta_model1": 0.6064081885293127,
        "mean_delta_model2": 0.650706312134862,
        "mean_delta_model2 / mean_delta_model1": 1.0730500089601083,
        "test_statistic": 1858.0,
        "p_value": 0.02182713389157079,
        "q_value": 0.07760081626939434,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7185811091959476,
        "mean_model2": 0.7087896719574929,
        "mean_delta_model1": 0.7185811091959476,
        "mean_delta_model2": 0.7087896719574929,
        "mean_delta_model2 / mean_delta_model1": 0.9863739289648028,
        "test_statistic": 2047.0,
        "p_value": 0.10027554062347219,
        "q_value": 0.33273720620851716,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 3200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_3200.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_3200.json"
    },
    "analysis_timestamp": "2025-08-27T07:50:58.551350",
    "comparisons": [
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5613767932355404,
        "mean_model2": -0.1128516443958506,
        "mean_delta_model1": 0.4386232067644596,
        "mean_delta_model2": 0.8871483556041494,
        "mean_delta_model2 / mean_delta_model1": 2.022575052852932,
        "test_statistic": 103.0,
        "p_value": 8.248184779150682e-17,
        "q_value": 4.105404881353181e-15,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5364108726009726,
        "mean_model2": -0.14540832499973477,
        "mean_delta_model1": 0.46358912739902736,
        "mean_delta_model2": 0.8545916750002652,
        "mean_delta_model2 / mean_delta_model1": 1.8434247580286505,
        "test_statistic": 188.0,
        "p_value": 9.330061592707216e-16,
        "q_value": 2.3219460664150024e-14,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9259700793027877,
        "mean_model2": 0.850104603767395,
        "mean_delta_model1": 0.07402992069721222,
        "mean_delta_model2": 0.14989539623260498,
        "mean_delta_model2 / mean_delta_model1": 2.0247947697484117,
        "test_statistic": 237.0,
        "p_value": 3.63574062749329e-15,
        "q_value": 6.032109981004404e-14,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8061958764493465,
        "mean_model2": 0.6942364919185638,
        "mean_delta_model1": 0.19380412355065346,
        "mean_delta_model2": 0.30576350808143615,
        "mean_delta_model2 / mean_delta_model1": 1.5776935107447314,
        "test_statistic": 368.0,
        "p_value": 1.2028611792100836e-13,
        "q_value": 1.4967633148816934e-12,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.821450163424015,
        "mean_model2": 0.7092052149772644,
        "mean_delta_model1": 0.17854983657598494,
        "mean_delta_model2": 0.2907947850227356,
        "mean_delta_model2 / mean_delta_model1": 1.6286477243510826,
        "test_statistic": 433.0,
        "p_value": 6.339428788808821e-13,
        "q_value": 6.310702922260798e-12,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5823457821644843,
        "mean_model2": -0.3086862379475497,
        "mean_delta_model1": 0.41765421783551576,
        "mean_delta_model2": 0.6913137620524503,
        "mean_delta_model2 / mean_delta_model1": 1.6552299307191711,
        "test_statistic": 472.0,
        "p_value": 1.6784884860821078e-12,
        "q_value": 1.3924022898296725e-11,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9277302587032318,
        "mean_model2": 0.8822702980041504,
        "mean_delta_model1": 0.07226974129676819,
        "mean_delta_model2": 0.1177297019958496,
        "mean_delta_model2 / mean_delta_model1": 1.6290317342136982,
        "test_statistic": 505.0,
        "p_value": 3.773462101902537e-12,
        "q_value": 2.6831167181536748e-11,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9006644827127457,
        "mean_model2": 0.8426375639438629,
        "mean_delta_model1": 0.09933551728725433,
        "mean_delta_model2": 0.1573624360561371,
        "mean_delta_model2 / mean_delta_model1": 1.584150768562295,
        "test_statistic": 554.0,
        "p_value": 1.2274930437817311e-11,
        "q_value": 7.637068137868962e-11,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8820497417449951,
        "mean_model2": 0.8269416973739863,
        "mean_delta_model1": 0.11795025825500488,
        "mean_delta_model2": 0.17305830262601377,
        "mean_delta_model2 / mean_delta_model1": 1.4672142747824,
        "test_statistic": 716.0,
        "p_value": 4.973778630852186e-10,
        "q_value": 2.750689409799422e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8680082071200013,
        "mean_model2": 0.8280221843719482,
        "mean_delta_model1": 0.13199179287999868,
        "mean_delta_model2": 0.17197781562805176,
        "mean_delta_model2 / mean_delta_model1": 1.3029432503004688,
        "test_statistic": 749.0,
        "p_value": 1.0186201019495722e-09,
        "q_value": 5.070022133062416e-09,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8896518182754517,
        "mean_model2": 0.8451215916872025,
        "mean_delta_model1": 0.11034818172454834,
        "mean_delta_model2": 0.15487840831279753,
        "mean_delta_model2 / mean_delta_model1": 1.4035429120110539,
        "test_statistic": 891.0,
        "p_value": 1.9293162111910322e-08,
        "q_value": 8.72988134874654e-08,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7534816281683743,
        "mean_model2": 0.7138943448662758,
        "mean_delta_model1": 0.7534816281683743,
        "mean_delta_model2": 0.7138943448662758,
        "mean_delta_model2 / mean_delta_model1": 0.9474608513039255,
        "test_statistic": 1465.0,
        "p_value": 0.00026778265635249015,
        "q_value": 0.0011107052177408861,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6569480375945568,
        "mean_model2": 0.742114825733006,
        "mean_delta_model1": 0.6760331769287586,
        "mean_delta_model2": 0.7429108519479632,
        "mean_delta_model2 / mean_delta_model1": 1.098926616772023,
        "test_statistic": 1690.0,
        "p_value": 0.0040917391418793594,
        "q_value": 0.015666147061516165,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9561981070041656,
        "mean_model2": 0.9639307999610901,
        "mean_delta_model1": 0.04380189299583435,
        "mean_delta_model2": 0.03606920003890991,
        "mean_delta_model2 / mean_delta_model1": 0.8234621285052719,
        "test_statistic": 1812.5,
        "p_value": 0.014293230439229828,
        "q_value": 0.05081594105395242,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6581632061302662,
        "mean_model2": 0.6388217556104064,
        "mean_delta_model1": 0.6581632061302662,
        "mean_delta_model2": 0.6388217556104064,
        "mean_delta_model2 / mean_delta_model1": 0.9706129872656059,
        "test_statistic": 2107.0,
        "p_value": 0.15065555303262182,
        "q_value": 0.4999096240637939,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 50,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_50.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_50.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:00.572336",
    "comparisons": [
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5116489662975073,
        "mean_model2": 0.21277562993578614,
        "mean_delta_model1": 0.4883510337024927,
        "mean_delta_model2": 1.2127756299357861,
        "mean_delta_model2 / mean_delta_model1": 2.4834095686067874,
        "test_statistic": 14.0,
        "p_value": 5.943312343029359e-18,
        "q_value": 1.7188440143237312e-16,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5958431795053184,
        "mean_model2": -0.01235572574660182,
        "mean_delta_model1": 0.4041568204946816,
        "mean_delta_model2": 0.9876442742533982,
        "mean_delta_model2 / mean_delta_model1": 2.4437154692689265,
        "test_statistic": 19.0,
        "p_value": 6.9066722754060085e-18,
        "q_value": 1.7188440143237312e-16,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5168375094980001,
        "mean_model2": 0.17102916828356685,
        "mean_delta_model1": 0.48316249050199983,
        "mean_delta_model2": 1.1710291682835667,
        "mean_delta_model2 / mean_delta_model1": 2.42367566047373,
        "test_statistic": 37.0,
        "p_value": 1.1832715745334621e-17,
        "q_value": 1.963183022740328e-16,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8093671980500221,
        "mean_model2": 0.6275429347157478,
        "mean_delta_model1": 0.19063280194997786,
        "mean_delta_model2": 0.37245706528425215,
        "mean_delta_model2 / mean_delta_model1": 1.9537931640011517,
        "test_statistic": 110.0,
        "p_value": 1.0104199612316772e-16,
        "q_value": 1.2573018040111e-15,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8159912088513375,
        "mean_model2": 0.6707700285315513,
        "mean_delta_model1": 0.18400879114866256,
        "mean_delta_model2": 0.3292299714684486,
        "mean_delta_model2 / mean_delta_model1": 1.7892078384584376,
        "test_statistic": 338.0,
        "p_value": 5.493717497178969e-14,
        "q_value": 5.468823803924603e-13,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9105998533964157,
        "mean_model2": 0.8201623994112015,
        "mean_delta_model1": 0.0894001466035843,
        "mean_delta_model2": 0.17983760058879852,
        "mean_delta_model2 / mean_delta_model1": 2.0116029718187103,
        "test_statistic": 356.0,
        "p_value": 8.80280981656619e-14,
        "q_value": 7.302434688802682e-13,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8923065981268883,
        "mean_model2": 0.8029209822416306,
        "mean_delta_model1": 0.10769340187311173,
        "mean_delta_model2": 0.19707901775836945,
        "mean_delta_model2 / mean_delta_model1": 1.8300008573466282,
        "test_statistic": 587.0,
        "p_value": 2.6743020071878613e-11,
        "q_value": 1.9015599550502686e-10,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8912533563375473,
        "mean_model2": 0.8087823301553726,
        "mean_delta_model1": 0.1087466436624527,
        "mean_delta_model2": 0.19121766984462737,
        "mean_delta_model2 / mean_delta_model1": 1.7583776694585906,
        "test_statistic": 639.0,
        "p_value": 8.891992200502532e-11,
        "q_value": 5.532312436363797e-10,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8778363499045372,
        "mean_model2": 0.7946413213014603,
        "mean_delta_model1": 0.1221636500954628,
        "mean_delta_model2": 0.20535867869853974,
        "mean_delta_model2 / mean_delta_model1": 1.6810129571117558,
        "test_statistic": 695.0,
        "p_value": 3.1313005114081147e-10,
        "q_value": 1.7317286905778635e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8655539400875568,
        "mean_model2": 0.7990796828269958,
        "mean_delta_model1": 0.13444605991244316,
        "mean_delta_model2": 0.20092031717300415,
        "mean_delta_model2 / mean_delta_model1": 1.4944306832334975,
        "test_statistic": 978.0,
        "p_value": 1.0428838221518432e-07,
        "q_value": 5.190791002850573e-07,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5520357795711607,
        "mean_model2": 0.7749024249613286,
        "mean_delta_model1": 0.594848697585985,
        "mean_delta_model2": 0.7749024249613286,
        "mean_delta_model2 / mean_delta_model1": 1.3026882770459742,
        "test_statistic": 1030.0,
        "p_value": 2.7430652036712907e-07,
        "q_value": 1.2411979757917734e-06,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6880296065658331,
        "mean_model2": 0.5950787249207496,
        "mean_delta_model1": 0.6882821650058031,
        "mean_delta_model2": 0.5950787249207496,
        "mean_delta_model2 / mean_delta_model1": 0.8645854203061216,
        "test_statistic": 1310.0,
        "p_value": 2.9464688438355742e-05,
        "q_value": 0.00012221322931576392,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9591578662395477,
        "mean_model2": 0.9555481004714966,
        "mean_delta_model1": 0.04084213376045227,
        "mean_delta_model2": 0.04445189952850342,
        "mean_delta_model2 / mean_delta_model1": 1.088383378528242,
        "test_statistic": 1874.0,
        "p_value": 0.025198274227111687,
        "q_value": 0.0964772816766195,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8772614413499832,
        "mean_model2": 0.8585129725933075,
        "mean_delta_model1": 0.12273855865001679,
        "mean_delta_model2": 0.1414870274066925,
        "mean_delta_model2 / mean_delta_model1": 1.152751254071152,
        "test_statistic": 1943.0,
        "p_value": 0.04538106928157531,
        "q_value": 0.16134083553627518,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6637225362658501,
        "mean_model2": 0.6576148796081543,
        "mean_delta_model1": 0.6637225362658501,
        "mean_delta_model2": 0.6576148796081543,
        "mean_delta_model2 / mean_delta_model1": 0.9907978766367376,
        "test_statistic": 2287.0,
        "p_value": 0.41317373945790214,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 100,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_100.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_100.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:00.572395",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5394507778063417,
        "mean_model2": 0.11971336530521512,
        "mean_delta_model1": 0.46054922219365835,
        "mean_delta_model2": 1.1197133653052151,
        "mean_delta_model2 / mean_delta_model1": 2.431256663450366,
        "test_statistic": 4.0,
        "p_value": 4.397113554181895e-18,
        "q_value": 2.1885944523009825e-16,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5218167159706354,
        "mean_model2": 0.12074767746031284,
        "mean_delta_model1": 0.4781832840293646,
        "mean_delta_model2": 1.120747677460313,
        "mean_delta_model2 / mean_delta_model1": 2.343761722527066,
        "test_statistic": 41.0,
        "p_value": 1.3329700507875109e-17,
        "q_value": 3.317324902221782e-16,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5945502911321818,
        "mean_model2": -0.11787714706733823,
        "mean_delta_model1": 0.4054497088678181,
        "mean_delta_model2": 0.8821228529326618,
        "mean_delta_model2 / mean_delta_model1": 2.175665276455397,
        "test_statistic": 87.0,
        "p_value": 5.175535327228705e-17,
        "q_value": 8.586805689145597e-16,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8223584407567978,
        "mean_model2": 0.6433760640025139,
        "mean_delta_model1": 0.1776415592432022,
        "mean_delta_model2": 0.3566239359974861,
        "mean_delta_model2 / mean_delta_model1": 2.0075478819077803,
        "test_statistic": 196.0,
        "p_value": 1.1672303430968547e-15,
        "q_value": 1.4524265873402283e-14,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9141515517234802,
        "mean_model2": 0.8288874435424805,
        "mean_delta_model1": 0.08584844827651977,
        "mean_delta_model2": 0.17111255645751952,
        "mean_delta_model2 / mean_delta_model1": 1.993193352853183,
        "test_statistic": 257.0,
        "p_value": 6.283587274042714e-15,
        "q_value": 6.255114442263982e-14,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8206867679953576,
        "mean_model2": 0.686120737195015,
        "mean_delta_model1": 0.1793132320046425,
        "mean_delta_model2": 0.31387926280498507,
        "mean_delta_model2 / mean_delta_model1": 1.7504523190840628,
        "test_statistic": 471.0,
        "p_value": 1.637462668432428e-12,
        "q_value": 1.3583690254306492e-11,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8901388949155807,
        "mean_model2": 0.8131767773628235,
        "mean_delta_model1": 0.10986110508441925,
        "mean_delta_model2": 0.18682322263717652,
        "mean_delta_model2 / mean_delta_model1": 1.7005401729175962,
        "test_statistic": 533.0,
        "p_value": 7.429365830407272e-12,
        "q_value": 5.282643664234758e-11,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8998089629411697,
        "mean_model2": 0.82161856174469,
        "mean_delta_model1": 0.10019103705883026,
        "mean_delta_model2": 0.17838143825531005,
        "mean_delta_model2 / mean_delta_model1": 1.7804131336675144,
        "test_statistic": 547.0,
        "p_value": 1.0389070469371608e-11,
        "q_value": 6.463746533281503e-11,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8953851580619812,
        "mean_model2": 0.8180751174688339,
        "mean_delta_model1": 0.1046148419380188,
        "mean_delta_model2": 0.18192488253116607,
        "mean_delta_model2 / mean_delta_model1": 1.7389968685222617,
        "test_statistic": 599.0,
        "p_value": 3.538590981975653e-11,
        "q_value": 1.956975865261711e-10,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8893018957972526,
        "mean_model2": 0.8075167548656463,
        "mean_delta_model1": 0.11069810420274734,
        "mean_delta_model2": 0.19248324513435364,
        "mean_delta_model2 / mean_delta_model1": 1.7388124803096359,
        "test_statistic": 638.0,
        "p_value": 8.691463376884235e-11,
        "q_value": 4.3260398656147867e-10,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.7045047158002853,
        "mean_model2": 0.6058803217113018,
        "mean_delta_model1": 0.7045047158002853,
        "mean_delta_model2": 0.6058803217113018,
        "mean_delta_model2 / mean_delta_model1": 0.8600088943663767,
        "test_statistic": 1003.0,
        "p_value": 1.6666306383615493e-07,
        "q_value": 7.541266507111417e-07,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5578671948984265,
        "mean_model2": 0.7633471128344536,
        "mean_delta_model1": 0.5952944745868445,
        "mean_delta_model2": 0.7639814612269401,
        "mean_delta_model2 / mean_delta_model1": 1.2833672977683361,
        "test_statistic": 1018.0,
        "p_value": 2.200430471442911e-07,
        "q_value": 9.126915234908013e-07,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9621203172206879,
        "mean_model2": 0.9642977565526962,
        "mean_delta_model1": 0.037879682779312134,
        "mean_delta_model2": 0.03570224344730377,
        "mean_delta_model2 / mean_delta_model1": 0.942516959693296,
        "test_statistic": 1853.0,
        "p_value": 0.02085748626037175,
        "q_value": 0.07415348103754384,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8694352394342423,
        "mean_model2": 0.856224809885025,
        "mean_delta_model1": 0.13056476056575775,
        "mean_delta_model2": 0.14377519011497497,
        "mean_delta_model2 / mean_delta_model1": 1.1011791351048654,
        "test_statistic": 1846.0,
        "p_value": 0.01956314649110028,
        "q_value": 0.07415348103754384,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6686529514938593,
        "mean_model2": 0.6788803699612618,
        "mean_delta_model1": 0.6686529514938593,
        "mean_delta_model2": 0.6788803699612618,
        "mean_delta_model2 / mean_delta_model1": 1.015295555705771,
        "test_statistic": 2503.0,
        "p_value": 0.9397030087863819,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 15,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_200.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:00.572421",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.6289387879520655,
        "mean_model2": 0.06768653955310583,
        "mean_delta_model1": 0.3710612120479345,
        "mean_delta_model2": 1.0676865395531059,
        "mean_delta_model2 / mean_delta_model1": 2.8773865467112736,
        "test_statistic": 10.0,
        "p_value": 5.269204481546394e-18,
        "q_value": 2.6226640622879086e-16,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.591442689821124,
        "mean_model2": 0.03428235882427543,
        "mean_delta_model1": 0.4085573101788759,
        "mean_delta_model2": 1.0342823588242753,
        "mean_delta_model2 / mean_delta_model1": 2.5315477977164145,
        "test_statistic": 54.0,
        "p_value": 1.9606593475405358e-17,
        "q_value": 3.554917837707308e-16,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6512607488036156,
        "mean_model2": -0.19173631826415657,
        "mean_delta_model1": 0.34873925119638444,
        "mean_delta_model2": 0.8082636817358434,
        "mean_delta_model2 / mean_delta_model1": 2.3176733876757925,
        "test_statistic": 57.0,
        "p_value": 2.1426597410614454e-17,
        "q_value": 3.554917837707308e-16,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9224555224180222,
        "mean_model2": 0.8292723709344864,
        "mean_delta_model1": 0.07754447758197784,
        "mean_delta_model2": 0.17072762906551361,
        "mean_delta_model2 / mean_delta_model1": 2.201673599322726,
        "test_statistic": 128.0,
        "p_value": 1.6983101606773517e-16,
        "q_value": 2.1132682554956173e-15,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.836933927088976,
        "mean_model2": 0.6599368354678155,
        "mean_delta_model1": 0.1630660729110241,
        "mean_delta_model2": 0.3400631645321846,
        "mean_delta_model2 / mean_delta_model1": 2.085431742246824,
        "test_statistic": 171.0,
        "p_value": 5.782369267221539e-16,
        "q_value": 5.756167605615241e-15,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.9072606128454208,
        "mean_model2": 0.8254379969835282,
        "mean_delta_model1": 0.09273938715457916,
        "mean_delta_model2": 0.17456200301647187,
        "mean_delta_model2 / mean_delta_model1": 1.8822854924144556,
        "test_statistic": 321.0,
        "p_value": 3.507386885783358e-14,
        "q_value": 2.9095782137193717e-13,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9120338016748428,
        "mean_model2": 0.8299850058555603,
        "mean_delta_model1": 0.08796619832515716,
        "mean_delta_model2": 0.17001499414443969,
        "mean_delta_model2 / mean_delta_model1": 1.932730950995499,
        "test_statistic": 419.0,
        "p_value": 4.4502030315715415e-13,
        "q_value": 3.164312726810625e-12,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8962541258335114,
        "mean_model2": 0.8154546048492193,
        "mean_delta_model1": 0.10374587416648864,
        "mean_delta_model2": 0.18454539515078067,
        "mean_delta_model2 / mean_delta_model1": 1.778821535154517,
        "test_statistic": 456.0,
        "p_value": 1.1281372528141459e-12,
        "q_value": 7.018908263680511e-12,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8344850561022759,
        "mean_model2": 0.6997049862146377,
        "mean_delta_model1": 0.16551494389772414,
        "mean_delta_model2": 0.3002950137853622,
        "mean_delta_model2 / mean_delta_model1": 1.8143075586632351,
        "test_statistic": 547.0,
        "p_value": 1.0389070469371608e-11,
        "q_value": 5.170997226625203e-11,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8930344766378403,
        "mean_model2": 0.820499284863472,
        "mean_delta_model1": 0.10696552336215973,
        "mean_delta_model2": 0.17950071513652802,
        "mean_delta_model2 / mean_delta_model1": 1.6781174858443075,
        "test_statistic": 543.0,
        "p_value": 9.442136734669708e-12,
        "q_value": 5.170997226625203e-11,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.7626550742983818,
        "mean_model2": 0.6275103187561035,
        "mean_delta_model1": 0.7626550742983818,
        "mean_delta_model2": 0.6275103187561035,
        "mean_delta_model2 / mean_delta_model1": 0.8227970151950971,
        "test_statistic": 647.0,
        "p_value": 1.0667618241015043e-10,
        "q_value": 4.826945473095627e-10,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8938190446794033,
        "mean_model2": 0.8651341861486435,
        "mean_delta_model1": 0.1061809553205967,
        "mean_delta_model2": 0.13486581385135651,
        "mean_delta_model2 / mean_delta_model1": 1.270150691752117,
        "test_statistic": 1334.0,
        "p_value": 4.220865997412669e-05,
        "q_value": 0.00017507249911436417,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7428091378509998,
        "mean_model2": 0.6710619619488716,
        "mean_delta_model1": 0.7428091378509998,
        "mean_delta_model2": 0.6710619619488716,
        "mean_delta_model2 / mean_delta_model1": 0.9034110214237564,
        "test_statistic": 1354.0,
        "p_value": 5.6666594558971774e-05,
        "q_value": 0.00021696085039976822,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.579792783986777,
        "mean_model2": 0.7432416027411819,
        "mean_delta_model1": 0.6313866220600903,
        "mean_delta_model2": 0.7432416027411819,
        "mean_delta_model2 / mean_delta_model1": 1.1771576665912413,
        "test_statistic": 1530.0,
        "p_value": 0.0006236077727091549,
        "q_value": 0.002217078634078365,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9607812535762786,
        "mean_model2": 0.9625260180234909,
        "mean_delta_model1": 0.039218746423721314,
        "mean_delta_model2": 0.03747398197650909,
        "mean_delta_model2 / mean_delta_model1": 0.9555119781657043,
        "test_statistic": 1688.0,
        "p_value": 0.004003600734826465,
        "q_value": 0.013284864035614078,
        "significant": true
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 400,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_400.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_400.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:00.572445",
    "comparisons": [
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8312988337874413,
        "mean_model2": 0.6503865107893944,
        "mean_delta_model1": 0.16870116621255873,
        "mean_delta_model2": 0.34961348921060564,
        "mean_delta_model2 / mean_delta_model1": 2.072383357267978,
        "test_statistic": 113.0,
        "p_value": 1.1020511023218767e-16,
        "q_value": 2.7426434398083176e-15,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5317437724769115,
        "mean_model2": -0.03185874013230205,
        "mean_delta_model1": 0.46825622752308843,
        "mean_delta_model2": 0.968141259867698,
        "mean_delta_model2 / mean_delta_model1": 2.067545935243246,
        "test_statistic": 94.0,
        "p_value": 6.348428415673147e-17,
        "q_value": 2.7426434398083176e-15,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5427909650467336,
        "mean_model2": -0.03048547205515206,
        "mean_delta_model1": 0.45720903495326637,
        "mean_delta_model2": 0.9695145279448479,
        "mean_delta_model2 / mean_delta_model1": 2.120506057024763,
        "test_statistic": 137.0,
        "p_value": 2.1986655925914448e-16,
        "q_value": 3.647837957875969e-15,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6048428108915687,
        "mean_model2": -0.22647322855889798,
        "mean_delta_model1": 0.39515718910843134,
        "mean_delta_model2": 0.773526771441102,
        "mean_delta_model2 / mean_delta_model1": 1.9575166358136178,
        "test_statistic": 166.0,
        "p_value": 5.020142338158485e-16,
        "q_value": 6.246743196230203e-15,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9221754205226899,
        "mean_model2": 0.8351600351929664,
        "mean_delta_model1": 0.07782457947731018,
        "mean_delta_model2": 0.16483996480703353,
        "mean_delta_model2 / mean_delta_model1": 2.118096440920606,
        "test_statistic": 283.0,
        "p_value": 1.2708161268604757e-14,
        "q_value": 1.2650576751634217e-13,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9083006224036216,
        "mean_model2": 0.8223720097541809,
        "mean_delta_model1": 0.09169937759637832,
        "mean_delta_model2": 0.1776279902458191,
        "mean_delta_model2 / mean_delta_model1": 1.9370686574085814,
        "test_statistic": 309.0,
        "p_value": 2.5500003530567317e-14,
        "q_value": 2.1153712760642542e-13,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.906309399008751,
        "mean_model2": 0.8264376425743103,
        "mean_delta_model1": 0.09369060099124908,
        "mean_delta_model2": 0.1735623574256897,
        "mean_delta_model2 / mean_delta_model1": 1.8525055404639879,
        "test_statistic": 367.0,
        "p_value": 1.1720436023075122e-13,
        "q_value": 8.333805133940012e-13,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8371026346087456,
        "mean_model2": 0.6889566028118134,
        "mean_delta_model1": 0.16289736539125443,
        "mean_delta_model2": 0.31104339718818663,
        "mean_delta_model2 / mean_delta_model1": 1.9094440013876726,
        "test_statistic": 376.0,
        "p_value": 1.479815745402138e-13,
        "q_value": 9.206939083181529e-13,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8944801944494247,
        "mean_model2": 0.8133673831820488,
        "mean_delta_model1": 0.10551980555057526,
        "mean_delta_model2": 0.1866326168179512,
        "mean_delta_model2 / mean_delta_model1": 1.7686975051190639,
        "test_statistic": 503.0,
        "p_value": 3.593963932707817e-12,
        "q_value": 1.9875992203550624e-11,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8895806133747101,
        "mean_model2": 0.8148437476158142,
        "mean_delta_model1": 0.11041938662528991,
        "mean_delta_model2": 0.1851562523841858,
        "mean_delta_model2 / mean_delta_model1": 1.6768455073248751,
        "test_statistic": 604.0,
        "p_value": 3.974582501006168e-11,
        "q_value": 1.978286233622891e-10,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.7382094451785087,
        "mean_model2": 0.6211692173779011,
        "mean_delta_model1": 0.7382094451785087,
        "mean_delta_model2": 0.6211692173779011,
        "mean_delta_model2 / mean_delta_model1": 0.8414539009693844,
        "test_statistic": 869.0,
        "p_value": 1.241946124416413e-08,
        "q_value": 5.619629506454898e-08,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5709415571670979,
        "mean_model2": 0.7328663312271237,
        "mean_delta_model1": 0.6106050681974738,
        "mean_delta_model2": 0.7328663312271237,
        "mean_delta_model2 / mean_delta_model1": 1.2002296892008597,
        "test_statistic": 1440.0,
        "p_value": 0.0001910376068031721,
        "q_value": 0.0007923831571142076,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8926843759417534,
        "mean_model2": 0.8694412750005722,
        "mean_delta_model1": 0.10731562405824661,
        "mean_delta_model2": 0.1305587249994278,
        "mean_delta_model2 / mean_delta_model1": 1.216586365174243,
        "test_statistic": 1469.0,
        "p_value": 0.0002824663114907604,
        "q_value": 0.0010814860435374152,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.962433015704155,
        "mean_model2": 0.9593369990587235,
        "mean_delta_model1": 0.037566984295845034,
        "mean_delta_model2": 0.04066300094127655,
        "mean_delta_model2 / mean_delta_model1": 1.0824132334139458,
        "test_statistic": 1751.0,
        "p_value": 0.007784802463816207,
        "q_value": 0.02767688275928083,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7068539950251579,
        "mean_model2": 0.6882101050019265,
        "mean_delta_model1": 0.7068539950251579,
        "mean_delta_model2": 0.6882101050019265,
        "mean_delta_model2 / mean_delta_model1": 0.9736241286680881,
        "test_statistic": 2079.0,
        "p_value": 0.1251544595595329,
        "q_value": 0.4152911563423476,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 800,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_800.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_800.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:00.572466",
    "comparisons": [
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.848747711777687,
        "mean_model2": 0.6562636724114418,
        "mean_delta_model1": 0.15125228822231293,
        "mean_delta_model2": 0.3437363275885582,
        "mean_delta_model2 / mean_delta_model1": 2.2726024950004677,
        "test_statistic": 77.0,
        "p_value": 3.861843481183515e-17,
        "q_value": 9.610860754931644e-16,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.6419921713322401,
        "mean_model2": -0.04582533801905811,
        "mean_delta_model1": 0.3580078286677599,
        "mean_delta_model2": 0.9541746619809419,
        "mean_delta_model2 / mean_delta_model1": 2.6652340691310403,
        "test_statistic": 72.0,
        "p_value": 3.334451023459934e-17,
        "q_value": 9.610860754931644e-16,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9363045537471771,
        "mean_model2": 0.8353920596837997,
        "mean_delta_model1": 0.06369544625282288,
        "mean_delta_model2": 0.16460794031620027,
        "mean_delta_model2 / mean_delta_model1": 2.584296837529498,
        "test_statistic": 94.0,
        "p_value": 6.348428415673147e-17,
        "q_value": 1.053276961516272e-15,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5929898209869862,
        "mean_model2": -0.06749663388356567,
        "mean_delta_model1": 0.40701017901301384,
        "mean_delta_model2": 0.9325033661164344,
        "mean_delta_model2 / mean_delta_model1": 2.2911057614768358,
        "test_statistic": 137.0,
        "p_value": 2.1986655925914448e-16,
        "q_value": 2.6737751540257946e-15,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6624097067117691,
        "mean_model2": -0.27204743315465746,
        "mean_delta_model1": 0.33759029328823087,
        "mean_delta_model2": 0.7279525668453425,
        "mean_delta_model2 / mean_delta_model1": 2.156319601949647,
        "test_statistic": 144.0,
        "p_value": 2.6859459865305274e-16,
        "q_value": 2.6737751540257946e-15,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.9231107419729233,
        "mean_model2": 0.8332299935817719,
        "mean_delta_model1": 0.07688925802707672,
        "mean_delta_model2": 0.16677000641822814,
        "mean_delta_model2 / mean_delta_model1": 2.1689636588702643,
        "test_statistic": 202.0,
        "p_value": 1.3800602760081443e-15,
        "q_value": 1.1448390050634578e-14,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9272476628422737,
        "mean_model2": 0.8327114832401276,
        "mean_delta_model1": 0.07275233715772629,
        "mean_delta_model2": 0.16728851675987244,
        "mean_delta_model2 / mean_delta_model1": 2.299424641124487,
        "test_statistic": 215.0,
        "p_value": 1.9809780081603547e-15,
        "q_value": 1.4085725703485809e-14,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.8009300374984741,
        "mean_model2": 0.6240697754919529,
        "mean_delta_model1": 0.8009300374984741,
        "mean_delta_model2": 0.6240697754919529,
        "mean_delta_model2 / mean_delta_model1": 0.7791813844828386,
        "test_statistic": 280.0,
        "p_value": 1.1720978734947919e-14,
        "q_value": 7.292417150123377e-14,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.9063731968402863,
        "mean_model2": 0.8116651327908039,
        "mean_delta_model1": 0.09362680315971375,
        "mean_delta_model2": 0.1883348672091961,
        "mean_delta_model2 / mean_delta_model1": 2.0115486255354047,
        "test_statistic": 355.0,
        "p_value": 8.576083005057974e-14,
        "q_value": 4.7429012126203e-13,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8485346738994122,
        "mean_model2": 0.6965580269694328,
        "mean_delta_model1": 0.15146532610058785,
        "mean_delta_model2": 0.3034419730305672,
        "mean_delta_model2 / mean_delta_model1": 2.00337582760725,
        "test_statistic": 456.0,
        "p_value": 1.1281372528141459e-12,
        "q_value": 5.104660555404008e-12,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.9029164424538613,
        "mean_model2": 0.8116703408956528,
        "mean_delta_model1": 0.09708355754613876,
        "mean_delta_model2": 0.18832965910434724,
        "mean_delta_model2 / mean_delta_model1": 1.9398718368436794,
        "test_statistic": 454.0,
        "p_value": 1.0732517780223133e-12,
        "q_value": 5.104660555404008e-12,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7841247874498367,
        "mean_model2": 0.6818721994757653,
        "mean_delta_model1": 0.7841247874498367,
        "mean_delta_model2": 0.6818721994757653,
        "mean_delta_model2 / mean_delta_model1": 0.869596536660164,
        "test_statistic": 788.0,
        "p_value": 2.3383109695574695e-09,
        "q_value": 9.698814067963742e-09,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9074542362242937,
        "mean_model2": 0.8714113384485245,
        "mean_delta_model1": 0.0925457637757063,
        "mean_delta_model2": 0.12858866155147552,
        "mean_delta_model2 / mean_delta_model1": 1.3894602659838944,
        "test_statistic": 881.0,
        "p_value": 1.5803323839645013e-08,
        "q_value": 6.050659310088118e-08,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9629180398583412,
        "mean_model2": 0.9600126922130585,
        "mean_delta_model1": 0.037081960141658786,
        "mean_delta_model2": 0.039987307786941526,
        "mean_delta_model2 / mean_delta_model1": 1.0783493546237541,
        "test_statistic": 1317.0,
        "p_value": 3.274333506469467e-05,
        "q_value": 0.00011641058972858719,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6377518126182258,
        "mean_model2": 0.7195214996114373,
        "mean_delta_model1": 0.6896430052630603,
        "mean_delta_model2": 0.7200984850153327,
        "mean_delta_model2 / mean_delta_model1": 1.044161224749398,
        "test_statistic": 2437.0,
        "p_value": 0.7622156087305044,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 1600,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_1600.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_1600.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:00.572487",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.6092907363921404,
        "mean_model2": -0.12716221901588143,
        "mean_delta_model1": 0.3907092636078596,
        "mean_delta_model2": 0.8728377809841186,
        "mean_delta_model2 / mean_delta_model1": 2.233982816082276,
        "test_statistic": 62.0,
        "p_value": 2.4837256358933875e-17,
        "q_value": 1.0806556627750465e-15,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5816727208904922,
        "mean_model2": -0.060103613748215136,
        "mean_delta_model1": 0.4183272791095078,
        "mean_delta_model2": 0.9398963862517848,
        "mean_delta_model2 / mean_delta_model1": 2.2467967861253033,
        "test_statistic": 81.0,
        "p_value": 4.3422989190124274e-17,
        "q_value": 1.0806556627750465e-15,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9325891584157944,
        "mean_model2": 0.8467677932977676,
        "mean_delta_model1": 0.06741084158420563,
        "mean_delta_model2": 0.15323220670223237,
        "mean_delta_model2 / mean_delta_model1": 2.273109237344616,
        "test_statistic": 212.0,
        "p_value": 1.822758080253685e-15,
        "q_value": 3.024164354770099e-14,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8454413242638111,
        "mean_model2": 0.6875161136686802,
        "mean_delta_model1": 0.1545586757361889,
        "mean_delta_model2": 0.3124838863313198,
        "mean_delta_model2 / mean_delta_model1": 2.021781597460684,
        "test_statistic": 290.0,
        "p_value": 1.5340861010765875e-14,
        "q_value": 1.9089183670132338e-13,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9249880424141884,
        "mean_model2": 0.8402752876281738,
        "mean_delta_model1": 0.07501195758581161,
        "mean_delta_model2": 0.15972471237182617,
        "mean_delta_model2 / mean_delta_model1": 2.129323344069584,
        "test_statistic": 309.0,
        "p_value": 2.5500003530567317e-14,
        "q_value": 2.538445531277105e-13,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.9212935066223145,
        "mean_model2": 0.8403214406967163,
        "mean_delta_model1": 0.07870649337768555,
        "mean_delta_model2": 0.1596785593032837,
        "mean_delta_model2 / mean_delta_model1": 2.028785077960987,
        "test_statistic": 323.0,
        "p_value": 3.698169679402043e-14,
        "q_value": 2.629580111158335e-13,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6462181920185686,
        "mean_model2": -0.3367601250298321,
        "mean_delta_model1": 0.3537818079814315,
        "mean_delta_model2": 0.6632398749701679,
        "mean_delta_model2 / mean_delta_model1": 1.874714470917562,
        "test_statistic": 322.0,
        "p_value": 3.601536165479944e-14,
        "q_value": 2.629580111158335e-13,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.8130274972319603,
        "mean_model2": 0.650706312134862,
        "mean_delta_model1": 0.8130274972319603,
        "mean_delta_model2": 0.650706312134862,
        "mean_delta_model2 / mean_delta_model1": 0.8003496983192594,
        "test_statistic": 337.0,
        "p_value": 5.3510998658976466e-14,
        "q_value": 3.329282760128503e-13,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.9016334283351898,
        "mean_model2": 0.8283761444687844,
        "mean_delta_model1": 0.09836657166481018,
        "mean_delta_model2": 0.17162385553121567,
        "mean_delta_model2 / mean_delta_model1": 1.744737593539744,
        "test_statistic": 573.0,
        "p_value": 1.924884776837515e-11,
        "q_value": 1.0645347458545605e-10,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8984165045619011,
        "mean_model2": 0.8285375213623047,
        "mean_delta_model1": 0.10158349543809891,
        "mean_delta_model2": 0.17146247863769531,
        "mean_delta_model2 / mean_delta_model1": 1.6878970141579541,
        "test_statistic": 599.0,
        "p_value": 3.538590981975653e-11,
        "q_value": 1.76127827873554e-10,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.83708411116153,
        "mean_model2": 0.7105161878466606,
        "mean_delta_model1": 0.16291588883846997,
        "mean_delta_model2": 0.2894838121533394,
        "mean_delta_model2 / mean_delta_model1": 1.7768912180220844,
        "test_statistic": 687.0,
        "p_value": 2.621715350632232e-10,
        "q_value": 1.1862888938810986e-09,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7958927756547928,
        "mean_model2": 0.7087896719574929,
        "mean_delta_model1": 0.7958927756547928,
        "mean_delta_model2": 0.7087896719574929,
        "mean_delta_model2 / mean_delta_model1": 0.8905592482283322,
        "test_statistic": 890.0,
        "p_value": 1.891299910025249e-08,
        "q_value": 7.844707745421459e-08,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9010970382392407,
        "mean_model2": 0.8760879856348037,
        "mean_delta_model1": 0.09890296176075936,
        "mean_delta_model2": 0.12391201436519622,
        "mean_delta_model2 / mean_delta_model1": 1.2528645468164274,
        "test_statistic": 1220.0,
        "p_value": 7.2228356931968e-06,
        "q_value": 2.7654257089032524e-05,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9618648284673691,
        "mean_model2": 0.9633785480260849,
        "mean_delta_model1": 0.03813517153263092,
        "mean_delta_model2": 0.0366214519739151,
        "mean_delta_model2 / mean_delta_model1": 0.9603064704345021,
        "test_statistic": 1513.0,
        "p_value": 0.0005021915186106702,
        "q_value": 0.0017854140612938822,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6017244151607156,
        "mean_model2": 0.7282713803648949,
        "mean_delta_model1": 0.6647565451636911,
        "mean_delta_model2": 0.7302746775746346,
        "mean_delta_model2 / mean_delta_model1": 1.0985595898041292,
        "test_statistic": 2135.0,
        "p_value": 0.17993776194288402,
        "q_value": 0.5970746986556144,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 3200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_3200.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_3200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:00.572509",
    "comparisons": [
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.6187816611677408,
        "mean_model2": -0.1128516443958506,
        "mean_delta_model1": 0.3812183388322592,
        "mean_delta_model2": 0.8871483556041494,
        "mean_delta_model2 / mean_delta_model1": 2.327139765420639,
        "test_statistic": 21.0,
        "p_value": 7.333806979251821e-18,
        "q_value": 3.6502876423947805e-16,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9381322014331818,
        "mean_model2": 0.850104603767395,
        "mean_delta_model1": 0.06186779856681824,
        "mean_delta_model2": 0.14989539623260498,
        "mean_delta_model2 / mean_delta_model1": 2.4228338441801105,
        "test_statistic": 142.0,
        "p_value": 2.5367819502505057e-16,
        "q_value": 6.313217562615913e-15,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.8387656405568122,
        "mean_model2": 0.6388217556104064,
        "mean_delta_model1": 0.8387656405568122,
        "mean_delta_model2": 0.6388217556104064,
        "mean_delta_model2 / mean_delta_model1": 0.761621273835593,
        "test_statistic": 158.0,
        "p_value": 4.001557931101309e-16,
        "q_value": 6.639042772532895e-15,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.9308484864234924,
        "mean_model2": 0.8451215916872025,
        "mean_delta_model1": 0.06915151357650756,
        "mean_delta_model2": 0.15487840831279753,
        "mean_delta_model2 / mean_delta_model1": 2.2396965778839215,
        "test_statistic": 199.0,
        "p_value": 1.2692585134298068e-15,
        "q_value": 1.2635071197496551e-14,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6731350625865161,
        "mean_model2": -0.3086862379475497,
        "mean_delta_model1": 0.3268649374134839,
        "mean_delta_model2": 0.6913137620524503,
        "mean_delta_model2 / mean_delta_model1": 2.114982926963313,
        "test_statistic": 193.0,
        "p_value": 1.0732910429952938e-15,
        "q_value": 1.2635071197496551e-14,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5864148696511984,
        "mean_model2": -0.14540832499973477,
        "mean_delta_model1": 0.4135851303488016,
        "mean_delta_model2": 0.8545916750002652,
        "mean_delta_model2 / mean_delta_model1": 2.066301741263125,
        "test_statistic": 267.0,
        "p_value": 8.246248576180102e-15,
        "q_value": 6.84073527771353e-14,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8504805979505181,
        "mean_model2": 0.6942364919185638,
        "mean_delta_model1": 0.14951940204948186,
        "mean_delta_model2": 0.30576350808143615,
        "mean_delta_model2 / mean_delta_model1": 2.0449754606445456,
        "test_statistic": 306.0,
        "p_value": 2.354049957607283e-14,
        "q_value": 1.6738450330375648e-13,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9292541517317295,
        "mean_model2": 0.8426375639438629,
        "mean_delta_model1": 0.07074584826827049,
        "mean_delta_model2": 0.1573624360561371,
        "mean_delta_model2 / mean_delta_model1": 2.2243345709760063,
        "test_statistic": 344.0,
        "p_value": 6.431315816976964e-14,
        "q_value": 4.00135848909472e-13,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.9104127806425094,
        "mean_model2": 0.8269416973739863,
        "mean_delta_model1": 0.08958721935749053,
        "mean_delta_model2": 0.17305830262601377,
        "mean_delta_model2 / mean_delta_model1": 1.9317298144441635,
        "test_statistic": 438.0,
        "p_value": 7.189401383931969e-13,
        "q_value": 3.976013352687285e-12,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8514876550808549,
        "mean_model2": 0.7092052149772644,
        "mean_delta_model1": 0.1485123449191451,
        "mean_delta_model2": 0.2907947850227356,
        "mean_delta_model2 / mean_delta_model1": 1.958051266250315,
        "test_statistic": 492.0,
        "p_value": 2.7466217414808055e-12,
        "q_value": 1.3670879844022078e-11,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.8262101083993911,
        "mean_model2": 0.7138943448662758,
        "mean_delta_model1": 0.8262101083993911,
        "mean_delta_model2": 0.7138943448662758,
        "mean_delta_model2 / mean_delta_model1": 0.864059078445913,
        "test_statistic": 601.0,
        "p_value": 3.7070625821242335e-11,
        "q_value": 1.6773930735980413e-10,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.9006912777572871,
        "mean_model2": 0.8280221843719482,
        "mean_delta_model1": 0.09930872224271298,
        "mean_delta_model2": 0.17197781562805176,
        "mean_delta_model2 / mean_delta_model1": 1.7317493543793032,
        "test_statistic": 606.0,
        "p_value": 4.1633293067648055e-11,
        "q_value": 1.726860001758368e-10,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9109495431184769,
        "mean_model2": 0.8822702980041504,
        "mean_delta_model1": 0.08905045688152313,
        "mean_delta_model2": 0.1177297019958496,
        "mean_delta_model2 / mean_delta_model1": 1.322056125466966,
        "test_statistic": 1029.0,
        "p_value": 2.6933086046598166e-07,
        "q_value": 1.0311940038109995e-06,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9612279242277145,
        "mean_model2": 0.9639307999610901,
        "mean_delta_model1": 0.03877207577228546,
        "mean_delta_model2": 0.03606920003890991,
        "mean_delta_model2 / mean_delta_model1": 0.93028808286536,
        "test_statistic": 1467.0,
        "p_value": 0.000275032659267615,
        "q_value": 0.0009778085829285758,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5968394887214526,
        "mean_model2": 0.742114825733006,
        "mean_delta_model1": 0.6793465176643804,
        "mean_delta_model2": 0.7429108519479632,
        "mean_delta_model2 / mean_delta_model1": 1.0935668802750613,
        "test_statistic": 2200.0,
        "p_value": 0.2637995846253631,
        "q_value": 0.8753474301056453,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 50,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_50.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_50.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:02.510383",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.36652338087558745,
        "mean_model2": 0.17102916828356685,
        "mean_delta_model1": 0.6334766191244126,
        "mean_delta_model2": 1.1710291682835667,
        "mean_delta_model2 / mean_delta_model1": 1.8485752006161742,
        "test_statistic": 100.0,
        "p_value": 7.559735353237344e-17,
        "q_value": 1.254246651512519e-15,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4064640519209206,
        "mean_model2": 0.21277562993578614,
        "mean_delta_model1": 0.5935359480790794,
        "mean_delta_model2": 1.2127756299357861,
        "mean_delta_model2 / mean_delta_model1": 2.043306111215024,
        "test_statistic": 71.0,
        "p_value": 3.2378376991430734e-17,
        "q_value": 1.254246651512519e-15,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.4720320870727301,
        "mean_model2": -0.01235572574660182,
        "mean_delta_model1": 0.52796791292727,
        "mean_delta_model2": 0.9876442742533982,
        "mean_delta_model2 / mean_delta_model1": 1.8706520795506199,
        "test_statistic": 100.0,
        "p_value": 7.559735353237344e-17,
        "q_value": 1.254246651512519e-15,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8196766093373299,
        "mean_model2": 0.6707700285315513,
        "mean_delta_model1": 0.18032339066267014,
        "mean_delta_model2": 0.3292299714684486,
        "mean_delta_model2 / mean_delta_model1": 1.8257751823463497,
        "test_statistic": 263.0,
        "p_value": 7.397715002246111e-15,
        "q_value": 9.2052421515368e-14,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7826671543717384,
        "mean_model2": 0.6275429347157478,
        "mean_delta_model1": 0.21733284562826158,
        "mean_delta_model2": 0.37245706528425215,
        "mean_delta_model2 / mean_delta_model1": 1.7137633485981398,
        "test_statistic": 400.0,
        "p_value": 2.743123083911266e-13,
        "q_value": 2.730693164709028e-12,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8884879565238952,
        "mean_model2": 0.7946413213014603,
        "mean_delta_model1": 0.11151204347610473,
        "mean_delta_model2": 0.20535867869853974,
        "mean_delta_model2 / mean_delta_model1": 1.8415829563964978,
        "test_statistic": 410.0,
        "p_value": 3.540540121284387e-13,
        "q_value": 2.9370807205340877e-12,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.8943710827827454,
        "mean_model2": 0.8201623994112015,
        "mean_delta_model1": 0.10562891721725463,
        "mean_delta_model2": 0.17983760058879852,
        "mean_delta_model2 / mean_delta_model1": 1.7025413620298078,
        "test_statistic": 493.0,
        "p_value": 2.8147507892817304e-12,
        "q_value": 2.0014259309376067e-11,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8897439941763878,
        "mean_model2": 0.8029209822416306,
        "mean_delta_model1": 0.11025600582361221,
        "mean_delta_model2": 0.19707901775836945,
        "mean_delta_model2 / mean_delta_model1": 1.7874674153683463,
        "test_statistic": 635.0,
        "p_value": 8.116041088360096e-11,
        "q_value": 5.049540534306425e-10,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8664501664042473,
        "mean_model2": 0.7990796828269958,
        "mean_delta_model1": 0.1335498335957527,
        "mean_delta_model2": 0.20092031717300415,
        "mean_delta_model2 / mean_delta_model1": 1.5044595097076485,
        "test_statistic": 686.0,
        "p_value": 2.564014863846255e-10,
        "q_value": 1.4179980767141222e-09,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8751442736387253,
        "mean_model2": 0.8087823301553726,
        "mean_delta_model1": 0.12485572636127472,
        "mean_delta_model2": 0.19121766984462737,
        "mean_delta_model2 / mean_delta_model1": 1.5315090097776685,
        "test_statistic": 698.0,
        "p_value": 3.346332605761628e-10,
        "q_value": 1.6655846810138642e-09,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7430514541268348,
        "mean_model2": 0.6576148796081543,
        "mean_delta_model1": 0.7430514541268348,
        "mean_delta_model2": 0.6576148796081543,
        "mean_delta_model2 / mean_delta_model1": 0.8850193024397246,
        "test_statistic": 1068.0,
        "p_value": 5.453214460509749e-07,
        "q_value": 2.467501953976235e-06,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6899956105649472,
        "mean_model2": 0.5950787249207496,
        "mean_delta_model1": 0.6899956105649472,
        "mean_delta_model2": 0.5950787249207496,
        "mean_delta_model2 / mean_delta_model1": 0.8624384210698348,
        "test_statistic": 1230.0,
        "p_value": 8.482458592651112e-06,
        "q_value": 3.5183425044999155e-05,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6477309703081846,
        "mean_model2": 0.7749024249613286,
        "mean_delta_model1": 0.6618481845408678,
        "mean_delta_model2": 0.7749024249613286,
        "mean_delta_model2 / mean_delta_model1": 1.1708159711866368,
        "test_statistic": 1358.0,
        "p_value": 6.007269113559776e-05,
        "q_value": 0.000230001860108574,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8889417380094529,
        "mean_model2": 0.8585129725933075,
        "mean_delta_model1": 0.11105826199054718,
        "mean_delta_model2": 0.1414870274066925,
        "mean_delta_model2 / mean_delta_model1": 1.2739892095442238,
        "test_statistic": 1444.0,
        "p_value": 0.00020173876107448913,
        "q_value": 0.0007172307922022856,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9633279949426651,
        "mean_model2": 0.9555481004714966,
        "mean_delta_model1": 0.0366720050573349,
        "mean_delta_model2": 0.04445189952850342,
        "mean_delta_model2 / mean_delta_model1": 1.2121480529631534,
        "test_statistic": 1889.0,
        "p_value": 0.028758982542069246,
        "q_value": 0.09542888968686063,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 100,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_100.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_100.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:02.510445",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4481902369391173,
        "mean_model2": 0.11971336530521512,
        "mean_delta_model1": 0.5518097630608827,
        "mean_delta_model2": 1.1197133653052151,
        "mean_delta_model2 / mean_delta_model1": 2.0291655571553817,
        "test_statistic": 13.0,
        "p_value": 5.7672036158171394e-18,
        "q_value": 2.1845263725618718e-16,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4656204093620181,
        "mean_model2": 0.12074767746031284,
        "mean_delta_model1": 0.5343795906379819,
        "mean_delta_model2": 1.120747677460313,
        "mean_delta_model2 / mean_delta_model1": 2.09728757814699,
        "test_statistic": 27.0,
        "p_value": 8.777880718979928e-18,
        "q_value": 2.1845263725618718e-16,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5301983192935587,
        "mean_model2": -0.11787714706733823,
        "mean_delta_model1": 0.4698016807064414,
        "mean_delta_model2": 0.8821228529326618,
        "mean_delta_model2 / mean_delta_model1": 1.8776494192319886,
        "test_statistic": 202.0,
        "p_value": 1.3800602760081443e-15,
        "q_value": 2.2896780101269155e-14,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8010369357466698,
        "mean_model2": 0.6433760640025139,
        "mean_delta_model1": 0.19896306425333024,
        "mean_delta_model2": 0.3566239359974861,
        "mean_delta_model2 / mean_delta_model1": 1.792412764328025,
        "test_statistic": 223.0,
        "p_value": 2.47205818888879e-15,
        "q_value": 3.076070683244977e-14,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8183139035105705,
        "mean_model2": 0.686120737195015,
        "mean_delta_model1": 0.18168609648942946,
        "mean_delta_model2": 0.31387926280498507,
        "mean_delta_model2 / mean_delta_model1": 1.7275909872566757,
        "test_statistic": 335.0,
        "p_value": 5.07669843779915e-14,
        "q_value": 5.053694383855644e-13,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9057852590084076,
        "mean_model2": 0.8288874435424805,
        "mean_delta_model1": 0.09421474099159241,
        "mean_delta_model2": 0.17111255645751952,
        "mean_delta_model2 / mean_delta_model1": 1.816197281408324,
        "test_statistic": 385.0,
        "p_value": 1.8666484415148118e-13,
        "q_value": 1.548491744700041e-12,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.892257282435894,
        "mean_model2": 0.82161856174469,
        "mean_delta_model1": 0.10774271756410599,
        "mean_delta_model2": 0.17838143825531005,
        "mean_delta_model2 / mean_delta_model1": 1.6556240856759032,
        "test_statistic": 521.0,
        "p_value": 5.5634556953558915e-12,
        "q_value": 3.955889998044619e-11,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8928372424840927,
        "mean_model2": 0.8075167548656463,
        "mean_delta_model1": 0.10716275751590729,
        "mean_delta_model2": 0.19248324513435364,
        "mean_delta_model2 / mean_delta_model1": 1.796176671786197,
        "test_statistic": 546.0,
        "p_value": 1.0143962707277701e-11,
        "q_value": 6.311248342785477e-11,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8726610094308853,
        "mean_model2": 0.8131767773628235,
        "mean_delta_model1": 0.12733899056911469,
        "mean_delta_model2": 0.18682322263717652,
        "mean_delta_model2 / mean_delta_model1": 1.4671329009458112,
        "test_statistic": 781.0,
        "p_value": 2.016919093053276e-09,
        "q_value": 1.1154332352610844e-08,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.874058330655098,
        "mean_model2": 0.8180751174688339,
        "mean_delta_model1": 0.12594166934490203,
        "mean_delta_model2": 0.18192488253116607,
        "mean_delta_model2 / mean_delta_model1": 1.4445170012233934,
        "test_statistic": 791.0,
        "p_value": 2.490848472018953e-09,
        "q_value": 1.2397808426390142e-08,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8968517458438874,
        "mean_model2": 0.856224809885025,
        "mean_delta_model1": 0.10314825415611267,
        "mean_delta_model2": 0.14377519011497497,
        "mean_delta_model2 / mean_delta_model1": 1.3938693513646319,
        "test_statistic": 1022.0,
        "p_value": 2.3686273843043898e-07,
        "q_value": 1.071770190102952e-06,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.63823157325387,
        "mean_model2": 0.7633471128344536,
        "mean_delta_model1": 0.6558975872397422,
        "mean_delta_model2": 0.7639814612269401,
        "mean_delta_model2 / mean_delta_model1": 1.1647877291972586,
        "test_statistic": 1171.0,
        "p_value": 3.2320821098517744e-06,
        "q_value": 1.3405985706758618e-05,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.67623041421175,
        "mean_model2": 0.6058803217113018,
        "mean_delta_model1": 0.67623041421175,
        "mean_delta_model2": 0.6058803217113018,
        "mean_delta_model2 / mean_delta_model1": 0.8959672753221962,
        "test_statistic": 1399.0,
        "p_value": 0.00010814379286022677,
        "q_value": 0.0004140529256614097,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7314479088783264,
        "mean_model2": 0.6788803699612618,
        "mean_delta_model1": 0.7314479088783264,
        "mean_delta_model2": 0.6788803699612618,
        "mean_delta_model2 / mean_delta_model1": 0.9281322179214692,
        "test_statistic": 1442.0,
        "p_value": 0.00019631968285300282,
        "q_value": 0.0006979646395546659,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9661470872163772,
        "mean_model2": 0.9642977565526962,
        "mean_delta_model1": 0.03385291278362274,
        "mean_delta_model2": 0.03570224344730377,
        "mean_delta_model2 / mean_delta_model1": 1.0546284059957078,
        "test_statistic": 1853.0,
        "p_value": 0.02085748626037175,
        "q_value": 0.06920991563504092,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_200.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:02.510471",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4837604131177068,
        "mean_model2": 0.06768653955310583,
        "mean_delta_model1": 0.5162395868822932,
        "mean_delta_model2": 1.0676865395531059,
        "mean_delta_model2 / mean_delta_model1": 2.0681996628758093,
        "test_statistic": 70.0,
        "p_value": 3.1439869904422714e-17,
        "q_value": 1.5648703179030467e-15,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.47045968018472195,
        "mean_model2": 0.03428235882427543,
        "mean_delta_model1": 0.529540319815278,
        "mean_delta_model2": 1.0342823588242753,
        "mean_delta_model2 / mean_delta_model1": 1.9531701744355723,
        "test_statistic": 97.0,
        "p_value": 6.928021197254875e-17,
        "q_value": 1.7241570601827127e-15,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5472050630953162,
        "mean_model2": -0.19173631826415657,
        "mean_delta_model1": 0.4527949369046837,
        "mean_delta_model2": 0.8082636817358434,
        "mean_delta_model2 / mean_delta_model1": 1.7850545928387627,
        "test_statistic": 194.0,
        "p_value": 1.103745435518733e-15,
        "q_value": 1.831240052641211e-14,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8211070868372917,
        "mean_model2": 0.6997049862146377,
        "mean_delta_model1": 0.17889291316270828,
        "mean_delta_model2": 0.3002950137853622,
        "mean_delta_model2 / mean_delta_model1": 1.6786300165632342,
        "test_statistic": 300.0,
        "p_value": 2.005534251506086e-14,
        "q_value": 2.4955582125979887e-13,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.8977004444599151,
        "mean_model2": 0.8292723709344864,
        "mean_delta_model1": 0.10229955554008484,
        "mean_delta_model2": 0.17072762906551361,
        "mean_delta_model2 / mean_delta_model1": 1.668899030539933,
        "test_statistic": 474.0,
        "p_value": 1.7635880957805926e-12,
        "q_value": 1.755596745459802e-11,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7866213303804398,
        "mean_model2": 0.6599368354678155,
        "mean_delta_model1": 0.21337866961956023,
        "mean_delta_model2": 0.3400631645321846,
        "mean_delta_model2 / mean_delta_model1": 1.5937073988627555,
        "test_statistic": 505.0,
        "p_value": 3.773462101902537e-12,
        "q_value": 3.130302837845954e-11,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8918347585201264,
        "mean_model2": 0.8154546048492193,
        "mean_delta_model1": 0.10816524147987366,
        "mean_delta_model2": 0.18454539515078067,
        "mean_delta_model2 / mean_delta_model1": 1.7061432362735407,
        "test_statistic": 605.0,
        "p_value": 4.067884888807611e-11,
        "q_value": 2.892465766962753e-10,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8890455746650696,
        "mean_model2": 0.8299850058555603,
        "mean_delta_model1": 0.11095442533493043,
        "mean_delta_model2": 0.17001499414443969,
        "mean_delta_model2 / mean_delta_model1": 1.5322957478372514,
        "test_statistic": 647.0,
        "p_value": 1.0667618241015043e-10,
        "q_value": 6.637050025506486e-10,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.866727951169014,
        "mean_model2": 0.820499284863472,
        "mean_delta_model1": 0.13327204883098603,
        "mean_delta_model2": 0.17950071513652802,
        "mean_delta_model2 / mean_delta_model1": 1.3468744324938577,
        "test_statistic": 815.0,
        "p_value": 4.114194036557029e-09,
        "q_value": 2.27530632264556e-08,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5819992932118475,
        "mean_model2": 0.7432416027411819,
        "mean_delta_model1": 0.5988613528199493,
        "mean_delta_model2": 0.7432416027411819,
        "mean_delta_model2 / mean_delta_model1": 1.241091279711685,
        "test_statistic": 934.0,
        "p_value": 4.490847904151786e-08,
        "q_value": 2.2352492579607177e-07,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8655536425113678,
        "mean_model2": 0.8254379969835282,
        "mean_delta_model1": 0.1344463574886322,
        "mean_delta_model2": 0.17456200301647187,
        "mean_delta_model2 / mean_delta_model1": 1.2983765888282361,
        "test_statistic": 1189.0,
        "p_value": 4.356599405924977e-06,
        "q_value": 1.9712992446306096e-05,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8923286634683609,
        "mean_model2": 0.8651341861486435,
        "mean_delta_model1": 0.1076713365316391,
        "mean_delta_model2": 0.13486581385135651,
        "mean_delta_model2 / mean_delta_model1": 1.2525693299230696,
        "test_statistic": 1487.0,
        "p_value": 0.00035837215484588663,
        "q_value": 0.0014864510932194642,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6690796260535717,
        "mean_model2": 0.6275103187561035,
        "mean_delta_model1": 0.6690796260535717,
        "mean_delta_model2": 0.6275103187561035,
        "mean_delta_model2 / mean_delta_model1": 0.9378709115047245,
        "test_statistic": 1917.0,
        "p_value": 0.036572773348455045,
        "q_value": 0.14002711948573401,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7044027525186539,
        "mean_model2": 0.6710619619488716,
        "mean_delta_model1": 0.7044027525186539,
        "mean_delta_model2": 0.6710619619488716,
        "mean_delta_model2 / mean_delta_model1": 0.9526680007274682,
        "test_statistic": 1944.0,
        "p_value": 0.045752803008419725,
        "q_value": 0.1626624401400354,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9603780847787857,
        "mean_model2": 0.9625260180234909,
        "mean_delta_model1": 0.0396219152212143,
        "mean_delta_model2": 0.03747398197650909,
        "mean_delta_model2 / mean_delta_model1": 0.9457892625151254,
        "test_statistic": 2086.0,
        "p_value": 0.1311903088113828,
        "q_value": 0.4353194863285955,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 400,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_400.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_400.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:02.510494",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5250386214349418,
        "mean_model2": -0.03048547205515206,
        "mean_delta_model1": 0.4749613785650581,
        "mean_delta_model2": 0.9695145279448479,
        "mean_delta_model2 / mean_delta_model1": 2.041249187194803,
        "test_statistic": 99.0,
        "p_value": 7.343095847008848e-17,
        "q_value": 3.654911030940626e-15,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5139625691249967,
        "mean_model2": -0.03185874013230205,
        "mean_delta_model1": 0.48603743087500334,
        "mean_delta_model2": 0.968141259867698,
        "mean_delta_model2 / mean_delta_model1": 1.9919067922912292,
        "test_statistic": 127.0,
        "p_value": 1.6501804744705306e-16,
        "q_value": 4.106757520836369e-15,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8049311187863349,
        "mean_model2": 0.6503865107893944,
        "mean_delta_model1": 0.195068881213665,
        "mean_delta_model2": 0.34961348921060564,
        "mean_delta_model2 / mean_delta_model1": 1.7922565969282571,
        "test_statistic": 227.0,
        "p_value": 2.7607481043139136e-15,
        "q_value": 4.580397201368205e-14,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8273078039288521,
        "mean_model2": 0.6889566028118134,
        "mean_delta_model1": 0.1726921960711479,
        "mean_delta_model2": 0.31104339718818663,
        "mean_delta_model2 / mean_delta_model1": 1.8011433305303446,
        "test_statistic": 279.0,
        "p_value": 1.1408997548745529e-14,
        "q_value": 1.1357299934977781e-13,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5761615025252104,
        "mean_model2": -0.22647322855889798,
        "mean_delta_model1": 0.42383849747478963,
        "mean_delta_model2": 0.773526771441102,
        "mean_delta_model2 / mean_delta_model1": 1.8250507588379516,
        "test_statistic": 278.0,
        "p_value": 1.1105191188694949e-14,
        "q_value": 1.1357299934977781e-13,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9045916336774826,
        "mean_model2": 0.8351600351929664,
        "mean_delta_model1": 0.0954083663225174,
        "mean_delta_model2": 0.16483996480703353,
        "mean_delta_model2 / mean_delta_model1": 1.7277307133612405,
        "test_statistic": 400.0,
        "p_value": 2.743123083911266e-13,
        "q_value": 2.275577637257523e-12,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8945431417226791,
        "mean_model2": 0.8133673831820488,
        "mean_delta_model1": 0.10545685827732086,
        "mean_delta_model2": 0.1866326168179512,
        "mean_delta_model2 / mean_delta_model1": 1.769753241910182,
        "test_statistic": 507.0,
        "p_value": 3.961741310694399e-12,
        "q_value": 2.8169924744612545e-11,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8884298619627953,
        "mean_model2": 0.8223720097541809,
        "mean_delta_model1": 0.11157013803720474,
        "mean_delta_model2": 0.1776279902458191,
        "mean_delta_model2 / mean_delta_model1": 1.5920746659521599,
        "test_statistic": 561.0,
        "p_value": 1.4494881762402344e-11,
        "q_value": 9.018250672018055e-11,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8712435624003411,
        "mean_model2": 0.8148437476158142,
        "mean_delta_model1": 0.12875643759965896,
        "mean_delta_model2": 0.1851562523841858,
        "mean_delta_model2 / mean_delta_model1": 1.4380349117757527,
        "test_statistic": 666.0,
        "p_value": 1.6389504642151865e-10,
        "q_value": 9.064021581374916e-10,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8759689223766327,
        "mean_model2": 0.8264376425743103,
        "mean_delta_model1": 0.1240310776233673,
        "mean_delta_model2": 0.1735623574256897,
        "mean_delta_model2 / mean_delta_model1": 1.3993457184394467,
        "test_statistic": 936.0,
        "p_value": 4.6684206788440657e-08,
        "q_value": 2.323633327369524e-07,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5761524818185717,
        "mean_model2": 0.7328663312271237,
        "mean_delta_model1": 0.5972943781409412,
        "mean_delta_model2": 0.7328663312271237,
        "mean_delta_model2 / mean_delta_model1": 1.226976777360848,
        "test_statistic": 1065.0,
        "p_value": 5.168378159148738e-07,
        "q_value": 2.338617580317114e-06,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8950811213254929,
        "mean_model2": 0.8694412750005722,
        "mean_delta_model1": 0.10491887867450714,
        "mean_delta_model2": 0.1305587249994278,
        "mean_delta_model2 / mean_delta_model1": 1.2443778150209164,
        "test_statistic": 1442.0,
        "p_value": 0.00019631968285300282,
        "q_value": 0.0008142920794804436,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7309173017740249,
        "mean_model2": 0.6882101050019265,
        "mean_delta_model1": 0.7309173017740249,
        "mean_delta_model2": 0.6882101050019265,
        "mean_delta_model2 / mean_delta_model1": 0.9415704120446419,
        "test_statistic": 1572.0,
        "p_value": 0.0010501913014833856,
        "q_value": 0.004020894490406608,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6787181308865547,
        "mean_model2": 0.6211692173779011,
        "mean_delta_model1": 0.6787181308865547,
        "mean_delta_model2": 0.6211692173779011,
        "mean_delta_model2 / mean_delta_model1": 0.9152094059525385,
        "test_statistic": 1654.0,
        "p_value": 0.0027464200766716255,
        "q_value": 0.00976419720642641,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9601987540721894,
        "mean_model2": 0.9593369990587235,
        "mean_delta_model1": 0.03980124592781067,
        "mean_delta_model2": 0.04066300094127655,
        "mean_delta_model2 / mean_delta_model1": 1.021651458223918,
        "test_statistic": 1857.0,
        "p_value": 0.02163012475488432,
        "q_value": 0.07177370708881732,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 800,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_800.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_800.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:02.510516",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8223814857006073,
        "mean_model2": 0.6965580269694328,
        "mean_delta_model1": 0.1776185142993927,
        "mean_delta_model2": 0.3034419730305672,
        "mean_delta_model2 / mean_delta_model1": 1.708391572958927,
        "test_statistic": 319.0,
        "p_value": 3.3262914681756384e-14,
        "q_value": 5.518698394815319e-13,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4950790704600513,
        "mean_model2": -0.06749663388356567,
        "mean_delta_model1": 0.5049209295399487,
        "mean_delta_model2": 0.9325033661164344,
        "mean_delta_model2 / mean_delta_model1": 1.846830486836961,
        "test_statistic": 314.0,
        "p_value": 2.912818606597037e-14,
        "q_value": 5.518698394815319e-13,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4359205562621355,
        "mean_model2": -0.04582533801905811,
        "mean_delta_model1": 0.5640794437378644,
        "mean_delta_model2": 0.9541746619809419,
        "mean_delta_model2 / mean_delta_model1": 1.6915607767198837,
        "test_statistic": 319.0,
        "p_value": 3.3262914681756384e-14,
        "q_value": 5.518698394815319e-13,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5520901354588568,
        "mean_model2": -0.27204743315465746,
        "mean_delta_model1": 0.4479098645411432,
        "mean_delta_model2": 0.7279525668453425,
        "mean_delta_model2 / mean_delta_model1": 1.6252211091423188,
        "test_statistic": 416.0,
        "p_value": 4.124029919832563e-13,
        "q_value": 5.131678368349594e-12,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7837818533182144,
        "mean_model2": 0.6562636724114418,
        "mean_delta_model1": 0.21621814668178557,
        "mean_delta_model2": 0.3437363275885582,
        "mean_delta_model2 / mean_delta_model1": 1.5897663210222812,
        "test_statistic": 447.0,
        "p_value": 9.010141543147884e-13,
        "q_value": 8.9693138704711e-12,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9015114814043045,
        "mean_model2": 0.8353920596837997,
        "mean_delta_model1": 0.09848851859569549,
        "mean_delta_model2": 0.16460794031620027,
        "mean_delta_model2 / mean_delta_model1": 1.6713414178959392,
        "test_statistic": 484.0,
        "p_value": 2.256776902419375e-12,
        "q_value": 1.8721256372143723e-11,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8889157819747925,
        "mean_model2": 0.8116651327908039,
        "mean_delta_model1": 0.11108421802520752,
        "mean_delta_model2": 0.1883348672091961,
        "mean_delta_model2 / mean_delta_model1": 1.6954241615713463,
        "test_statistic": 511.0,
        "p_value": 4.366343409248429e-12,
        "q_value": 3.104684420349099e-11,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8823373070359231,
        "mean_model2": 0.8327114832401276,
        "mean_delta_model1": 0.117662692964077,
        "mean_delta_model2": 0.16728851675987244,
        "mean_delta_model2 / mean_delta_model1": 1.421763454036756,
        "test_statistic": 701.0,
        "p_value": 3.5757592915601463e-10,
        "q_value": 2.224722778886783e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8683306187391281,
        "mean_model2": 0.8116703408956528,
        "mean_delta_model1": 0.13166938126087188,
        "mean_delta_model2": 0.18832965910434724,
        "mean_delta_model2 / mean_delta_model1": 1.4303223520980657,
        "test_statistic": 723.0,
        "p_value": 5.796713177452411e-10,
        "q_value": 3.2058036218091923e-09,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5661799089238048,
        "mean_model2": 0.7195214996114373,
        "mean_delta_model1": 0.5870349088683724,
        "mean_delta_model2": 0.7200984850153327,
        "mean_delta_model2 / mean_delta_model1": 1.2266706359992579,
        "test_statistic": 1047.0,
        "p_value": 3.73788105734042e-07,
        "q_value": 1.8604717946562642e-06,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8622986376285553,
        "mean_model2": 0.8332299935817719,
        "mean_delta_model1": 0.1377013623714447,
        "mean_delta_model2": 0.16677000641822814,
        "mean_delta_model2 / mean_delta_model1": 1.2110991753906672,
        "test_statistic": 1409.0,
        "p_value": 0.00012446005056465924,
        "q_value": 0.0005631640203850895,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8888232693076134,
        "mean_model2": 0.8714113384485245,
        "mean_delta_model1": 0.11117673069238662,
        "mean_delta_model2": 0.12858866155147552,
        "mean_delta_model2 / mean_delta_model1": 1.1566148847033983,
        "test_statistic": 1648.0,
        "p_value": 0.00256631337533024,
        "q_value": 0.010644519309665203,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9616484326124192,
        "mean_model2": 0.9600126922130585,
        "mean_delta_model1": 0.03835156738758087,
        "mean_delta_model2": 0.039987307786941526,
        "mean_delta_model2 / mean_delta_model1": 1.0426512007405033,
        "test_statistic": 1978.0,
        "p_value": 0.060004071567819,
        "q_value": 0.2297391346016821,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6237460194900631,
        "mean_model2": 0.6240697754919529,
        "mean_delta_model1": 0.6237460194900631,
        "mean_delta_model2": 0.6240697754919529,
        "mean_delta_model2 / mean_delta_model1": 1.0005190510107855,
        "test_statistic": 2480.0,
        "p_value": 0.8770384213786963,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6905121773481369,
        "mean_model2": 0.6818721994757653,
        "mean_delta_model1": 0.6905121773481369,
        "mean_delta_model2": 0.6818721994757653,
        "mean_delta_model2 / mean_delta_model1": 0.987487580732388,
        "test_statistic": 2232.0,
        "p_value": 0.313728575022711,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 1600,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_1600.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_1600.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:02.510537",
    "comparisons": [
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5729679375886917,
        "mean_model2": -0.060103613748215136,
        "mean_delta_model1": 0.4270320624113083,
        "mean_delta_model2": 0.9398963862517848,
        "mean_delta_model2 / mean_delta_model1": 2.2009972294457283,
        "test_statistic": 29.0,
        "p_value": 9.318997235593412e-18,
        "q_value": 4.638385022245035e-16,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5476197991915979,
        "mean_model2": -0.12716221901588143,
        "mean_delta_model1": 0.45238020080840213,
        "mean_delta_model2": 0.8728377809841186,
        "mean_delta_model2 / mean_delta_model1": 1.9294340897863345,
        "test_statistic": 140.0,
        "p_value": 2.3957899977091284e-16,
        "q_value": 5.96233487406484e-15,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8314262241125107,
        "mean_model2": 0.7105161878466606,
        "mean_delta_model1": 0.1685737758874893,
        "mean_delta_model2": 0.2894838121533394,
        "mean_delta_model2 / mean_delta_model1": 1.717252939428424,
        "test_statistic": 318.0,
        "p_value": 3.23922440721765e-14,
        "q_value": 5.374244171802303e-13,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9110442441701889,
        "mean_model2": 0.8467677932977676,
        "mean_delta_model1": 0.08895575582981109,
        "mean_delta_model2": 0.15323220670223237,
        "mean_delta_model2 / mean_delta_model1": 1.7225665194211162,
        "test_statistic": 453.0,
        "p_value": 1.0468004235982917e-12,
        "q_value": 1.3025713183905913e-11,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5874424312636256,
        "mean_model2": -0.3367601250298321,
        "mean_delta_model1": 0.4125575687363744,
        "mean_delta_model2": 0.6632398749701679,
        "mean_delta_model2 / mean_delta_model1": 1.6076298806045668,
        "test_statistic": 479.0,
        "p_value": 1.9952894679179096e-12,
        "q_value": 1.986248208698898e-11,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7972089719772338,
        "mean_model2": 0.6875161136686802,
        "mean_delta_model1": 0.2027910280227661,
        "mean_delta_model2": 0.3124838863313198,
        "mean_delta_model2 / mean_delta_model1": 1.5409157366480688,
        "test_statistic": 567.0,
        "p_value": 1.670707085341673e-11,
        "q_value": 1.3859471724434614e-10,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8946152561903,
        "mean_model2": 0.8283761444687844,
        "mean_delta_model1": 0.10538474380970002,
        "mean_delta_model2": 0.17162385553121567,
        "mean_delta_model2 / mean_delta_model1": 1.6285455496397834,
        "test_statistic": 729.0,
        "p_value": 6.60666275281221e-10,
        "q_value": 4.697661406043046e-09,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8931651628017425,
        "mean_model2": 0.8402752876281738,
        "mean_delta_model1": 0.10683483719825744,
        "mean_delta_model2": 0.15972471237182617,
        "mean_delta_model2 / mean_delta_model1": 1.4950620655266127,
        "test_statistic": 748.0,
        "p_value": 9.96915585881586e-10,
        "q_value": 6.202489126639026e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8733972808718682,
        "mean_model2": 0.8285375213623047,
        "mean_delta_model1": 0.12660271912813187,
        "mean_delta_model2": 0.17146247863769531,
        "mean_delta_model2 / mean_delta_model1": 1.3543348817347427,
        "test_statistic": 898.0,
        "p_value": 2.2169944511644565e-08,
        "q_value": 1.2260825442802833e-07,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8785033768415451,
        "mean_model2": 0.8403214406967163,
        "mean_delta_model1": 0.1214966231584549,
        "mean_delta_model2": 0.1596785593032837,
        "mean_delta_model2 / mean_delta_model1": 1.3142633527767453,
        "test_statistic": 1166.0,
        "p_value": 2.9728975849282753e-06,
        "q_value": 1.4797132440314187e-05,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5992224165331572,
        "mean_model2": 0.7282713803648949,
        "mean_delta_model1": 0.6271433152537793,
        "mean_delta_model2": 0.7302746775746346,
        "mean_delta_model2 / mean_delta_model1": 1.1644462434860239,
        "test_statistic": 1316.0,
        "p_value": 3.225460839514926e-05,
        "q_value": 0.00014594751373913397,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9012007176876068,
        "mean_model2": 0.8760879856348037,
        "mean_delta_model1": 0.09879928231239318,
        "mean_delta_model2": 0.12391201436519622,
        "mean_delta_model2 / mean_delta_model1": 1.2541792962969018,
        "test_statistic": 1363.0,
        "p_value": 6.460325535698146e-05,
        "q_value": 0.0002679604937281402,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7497611582279206,
        "mean_model2": 0.7087896719574929,
        "mean_delta_model1": 0.7497611582279206,
        "mean_delta_model2": 0.7087896719574929,
        "mean_delta_model2 / mean_delta_model1": 0.9453539492933125,
        "test_statistic": 1711.0,
        "p_value": 0.0051293465137661285,
        "q_value": 0.01963886113649634,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6919266555458308,
        "mean_model2": 0.650706312134862,
        "mean_delta_model1": 0.6919266555458308,
        "mean_delta_model2": 0.650706312134862,
        "mean_delta_model2 / mean_delta_model1": 0.9404267156344022,
        "test_statistic": 1861.0,
        "p_value": 0.022427558989963504,
        "q_value": 0.0797354748084112,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9620287293195724,
        "mean_model2": 0.9633785480260849,
        "mean_delta_model1": 0.03797127068042755,
        "mean_delta_model2": 0.0366214519739151,
        "mean_delta_model2 / mean_delta_model1": 0.9644515792512517,
        "test_statistic": 1995.0,
        "p_value": 0.0684076330889175,
        "q_value": 0.22699219147381708,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "snowflake-arctic-embed-l-v2.0",
    "projection_dimension": 3200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_3200.json",
      "model2": "models/snowflake-arctic-embed-l-v2.0_propositions_a->b_3200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:02.510557",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5349437871575355,
        "mean_model2": -0.14540832499973477,
        "mean_delta_model1": 0.46505621284246446,
        "mean_delta_model2": 0.8545916750002652,
        "mean_delta_model2 / mean_delta_model1": 1.8376094145198616,
        "test_statistic": 108.0,
        "p_value": 9.535497985106282e-17,
        "q_value": 2.3730724409292234e-15,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4948650293983519,
        "mean_model2": -0.1128516443958506,
        "mean_delta_model1": 0.5051349706016481,
        "mean_delta_model2": 0.8871483556041494,
        "mean_delta_model2 / mean_delta_model1": 1.756260024023874,
        "test_statistic": 106.0,
        "p_value": 8.998385137455786e-17,
        "q_value": 2.3730724409292234e-15,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5873328913748265,
        "mean_model2": -0.3086862379475497,
        "mean_delta_model1": 0.4126671086251736,
        "mean_delta_model2": 0.6913137620524503,
        "mean_delta_model2 / mean_delta_model1": 1.6752334935430293,
        "test_statistic": 298.0,
        "p_value": 1.901053991779918e-14,
        "q_value": 3.154066236608918e-13,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8229858508706093,
        "mean_model2": 0.7092052149772644,
        "mean_delta_model1": 0.17701414912939073,
        "mean_delta_model2": 0.2907947850227356,
        "mean_delta_model2 / mean_delta_model1": 1.6427770686860488,
        "test_statistic": 452.0,
        "p_value": 1.0209891323098914e-12,
        "q_value": 1.270453402725898e-11,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7929314923286438,
        "mean_model2": 0.6942364919185638,
        "mean_delta_model1": 0.2070685076713562,
        "mean_delta_model2": 0.30576350808143615,
        "mean_delta_model2 / mean_delta_model1": 1.47662969864409,
        "test_statistic": 587.0,
        "p_value": 2.6743020071878613e-11,
        "q_value": 2.6621839370703763e-10,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9100125628709793,
        "mean_model2": 0.850104603767395,
        "mean_delta_model1": 0.08998743712902069,
        "mean_delta_model2": 0.14989539623260498,
        "mean_delta_model2 / mean_delta_model1": 1.665736918562204,
        "test_statistic": 628.0,
        "p_value": 6.914403547593268e-11,
        "q_value": 5.735893580627348e-10,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8938222086429596,
        "mean_model2": 0.8269416973739863,
        "mean_delta_model1": 0.1061777913570404,
        "mean_delta_model2": 0.17305830262601377,
        "mean_delta_model2 / mean_delta_model1": 1.6298917166592453,
        "test_statistic": 652.0,
        "p_value": 1.1948733160530284e-10,
        "q_value": 8.496135601206062e-10,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.889600751399994,
        "mean_model2": 0.8426375639438629,
        "mean_delta_model1": 0.1103992486000061,
        "mean_delta_model2": 0.1573624360561371,
        "mean_delta_model2 / mean_delta_model1": 1.4253940860257666,
        "test_statistic": 852.0,
        "p_value": 8.80282944882976e-09,
        "q_value": 5.476838231166921e-08,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8682697361707687,
        "mean_model2": 0.8280221843719482,
        "mean_delta_model1": 0.13173026382923125,
        "mean_delta_model2": 0.17197781562805176,
        "mean_delta_model2 / mean_delta_model1": 1.3055300325747126,
        "test_statistic": 1121.0,
        "p_value": 1.3832149770917264e-06,
        "q_value": 7.649706734757241e-06,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6040639822650701,
        "mean_model2": 0.742114825733006,
        "mean_delta_model1": 0.6252481802832335,
        "mean_delta_model2": 0.7429108519479632,
        "mean_delta_model2 / mean_delta_model1": 1.1881855483552617,
        "test_statistic": 1197.0,
        "p_value": 4.968927191066459e-06,
        "q_value": 2.4732057405960942e-05,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8699245226383209,
        "mean_model2": 0.8451215916872025,
        "mean_delta_model1": 0.13007547736167907,
        "mean_delta_model2": 0.15487840831279753,
        "mean_delta_model2 / mean_delta_model1": 1.1906810680552269,
        "test_statistic": 1574.0,
        "p_value": 0.001076056967908346,
        "q_value": 0.004869004676290302,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8979895907640457,
        "mean_model2": 0.8822702980041504,
        "mean_delta_model1": 0.10201040923595428,
        "mean_delta_model2": 0.1177297019958496,
        "mean_delta_model2 / mean_delta_model1": 1.154094987733418,
        "test_statistic": 1749.0,
        "p_value": 0.007627245069920197,
        "q_value": 0.03163618216184012,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6802295897901058,
        "mean_model2": 0.6388217556104064,
        "mean_delta_model1": 0.6802295897901058,
        "mean_delta_model2": 0.6388217556104064,
        "mean_delta_model2 / mean_delta_model1": 0.9391266789901387,
        "test_statistic": 1950.0,
        "p_value": 0.04803747233781293,
        "q_value": 0.18392230785319225,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7239353622496129,
        "mean_model2": 0.7138943448662758,
        "mean_delta_model1": 0.7239353622496129,
        "mean_delta_model2": 0.7138943448662758,
        "mean_delta_model2 / mean_delta_model1": 0.9861299531602727,
        "test_statistic": 2086.0,
        "p_value": 0.1311903088113828,
        "q_value": 0.4664137353520666,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9622382885217666,
        "mean_model2": 0.9639307999610901,
        "mean_delta_model1": 0.03776171147823334,
        "mean_delta_model2": 0.03606920003890991,
        "mean_delta_model2 / mean_delta_model1": 0.9551791650042392,
        "test_statistic": 2295.0,
        "p_value": 0.42905212231325207,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "bge-m3",
    "projection_dimension": 50,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 7,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_50.json",
      "model2": "models/bge-m3_propositions_a->b_50.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:04.581114",
    "comparisons": [
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5883494353294373,
        "mean_model2": 0.7030336222052574,
        "mean_delta_model1": 0.5883494353294373,
        "mean_delta_model2": 0.7030336222052574,
        "mean_delta_model2 / mean_delta_model1": 1.1949252943730702,
        "test_statistic": 819.0,
        "p_value": 4.470203127454593e-09,
        "q_value": 2.2249736434714127e-07,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7038766911625862,
        "mean_model2": 0.7693505817651749,
        "mean_delta_model1": 0.7038766911625862,
        "mean_delta_model2": 0.7693505817651749,
        "mean_delta_model2 / mean_delta_model1": 1.0930189782168325,
        "test_statistic": 1216.0,
        "p_value": 6.770862440571827e-06,
        "q_value": 0.00016850454044602993,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.41884719345718624,
        "mean_model2": -0.29622286692261696,
        "mean_delta_model1": 0.5811528065428138,
        "mean_delta_model2": 0.703777133077383,
        "mean_delta_model2 / mean_delta_model1": 1.21100186586733,
        "test_statistic": 1278.0,
        "p_value": 1.8062120582409198e-05,
        "q_value": 0.00022475344573405633,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6274809923022986,
        "mean_model2": 0.6995121742784977,
        "mean_delta_model1": 0.6369218497723341,
        "mean_delta_model2": 0.7163089196383953,
        "mean_delta_model2 / mean_delta_model1": 1.1246417749594206,
        "test_statistic": 1267.0,
        "p_value": 1.52246587827266e-05,
        "q_value": 0.00022475344573405633,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9007420587539673,
        "mean_model2": 0.8712864607572556,
        "mean_delta_model1": 0.09925794124603271,
        "mean_delta_model2": 0.12871353924274445,
        "mean_delta_model2 / mean_delta_model1": 1.2967580994219852,
        "test_statistic": 1424.0,
        "p_value": 0.00015334083624479915,
        "q_value": 0.0015264600260204155,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9462025570869446,
        "mean_model2": 0.9541554898023605,
        "mean_delta_model1": 0.05379744291305542,
        "mean_delta_model2": 0.045844510197639465,
        "mean_delta_model2 / mean_delta_model1": 0.8521689454967393,
        "test_statistic": 1636.0,
        "p_value": 0.002238099402043779,
        "q_value": 0.018566315813975352,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8518508130311966,
        "mean_model2": 0.8743390291929245,
        "mean_delta_model1": 0.14814918696880341,
        "mean_delta_model2": 0.1256609708070755,
        "mean_delta_model2 / mean_delta_model1": 0.8482056052966165,
        "test_statistic": 1654.0,
        "p_value": 0.0027464200766716255,
        "q_value": 0.01952839441285282,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8987804532051087,
        "mean_model2": 0.8861065673828125,
        "mean_delta_model1": 0.10121954679489135,
        "mean_delta_model2": 0.1138934326171875,
        "mean_delta_model2 / mean_delta_model1": 1.1252118412264598,
        "test_statistic": 1995.0,
        "p_value": 0.0684076330889175,
        "q_value": 0.42561035901340705,
        "significant": false
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7675900214910507,
        "mean_model2": 0.7458646422624589,
        "mean_delta_model1": 0.2324099785089493,
        "mean_delta_model2": 0.2541353577375412,
        "mean_delta_model2 / mean_delta_model1": 1.0934786852439529,
        "test_statistic": 2042.0,
        "p_value": 0.09677152330659382,
        "q_value": 0.5351834572581248,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8362160032987594,
        "mean_model2": 0.8465324777364731,
        "mean_delta_model1": 0.16378399670124055,
        "mean_delta_model2": 0.15346752226352692,
        "mean_delta_model2 / mean_delta_model1": 0.9370117066044493,
        "test_statistic": 2069.0,
        "p_value": 0.11691004377816779,
        "q_value": 0.5819014452965809,
        "significant": false
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7419095650315285,
        "mean_model2": 0.7287274929881096,
        "mean_delta_model1": 0.2580904349684715,
        "mean_delta_model2": 0.27127250701189043,
        "mean_delta_model2 / mean_delta_model1": 1.051075399384829,
        "test_statistic": 2098.0,
        "p_value": 0.14205948812602387,
        "q_value": 0.6427989712677851,
        "significant": false
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8725327450037003,
        "mean_model2": 0.8588985985517502,
        "mean_delta_model1": 0.12746725499629974,
        "mean_delta_model2": 0.1411014014482498,
        "mean_delta_model2 / mean_delta_model1": 1.1069619523252923,
        "test_statistic": 2125.0,
        "p_value": 0.16902917920250482,
        "q_value": 0.7010969039143133,
        "significant": false
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8501347251236439,
        "mean_model2": 0.8454501834511757,
        "mean_delta_model1": 0.14986527487635612,
        "mean_delta_model2": 0.1545498165488243,
        "mean_delta_model2 / mean_delta_model1": 1.0312583530529877,
        "test_statistic": 2439.0,
        "p_value": 0.7674623006604582,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.3898035565204918,
        "mean_model2": -0.3671434297040105,
        "mean_delta_model1": 0.6101964434795082,
        "mean_delta_model2": 0.6328565702959895,
        "mean_delta_model2 / mean_delta_model1": 1.0371357897257922,
        "test_statistic": 2280.0,
        "p_value": 0.39956957160546047,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.4778680958226323,
        "mean_model2": -0.4421729109436274,
        "mean_delta_model1": 0.5221319041773677,
        "mean_delta_model2": 0.5578270890563727,
        "mean_delta_model2 / mean_delta_model1": 1.0683643052520295,
        "test_statistic": 2344.0,
        "p_value": 0.5337204302967387,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "bge-m3",
    "projection_dimension": 100,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_100.json",
      "model2": "models/bge-m3_propositions_a->b_100.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:04.581175",
    "comparisons": [
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9084970563650131,
        "mean_model2": 0.863963331580162,
        "mean_delta_model1": 0.09150294363498687,
        "mean_delta_model2": 0.13603666841983794,
        "mean_delta_model2 / mean_delta_model1": 1.4866917173998242,
        "test_statistic": 703.0,
        "p_value": 3.737167999465526e-10,
        "q_value": 1.8601168812591154e-08,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5860802850499749,
        "mean_model2": 0.7031710910797119,
        "mean_delta_model1": 0.5860802850499749,
        "mean_delta_model2": 0.7031710910797119,
        "mean_delta_model2 / mean_delta_model1": 1.1997862904051664,
        "test_statistic": 820.0,
        "p_value": 4.563786971596854e-09,
        "q_value": 1.135776768605507e-07,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6861503298580647,
        "mean_model2": 0.7655108091235161,
        "mean_delta_model1": 0.6861503298580647,
        "mean_delta_model2": 0.7655108091235161,
        "mean_delta_model2 / mean_delta_model1": 1.1156604840252249,
        "test_statistic": 1021.0,
        "p_value": 2.3254496223335124e-07,
        "q_value": 3.858187179560237e-06,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8027366203069687,
        "mean_model2": 0.7450579446554184,
        "mean_delta_model1": 0.19726337969303132,
        "mean_delta_model2": 0.2549420553445816,
        "mean_delta_model2 / mean_delta_model1": 1.292394238308733,
        "test_statistic": 1148.0,
        "p_value": 2.1951641838470253e-06,
        "q_value": 2.7315215399021473e-05,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7801883789896965,
        "mean_model2": 0.7309655785560608,
        "mean_delta_model1": 0.2198116210103035,
        "mean_delta_model2": 0.2690344214439392,
        "mean_delta_model2 / mean_delta_model1": 1.2239317475909448,
        "test_statistic": 1375.0,
        "p_value": 7.68317676362045e-05,
        "q_value": 0.0007648361969144605,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4514737324416637,
        "mean_model2": -0.33970477029681206,
        "mean_delta_model1": 0.5485262675583362,
        "mean_delta_model2": 0.660295229703188,
        "mean_delta_model2 / mean_delta_model1": 1.203762278591271,
        "test_statistic": 1406.0,
        "p_value": 0.0001193363377195842,
        "q_value": 0.0009899632394172275,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.45454454969614744,
        "mean_model2": -0.37602380799129603,
        "mean_delta_model1": 0.5454554503038526,
        "mean_delta_model2": 0.6239761920087039,
        "mean_delta_model2 / mean_delta_model1": 1.143954454320899,
        "test_statistic": 1534.0,
        "p_value": 0.0006558976151462913,
        "q_value": 0.00408078465594033,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6212199111655354,
        "mean_model2": 0.6851282499730587,
        "mean_delta_model1": 0.6348483427241445,
        "mean_delta_model2": 0.702600526958704,
        "mean_delta_model2 / mean_delta_model1": 1.1067218415406643,
        "test_statistic": 1528.0,
        "p_value": 0.0006080233557528743,
        "q_value": 0.00408078465594033,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9482488125562668,
        "mean_model2": 0.9543231099843978,
        "mean_delta_model1": 0.051751187443733214,
        "mean_delta_model2": 0.045676890015602115,
        "mean_delta_model2 / mean_delta_model1": 0.8826249651810325,
        "test_statistic": 1676.0,
        "p_value": 0.0035100103466738823,
        "q_value": 0.019411696831445047,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8564163464307785,
        "mean_model2": 0.8756408321857453,
        "mean_delta_model1": 0.1435836535692215,
        "mean_delta_model2": 0.12435916781425477,
        "mean_delta_model2 / mean_delta_model1": 0.8661095098426462,
        "test_statistic": 1704.0,
        "p_value": 0.00475966539313037,
        "q_value": 0.023690489558330804,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5279463818110526,
        "mean_model2": -0.4696632133424282,
        "mean_delta_model1": 0.4720536181889474,
        "mean_delta_model2": 0.5303367866575718,
        "mean_delta_model2 / mean_delta_model1": 1.1234672635117808,
        "test_statistic": 1729.0,
        "p_value": 0.006201990345506126,
        "q_value": 0.02806312388207009,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8557861065864563,
        "mean_model2": 0.8308614087104798,
        "mean_delta_model1": 0.1442138934135437,
        "mean_delta_model2": 0.16913859128952027,
        "mean_delta_model2 / mean_delta_model1": 1.172831460866972,
        "test_statistic": 1739.0,
        "p_value": 0.0068815355518277305,
        "q_value": 0.02854313848251357,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8723850011825561,
        "mean_model2": 0.8598710787296295,
        "mean_delta_model1": 0.12761499881744384,
        "mean_delta_model2": 0.1401289212703705,
        "mean_delta_model2 / mean_delta_model1": 1.0980599660611061,
        "test_statistic": 2031.0,
        "p_value": 0.08940730920445931,
        "q_value": 0.3423160680101842,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8997249734401703,
        "mean_model2": 0.8853390079736709,
        "mean_delta_model1": 0.10027502655982971,
        "mean_delta_model2": 0.11466099202632904,
        "mean_delta_model2 / mean_delta_model1": 1.1434650875700925,
        "test_statistic": 2099.0,
        "p_value": 0.14299558373008794,
        "q_value": 0.5083843841109449,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8346818608045578,
        "mean_model2": 0.8420049086213112,
        "mean_delta_model1": 0.1653181391954422,
        "mean_delta_model2": 0.1579950913786888,
        "mean_delta_model2 / mean_delta_model1": 0.9557033012082483,
        "test_statistic": 2317.0,
        "p_value": 0.4745030356140598,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "bge-m3",
    "projection_dimension": 200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 7,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_200.json",
      "model2": "models/bge-m3_propositions_a->b_200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:04.581199",
    "comparisons": [
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5509045364707709,
        "mean_model2": 0.717666858844459,
        "mean_delta_model1": 0.5509045364707709,
        "mean_delta_model2": 0.717666858844459,
        "mean_delta_model2 / mean_delta_model1": 1.3027063880105405,
        "test_statistic": 533.0,
        "p_value": 7.429365830407272e-12,
        "q_value": 3.6978505649643313e-10,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6828296953439712,
        "mean_model2": 0.7836297407746315,
        "mean_delta_model1": 0.6828296953439712,
        "mean_delta_model2": 0.7836297407746315,
        "mean_delta_model2 / mean_delta_model1": 1.1476210629355874,
        "test_statistic": 763.0,
        "p_value": 1.3754339280391946e-09,
        "q_value": 3.4230035537178724e-08,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5039551055803895,
        "mean_model2": -0.3879535548947752,
        "mean_delta_model1": 0.4960448944196105,
        "mean_delta_model2": 0.6120464451052249,
        "mean_delta_model2 / mean_delta_model1": 1.2338529274075891,
        "test_statistic": 1246.0,
        "p_value": 1.0944564978822444e-05,
        "q_value": 0.00013618714811627737,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5950543672963977,
        "mean_model2": 0.6757651863619685,
        "mean_delta_model1": 0.6073631683364511,
        "mean_delta_model2": 0.7008917746320367,
        "mean_delta_model2 / mean_delta_model1": 1.1539912381446467,
        "test_statistic": 1228.0,
        "p_value": 8.214826707417163e-06,
        "q_value": 0.00013618714811627737,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9059866589307785,
        "mean_model2": 0.8785044533014298,
        "mean_delta_model1": 0.09401334106922149,
        "mean_delta_model2": 0.12149554669857025,
        "mean_delta_model2 / mean_delta_model1": 1.2923224014463413,
        "test_statistic": 1345.0,
        "p_value": 4.965959424148884e-05,
        "q_value": 0.0004119547635102396,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8571139967441559,
        "mean_model2": 0.8865419811010361,
        "mean_delta_model1": 0.1428860032558441,
        "mean_delta_model2": 0.11345801889896392,
        "mean_delta_model2 / mean_delta_model1": 0.7940457169608979,
        "test_statistic": 1336.0,
        "p_value": 4.3479279216686876e-05,
        "q_value": 0.0004119547635102396,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9484285479784011,
        "mean_model2": 0.9613676565885544,
        "mean_delta_model1": 0.05157145202159882,
        "mean_delta_model2": 0.03863234341144562,
        "mean_delta_model2 / mean_delta_model1": 0.7491032712297082,
        "test_statistic": 1416.0,
        "p_value": 0.00013723330210865516,
        "q_value": 0.0009757961183431994,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8326665459573269,
        "mean_model2": 0.8577329444885254,
        "mean_delta_model1": 0.1673334540426731,
        "mean_delta_model2": 0.14226705551147462,
        "mean_delta_model2 / mean_delta_model1": 0.8502009136510975,
        "test_statistic": 1755.0,
        "p_value": 0.008108693042064742,
        "q_value": 0.05044968815507519,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9053152561187744,
        "mean_model2": 0.8933765804767608,
        "mean_delta_model1": 0.09468474388122558,
        "mean_delta_model2": 0.10662341952323913,
        "mean_delta_model2 / mean_delta_model1": 1.1260886934117882,
        "test_statistic": 2042.0,
        "p_value": 0.09677152330659382,
        "q_value": 0.5351834572581248,
        "significant": false
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7598642486333848,
        "mean_model2": 0.7418005523085595,
        "mean_delta_model1": 0.2401357513666153,
        "mean_delta_model2": 0.2581994476914406,
        "mean_delta_model2 / mean_delta_model1": 1.075222852998875,
        "test_statistic": 2139.0,
        "p_value": 0.18444471851330213,
        "q_value": 0.9180447189281995,
        "significant": false
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8559744426608086,
        "mean_model2": 0.84746671885252,
        "mean_delta_model1": 0.14402555733919142,
        "mean_delta_model2": 0.15253328114748002,
        "mean_delta_model2 / mean_delta_model1": 1.0590709313365283,
        "test_statistic": 2276.0,
        "p_value": 0.3919187494866655,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7814646589756012,
        "mean_model2": 0.7657334405183792,
        "mean_delta_model1": 0.2185353410243988,
        "mean_delta_model2": 0.23426655948162078,
        "mean_delta_model2 / mean_delta_model1": 1.0719847800519626,
        "test_statistic": 2171.0,
        "p_value": 0.22354068509721814,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8760127478837967,
        "mean_model2": 0.8720557016134262,
        "mean_delta_model1": 0.1239872521162033,
        "mean_delta_model2": 0.12794429838657378,
        "mean_delta_model2 / mean_delta_model1": 1.0319149445030191,
        "test_statistic": 2300.0,
        "p_value": 0.439153713588891,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4391170089226216,
        "mean_model2": -0.44858737911097707,
        "mean_delta_model1": 0.5608829910773784,
        "mean_delta_model2": 0.551412620889023,
        "mean_delta_model2 / mean_delta_model1": 0.9831152480303171,
        "test_statistic": 2491.0,
        "p_value": 0.9069368853915614,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.533792215064168,
        "mean_model2": -0.5294351847469807,
        "mean_delta_model1": 0.46620778493583204,
        "mean_delta_model2": 0.47056481525301935,
        "mean_delta_model2 / mean_delta_model1": 1.0093456833154062,
        "test_statistic": 2412.0,
        "p_value": 0.6976233342391592,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "bge-m3",
    "projection_dimension": 400,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 8,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_400.json",
      "model2": "models/bge-m3_propositions_a->b_400.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:04.581221",
    "comparisons": [
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5317632901668549,
        "mean_model2": 0.6971724597364664,
        "mean_delta_model1": 0.5317632901668549,
        "mean_delta_model2": 0.6971724597364664,
        "mean_delta_model2 / mean_delta_model1": 1.3110578947969687,
        "test_statistic": 467.0,
        "p_value": 1.4829726830182498e-12,
        "q_value": 7.38126442943662e-11,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.653459616880864,
        "mean_model2": 0.7796342498064042,
        "mean_delta_model1": 0.653459616880864,
        "mean_delta_model2": 0.7796342498064042,
        "mean_delta_model2 / mean_delta_model1": 1.1930871161217356,
        "test_statistic": 559.0,
        "p_value": 1.3823325189872664e-11,
        "q_value": 3.4401718820901114e-10,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9103552997112274,
        "mean_model2": 0.8749540442228317,
        "mean_delta_model1": 0.08964470028877258,
        "mean_delta_model2": 0.12504595577716826,
        "mean_delta_model2 / mean_delta_model1": 1.3949062841903377,
        "test_statistic": 992.0,
        "p_value": 1.3571797346865746e-07,
        "q_value": 2.2517165723299125e-06,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8538247805833816,
        "mean_model2": 0.8809602975845336,
        "mean_delta_model1": 0.14617521941661835,
        "mean_delta_model2": 0.11903970241546631,
        "mean_delta_model2 / mean_delta_model1": 0.81436308350041,
        "test_statistic": 1282.0,
        "p_value": 1.9213684786277983e-05,
        "q_value": 0.00023908277221721408,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5095309701561928,
        "mean_model2": -0.39113867182284595,
        "mean_delta_model1": 0.49046902984380725,
        "mean_delta_model2": 0.6088613281771541,
        "mean_delta_model2 / mean_delta_model1": 1.2413858799016313,
        "test_statistic": 1382.0,
        "p_value": 8.494399052511614e-05,
        "q_value": 0.0008455908364830279,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9484348738193512,
        "mean_model2": 0.9577863413095474,
        "mean_delta_model1": 0.051565126180648804,
        "mean_delta_model2": 0.04221365869045258,
        "mean_delta_model2 / mean_delta_model1": 0.8186474428970637,
        "test_statistic": 1469.0,
        "p_value": 0.0002824663114907604,
        "q_value": 0.0020084740808552,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.7979694849252701,
        "mean_model2": 0.7604560178518295,
        "mean_delta_model1": 0.2020305150747299,
        "mean_delta_model2": 0.23954398214817046,
        "mean_delta_model2 / mean_delta_model1": 1.1856821830086635,
        "test_statistic": 1467.0,
        "p_value": 0.000275032659267615,
        "q_value": 0.0020084740808552,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7817780831456185,
        "mean_model2": 0.7445771983265876,
        "mean_delta_model1": 0.21822191685438155,
        "mean_delta_model2": 0.2554228016734123,
        "mean_delta_model2 / mean_delta_model1": 1.1704727249914806,
        "test_statistic": 1583.0,
        "p_value": 0.001199907932052049,
        "q_value": 0.0074654424175135395,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6067761509120464,
        "mean_model2": 0.6428800263255835,
        "mean_delta_model1": 0.6162807486951352,
        "mean_delta_model2": 0.6668178652971983,
        "mean_delta_model2 / mean_delta_model1": 1.0820033997639331,
        "test_statistic": 1769.0,
        "p_value": 0.009339407150918728,
        "q_value": 0.05165048597958119,
        "significant": false
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8579214942455292,
        "mean_model2": 0.8411412218213081,
        "mean_delta_model1": 0.1420785057544708,
        "mean_delta_model2": 0.15885877817869187,
        "mean_delta_model2 / mean_delta_model1": 1.1181056369864941,
        "test_statistic": 2050.0,
        "p_value": 0.1024260357718117,
        "q_value": 0.5098095623393034,
        "significant": false
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8754362678527832,
        "mean_model2": 0.8680522787570953,
        "mean_delta_model1": 0.12456373214721679,
        "mean_delta_model2": 0.13194772124290466,
        "mean_delta_model2 / mean_delta_model1": 1.059278804258699,
        "test_statistic": 2102.0,
        "p_value": 0.14583227876539612,
        "q_value": 0.6598703121108959,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8424444341659546,
        "mean_model2": 0.849129826426506,
        "mean_delta_model1": 0.1575555658340454,
        "mean_delta_model2": 0.15087017357349397,
        "mean_delta_model2 / mean_delta_model1": 0.9575680349649266,
        "test_statistic": 2254.0,
        "p_value": 0.35144711435544207,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9001644074916839,
        "mean_model2": 0.8929816073179245,
        "mean_delta_model1": 0.09983559250831604,
        "mean_delta_model2": 0.1070183926820755,
        "mean_delta_model2 / mean_delta_model1": 1.0719462868231202,
        "test_statistic": 2341.0,
        "p_value": 0.5269610710548687,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4572508700564504,
        "mean_model2": -0.454970286320895,
        "mean_delta_model1": 0.5427491299435496,
        "mean_delta_model2": 0.545029713679105,
        "mean_delta_model2 / mean_delta_model1": 1.0042019113615026,
        "test_statistic": 2465.0,
        "p_value": 0.8365569764052994,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5445865708589553,
        "mean_model2": -0.5573559789545834,
        "mean_delta_model1": 0.4554134291410446,
        "mean_delta_model2": 0.4426440210454166,
        "mean_delta_model2 / mean_delta_model1": 0.9719608441944446,
        "test_statistic": 2519.0,
        "p_value": 0.9835408496499783,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "bge-m3",
    "projection_dimension": 800,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 6,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_800.json",
      "model2": "models/bge-m3_propositions_a->b_800.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:04.581242",
    "comparisons": [
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.5350240828003734,
        "mean_model2": 0.6976736136339605,
        "mean_delta_model1": 0.5352630440983921,
        "mean_delta_model2": 0.6976736136339605,
        "mean_delta_model2 / mean_delta_model1": 1.3034219741606412,
        "test_statistic": 529.0,
        "p_value": 6.747817107930334e-12,
        "q_value": 3.3586203552811577e-10,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6768922313302755,
        "mean_model2": 0.7792805349826812,
        "mean_delta_model1": 0.6768922313302755,
        "mean_delta_model2": 0.7792805349826812,
        "mean_delta_model2 / mean_delta_model1": 1.151262341201915,
        "test_statistic": 765.0,
        "p_value": 1.4354630350828956e-09,
        "q_value": 3.572396296290414e-08,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9153037869930267,
        "mean_model2": 0.880825629234314,
        "mean_delta_model1": 0.08469621300697326,
        "mean_delta_model2": 0.11917437076568603,
        "mean_delta_model2 / mean_delta_model1": 1.4070802759017584,
        "test_statistic": 1081.0,
        "p_value": 6.872193234754664e-07,
        "q_value": 1.1401755419317535e-05,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8530932098627091,
        "mean_model2": 0.8843132483959198,
        "mean_delta_model1": 0.14690679013729097,
        "mean_delta_model2": 0.1156867516040802,
        "mean_delta_model2 / mean_delta_model1": 0.7874840332156585,
        "test_statistic": 1325.0,
        "p_value": 3.691436091264259e-05,
        "q_value": 0.00036747090794054925,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5761898263543844,
        "mean_model2": 0.6583599449694156,
        "mean_delta_model1": 0.5904155380278826,
        "mean_delta_model2": 0.6839210173487663,
        "mean_delta_model2 / mean_delta_model1": 1.1583723213538935,
        "test_statistic": 1317.0,
        "p_value": 3.274333506469467e-05,
        "q_value": 0.00036747090794054925,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9464694887399674,
        "mean_model2": 0.955643218755722,
        "mean_delta_model1": 0.05353051126003265,
        "mean_delta_model2": 0.04435678124427796,
        "mean_delta_model2 / mean_delta_model1": 0.8286261461021379,
        "test_statistic": 1531.0,
        "p_value": 0.0006315385998499609,
        "q_value": 0.00523897423091346,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4901000130921602,
        "mean_model2": -0.4144854263169691,
        "mean_delta_model1": 0.5098999869078398,
        "mean_delta_model2": 0.585514573683031,
        "mean_delta_model2 / mean_delta_model1": 1.1482929764986598,
        "test_statistic": 1753.0,
        "p_value": 0.007945269782797878,
        "q_value": 0.056494766897798886,
        "significant": false
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8004315641522407,
        "mean_model2": 0.775978013575077,
        "mean_delta_model1": 0.19956843584775924,
        "mean_delta_model2": 0.22402198642492294,
        "mean_delta_model2 / mean_delta_model1": 1.1225321553144711,
        "test_statistic": 1911.0,
        "p_value": 0.03476101453485352,
        "q_value": 0.21627188674426015,
        "significant": false
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7752265084534884,
        "mean_model2": 0.7574179509282112,
        "mean_delta_model1": 0.22477349154651166,
        "mean_delta_model2": 0.2425820490717888,
        "mean_delta_model2 / mean_delta_model1": 1.0792289046308294,
        "test_statistic": 1997.0,
        "p_value": 0.06945700702786524,
        "q_value": 0.3841237575046207,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8374088387191295,
        "mean_model2": 0.8562631008028984,
        "mean_delta_model1": 0.16259116128087045,
        "mean_delta_model2": 0.1437368991971016,
        "mean_delta_model2 / mean_delta_model1": 0.8840388251413077,
        "test_statistic": 2018.0,
        "p_value": 0.08129332564998845,
        "q_value": 0.40462480519169686,
        "significant": false
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8613641321659088,
        "mean_model2": 0.8504841342568398,
        "mean_delta_model1": 0.13863586783409118,
        "mean_delta_model2": 0.14951586574316025,
        "mean_delta_model2 / mean_delta_model1": 1.0784789541050763,
        "test_statistic": 2296.0,
        "p_value": 0.4310615640798099,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8784546732902527,
        "mean_model2": 0.8727085554599762,
        "mean_delta_model1": 0.12154532670974731,
        "mean_delta_model2": 0.1272914445400238,
        "mean_delta_model2 / mean_delta_model1": 1.0472755142943369,
        "test_statistic": 2194.0,
        "p_value": 0.255084772503005,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9025527334213257,
        "mean_model2": 0.8950576251745224,
        "mean_delta_model1": 0.09744726657867432,
        "mean_delta_model2": 0.1049423748254776,
        "mean_delta_model2 / mean_delta_model1": 1.076914504736283,
        "test_statistic": 2401.0,
        "p_value": 0.6698511676891292,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.45384409856051205,
        "mean_model2": -0.47621000541374087,
        "mean_delta_model1": 0.5461559014394879,
        "mean_delta_model2": 0.5237899945862591,
        "mean_delta_model2 / mean_delta_model1": 0.9590484936731808,
        "test_statistic": 2316.0,
        "p_value": 0.4723813165543246,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.564556743670255,
        "mean_model2": -0.5729254976660013,
        "mean_delta_model1": 0.43544325632974507,
        "mean_delta_model2": 0.4270745023339987,
        "mean_delta_model2 / mean_delta_model1": 0.9807810687751034,
        "test_statistic": 2511.0,
        "p_value": 0.9616074190130767,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "bge-m3",
    "projection_dimension": 1600,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 7,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_1600.json",
      "model2": "models/bge-m3_propositions_a->b_1600.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:04.581261",
    "comparisons": [
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6064081885293127,
        "mean_model2": 0.7346305106580258,
        "mean_delta_model1": 0.6064081885293127,
        "mean_delta_model2": 0.7346305106580258,
        "mean_delta_model2 / mean_delta_model1": 1.211445565139355,
        "test_statistic": 872.0,
        "p_value": 1.3192638441071741e-08,
        "q_value": 6.56642930585274e-07,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7185811091959476,
        "mean_model2": 0.7979374438524246,
        "mean_delta_model1": 0.7185811091959476,
        "mean_delta_model2": 0.7979374438524246,
        "mean_delta_model2 / mean_delta_model1": 1.110434763231213,
        "test_statistic": 991.0,
        "p_value": 1.331981564680161e-07,
        "q_value": 2.714134364880599e-06,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6287958171218634,
        "mean_model2": 0.7162200941890479,
        "mean_delta_model1": 0.6457068299502134,
        "mean_delta_model2": 0.7418730112165213,
        "mean_delta_model2 / mean_delta_model1": 1.1489316463846644,
        "test_statistic": 1002.0,
        "p_value": 1.635893345769036e-07,
        "q_value": 2.714134364880599e-06,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9205120611190796,
        "mean_model2": 0.8871316158771515,
        "mean_delta_model1": 0.07948793888092041,
        "mean_delta_model2": 0.11286838412284851,
        "mean_delta_model2 / mean_delta_model1": 1.4199435249155825,
        "test_statistic": 1045.0,
        "p_value": 3.6048694917300083e-07,
        "q_value": 4.485668424099442e-06,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9531845897436142,
        "mean_model2": 0.9616730916500091,
        "mean_delta_model1": 0.0468154102563858,
        "mean_delta_model2": 0.03832690834999084,
        "mean_delta_model2 / mean_delta_model1": 0.8186814585217248,
        "test_statistic": 1469.0,
        "p_value": 0.0002824663114907604,
        "q_value": 0.00281186371319728,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8730791747570038,
        "mean_model2": 0.8939504718780518,
        "mean_delta_model1": 0.12692082524299622,
        "mean_delta_model2": 0.10604952812194825,
        "mean_delta_model2 / mean_delta_model1": 0.8355565599176585,
        "test_statistic": 1553.0,
        "p_value": 0.0008316155520464255,
        "q_value": 0.00689872709005146,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8084689670801163,
        "mean_model2": 0.7757485294342041,
        "mean_delta_model1": 0.19153103291988371,
        "mean_delta_model2": 0.2242514705657959,
        "mean_delta_model2 / mean_delta_model1": 1.1708362198391054,
        "test_statistic": 1728.0,
        "p_value": 0.006137474165144637,
        "q_value": 0.04364045297137895,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.873994032740593,
        "mean_model2": 0.8495121473073959,
        "mean_delta_model1": 0.12600596725940705,
        "mean_delta_model2": 0.15048785269260406,
        "mean_delta_model2 / mean_delta_model1": 1.1942914765520305,
        "test_statistic": 1890.0,
        "p_value": 0.029011054990631654,
        "q_value": 0.18049748211388994,
        "significant": false
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7914498284459114,
        "mean_model2": 0.7644297757744789,
        "mean_delta_model1": 0.2085501715540886,
        "mean_delta_model2": 0.2355702242255211,
        "mean_delta_model2 / mean_delta_model1": 1.1295614022759253,
        "test_statistic": 1912.0,
        "p_value": 0.03505754201811026,
        "q_value": 0.1938815872597286,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5477807454764843,
        "mean_model2": -0.5013888927176595,
        "mean_delta_model1": 0.4522192545235157,
        "mean_delta_model2": 0.49861110728234054,
        "mean_delta_model2 / mean_delta_model1": 1.1025870798175235,
        "test_statistic": 1942.0,
        "p_value": 0.045011884468366715,
        "q_value": 0.2240396101242124,
        "significant": false
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8934113085269928,
        "mean_model2": 0.8848850929737091,
        "mean_delta_model1": 0.1065886914730072,
        "mean_delta_model2": 0.11511490702629089,
        "mean_delta_model2 / mean_delta_model1": 1.0799917461735882,
        "test_statistic": 1999.0,
        "p_value": 0.07051956353395925,
        "q_value": 0.319090991329415,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9158792644739151,
        "mean_model2": 0.9033829241991043,
        "mean_delta_model1": 0.0841207355260849,
        "mean_delta_model2": 0.0966170758008957,
        "mean_delta_model2 / mean_delta_model1": 1.1485524371209976,
        "test_statistic": 2081.0,
        "p_value": 0.12685646867814937,
        "q_value": 0.5158799477074211,
        "significant": false
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5312661869963631,
        "mean_model2": -0.474891130970791,
        "mean_delta_model1": 0.4687338130036369,
        "mean_delta_model2": 0.5251088690292091,
        "mean_delta_model2 / mean_delta_model1": 1.1202709394151062,
        "test_statistic": 2090.0,
        "p_value": 0.13473933100822333,
        "q_value": 0.5158799477074211,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8618635135889053,
        "mean_model2": 0.8665955227613449,
        "mean_delta_model1": 0.13813648641109466,
        "mean_delta_model2": 0.1334044772386551,
        "mean_delta_model2 / mean_delta_model1": 0.9657439587803247,
        "test_statistic": 2424.0,
        "p_value": 0.7283878786577169,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.567202514517121,
        "mean_model2": -0.5971838396694511,
        "mean_delta_model1": 0.432797485482879,
        "mean_delta_model2": 0.4028161603305489,
        "mean_delta_model2 / mean_delta_model1": 0.9307266651078634,
        "test_statistic": 2367.0,
        "p_value": 0.5869540189208129,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "text-embedding-3-large",
    "model2_name": "bge-m3",
    "projection_dimension": 3200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 9,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/text-embedding-3-large_propositions_a->b_3200.json",
      "model2": "models/bge-m3_propositions_a->b_3200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:04.581280",
    "comparisons": [
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9259700793027877,
        "mean_model2": 0.882039293050766,
        "mean_delta_model1": 0.07402992069721222,
        "mean_delta_model2": 0.11796070694923401,
        "mean_delta_model2 / mean_delta_model1": 1.593419334213553,
        "test_statistic": 809.0,
        "p_value": 3.631372792907468e-09,
        "q_value": 1.807458972997276e-07,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8061958764493465,
        "mean_model2": 0.7500394213199616,
        "mean_delta_model1": 0.19380412355065346,
        "mean_delta_model2": 0.24996057868003846,
        "mean_delta_model2 / mean_delta_model1": 1.2897588250474337,
        "test_statistic": 1069.0,
        "p_value": 5.551479519412731e-07,
        "q_value": 1.3815810222474211e-05,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6581632061302662,
        "mean_model2": 0.7323339978605509,
        "mean_delta_model1": 0.6581632061302662,
        "mean_delta_model2": 0.7328643359988928,
        "mean_delta_model2 / mean_delta_model1": 1.1134994013230228,
        "test_statistic": 1113.0,
        "p_value": 1.2043858792101071e-06,
        "q_value": 1.998214071715285e-05,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9277302587032318,
        "mean_model2": 0.8983676600456237,
        "mean_delta_model1": 0.07226974129676819,
        "mean_delta_model2": 0.10163233995437622,
        "mean_delta_model2 / mean_delta_model1": 1.4062917360812677,
        "test_statistic": 1168.0,
        "p_value": 3.0740842199796304e-06,
        "q_value": 3.8251932698865546e-05,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.821450163424015,
        "mean_model2": 0.7623094499111176,
        "mean_delta_model1": 0.17854983657598494,
        "mean_delta_model2": 0.23769055008888246,
        "mean_delta_model2 / mean_delta_model1": 1.3312280461692227,
        "test_statistic": 1236.0,
        "p_value": 9.336270456051865e-06,
        "q_value": 9.293964994769573e-05,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9006644827127457,
        "mean_model2": 0.8735338073968887,
        "mean_delta_model1": 0.09933551728725433,
        "mean_delta_model2": 0.12646619260311126,
        "mean_delta_model2 / mean_delta_model1": 1.2731215989684894,
        "test_statistic": 1343.0,
        "p_value": 4.821815867885519e-05,
        "q_value": 0.00039999723032073376,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5364108726009726,
        "mean_model2": -0.43089755072724073,
        "mean_delta_model1": 0.46358912739902736,
        "mean_delta_model2": 0.5691024492727592,
        "mean_delta_model2 / mean_delta_model1": 1.2276009415184426,
        "test_statistic": 1489.0,
        "p_value": 0.0003678938283442542,
        "q_value": 0.00261590564494696,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8820497417449951,
        "mean_model2": 0.8513754630088806,
        "mean_delta_model1": 0.11795025825500488,
        "mean_delta_model2": 0.14862453699111938,
        "mean_delta_model2 / mean_delta_model1": 1.260061140941274,
        "test_statistic": 1507.0,
        "p_value": 0.00046488358494524537,
        "q_value": 0.0028923566063279002,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5613767932355404,
        "mean_model2": -0.49310068497434256,
        "mean_delta_model1": 0.4386232067644596,
        "mean_delta_model2": 0.5068993150256574,
        "mean_delta_model2 / mean_delta_model1": 1.155660045360669,
        "test_statistic": 1518.0,
        "p_value": 0.0005353945173863671,
        "q_value": 0.0029609360173454796,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7534816281683743,
        "mean_model2": 0.7917789721488953,
        "mean_delta_model1": 0.7534816281683743,
        "mean_delta_model2": 0.7917789721488953,
        "mean_delta_model2 / mean_delta_model1": 1.050827176866432,
        "test_statistic": 1868.0,
        "p_value": 0.023884664929614687,
        "q_value": 0.1188821814945108,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8680082071200013,
        "mean_model2": 0.8581146469712257,
        "mean_delta_model1": 0.13199179287999868,
        "mean_delta_model2": 0.14188535302877425,
        "mean_delta_model2 / mean_delta_model1": 1.0749558736411011,
        "test_statistic": 1984.0,
        "p_value": 0.0628666066190492,
        "q_value": 0.28446245016716026,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9561981070041656,
        "mean_model2": 0.9573878955841064,
        "mean_delta_model1": 0.04380189299583435,
        "mean_delta_model2": 0.042612104415893554,
        "mean_delta_model2 / mean_delta_model1": 0.9728370511280426,
        "test_statistic": 2018.0,
        "p_value": 0.08129332564998845,
        "q_value": 0.33718733765974734,
        "significant": false
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8896518182754517,
        "mean_model2": 0.8907698923349381,
        "mean_delta_model1": 0.11034818172454834,
        "mean_delta_model2": 0.10923010766506196,
        "mean_delta_model2 / mean_delta_model1": 0.9898677618243196,
        "test_statistic": 2409.0,
        "p_value": 0.6900069147093294,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5823457821644843,
        "mean_model2": -0.5908838836103678,
        "mean_delta_model1": 0.41765421783551576,
        "mean_delta_model2": 0.40911611638963225,
        "mean_delta_model2 / mean_delta_model1": 0.9795570089292237,
        "test_statistic": 2513.0,
        "p_value": 0.9670887024899815,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6569480375945568,
        "mean_model2": 0.650384536832571,
        "mean_delta_model1": 0.6760331769287586,
        "mean_delta_model2": 0.6785368636250496,
        "mean_delta_model2 / mean_delta_model1": 1.0037034967834941,
        "test_statistic": 2447.0,
        "p_value": 0.7885535816793301,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 50,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 11,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_50.json",
      "model2": "models/bge-m3_propositions_a->b_50.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:06.796617",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5168375094980001,
        "mean_model2": -0.29622286692261696,
        "mean_delta_model1": 0.48316249050199983,
        "mean_delta_model2": 0.703777133077383,
        "mean_delta_model2 / mean_delta_model1": 1.4566054834806552,
        "test_statistic": 755.0,
        "p_value": 1.158857605214751e-09,
        "q_value": 5.7680323569712585e-08,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8093671980500221,
        "mean_model2": 0.7287274929881096,
        "mean_delta_model1": 0.19063280194997786,
        "mean_delta_model2": 0.27127250701189043,
        "mean_delta_model2 / mean_delta_model1": 1.4230106478898237,
        "test_statistic": 844.0,
        "p_value": 7.4778784300395e-09,
        "q_value": 1.860998476079908e-07,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5958431795053184,
        "mean_model2": -0.4421729109436274,
        "mean_delta_model1": 0.4041568204946816,
        "mean_delta_model2": 0.5578270890563727,
        "mean_delta_model2 / mean_delta_model1": 1.3802243603698214,
        "test_statistic": 989.0,
        "p_value": 1.282935927699521e-07,
        "q_value": 2.1285375958738434e-06,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8159912088513375,
        "mean_model2": 0.7458646422624589,
        "mean_delta_model1": 0.18400879114866256,
        "mean_delta_model2": 0.2541353577375412,
        "mean_delta_model2 / mean_delta_model1": 1.3811044361039395,
        "test_statistic": 1032.0,
        "p_value": 2.845255256693596e-07,
        "q_value": 3.54045318221171e-06,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6637225362658501,
        "mean_model2": 0.7693505817651749,
        "mean_delta_model1": 0.6637225362658501,
        "mean_delta_model2": 0.7693505817651749,
        "mean_delta_model2 / mean_delta_model1": 1.1591448831820532,
        "test_statistic": 1047.0,
        "p_value": 3.73788105734042e-07,
        "q_value": 3.7209435893125283e-06,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5116489662975073,
        "mean_model2": -0.3671434297040105,
        "mean_delta_model1": 0.4883510337024927,
        "mean_delta_model2": 0.6328565702959895,
        "mean_delta_model2 / mean_delta_model1": 1.2959050490748643,
        "test_statistic": 1063.0,
        "p_value": 4.986516364601757e-07,
        "q_value": 4.136600794058098e-06,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8923065981268883,
        "mean_model2": 0.8588985985517502,
        "mean_delta_model1": 0.10769340187311173,
        "mean_delta_model2": 0.1411014014482498,
        "mean_delta_model2 / mean_delta_model1": 1.3102139870602343,
        "test_statistic": 1165.0,
        "p_value": 2.9235102548723395e-06,
        "q_value": 2.0787592477970743e-05,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9105998533964157,
        "mean_model2": 0.8712864607572556,
        "mean_delta_model1": 0.0894001466035843,
        "mean_delta_model2": 0.12871353924274445,
        "mean_delta_model2 / mean_delta_model1": 1.4397464001203772,
        "test_statistic": 1246.0,
        "p_value": 1.0944564978822444e-05,
        "q_value": 6.809357405813868e-05,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5520357795711607,
        "mean_model2": 0.6995121742784977,
        "mean_delta_model1": 0.594848697585985,
        "mean_delta_model2": 0.7163089196383953,
        "mean_delta_model2 / mean_delta_model1": 1.2041867495807255,
        "test_statistic": 1468.0,
        "p_value": 0.000278726263093336,
        "q_value": 0.0015414626122844666,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8778363499045372,
        "mean_model2": 0.8454501834511757,
        "mean_delta_model1": 0.1221636500954628,
        "mean_delta_model2": 0.1545498165488243,
        "mean_delta_model2 / mean_delta_model1": 1.2651047707567173,
        "test_statistic": 1586.0,
        "p_value": 0.0012440340007009926,
        "q_value": 0.006191984534533038,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8655539400875568,
        "mean_model2": 0.8465324777364731,
        "mean_delta_model1": 0.13444605991244316,
        "mean_delta_model2": 0.15346752226352692,
        "mean_delta_model2 / mean_delta_model1": 1.1414802513623035,
        "test_statistic": 1649.0,
        "p_value": 0.0025955605620449655,
        "q_value": 0.0117445422418082,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8912533563375473,
        "mean_model2": 0.8743390291929245,
        "mean_delta_model1": 0.1087466436624527,
        "mean_delta_model2": 0.1256609708070755,
        "mean_delta_model2 / mean_delta_model1": 1.1555388430848912,
        "test_statistic": 1909.0,
        "p_value": 0.03417438138857468,
        "q_value": 0.1417480289365423,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9591578662395477,
        "mean_model2": 0.9541554898023605,
        "mean_delta_model1": 0.04084213376045227,
        "mean_delta_model2": 0.045844510197639465,
        "mean_delta_model2 / mean_delta_model1": 1.1224807809143171,
        "test_statistic": 2079.0,
        "p_value": 0.1251544595595329,
        "q_value": 0.4791821034719395,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8772614413499832,
        "mean_model2": 0.8861065673828125,
        "mean_delta_model1": 0.12273855865001679,
        "mean_delta_model2": 0.1138934326171875,
        "mean_delta_model2 / mean_delta_model1": 0.9279352297263751,
        "test_statistic": 2369.0,
        "p_value": 0.5916968660875195,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6880296065658331,
        "mean_model2": 0.7030336222052574,
        "mean_delta_model1": 0.6882821650058031,
        "mean_delta_model2": 0.7030336222052574,
        "mean_delta_model2 / mean_delta_model1": 1.0214322816273032,
        "test_statistic": 2516.0,
        "p_value": 0.9753134632163343,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 100,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 12,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_100.json",
      "model2": "models/bge-m3_propositions_a->b_100.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:06.796680",
    "comparisons": [
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8223584407567978,
        "mean_model2": 0.7309655785560608,
        "mean_delta_model1": 0.1776415592432022,
        "mean_delta_model2": 0.2690344214439392,
        "mean_delta_model2 / mean_delta_model1": 1.5144790587860948,
        "test_statistic": 481.0,
        "p_value": 2.0961102458610946e-12,
        "q_value": 1.0433060686230957e-10,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9141515517234802,
        "mean_model2": 0.863963331580162,
        "mean_delta_model1": 0.08584844827651977,
        "mean_delta_model2": 0.13603666841983794,
        "mean_delta_model2 / mean_delta_model1": 1.5846141794160424,
        "test_statistic": 690.0,
        "p_value": 2.8025308498580313e-10,
        "q_value": 4.649719560208805e-09,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5394507778063417,
        "mean_model2": -0.33970477029681206,
        "mean_delta_model1": 0.46054922219365835,
        "mean_delta_model2": 0.660295229703188,
        "mean_delta_model2 / mean_delta_model1": 1.4337126150340942,
        "test_statistic": 685.0,
        "p_value": 2.507555292510171e-10,
        "q_value": 4.649719560208805e-09,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8206867679953576,
        "mean_model2": 0.7450579446554184,
        "mean_delta_model1": 0.1793132320046425,
        "mean_delta_model2": 0.2549420553445816,
        "mean_delta_model2 / mean_delta_model1": 1.4217693390188908,
        "test_statistic": 770.0,
        "p_value": 1.5969267301638846e-09,
        "q_value": 1.987113216034566e-08,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8901388949155807,
        "mean_model2": 0.8420049086213112,
        "mean_delta_model1": 0.10986110508441925,
        "mean_delta_model2": 0.1579950913786888,
        "mean_delta_model2 / mean_delta_model1": 1.4381349182431993,
        "test_statistic": 933.0,
        "p_value": 4.4045347744048177e-08,
        "q_value": 4.3845764970346173e-07,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5945502911321818,
        "mean_model2": -0.4696632133424282,
        "mean_delta_model1": 0.4054497088678181,
        "mean_delta_model2": 0.5303367866575718,
        "mean_delta_model2 / mean_delta_model1": 1.308021130755007,
        "test_statistic": 987.0,
        "p_value": 1.2356395386122228e-07,
        "q_value": 8.786003447578229e-07,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6686529514938593,
        "mean_model2": 0.7655108091235161,
        "mean_delta_model1": 0.6686529514938593,
        "mean_delta_model2": 0.7655108091235161,
        "mean_delta_model2 / mean_delta_model1": 1.1448552009129154,
        "test_statistic": 981.0,
        "p_value": 1.1036534514135368e-07,
        "q_value": 8.786003447578229e-07,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8893018957972526,
        "mean_model2": 0.8308614087104798,
        "mean_delta_model1": 0.11069810420274734,
        "mean_delta_model2": 0.16913859128952027,
        "mean_delta_model2 / mean_delta_model1": 1.5279267202240177,
        "test_statistic": 1025.0,
        "p_value": 2.502858663696827e-07,
        "q_value": 1.5571984094687224e-06,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8998089629411697,
        "mean_model2": 0.8598710787296295,
        "mean_delta_model1": 0.10019103705883026,
        "mean_delta_model2": 0.1401289212703705,
        "mean_delta_model2 / mean_delta_model1": 1.3986173352820919,
        "test_statistic": 1067.0,
        "p_value": 5.356627498815636e-07,
        "q_value": 2.9624194454162916e-06,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5218167159706354,
        "mean_model2": -0.37602380799129603,
        "mean_delta_model1": 0.4781832840293646,
        "mean_delta_model2": 0.6239761920087039,
        "mean_delta_model2 / mean_delta_model1": 1.3048891771180908,
        "test_statistic": 1266.0,
        "p_value": 1.4988931053447023e-05,
        "q_value": 7.460505839858747e-05,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5578671948984265,
        "mean_model2": 0.6851282499730587,
        "mean_delta_model1": 0.5952944745868445,
        "mean_delta_model2": 0.702600526958704,
        "mean_delta_model2 / mean_delta_model1": 1.1802570945183621,
        "test_statistic": 1512.0,
        "p_value": 0.000495785644777983,
        "q_value": 0.0022433595012668643,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8953851580619812,
        "mean_model2": 0.8756408321857453,
        "mean_delta_model1": 0.1046148419380188,
        "mean_delta_model2": 0.12435916781425477,
        "mean_delta_model2 / mean_delta_model1": 1.188733505786243,
        "test_statistic": 1674.0,
        "p_value": 0.003433349985234561,
        "q_value": 0.014240801831134572,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9621203172206879,
        "mean_model2": 0.9543231099843978,
        "mean_delta_model1": 0.037879682779312134,
        "mean_delta_model2": 0.045676890015602115,
        "mean_delta_model2 / mean_delta_model1": 1.2058414079578408,
        "test_statistic": 2000.0,
        "p_value": 0.07105582109513767,
        "q_value": 0.27205325272566705,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8694352394342423,
        "mean_model2": 0.8853390079736709,
        "mean_delta_model1": 0.13056476056575775,
        "mean_delta_model2": 0.11466099202632904,
        "mean_delta_model2 / mean_delta_model1": 0.8781924887656121,
        "test_statistic": 2133.0,
        "p_value": 0.17771520952175224,
        "q_value": 0.631821172256618,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.7045047158002853,
        "mean_model2": 0.7031710910797119,
        "mean_delta_model1": 0.7045047158002853,
        "mean_delta_model2": 0.7031710910797119,
        "mean_delta_model2 / mean_delta_model1": 0.9981070038416159,
        "test_statistic": 2444.0,
        "p_value": 0.7806252551793645,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 11,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_200.json",
      "model2": "models/bge-m3_propositions_a->b_200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:06.796706",
    "comparisons": [
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.6289387879520655,
        "mean_model2": -0.3879535548947752,
        "mean_delta_model1": 0.3710612120479345,
        "mean_delta_model2": 0.6120464451052249,
        "mean_delta_model2 / mean_delta_model1": 1.6494487303786398,
        "test_statistic": 417.0,
        "p_value": 4.2300567404403065e-13,
        "q_value": 2.1054445378699135e-11,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.836933927088976,
        "mean_model2": 0.7418005523085595,
        "mean_delta_model1": 0.1630660729110241,
        "mean_delta_model2": 0.2581994476914406,
        "mean_delta_model2 / mean_delta_model1": 1.5834038502437315,
        "test_statistic": 582.0,
        "p_value": 2.378601237013467e-11,
        "q_value": 5.919557690991327e-10,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9120338016748428,
        "mean_model2": 0.8720557016134262,
        "mean_delta_model1": 0.08796619832515716,
        "mean_delta_model2": 0.12794429838657378,
        "mean_delta_model2 / mean_delta_model1": 1.4544711584970635,
        "test_statistic": 822.0,
        "p_value": 4.7567087266692085e-09,
        "q_value": 7.891924404589568e-08,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9224555224180222,
        "mean_model2": 0.8785044533014298,
        "mean_delta_model1": 0.07754447758197784,
        "mean_delta_model2": 0.12149554669857025,
        "mean_delta_model2 / mean_delta_model1": 1.566785288741272,
        "test_statistic": 875.0,
        "p_value": 1.4012498297407568e-08,
        "q_value": 1.7436254294261137e-07,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8962541258335114,
        "mean_model2": 0.84746671885252,
        "mean_delta_model1": 0.10374587416648864,
        "mean_delta_model2": 0.15253328114748002,
        "mean_delta_model2 / mean_delta_model1": 1.4702587681000079,
        "test_statistic": 891.0,
        "p_value": 1.9293162111910322e-08,
        "q_value": 1.839123816420425e-07,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8344850561022759,
        "mean_model2": 0.7657334405183792,
        "mean_delta_model1": 0.16551494389772414,
        "mean_delta_model2": 0.23426655948162078,
        "mean_delta_model2 / mean_delta_model1": 1.4153801098853043,
        "test_statistic": 898.0,
        "p_value": 2.2169944511644565e-08,
        "q_value": 1.839123816420425e-07,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6512607488036156,
        "mean_model2": -0.5294351847469807,
        "mean_delta_model1": 0.34873925119638444,
        "mean_delta_model2": 0.47056481525301935,
        "mean_delta_model2 / mean_delta_model1": 1.3493313805047762,
        "test_statistic": 915.0,
        "p_value": 3.0998246960301505e-08,
        "q_value": 2.2041274672059635e-07,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8930344766378403,
        "mean_model2": 0.8577329444885254,
        "mean_delta_model1": 0.10696552336215973,
        "mean_delta_model2": 0.14226705551147462,
        "mean_delta_model2 / mean_delta_model1": 1.3300271997902757,
        "test_statistic": 951.0,
        "p_value": 6.235139563703788e-08,
        "q_value": 3.879303914458329e-07,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.591442689821124,
        "mean_model2": -0.44858737911097707,
        "mean_delta_model1": 0.4085573101788759,
        "mean_delta_model2": 0.551412620889023,
        "mean_delta_model2 / mean_delta_model1": 1.349657947981892,
        "test_statistic": 961.0,
        "p_value": 7.551041748728438e-08,
        "q_value": 4.176014276618877e-07,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.9072606128454208,
        "mean_model2": 0.8865419811010361,
        "mean_delta_model1": 0.09273938715457916,
        "mean_delta_model2": 0.11345801889896392,
        "mean_delta_model2 / mean_delta_model1": 1.223407037506628,
        "test_statistic": 1554.0,
        "p_value": 0.0008419764864644989,
        "q_value": 0.00419080618350537,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.7626550742983818,
        "mean_model2": 0.717666858844459,
        "mean_delta_model1": 0.7626550742983818,
        "mean_delta_model2": 0.717666858844459,
        "mean_delta_model2 / mean_delta_model1": 0.941011058642322,
        "test_statistic": 1647.0,
        "p_value": 0.0025373678555059894,
        "q_value": 0.011481228524491705,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9607812535762786,
        "mean_model2": 0.9613676565885544,
        "mean_delta_model1": 0.039218746423721314,
        "mean_delta_model2": 0.03863234341144562,
        "mean_delta_model2 / mean_delta_model1": 0.9850478899570076,
        "test_statistic": 1961.0,
        "p_value": 0.052475138087962746,
        "q_value": 0.21765565578396628,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7428091378509998,
        "mean_model2": 0.7836297407746315,
        "mean_delta_model1": 0.7428091378509998,
        "mean_delta_model2": 0.7836297407746315,
        "mean_delta_model2 / mean_delta_model1": 1.0549543628956002,
        "test_statistic": 1984.0,
        "p_value": 0.0628666066190492,
        "q_value": 0.2406989962952894,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.579792783986777,
        "mean_model2": 0.6757651863619685,
        "mean_delta_model1": 0.6313866220600903,
        "mean_delta_model2": 0.7008917746320367,
        "mean_delta_model2 / mean_delta_model1": 1.1100833469438498,
        "test_statistic": 2003.0,
        "p_value": 0.07268468332694299,
        "q_value": 0.25841188240621094,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8938190446794033,
        "mean_model2": 0.8933765804767608,
        "mean_delta_model1": 0.1061809553205967,
        "mean_delta_model2": 0.10662341952323913,
        "mean_delta_model2 / mean_delta_model1": 1.0041670768670943,
        "test_statistic": 2440.5,
        "p_value": 0.7714041761823505,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 400,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 10,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_400.json",
      "model2": "models/bge-m3_propositions_a->b_400.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:06.796730",
    "comparisons": [
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8312988337874413,
        "mean_model2": 0.7445771983265876,
        "mean_delta_model1": 0.16870116621255873,
        "mean_delta_model2": 0.2554228016734123,
        "mean_delta_model2 / mean_delta_model1": 1.5140547478586293,
        "test_statistic": 492.0,
        "p_value": 2.7466217414808055e-12,
        "q_value": 1.3670879844022077e-10,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8944801944494247,
        "mean_model2": 0.8411412218213081,
        "mean_delta_model1": 0.10551980555057526,
        "mean_delta_model2": 0.15885877817869187,
        "mean_delta_model2 / mean_delta_model1": 1.505487783547435,
        "test_statistic": 888.0,
        "p_value": 1.817437099828165e-08,
        "q_value": 2.2615021792574374e-07,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8371026346087456,
        "mean_model2": 0.7604560178518295,
        "mean_delta_model1": 0.16289736539125443,
        "mean_delta_model2": 0.23954398214817046,
        "mean_delta_model2 / mean_delta_model1": 1.4705209109602395,
        "test_statistic": 857.0,
        "p_value": 9.743997296518111e-09,
        "q_value": 2.2615021792574374e-07,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9083006224036216,
        "mean_model2": 0.8680522787570953,
        "mean_delta_model1": 0.09169937759637832,
        "mean_delta_model2": 0.13194772124290466,
        "mean_delta_model2 / mean_delta_model1": 1.4389162140628962,
        "test_statistic": 875.0,
        "p_value": 1.4012498297407568e-08,
        "q_value": 2.2615021792574374e-07,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9221754205226899,
        "mean_model2": 0.8749540442228317,
        "mean_delta_model1": 0.07782457947731018,
        "mean_delta_model2": 0.12504595577716826,
        "mean_delta_model2 / mean_delta_model1": 1.6067668674473918,
        "test_statistic": 904.0,
        "p_value": 2.4963642407614978e-08,
        "q_value": 2.485052460406466e-07,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8895806133747101,
        "mean_model2": 0.849129826426506,
        "mean_delta_model1": 0.11041938662528991,
        "mean_delta_model2": 0.15087017357349397,
        "mean_delta_model2 / mean_delta_model1": 1.3663377255071567,
        "test_statistic": 1032.0,
        "p_value": 2.845255256693596e-07,
        "q_value": 2.360302121474473e-06,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5427909650467336,
        "mean_model2": -0.39113867182284595,
        "mean_delta_model1": 0.45720903495326637,
        "mean_delta_model2": 0.6088613281771541,
        "mean_delta_model2 / mean_delta_model1": 1.3316913744702987,
        "test_statistic": 1121.0,
        "p_value": 1.3832149770917264e-06,
        "q_value": 9.835337230402167e-06,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7068539950251579,
        "mean_model2": 0.7796342498064042,
        "mean_delta_model1": 0.7068539950251579,
        "mean_delta_model2": 0.7796342498064042,
        "mean_delta_model2 / mean_delta_model1": 1.102963632225996,
        "test_statistic": 1327.0,
        "p_value": 3.803333427971095e-05,
        "q_value": 0.00023663121096770067,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.906309399008751,
        "mean_model2": 0.8809602975845336,
        "mean_delta_model1": 0.09369060099124908,
        "mean_delta_model2": 0.11903970241546631,
        "mean_delta_model2 / mean_delta_model1": 1.2705618403129348,
        "test_statistic": 1449.0,
        "p_value": 0.0002159076122813839,
        "q_value": 0.0011940514982182206,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5317437724769115,
        "mean_model2": -0.454970286320895,
        "mean_delta_model1": 0.46825622752308843,
        "mean_delta_model2": 0.545029713679105,
        "mean_delta_model2 / mean_delta_model1": 1.1639561454678808,
        "test_statistic": 1615.0,
        "p_value": 0.0017547657728420143,
        "q_value": 0.00873407199565538,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.7382094451785087,
        "mean_model2": 0.6971724597364664,
        "mean_delta_model1": 0.7382094451785087,
        "mean_delta_model2": 0.6971724597364664,
        "mean_delta_model2 / mean_delta_model1": 0.9444101051401217,
        "test_statistic": 1830.0,
        "p_value": 0.016865089187315326,
        "q_value": 0.0763121289746489,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6048428108915687,
        "mean_model2": -0.5573559789545834,
        "mean_delta_model1": 0.39515718910843134,
        "mean_delta_model2": 0.4426440210454166,
        "mean_delta_model2 / mean_delta_model1": 1.12017200558625,
        "test_statistic": 1856.0,
        "p_value": 0.021434665299879318,
        "q_value": 0.08890640982277374,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.962433015704155,
        "mean_model2": 0.9577863413095474,
        "mean_delta_model1": 0.037566984295845034,
        "mean_delta_model2": 0.04221365869045258,
        "mean_delta_model2 / mean_delta_model1": 1.1236903755173522,
        "test_statistic": 1930.0,
        "p_value": 0.040775729015492666,
        "q_value": 0.156119084122219,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5709415571670979,
        "mean_model2": 0.6428800263255835,
        "mean_delta_model1": 0.6106050681974738,
        "mean_delta_model2": 0.6668178652971983,
        "mean_delta_model2 / mean_delta_model1": 1.0920608098875866,
        "test_statistic": 2106.0,
        "p_value": 0.1496812949935662,
        "q_value": 0.5321537279911212,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8926843759417534,
        "mean_model2": 0.8929816073179245,
        "mean_delta_model1": 0.10731562405824661,
        "mean_delta_model2": 0.1070183926820755,
        "mean_delta_model2 / mean_delta_model1": 0.9972303065953398,
        "test_statistic": 2524.0,
        "p_value": 0.9972566190976729,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 800,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 13,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_800.json",
      "model2": "models/bge-m3_propositions_a->b_800.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:06.796751",
    "comparisons": [
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9363045537471771,
        "mean_model2": 0.880825629234314,
        "mean_delta_model1": 0.06369544625282288,
        "mean_delta_model2": 0.11917437076568603,
        "mean_delta_model2 / mean_delta_model1": 1.8710029959230314,
        "test_statistic": 490.0,
        "p_value": 2.6151797240223703e-12,
        "q_value": 1.301664777413344e-10,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9272476628422737,
        "mean_model2": 0.8727085554599762,
        "mean_delta_model1": 0.07275233715772629,
        "mean_delta_model2": 0.1272914445400238,
        "mean_delta_model2 / mean_delta_model1": 1.7496543686845045,
        "test_statistic": 521.0,
        "p_value": 5.5634556953558915e-12,
        "q_value": 1.3845614993156168e-10,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.6419921713322401,
        "mean_model2": -0.47621000541374087,
        "mean_delta_model1": 0.3580078286677599,
        "mean_delta_model2": 0.5237899945862591,
        "mean_delta_model2 / mean_delta_model1": 1.4630685494655737,
        "test_statistic": 550.0,
        "p_value": 1.1159728368655805e-11,
        "q_value": 1.8515267114716895e-10,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.848747711777687,
        "mean_model2": 0.7574179509282112,
        "mean_delta_model1": 0.15125228822231293,
        "mean_delta_model2": 0.2425820490717888,
        "mean_delta_model2 / mean_delta_model1": 1.6038239944855444,
        "test_statistic": 571.0,
        "p_value": 1.836214434377775e-11,
        "q_value": 2.2848674902267163e-10,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.9063731968402863,
        "mean_model2": 0.8504841342568398,
        "mean_delta_model1": 0.09362680315971375,
        "mean_delta_model2": 0.14951586574316025,
        "mean_delta_model2 / mean_delta_model1": 1.5969344322064256,
        "test_statistic": 690.0,
        "p_value": 2.8025308498580313e-10,
        "q_value": 2.7898317361252833e-09,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.9231107419729233,
        "mean_model2": 0.8843132483959198,
        "mean_delta_model1": 0.07688925802707672,
        "mean_delta_model2": 0.1156867516040802,
        "mean_delta_model2 / mean_delta_model1": 1.504589256971902,
        "test_statistic": 702.0,
        "p_value": 3.655594024126739e-10,
        "q_value": 3.032524519582999e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.9029164424538613,
        "mean_model2": 0.8562631008028984,
        "mean_delta_model1": 0.09708355754613876,
        "mean_delta_model2": 0.1437368991971016,
        "mean_delta_model2 / mean_delta_model1": 1.4805483320776633,
        "test_statistic": 751.0,
        "p_value": 1.0634202142936968e-09,
        "q_value": 7.561439543689572e-09,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8485346738994122,
        "mean_model2": 0.775978013575077,
        "mean_delta_model1": 0.15146532610058785,
        "mean_delta_model2": 0.22402198642492294,
        "mean_delta_model2 / mean_delta_model1": 1.4790314865604972,
        "test_statistic": 841.0,
        "p_value": 7.032818332192009e-09,
        "q_value": 4.3079880510754674e-08,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.8009300374984741,
        "mean_model2": 0.6976736136339605,
        "mean_delta_model1": 0.8009300374984741,
        "mean_delta_model2": 0.6976736136339605,
        "mean_delta_model2 / mean_delta_model1": 0.871079346471994,
        "test_statistic": 846.0,
        "p_value": 7.789675866010679e-09,
        "q_value": 4.3079880510754674e-08,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5929898209869862,
        "mean_model2": -0.4144854263169691,
        "mean_delta_model1": 0.40701017901301384,
        "mean_delta_model2": 0.585514573683031,
        "mean_delta_model2 / mean_delta_model1": 1.4385747675964378,
        "test_statistic": 912.0,
        "p_value": 2.9224879795833528e-08,
        "q_value": 1.4546226519325056e-07,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6624097067117691,
        "mean_model2": -0.5729254976660013,
        "mean_delta_model1": 0.33759029328823087,
        "mean_delta_model2": 0.4270745023339987,
        "mean_delta_model2 / mean_delta_model1": 1.2650674821665182,
        "test_statistic": 1176.0,
        "p_value": 3.512862758501502e-06,
        "q_value": 1.589520416521918e-05,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9629180398583412,
        "mean_model2": 0.955643218755722,
        "mean_delta_model1": 0.037081960141658786,
        "mean_delta_model2": 0.04435678124427796,
        "mean_delta_model2 / mean_delta_model1": 1.196182215687311,
        "test_statistic": 1555.0,
        "p_value": 0.0008524570416554503,
        "q_value": 0.0035358095888791637,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9074542362242937,
        "mean_model2": 0.8950576251745224,
        "mean_delta_model1": 0.0925457637757063,
        "mean_delta_model2": 0.1049423748254776,
        "mean_delta_model2 / mean_delta_model1": 1.133951145293,
        "test_statistic": 1693.0,
        "p_value": 0.004227251435042434,
        "q_value": 0.01618498646933847,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6377518126182258,
        "mean_model2": 0.6583599449694156,
        "mean_delta_model1": 0.6896430052630603,
        "mean_delta_model2": 0.6839210173487663,
        "mean_delta_model2 / mean_delta_model1": 0.9917029711450327,
        "test_statistic": 2192.0,
        "p_value": 0.2522248272969543,
        "q_value": 0.8967211443738529,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7841247874498367,
        "mean_model2": 0.7792805349826812,
        "mean_delta_model1": 0.7841247874498367,
        "mean_delta_model2": 0.7792805349826812,
        "mean_delta_model2 / mean_delta_model1": 0.9938220898705292,
        "test_statistic": 2217.0,
        "p_value": 0.2895979344328722,
        "q_value": 0.9609522624143855,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 1600,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 11,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_1600.json",
      "model2": "models/bge-m3_propositions_a->b_1600.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:06.796772",
    "comparisons": [
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8454413242638111,
        "mean_model2": 0.7644297757744789,
        "mean_delta_model1": 0.1545586757361889,
        "mean_delta_model2": 0.2355702242255211,
        "mean_delta_model2 / mean_delta_model1": 1.5241475323430445,
        "test_statistic": 683.0,
        "p_value": 2.3982552911437027e-10,
        "q_value": 1.1936940360356812e-08,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9249880424141884,
        "mean_model2": 0.8848850929737091,
        "mean_delta_model1": 0.07501195758581161,
        "mean_delta_model2": 0.11511490702629089,
        "mean_delta_model2 / mean_delta_model1": 1.5346207555589069,
        "test_statistic": 789.0,
        "p_value": 2.3881171317733165e-09,
        "q_value": 5.9432396294078126e-08,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9325891584157944,
        "mean_model2": 0.8871316158771515,
        "mean_delta_model1": 0.06741084158420563,
        "mean_delta_model2": 0.11286838412284851,
        "mean_delta_model2 / mean_delta_model1": 1.6743357814612063,
        "test_statistic": 881.0,
        "p_value": 1.5803323839645013e-08,
        "q_value": 2.6219523677048513e-07,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.83708411116153,
        "mean_model2": 0.7757485294342041,
        "mean_delta_model1": 0.16291588883846997,
        "mean_delta_model2": 0.2242514705657959,
        "mean_delta_model2 / mean_delta_model1": 1.3764861866121587,
        "test_statistic": 954.0,
        "p_value": 6.604603425472361e-08,
        "q_value": 8.218344965693218e-07,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.9016334283351898,
        "mean_model2": 0.8495121473073959,
        "mean_delta_model1": 0.09836657166481018,
        "mean_delta_model2": 0.15048785269260406,
        "mean_delta_model2 / mean_delta_model1": 1.5298678214119343,
        "test_statistic": 1016.0,
        "p_value": 2.1207195839800435e-07,
        "q_value": 2.0762645459582966e-06,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8984165045619011,
        "mean_model2": 0.8665955227613449,
        "mean_delta_model1": 0.10158349543809891,
        "mean_delta_model2": 0.1334044772386551,
        "mean_delta_model2 / mean_delta_model1": 1.3132495260506827,
        "test_statistic": 1025.0,
        "p_value": 2.502858663696827e-07,
        "q_value": 2.0762645459582966e-06,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.6092907363921404,
        "mean_model2": -0.474891130970791,
        "mean_delta_model1": 0.3907092636078596,
        "mean_delta_model2": 0.5251088690292091,
        "mean_delta_model2 / mean_delta_model1": 1.3439887863939701,
        "test_statistic": 1097.0,
        "p_value": 9.110989609422746e-07,
        "q_value": 6.478360688355998e-06,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.9212935066223145,
        "mean_model2": 0.8939504718780518,
        "mean_delta_model1": 0.07870649337768555,
        "mean_delta_model2": 0.10604952812194825,
        "mean_delta_model2 / mean_delta_model1": 1.3474050687667258,
        "test_statistic": 1145.0,
        "p_value": 2.0862144054582563e-06,
        "q_value": 1.29797571117817e-05,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.8130274972319603,
        "mean_model2": 0.7346305106580258,
        "mean_delta_model1": 0.8130274972319603,
        "mean_delta_model2": 0.7346305106580258,
        "mean_delta_model2 / mean_delta_model1": 0.9035740035351257,
        "test_statistic": 1257.0,
        "p_value": 1.3018380819711497e-05,
        "q_value": 7.199661446810486e-05,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5816727208904922,
        "mean_model2": -0.5013888927176595,
        "mean_delta_model1": 0.4183272791095078,
        "mean_delta_model2": 0.49861110728234054,
        "mean_delta_model2 / mean_delta_model1": 1.1919163109413584,
        "test_statistic": 1591.0,
        "p_value": 0.0013209204674027352,
        "q_value": 0.006574674889028025,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6462181920185686,
        "mean_model2": -0.5971838396694511,
        "mean_delta_model1": 0.3537818079814315,
        "mean_delta_model2": 0.4028161603305489,
        "mean_delta_model2 / mean_delta_model1": 1.1386005476903747,
        "test_statistic": 1756.0,
        "p_value": 0.008191527683583642,
        "q_value": 0.03706549726159797,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9618648284673691,
        "mean_model2": 0.9616730916500091,
        "mean_delta_model1": 0.03813517153263092,
        "mean_delta_model2": 0.03832690834999084,
        "mean_delta_model2 / mean_delta_model1": 1.005027821028571,
        "test_statistic": 1857.0,
        "p_value": 0.02163012475488432,
        "q_value": 0.08971713386102165,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6017244151607156,
        "mean_model2": 0.7162200941890479,
        "mean_delta_model1": 0.6647565451636911,
        "mean_delta_model2": 0.7418730112165213,
        "mean_delta_model2 / mean_delta_model1": 1.116007080507708,
        "test_statistic": 1966.0,
        "p_value": 0.0546027898238638,
        "q_value": 0.20905910804392372,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9010970382392407,
        "mean_model2": 0.9033829241991043,
        "mean_delta_model1": 0.09890296176075936,
        "mean_delta_model2": 0.0966170758008957,
        "mean_delta_model2 / mean_delta_model1": 0.9768875884081906,
        "test_statistic": 2243.0,
        "p_value": 0.3322421202721664,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7958927756547928,
        "mean_model2": 0.7979374438524246,
        "mean_delta_model1": 0.7958927756547928,
        "mean_delta_model2": 0.7979374438524246,
        "mean_delta_model2 / mean_delta_model1": 1.0025690246980188,
        "test_statistic": 2381.0,
        "p_value": 0.6205162798090659,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "e5-mistral-7b-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 3200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 14,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/e5-mistral-7b-instruct_propositions_a->b_3200.json",
      "model2": "models/bge-m3_propositions_a->b_3200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:06.796792",
    "comparisons": [
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.9292541517317295,
        "mean_model2": 0.8735338073968887,
        "mean_delta_model1": 0.07074584826827049,
        "mean_delta_model2": 0.12646619260311126,
        "mean_delta_model2 / mean_delta_model1": 1.7876129228608224,
        "test_statistic": 529.0,
        "p_value": 6.747817107930334e-12,
        "q_value": 3.3586203552811577e-10,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8504805979505181,
        "mean_model2": 0.7500394213199616,
        "mean_delta_model1": 0.14951940204948186,
        "mean_delta_model2": 0.24996057868003846,
        "mean_delta_model2 / mean_delta_model1": 1.671760154560521,
        "test_statistic": 575.0,
        "p_value": 2.0177434491871465e-11,
        "q_value": 5.021501110492996e-10,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8514876550808549,
        "mean_model2": 0.7623094499111176,
        "mean_delta_model1": 0.1485123449191451,
        "mean_delta_model2": 0.23769055008888246,
        "mean_delta_model2 / mean_delta_model1": 1.6004767160486815,
        "test_statistic": 619.0,
        "p_value": 5.6223836477769614e-11,
        "q_value": 9.328178215555052e-10,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9381322014331818,
        "mean_model2": 0.882039293050766,
        "mean_delta_model1": 0.06186779856681824,
        "mean_delta_model2": 0.11796070694923401,
        "mean_delta_model2 / mean_delta_model1": 1.9066575776384624,
        "test_statistic": 637.0,
        "p_value": 8.495358469771918e-11,
        "q_value": 1.0571079293351502e-09,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.9104127806425094,
        "mean_model2": 0.8513754630088806,
        "mean_delta_model1": 0.08958721935749053,
        "mean_delta_model2": 0.14862453699111938,
        "mean_delta_model2 / mean_delta_model1": 1.65899263373769,
        "test_statistic": 700.0,
        "p_value": 3.497627642967373e-10,
        "q_value": 2.9668132445557406e-09,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.9308484864234924,
        "mean_model2": 0.8907698923349381,
        "mean_delta_model1": 0.06915151357650756,
        "mean_delta_model2": 0.10923010766506196,
        "mean_delta_model2 / mean_delta_model1": 1.579576527189277,
        "test_statistic": 704.0,
        "p_value": 3.820518122543279e-10,
        "q_value": 2.9668132445557406e-09,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.8387656405568122,
        "mean_model2": 0.7323339978605509,
        "mean_delta_model1": 0.8387656405568122,
        "mean_delta_model2": 0.7328643359988928,
        "mean_delta_model2 / mean_delta_model1": 0.873741484584875,
        "test_statistic": 708.0,
        "p_value": 4.172445151568837e-10,
        "q_value": 2.9668132445557406e-09,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.9006912777572871,
        "mean_model2": 0.8581146469712257,
        "mean_delta_model1": 0.09930872224271298,
        "mean_delta_model2": 0.14188535302877425,
        "mean_delta_model2 / mean_delta_model1": 1.4287300231494564,
        "test_statistic": 826.0,
        "p_value": 5.166648052934121e-09,
        "q_value": 3.214522756323024e-08,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.6187816611677408,
        "mean_model2": -0.49310068497434256,
        "mean_delta_model1": 0.3812183388322592,
        "mean_delta_model2": 0.5068993150256574,
        "mean_delta_model2 / mean_delta_model1": 1.3296823982245498,
        "test_statistic": 1053.0,
        "p_value": 4.165949265329799e-07,
        "q_value": 2.303928939423061e-06,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5864148696511984,
        "mean_model2": -0.43089755072724073,
        "mean_delta_model1": 0.4135851303488016,
        "mean_delta_model2": 0.5691024492727592,
        "mean_delta_model2 / mean_delta_model1": 1.376022510269652,
        "test_statistic": 1080.0,
        "p_value": 6.75147690723417e-07,
        "q_value": 3.360441963105066e-06,
        "significant": true
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.6731350625865161,
        "mean_model2": -0.5908838836103678,
        "mean_delta_model1": 0.3268649374134839,
        "mean_delta_model2": 0.40911611638963225,
        "mean_delta_model2 / mean_delta_model1": 1.2516365922481942,
        "test_statistic": 1193.0,
        "p_value": 4.653124306801065e-06,
        "q_value": 2.105472543262617e-05,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9612279242277145,
        "mean_model2": 0.9573878955841064,
        "mean_delta_model1": 0.03877207577228546,
        "mean_delta_model2": 0.042612104415893554,
        "mean_delta_model2 / mean_delta_model1": 1.0990410899370255,
        "test_statistic": 1522.0,
        "p_value": 0.0005634176930800106,
        "q_value": 0.002241049176858498,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.8262101083993911,
        "mean_model2": 0.7917789721488953,
        "mean_delta_model1": 0.8262101083993911,
        "mean_delta_model2": 0.7917789721488953,
        "mean_delta_model2 / mean_delta_model1": 0.9583264161253134,
        "test_statistic": 1525.0,
        "p_value": 0.0005853250706648853,
        "q_value": 0.002241049176858498,
        "significant": true
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9109495431184769,
        "mean_model2": 0.8983676600456237,
        "mean_delta_model1": 0.08905045688152313,
        "mean_delta_model2": 0.10163233995437622,
        "mean_delta_model2 / mean_delta_model1": 1.1412893713683312,
        "test_statistic": 1641.0,
        "p_value": 0.0023698759669958315,
        "q_value": 0.008425490511473748,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5968394887214526,
        "mean_model2": 0.650384536832571,
        "mean_delta_model1": 0.6793465176643804,
        "mean_delta_model2": 0.6785368636250496,
        "mean_delta_model2 / mean_delta_model1": 0.9988081869586756,
        "test_statistic": 2217.0,
        "p_value": 0.2895979344328722,
        "q_value": 0.9609522624143855,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 50,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 5,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_50.json",
      "model2": "models/bge-m3_propositions_a->b_50.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:08.667678",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8196766093373299,
        "mean_model2": 0.7458646422624589,
        "mean_delta_model1": 0.18032339066267014,
        "mean_delta_model2": 0.2541353577375412,
        "mean_delta_model2 / mean_delta_model1": 1.4093310734875801,
        "test_statistic": 842.0,
        "p_value": 7.178230546260448e-09,
        "q_value": 3.572851907802512e-07,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7826671543717384,
        "mean_model2": 0.7287274929881096,
        "mean_delta_model1": 0.21733284562826158,
        "mean_delta_model2": 0.27127250701189043,
        "mean_delta_model2 / mean_delta_model1": 1.2481891829451786,
        "test_statistic": 1285.0,
        "p_value": 2.0122895164012965e-05,
        "q_value": 0.0005007928062070149,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8897439941763878,
        "mean_model2": 0.8588985985517502,
        "mean_delta_model1": 0.11025600582361221,
        "mean_delta_model2": 0.1411014014482498,
        "mean_delta_model2 / mean_delta_model1": 1.2797615911643319,
        "test_statistic": 1361.0,
        "p_value": 6.275353012557241e-05,
        "q_value": 0.0010411529154507172,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8884879565238952,
        "mean_model2": 0.8454501834511757,
        "mean_delta_model1": 0.11151204347610473,
        "mean_delta_model2": 0.1545498165488243,
        "mean_delta_model2 / mean_delta_model1": 1.3859473087491387,
        "test_statistic": 1474.0,
        "p_value": 0.00030188229444763794,
        "q_value": 0.0037564296824199156,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.8943710827827454,
        "mean_model2": 0.8712864607572556,
        "mean_delta_model1": 0.10562891721725463,
        "mean_delta_model2": 0.12871353924274445,
        "mean_delta_model2 / mean_delta_model1": 1.218544529600829,
        "test_statistic": 1622.0,
        "p_value": 0.0019040300514065042,
        "q_value": 0.01895402316166906,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8664501664042473,
        "mean_model2": 0.8465324777364731,
        "mean_delta_model1": 0.1335498335957527,
        "mean_delta_model2": 0.15346752226352692,
        "mean_delta_model2 / mean_delta_model1": 1.1491404978314226,
        "test_statistic": 1732.0,
        "p_value": 0.006399218797703944,
        "q_value": 0.053085183371393015,
        "significant": false
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.36652338087558745,
        "mean_model2": -0.29622286692261696,
        "mean_delta_model1": 0.6334766191244126,
        "mean_delta_model2": 0.703777133077383,
        "mean_delta_model2 / mean_delta_model1": 1.1109757042811452,
        "test_statistic": 1854.0,
        "p_value": 0.021048355887367972,
        "q_value": 0.1496641387827141,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7430514541268348,
        "mean_model2": 0.7693505817651749,
        "mean_delta_model1": 0.7430514541268348,
        "mean_delta_model2": 0.7693505817651749,
        "mean_delta_model2 / mean_delta_model1": 1.0353934138642449,
        "test_statistic": 1882.0,
        "p_value": 0.027046719679822254,
        "q_value": 0.16827601764998137,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6477309703081846,
        "mean_model2": 0.6995121742784977,
        "mean_delta_model1": 0.6618481845408678,
        "mean_delta_model2": 0.7163089196383953,
        "mean_delta_model2 / mean_delta_model1": 1.0822858419341401,
        "test_statistic": 1958.0,
        "p_value": 0.051232141515175696,
        "q_value": 0.2833332956014446,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9633279949426651,
        "mean_model2": 0.9541554898023605,
        "mean_delta_model1": 0.0366720050573349,
        "mean_delta_model2": 0.045844510197639465,
        "mean_delta_model2 / mean_delta_model1": 1.2501228151000687,
        "test_statistic": 1983.0,
        "p_value": 0.062381823596739015,
        "q_value": 0.310495763563794,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4064640519209206,
        "mean_model2": -0.3671434297040105,
        "mean_delta_model1": 0.5935359480790794,
        "mean_delta_model2": 0.6328565702959895,
        "mean_delta_model2 / mean_delta_model1": 1.066248088838035,
        "test_statistic": 2121.0,
        "p_value": 0.16480744054104957,
        "q_value": 0.7457302193225103,
        "significant": false
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8751442736387253,
        "mean_model2": 0.8743390291929245,
        "mean_delta_model1": 0.12485572636127472,
        "mean_delta_model2": 0.1256609708070755,
        "mean_delta_model2 / mean_delta_model1": 1.0064493993929504,
        "test_statistic": 2487.0,
        "p_value": 0.8960471686012128,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8889417380094529,
        "mean_model2": 0.8861065673828125,
        "mean_delta_model1": 0.11105826199054718,
        "mean_delta_model2": 0.1138934326171875,
        "mean_delta_model2 / mean_delta_model1": 1.025528678153469,
        "test_statistic": 2336.0,
        "p_value": 0.5157933846475856,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.4720320870727301,
        "mean_model2": -0.4421729109436274,
        "mean_delta_model1": 0.52796791292727,
        "mean_delta_model2": 0.5578270890563727,
        "mean_delta_model2 / mean_delta_model1": 1.0565549068380904,
        "test_statistic": 2396.0,
        "p_value": 0.6573723697180689,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6899956105649472,
        "mean_model2": 0.7030336222052574,
        "mean_delta_model1": 0.6899956105649472,
        "mean_delta_model2": 0.7030336222052574,
        "mean_delta_model2 / mean_delta_model1": 1.0188957892494928,
        "test_statistic": 2330.0,
        "p_value": 0.5025560930996884,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 100,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 9,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_100.json",
      "model2": "models/bge-m3_propositions_a->b_100.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:08.667742",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8183139035105705,
        "mean_model2": 0.7450579446554184,
        "mean_delta_model1": 0.18168609648942946,
        "mean_delta_model2": 0.2549420553445816,
        "mean_delta_model2 / mean_delta_model1": 1.4032006866271916,
        "test_statistic": 699.0,
        "p_value": 3.4211636510736907e-10,
        "q_value": 1.702830662636082e-08,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8010369357466698,
        "mean_model2": 0.7309655785560608,
        "mean_delta_model1": 0.19896306425333024,
        "mean_delta_model2": 0.2690344214439392,
        "mean_delta_model2 / mean_delta_model1": 1.3521827403170188,
        "test_statistic": 813.0,
        "p_value": 3.946694285453164e-09,
        "q_value": 9.822026554051406e-08,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9057852590084076,
        "mean_model2": 0.863963331580162,
        "mean_delta_model1": 0.09421474099159241,
        "mean_delta_model2": 0.13603666841983794,
        "mean_delta_model2 / mean_delta_model1": 1.4439000414168486,
        "test_statistic": 973.0,
        "p_value": 9.487277678157034e-08,
        "q_value": 1.5740479929237459e-06,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8928372424840927,
        "mean_model2": 0.8308614087104798,
        "mean_delta_model1": 0.10716275751590729,
        "mean_delta_model2": 0.16913859128952027,
        "mean_delta_model2 / mean_delta_model1": 1.5783336973613549,
        "test_statistic": 1087.0,
        "p_value": 7.641358460082319e-07,
        "q_value": 9.508416446212802e-06,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.892257282435894,
        "mean_model2": 0.8598710787296295,
        "mean_delta_model1": 0.10774271756410599,
        "mean_delta_model2": 0.1401289212703705,
        "mean_delta_model2 / mean_delta_model1": 1.3005883315222209,
        "test_statistic": 1186.0,
        "p_value": 4.146178290313136e-06,
        "q_value": 4.127390704204101e-05,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8726610094308853,
        "mean_model2": 0.8420049086213112,
        "mean_delta_model1": 0.12733899056911469,
        "mean_delta_model2": 0.1579950913786888,
        "mean_delta_model2 / mean_delta_model1": 1.2407440224911723,
        "test_statistic": 1328.0,
        "p_value": 3.8604823016963796e-05,
        "q_value": 0.0003202491075334081,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4481902369391173,
        "mean_model2": -0.33970477029681206,
        "mean_delta_model1": 0.5518097630608827,
        "mean_delta_model2": 0.660295229703188,
        "mean_delta_model2 / mean_delta_model1": 1.1965993969380635,
        "test_statistic": 1449.0,
        "p_value": 0.0002159076122813839,
        "q_value": 0.001535209069137712,
        "significant": true
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4656204093620181,
        "mean_model2": -0.37602380799129603,
        "mean_delta_model1": 0.5343795906379819,
        "mean_delta_model2": 0.6239761920087039,
        "mean_delta_model2 / mean_delta_model1": 1.1676647142600542,
        "test_statistic": 1622.0,
        "p_value": 0.0019040300514065042,
        "q_value": 0.011846264476043162,
        "significant": true
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9661470872163772,
        "mean_model2": 0.9543231099843978,
        "mean_delta_model1": 0.03385291278362274,
        "mean_delta_model2": 0.045676890015602115,
        "mean_delta_model2 / mean_delta_model1": 1.349275033068928,
        "test_statistic": 1731.0,
        "p_value": 0.006332857655442997,
        "q_value": 0.035023119803805236,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.63823157325387,
        "mean_model2": 0.6851282499730587,
        "mean_delta_model1": 0.6558975872397422,
        "mean_delta_model2": 0.702600526958704,
        "mean_delta_model2 / mean_delta_model1": 1.071204621922006,
        "test_statistic": 1833.0,
        "p_value": 0.017344574395330364,
        "q_value": 0.08632990445070368,
        "significant": false
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7314479088783264,
        "mean_model2": 0.7655108091235161,
        "mean_delta_model1": 0.7314479088783264,
        "mean_delta_model2": 0.7655108091235161,
        "mean_delta_model2 / mean_delta_model1": 1.0465691402377855,
        "test_statistic": 1857.0,
        "p_value": 0.02163012475488432,
        "q_value": 0.09787323693929635,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5301983192935587,
        "mean_model2": -0.4696632133424282,
        "mean_delta_model1": 0.4698016807064414,
        "mean_delta_model2": 0.5303367866575718,
        "mean_delta_model2 / mean_delta_model1": 1.1288524678330305,
        "test_statistic": 1893.0,
        "p_value": 0.02977871411538426,
        "q_value": 0.12351574069843191,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.67623041421175,
        "mean_model2": 0.7031710910797119,
        "mean_delta_model1": 0.67623041421175,
        "mean_delta_model2": 0.7031710910797119,
        "mean_delta_model2 / mean_delta_model1": 1.0398394930215693,
        "test_statistic": 2050.0,
        "p_value": 0.1024260357718117,
        "q_value": 0.3921612017994641,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8968517458438874,
        "mean_model2": 0.8853390079736709,
        "mean_delta_model1": 0.10314825415611267,
        "mean_delta_model2": 0.11466099202632904,
        "mean_delta_model2 / mean_delta_model1": 1.1116135019870728,
        "test_statistic": 2071.0,
        "p_value": 0.11852387379698195,
        "q_value": 0.4213814511674568,
        "significant": false
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.874058330655098,
        "mean_model2": 0.8756408321857453,
        "mean_delta_model1": 0.12594166934490203,
        "mean_delta_model2": 0.12435916781425477,
        "mean_delta_model2 / mean_delta_model1": 0.9874346470165212,
        "test_statistic": 2380.0,
        "p_value": 0.61809143216358,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 8,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_200.json",
      "model2": "models/bge-m3_propositions_a->b_200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:08.667772",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8211070868372917,
        "mean_model2": 0.7657334405183792,
        "mean_delta_model1": 0.17889291316270828,
        "mean_delta_model2": 0.23426655948162078,
        "mean_delta_model2 / mean_delta_model1": 1.3095351589950832,
        "test_statistic": 1084.0,
        "p_value": 7.246950671541695e-07,
        "q_value": 3.607056274621497e-05,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7044027525186539,
        "mean_model2": 0.7836297407746315,
        "mean_delta_model1": 0.7044027525186539,
        "mean_delta_model2": 0.7836297407746315,
        "mean_delta_model2 / mean_delta_model1": 1.1124739901607348,
        "test_statistic": 1190.0,
        "p_value": 4.428985885216569e-06,
        "q_value": 0.00011022292031195699,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7866213303804398,
        "mean_model2": 0.7418005523085595,
        "mean_delta_model1": 0.21337866961956023,
        "mean_delta_model2": 0.2581994476914406,
        "mean_delta_model2 / mean_delta_model1": 1.2100527580933595,
        "test_statistic": 1297.0,
        "p_value": 2.4186175752933408e-05,
        "q_value": 0.0004012763480935786,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5819992932118475,
        "mean_model2": 0.6757651863619685,
        "mean_delta_model1": 0.5988613528199493,
        "mean_delta_model2": 0.7008917746320367,
        "mean_delta_model2 / mean_delta_model1": 1.17037402953395,
        "test_statistic": 1350.0,
        "p_value": 5.344397937967702e-05,
        "q_value": 0.0006650226070919128,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8918347585201264,
        "mean_model2": 0.84746671885252,
        "mean_delta_model1": 0.10816524147987366,
        "mean_delta_model2": 0.15253328114748002,
        "mean_delta_model2 / mean_delta_model1": 1.4101875894749603,
        "test_statistic": 1447.0,
        "p_value": 0.00021013147514216082,
        "q_value": 0.002091793059620087,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8655536425113678,
        "mean_model2": 0.8865419811010361,
        "mean_delta_model1": 0.1344463574886322,
        "mean_delta_model2": 0.11345801889896392,
        "mean_delta_model2 / mean_delta_model1": 0.843890611975539,
        "test_statistic": 1636.0,
        "p_value": 0.002238099402043779,
        "q_value": 0.015913984983407445,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4837604131177068,
        "mean_model2": -0.3879535548947752,
        "mean_delta_model1": 0.5162395868822932,
        "mean_delta_model2": 0.6120464451052249,
        "mean_delta_model2 / mean_delta_model1": 1.1855860353552785,
        "test_statistic": 1623.0,
        "p_value": 0.001926280773596183,
        "q_value": 0.015913984983407445,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6690796260535717,
        "mean_model2": 0.717666858844459,
        "mean_delta_model1": 0.6690796260535717,
        "mean_delta_model2": 0.717666858844459,
        "mean_delta_model2 / mean_delta_model1": 1.072618012713179,
        "test_statistic": 1658.0,
        "p_value": 0.0028728286282878363,
        "q_value": 0.017873818588055583,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8890455746650696,
        "mean_model2": 0.8720557016134262,
        "mean_delta_model1": 0.11095442533493043,
        "mean_delta_model2": 0.12794429838657378,
        "mean_delta_model2 / mean_delta_model1": 1.1531247897536057,
        "test_statistic": 1833.0,
        "p_value": 0.017344574395330364,
        "q_value": 0.09592211605633742,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.866727951169014,
        "mean_model2": 0.8577329444885254,
        "mean_delta_model1": 0.13327204883098603,
        "mean_delta_model2": 0.14226705551147462,
        "mean_delta_model2 / mean_delta_model1": 1.067493572428649,
        "test_statistic": 1868.0,
        "p_value": 0.023884664929614687,
        "q_value": 0.1188821814945108,
        "significant": false
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.8977004444599151,
        "mean_model2": 0.8785044533014298,
        "mean_delta_model1": 0.10229955554008484,
        "mean_delta_model2": 0.12149554669857025,
        "mean_delta_model2 / mean_delta_model1": 1.1876449126014403,
        "test_statistic": 1914.0,
        "p_value": 0.035657079268935354,
        "q_value": 0.1613432103327897,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8923286634683609,
        "mean_model2": 0.8933765804767608,
        "mean_delta_model1": 0.1076713365316391,
        "mean_delta_model2": 0.10662341952323913,
        "mean_delta_model2 / mean_delta_model1": 0.9902674468233053,
        "test_statistic": 2156.0,
        "p_value": 0.20453314479183926,
        "q_value": 0.848359763905731,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9603780847787857,
        "mean_model2": 0.9613676565885544,
        "mean_delta_model1": 0.0396219152212143,
        "mean_delta_model2": 0.03863234341144562,
        "mean_delta_model2 / mean_delta_model1": 0.9750246346183982,
        "test_statistic": 2504.0,
        "p_value": 0.9424389080177923,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.47045968018472195,
        "mean_model2": -0.44858737911097707,
        "mean_delta_model1": 0.529540319815278,
        "mean_delta_model2": 0.551412620889023,
        "mean_delta_model2 / mean_delta_model1": 1.0413043166974987,
        "test_statistic": 2480.0,
        "p_value": 0.8770384213786963,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5472050630953162,
        "mean_model2": -0.5294351847469807,
        "mean_delta_model1": 0.4527949369046837,
        "mean_delta_model2": 0.47056481525301935,
        "mean_delta_model2 / mean_delta_model1": 1.0392448698075356,
        "test_statistic": 2310.0,
        "p_value": 0.45976137200079736,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 400,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 8,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_400.json",
      "model2": "models/bge-m3_propositions_a->b_400.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:08.667798",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8273078039288521,
        "mean_model2": 0.7604560178518295,
        "mean_delta_model1": 0.1726921960711479,
        "mean_delta_model2": 0.23954398214817046,
        "mean_delta_model2 / mean_delta_model1": 1.3871152698149725,
        "test_statistic": 863.0,
        "p_value": 1.1002970803981983e-08,
        "q_value": 5.476556510013772e-07,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.8049311187863349,
        "mean_model2": 0.7445771983265876,
        "mean_delta_model1": 0.195068881213665,
        "mean_delta_model2": 0.2554228016734123,
        "mean_delta_model2 / mean_delta_model1": 1.3093979935920161,
        "test_statistic": 933.0,
        "p_value": 4.4045347744048177e-08,
        "q_value": 1.0961441242586542e-06,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8945431417226791,
        "mean_model2": 0.8411412218213081,
        "mean_delta_model1": 0.10545685827732086,
        "mean_delta_model2": 0.15885877817869187,
        "mean_delta_model2 / mean_delta_model1": 1.5063864102696811,
        "test_statistic": 1174.0,
        "p_value": 3.3978523164404245e-06,
        "q_value": 5.637426035561456e-05,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9045916336774826,
        "mean_model2": 0.8749540442228317,
        "mean_delta_model1": 0.0954083663225174,
        "mean_delta_model2": 0.12504595577716826,
        "mean_delta_model2 / mean_delta_model1": 1.3106393138989958,
        "test_statistic": 1372.0,
        "p_value": 7.358431663867958e-05,
        "q_value": 0.0009156360484402846,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5250386214349418,
        "mean_model2": -0.39113867182284595,
        "mean_delta_model1": 0.4749613785650581,
        "mean_delta_model2": 0.6088613281771541,
        "mean_delta_model2 / mean_delta_model1": 1.2819175529947957,
        "test_statistic": 1389.0,
        "p_value": 9.386095694675811e-05,
        "q_value": 0.0009343564460188533,
        "significant": true
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8712435624003411,
        "mean_model2": 0.849129826426506,
        "mean_delta_model1": 0.12875643759965896,
        "mean_delta_model2": 0.15087017357349397,
        "mean_delta_model2 / mean_delta_model1": 1.1717485850501161,
        "test_statistic": 1496.0,
        "p_value": 0.0004031146901668627,
        "q_value": 0.0028663432553747282,
        "significant": true
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8884298619627953,
        "mean_model2": 0.8680522787570953,
        "mean_delta_model1": 0.11157013803720474,
        "mean_delta_model2": 0.13194772124290466,
        "mean_delta_model2 / mean_delta_model1": 1.1826437034514083,
        "test_statistic": 1494.0,
        "p_value": 0.0003927428225373251,
        "q_value": 0.0028663432553747282,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7309173017740249,
        "mean_model2": 0.7796342498064042,
        "mean_delta_model1": 0.7309173017740249,
        "mean_delta_model2": 0.7796342498064042,
        "mean_delta_model2 / mean_delta_model1": 1.0666517920893885,
        "test_statistic": 1644.0,
        "p_value": 0.002452313676803902,
        "q_value": 0.015257509392867567,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5761524818185717,
        "mean_model2": 0.6428800263255835,
        "mean_delta_model1": 0.5972943781409412,
        "mean_delta_model2": 0.6668178652971983,
        "mean_delta_model2 / mean_delta_model1": 1.1163973573175872,
        "test_statistic": 1799.0,
        "p_value": 0.012552336622314263,
        "q_value": 0.06941921218822214,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5139625691249967,
        "mean_model2": -0.454970286320895,
        "mean_delta_model1": 0.48603743087500334,
        "mean_delta_model2": 0.545029713679105,
        "mean_delta_model2 / mean_delta_model1": 1.1213739499402322,
        "test_statistic": 1958.0,
        "p_value": 0.051232141515175696,
        "q_value": 0.25499996604130015,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5761615025252104,
        "mean_model2": -0.5573559789545834,
        "mean_delta_model1": 0.42383849747478963,
        "mean_delta_model2": 0.4426440210454166,
        "mean_delta_model2 / mean_delta_model1": 1.044369550389286,
        "test_statistic": 2123.0,
        "p_value": 0.16690827813258957,
        "q_value": 0.7552362105129377,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9601987540721894,
        "mean_model2": 0.9577863413095474,
        "mean_delta_model1": 0.03980124592781067,
        "mean_delta_model2": 0.04221365869045258,
        "mean_delta_model2 / mean_delta_model1": 1.0606114885704185,
        "test_statistic": 2172.0,
        "p_value": 0.22485135260421962,
        "q_value": 0.9326353467225964,
        "significant": false
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6787181308865547,
        "mean_model2": 0.6971724597364664,
        "mean_delta_model1": 0.6787181308865547,
        "mean_delta_model2": 0.6971724597364664,
        "mean_delta_model2 / mean_delta_model1": 1.0271899747628757,
        "test_statistic": 2192.0,
        "p_value": 0.2522248272969543,
        "q_value": 0.9656996939410722,
        "significant": false
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8759689223766327,
        "mean_model2": 0.8809602975845336,
        "mean_delta_model1": 0.1240310776233673,
        "mean_delta_model2": 0.11903970241546631,
        "mean_delta_model2 / mean_delta_model1": 0.9597570600566916,
        "test_statistic": 2278.0,
        "p_value": 0.39573316385218726,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8950811213254929,
        "mean_model2": 0.8929816073179245,
        "mean_delta_model1": 0.10491887867450714,
        "mean_delta_model2": 0.1070183926820755,
        "mean_delta_model2 / mean_delta_model1": 1.02001083154998,
        "test_statistic": 2320.0,
        "p_value": 0.4808994854883417,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 800,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 6,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_800.json",
      "model2": "models/bge-m3_propositions_a->b_800.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:08.667822",
    "comparisons": [
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.6905121773481369,
        "mean_model2": 0.7792805349826812,
        "mean_delta_model1": 0.6905121773481369,
        "mean_delta_model2": 0.7792805349826812,
        "mean_delta_model2 / mean_delta_model1": 1.1285543695629714,
        "test_statistic": 1060.0,
        "p_value": 4.725246164325417e-07,
        "q_value": 2.3519173233913037e-05,
        "significant": true
      },
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8223814857006073,
        "mean_model2": 0.775978013575077,
        "mean_delta_model1": 0.1776185142993927,
        "mean_delta_model2": 0.22402198642492294,
        "mean_delta_model2 / mean_delta_model1": 1.2612535765686723,
        "test_statistic": 1252.0,
        "p_value": 1.2033085813778235e-05,
        "q_value": 0.00029946400668968574,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5661799089238048,
        "mean_model2": 0.6583599449694156,
        "mean_delta_model1": 0.5870349088683724,
        "mean_delta_model2": 0.6839210173487663,
        "mean_delta_model2 / mean_delta_model1": 1.1650431805957866,
        "test_statistic": 1505.0,
        "p_value": 0.00045303361317550677,
        "q_value": 0.007516346350731276,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7837818533182144,
        "mean_model2": 0.7574179509282112,
        "mean_delta_model1": 0.21621814668178557,
        "mean_delta_model2": 0.2425820490717888,
        "mean_delta_model2 / mean_delta_model1": 1.1219319598960569,
        "test_statistic": 1552.0,
        "p_value": 0.0008213729941875387,
        "q_value": 0.010220638813381502,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6237460194900631,
        "mean_model2": 0.6976736136339605,
        "mean_delta_model1": 0.6237460194900631,
        "mean_delta_model2": 0.6976736136339605,
        "mean_delta_model2 / mean_delta_model1": 1.1185219493734582,
        "test_statistic": 1591.0,
        "p_value": 0.0013209204674027352,
        "q_value": 0.01314934977805605,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8889157819747925,
        "mean_model2": 0.8504841342568398,
        "mean_delta_model1": 0.11108421802520752,
        "mean_delta_model2": 0.14951586574316025,
        "mean_delta_model2 / mean_delta_model1": 1.3459685669230865,
        "test_statistic": 1655.0,
        "p_value": 0.0027775367201682928,
        "q_value": 0.02304125718655149,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8622986376285553,
        "mean_model2": 0.8843132483959198,
        "mean_delta_model1": 0.1377013623714447,
        "mean_delta_model2": 0.1156867516040802,
        "mean_delta_model2 / mean_delta_model1": 0.8401278651987419,
        "test_statistic": 1796.0,
        "p_value": 0.012191967211859929,
        "q_value": 0.086690869471909,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8683306187391281,
        "mean_model2": 0.8562631008028984,
        "mean_delta_model1": 0.13166938126087188,
        "mean_delta_model2": 0.1437368991971016,
        "mean_delta_model2 / mean_delta_model1": 1.0916501453919705,
        "test_statistic": 1824.0,
        "p_value": 0.015940905907188684,
        "q_value": 0.08815929359928137,
        "significant": false
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.4950790704600513,
        "mean_model2": -0.4144854263169691,
        "mean_delta_model1": 0.5049209295399487,
        "mean_delta_model2": 0.585514573683031,
        "mean_delta_model2 / mean_delta_model1": 1.159616366500224,
        "test_statistic": 1821.0,
        "p_value": 0.015495739833762864,
        "q_value": 0.08815929359928137,
        "significant": false
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9015114814043045,
        "mean_model2": 0.880825629234314,
        "mean_delta_model1": 0.09848851859569549,
        "mean_delta_model2": 0.11917437076568603,
        "mean_delta_model2 / mean_delta_model1": 1.2100331334549552,
        "test_statistic": 1849.0,
        "p_value": 0.020109014084802552,
        "q_value": 0.10008947034216303,
        "significant": false
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8823373070359231,
        "mean_model2": 0.8727085554599762,
        "mean_delta_model1": 0.117662692964077,
        "mean_delta_model2": 0.1272914445400238,
        "mean_delta_model2 / mean_delta_model1": 1.0818335135240065,
        "test_statistic": 1868.0,
        "p_value": 0.023884664929614687,
        "q_value": 0.10807471044955527,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4359205562621355,
        "mean_model2": -0.47621000541374087,
        "mean_delta_model1": 0.5640794437378644,
        "mean_delta_model2": 0.5237899945862591,
        "mean_delta_model2 / mean_delta_model1": 0.9285748672480814,
        "test_statistic": 1982.0,
        "p_value": 0.061900136920038915,
        "q_value": 0.25674853626614697,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9616484326124192,
        "mean_model2": 0.955643218755722,
        "mean_delta_model1": 0.03835156738758087,
        "mean_delta_model2": 0.04435678124427796,
        "mean_delta_model2 / mean_delta_model1": 1.156583270665535,
        "test_statistic": 2104.0,
        "p_value": 0.14774720853713422,
        "q_value": 0.5531943381176765,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8888232693076134,
        "mean_model2": 0.8950576251745224,
        "mean_delta_model1": 0.11117673069238662,
        "mean_delta_model2": 0.1049423748254776,
        "mean_delta_model2 / mean_delta_model1": 0.9439239144011279,
        "test_statistic": 2112.0,
        "p_value": 0.15559948292600154,
        "q_value": 0.5531943381176765,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5520901354588568,
        "mean_model2": -0.5729254976660013,
        "mean_delta_model1": 0.4479098645411432,
        "mean_delta_model2": 0.4270745023339987,
        "mean_delta_model2 / mean_delta_model1": 0.9534831361026418,
        "test_statistic": 2318.0,
        "p_value": 0.47662997837909227,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 1600,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 7,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_1600.json",
      "model2": "models/bge-m3_propositions_a->b_1600.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:08.667846",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8314262241125107,
        "mean_model2": 0.7757485294342041,
        "mean_delta_model1": 0.1685737758874893,
        "mean_delta_model2": 0.2242514705657959,
        "mean_delta_model2 / mean_delta_model1": 1.3302868099452632,
        "test_statistic": 1118.0,
        "p_value": 1.3133496145748613e-06,
        "q_value": 3.688080174734461e-05,
        "significant": true
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.5992224165331572,
        "mean_model2": 0.7162200941890479,
        "mean_delta_model1": 0.6271433152537793,
        "mean_delta_model2": 0.7418730112165213,
        "mean_delta_model2 / mean_delta_model1": 1.1829401560571773,
        "test_statistic": 1125.0,
        "p_value": 1.4819472203436094e-06,
        "q_value": 3.688080174734461e-05,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8946152561903,
        "mean_model2": 0.8495121473073959,
        "mean_delta_model1": 0.10538474380970002,
        "mean_delta_model2": 0.15048785269260406,
        "mean_delta_model2 / mean_delta_model1": 1.4279851831717656,
        "test_statistic": 1482.0,
        "p_value": 0.0003355666276420729,
        "q_value": 0.00556743456501002,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7972089719772338,
        "mean_model2": 0.7644297757744789,
        "mean_delta_model1": 0.2027910280227661,
        "mean_delta_model2": 0.2355702242255211,
        "mean_delta_model2 / mean_delta_model1": 1.1616402684199376,
        "test_statistic": 1527.0,
        "p_value": 0.0006003677894692837,
        "q_value": 0.007470591771066665,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7497611582279206,
        "mean_model2": 0.7979374438524246,
        "mean_delta_model1": 0.7497611582279206,
        "mean_delta_model2": 0.7979374438524246,
        "mean_delta_model2 / mean_delta_model1": 1.0642555100325148,
        "test_statistic": 1562.0,
        "p_value": 0.0009292772651479039,
        "q_value": 0.009250664291886965,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5476197991915979,
        "mean_model2": -0.474891130970791,
        "mean_delta_model1": 0.45238020080840213,
        "mean_delta_model2": 0.5251088690292091,
        "mean_delta_model2 / mean_delta_model1": 1.1607689021111909,
        "test_statistic": 1643.0,
        "p_value": 0.002424547311475631,
        "q_value": 0.020113007960984614,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9110442441701889,
        "mean_model2": 0.8871316158771515,
        "mean_delta_model1": 0.08895575582981109,
        "mean_delta_model2": 0.11286838412284851,
        "mean_delta_model2 / mean_delta_model1": 1.268814851495239,
        "test_statistic": 1667.0,
        "p_value": 0.0031768886430561445,
        "q_value": 0.02258923715067603,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6919266555458308,
        "mean_model2": 0.7346305106580258,
        "mean_delta_model1": 0.6919266555458308,
        "mean_delta_model2": 0.7346305106580258,
        "mean_delta_model2 / mean_delta_model1": 1.0617173146458823,
        "test_statistic": 1792.0,
        "p_value": 0.01172574799602767,
        "q_value": 0.0729538443144671,
        "significant": false
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8785033768415451,
        "mean_model2": 0.8939504718780518,
        "mean_delta_model1": 0.1214966231584549,
        "mean_delta_model2": 0.10604952812194825,
        "mean_delta_model2 / mean_delta_model1": 0.8728598817404112,
        "test_statistic": 1855.0,
        "p_value": 0.021240745629242976,
        "q_value": 0.11746942997459345,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.5729679375886917,
        "mean_model2": -0.5013888927176595,
        "mean_delta_model1": 0.4270320624113083,
        "mean_delta_model2": 0.49861110728234054,
        "mean_delta_model2 / mean_delta_model1": 1.167619837411854,
        "test_statistic": 1884.0,
        "p_value": 0.02752671948749582,
        "q_value": 0.13700993803783526,
        "significant": false
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.8931651628017425,
        "mean_model2": 0.8848850929737091,
        "mean_delta_model1": 0.10683483719825744,
        "mean_delta_model2": 0.11511490702629089,
        "mean_delta_model2 / mean_delta_model1": 1.0775034627765456,
        "test_statistic": 1925.0,
        "p_value": 0.03911311214562062,
        "q_value": 0.1769812673686574,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8733972808718682,
        "mean_model2": 0.8665955227613449,
        "mean_delta_model1": 0.12660271912813187,
        "mean_delta_model2": 0.1334044772386551,
        "mean_delta_model2 / mean_delta_model1": 1.0537252134659076,
        "test_statistic": 2006.0,
        "p_value": 0.07434398189444386,
        "q_value": 0.3083629452427937,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9620287293195724,
        "mean_model2": 0.9616730916500091,
        "mean_delta_model1": 0.03797127068042755,
        "mean_delta_model2": 0.03832690834999084,
        "mean_delta_model2 / mean_delta_model1": 1.0093659670374584,
        "test_statistic": 2376.0,
        "p_value": 0.6084337865773708,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.9012007176876068,
        "mean_model2": 0.9033829241991043,
        "mean_delta_model1": 0.09879928231239318,
        "mean_delta_model2": 0.0966170758008957,
        "mean_delta_model2 / mean_delta_model1": 0.9779127291168211,
        "test_statistic": 2371.0,
        "p_value": 0.5964572394238804,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5874424312636256,
        "mean_model2": -0.5971838396694511,
        "mean_delta_model1": 0.4125575687363744,
        "mean_delta_model2": 0.4028161603305489,
        "mean_delta_model2 / mean_delta_model1": 0.9763877598084977,
        "test_statistic": 2380.0,
        "p_value": 0.61809143216358,
        "q_value": 1.0,
        "significant": false
      }
    ]
  },
  {
    "comparison_type": "Wilcoxon rank-sum test to compare semantic entailment estimation error in each proposition category between two models",
    "dataset": "propositions",
    "model1_name": "gte-Qwen2-1.5B-instruct",
    "model2_name": "bge-m3",
    "projection_dimension": 3200,
    "samples_per_proposition_category": {
      "1.1": 100,
      "1.2": 100,
      "1.3": 100,
      "1.4": 100,
      "1.5": 100,
      "1.6": 100,
      "1.7": 100,
      "2.1": 100,
      "2.2": 100,
      "3.1": 100,
      "3.2": 100,
      "3.3": 100,
      "4.1": 100,
      "4.2": 100,
      "4.3": 100
    },
    "total_proposition_categories_compared": 15,
    "significant_proposition_categories": 7,
    "fdr_correction_method": "fdr_by",
    "model_files": {
      "model1": "models/gte-Qwen2-1.5B-instruct_propositions_a->b_3200.json",
      "model2": "models/bge-m3_propositions_a->b_3200.json"
    },
    "analysis_timestamp": "2025-08-27T07:51:08.667868",
    "comparisons": [
      {
        "domain": "1.4",
        "sample_size": 100,
        "mean_model1": 0.8229858508706093,
        "mean_model2": 0.7623094499111176,
        "mean_delta_model1": 0.17701414912939073,
        "mean_delta_model2": 0.23769055008888246,
        "mean_delta_model2 / mean_delta_model1": 1.3427771240768982,
        "test_statistic": 1254.0,
        "p_value": 1.2418346489695002e-05,
        "q_value": 0.0006181037605510418,
        "significant": true
      },
      {
        "domain": "1.2",
        "sample_size": 100,
        "mean_model1": 0.9100125628709793,
        "mean_model2": 0.882039293050766,
        "mean_delta_model1": 0.08998743712902069,
        "mean_delta_model2": 0.11796070694923401,
        "mean_delta_model2 / mean_delta_model1": 1.3108575009210037,
        "test_statistic": 1390.0,
        "p_value": 9.520475071229453e-05,
        "q_value": 0.0011846668654000288,
        "significant": true
      },
      {
        "domain": "1.7",
        "sample_size": 100,
        "mean_model1": 0.7929314923286438,
        "mean_model2": 0.7500394213199616,
        "mean_delta_model1": 0.2070685076713562,
        "mean_delta_model2": 0.24996057868003846,
        "mean_delta_model2 / mean_delta_model1": 1.2071395186599663,
        "test_statistic": 1387.0,
        "p_value": 9.122692472649753e-05,
        "q_value": 0.0011846668654000288,
        "significant": true
      },
      {
        "domain": "4.2",
        "sample_size": 100,
        "mean_model1": 0.7239353622496129,
        "mean_model2": 0.7917789721488953,
        "mean_delta_model1": 0.7239353622496129,
        "mean_delta_model2": 0.7917789721488953,
        "mean_delta_model2 / mean_delta_model1": 1.0937150102579047,
        "test_statistic": 1390.0,
        "p_value": 9.520475071229453e-05,
        "q_value": 0.0011846668654000288,
        "significant": true
      },
      {
        "domain": "1.1",
        "sample_size": 100,
        "mean_model1": 0.8938222086429596,
        "mean_model2": 0.8513754630088806,
        "mean_delta_model1": 0.1061777913570404,
        "mean_delta_model2": 0.14862453699111938,
        "mean_delta_model2 / mean_delta_model1": 1.3997704707507501,
        "test_statistic": 1416.0,
        "p_value": 0.00013723330210865516,
        "q_value": 0.0013661145656804793,
        "significant": true
      },
      {
        "domain": "4.1",
        "sample_size": 100,
        "mean_model1": 0.6802295897901058,
        "mean_model2": 0.7323339978605509,
        "mean_delta_model1": 0.6802295897901058,
        "mean_delta_model2": 0.7328643359988928,
        "mean_delta_model2 / mean_delta_model1": 1.0773779132792916,
        "test_statistic": 1600.0,
        "p_value": 0.001470462686034046,
        "q_value": 0.012198329795648886,
        "significant": true
      },
      {
        "domain": "3.1",
        "sample_size": 100,
        "mean_model1": -0.5349437871575355,
        "mean_model2": -0.43089755072724073,
        "mean_delta_model1": 0.46505621284246446,
        "mean_delta_model2": 0.5691024492727592,
        "mean_delta_model2 / mean_delta_model1": 1.223728301132362,
        "test_statistic": 1635.0,
        "p_value": 0.0022125636913517266,
        "q_value": 0.015732413550019423,
        "significant": true
      },
      {
        "domain": "2.1",
        "sample_size": 100,
        "mean_model1": 0.8699245226383209,
        "mean_model2": 0.8907698923349381,
        "mean_delta_model1": 0.13007547736167907,
        "mean_delta_model2": 0.10923010766506196,
        "mean_delta_model2 / mean_delta_model1": 0.8397440461536352,
        "test_statistic": 1769.0,
        "p_value": 0.009339407150918728,
        "q_value": 0.058106796727028835,
        "significant": false
      },
      {
        "domain": "1.6",
        "sample_size": 100,
        "mean_model1": 0.889600751399994,
        "mean_model2": 0.8735338073968887,
        "mean_delta_model1": 0.1103992486000061,
        "mean_delta_model2": 0.12646619260311126,
        "mean_delta_model2 / mean_delta_model1": 1.1455349036053517,
        "test_statistic": 1835.0,
        "p_value": 0.01767083481608265,
        "q_value": 0.09772646070214296,
        "significant": false
      },
      {
        "domain": "4.3",
        "sample_size": 100,
        "mean_model1": 0.6040639822650701,
        "mean_model2": 0.650384536832571,
        "mean_delta_model1": 0.6252481802832335,
        "mean_delta_model2": 0.6785368636250496,
        "mean_delta_model2 / mean_delta_model1": 1.0852280502722562,
        "test_statistic": 1908.0,
        "p_value": 0.03388425165082691,
        "q_value": 0.16865355936246187,
        "significant": false
      },
      {
        "domain": "1.5",
        "sample_size": 100,
        "mean_model1": 0.8682697361707687,
        "mean_model2": 0.8581146469712257,
        "mean_delta_model1": 0.13173026382923125,
        "mean_delta_model2": 0.14188535302877425,
        "mean_delta_model2 / mean_delta_model1": 1.0770900239956065,
        "test_statistic": 2075.0,
        "p_value": 0.12180389428011701,
        "q_value": 0.5511452911206592,
        "significant": false
      },
      {
        "domain": "1.3",
        "sample_size": 100,
        "mean_model1": 0.9622382885217666,
        "mean_model2": 0.9573878955841064,
        "mean_delta_model1": 0.03776171147823334,
        "mean_delta_model2": 0.042612104415893554,
        "mean_delta_model2 / mean_delta_model1": 1.1284473808994617,
        "test_statistic": 2249.0,
        "p_value": 0.3426319759495927,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "2.2",
        "sample_size": 100,
        "mean_model1": 0.8979895907640457,
        "mean_model2": 0.8983676600456237,
        "mean_delta_model1": 0.10201040923595428,
        "mean_delta_model2": 0.10163233995437622,
        "mean_delta_model2 / mean_delta_model1": 0.9962938166368535,
        "test_statistic": 2353.0,
        "p_value": 0.5542578843858728,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.2",
        "sample_size": 100,
        "mean_model1": -0.4948650293983519,
        "mean_model2": -0.49310068497434256,
        "mean_delta_model1": 0.5051349706016481,
        "mean_delta_model2": 0.5068993150256574,
        "mean_delta_model2 / mean_delta_model1": 1.0034928178144307,
        "test_statistic": 2413.0,
        "p_value": 0.7001689715399704,
        "q_value": 1.0,
        "significant": false
      },
      {
        "domain": "3.3",
        "sample_size": 100,
        "mean_model1": -0.5873328913748265,
        "mean_model2": -0.5908838836103678,
        "mean_delta_model1": 0.4126671086251736,
        "mean_delta_model2": 0.40911611638963225,
        "mean_delta_model2 / mean_delta_model1": 0.9913950199535609,
        "test_statistic": 2504.0,
        "p_value": 0.9424389080177923,
        "q_value": 1.0,
        "significant": false
      }
    ]
  }
]