criterion,model1,model2,model1_label,model2_label,model1_mean,model2_mean,model1_std,model2_std,model1_n,model2_n,mean_diff,t_statistic,p_value,cohens_d,significant_uncorrected,p_value_corrected,significant_corrected
Clarity of Steps,orange-model,blue-model,OSS,DAPO,2.9066666666666667,4.066666666666666,1.32719306863826,0.9348122131604153,75,75,-1.1599999999999997,-6.18831379914917,5.668760270923693e-09,-1.010547411742633,True,1.3605024650216864e-07,True
Clarity of Steps,orange-model,green-model,OSS,QwQ+DAPO/OSS,2.9066666666666667,4.486486486486487,1.32719306863826,0.7806432760694334,75,74,-1.5798198198198201,-8.841128191167469,2.6970983939438764e-15,-1.4486195109843865,True,6.473036145465304e-14,True
Clarity of Steps,orange-model,purple-model,OSS,QwQ+OSS/DAPO,2.9066666666666667,3.2133333333333334,1.32719306863826,1.3585896237516926,75,75,-0.30666666666666664,-1.398336388479058,0.16410392359237802,-0.2283473760359951,False,1.0,False
Clarity of Steps,blue-model,green-model,DAPO,QwQ+DAPO/OSS,4.066666666666666,4.486486486486487,0.9348122131604153,0.7806432760694334,75,74,-0.4198198198198204,-2.9734238437653313,0.0034427859604063966,-0.4871957177148297,True,0.08262686304975352,False
Clarity of Steps,blue-model,purple-model,DAPO,QwQ+OSS/DAPO,4.066666666666666,3.2133333333333334,0.9348122131604153,1.3585896237516926,75,75,0.853333333333333,4.4811931251136725,1.4763591931954779e-05,0.731775773026429,True,0.0003543262063669147,True
Clarity of Steps,green-model,purple-model,QwQ+DAPO/OSS,QwQ+OSS/DAPO,4.486486486486487,3.2133333333333334,0.7806432760694334,1.3585896237516926,74,75,1.2731531531531535,7.001089708794778,8.408041972670648e-11,1.1471290689398854,True,2.0179300734409555e-09,True
Ease of Following,orange-model,blue-model,OSS,DAPO,3.0,3.96,1.30487650860252,0.964785370334154,75,75,-0.96,-5.123114816078126,9.233167570459964e-07,-0.8366011462055932,True,2.2159602169103913e-05,True
Ease of Following,orange-model,green-model,OSS,QwQ+DAPO/OSS,3.0,4.373333333333333,1.30487650860252,0.8663894220445161,75,75,-1.373333333333333,-7.5932607419772005,3.2488585159672417e-12,-1.2399742867834225,True,7.79726043832138e-11,True
Ease of Following,orange-model,purple-model,OSS,QwQ+OSS/DAPO,3.0,3.12,1.30487650860252,1.2515935787972414,75,75,-0.1200000000000001,-0.5747670335511105,0.5663215388269862,-0.09385906354489067,False,1.0,False
Ease of Following,blue-model,green-model,DAPO,QwQ+DAPO/OSS,3.96,4.373333333333333,0.964785370334154,0.8663894220445161,75,75,-0.4133333333333331,-2.7605154913472725,0.006501437432318685,-0.4507902920566139,True,0.15603449837564842,False
Ease of Following,blue-model,purple-model,DAPO,QwQ+OSS/DAPO,3.96,3.12,0.964785370334154,1.2515935787972414,75,75,0.8399999999999999,4.603358457790256,8.878801466827123e-06,0.7517252883140947,True,0.00021309123520385097,True
Ease of Following,green-model,purple-model,QwQ+DAPO/OSS,QwQ+OSS/DAPO,4.373333333333333,3.12,0.8663894220445161,1.2515935787972414,75,75,1.253333333333333,7.130553277737729,4.1064446942493286e-11,1.164414474279169,True,9.855467266198388e-10,True
Confidence,orange-model,blue-model,OSS,DAPO,3.0533333333333332,4.1066666666666665,1.324339032828782,0.8787788414909933,75,75,-1.0533333333333332,-5.739432390531129,5.193088945910032e-08,-0.9372453846669024,True,1.2463413470184077e-06,True
Confidence,orange-model,green-model,OSS,QwQ+DAPO/OSS,3.0533333333333332,4.413333333333333,1.324339032828782,0.7900279388376996,75,75,-1.3599999999999999,-7.637688987822709,2.537577992848412e-12,-1.2472293889493173,True,6.090187182836188e-11,True
Confidence,orange-model,purple-model,OSS,QwQ+OSS/DAPO,3.0533333333333332,3.2,1.324339032828782,1.404625563263382,75,75,-0.14666666666666694,-0.6579475596866539,0.5115939153631899,-0.10744238658277877,False,1.0,False
Confidence,blue-model,green-model,DAPO,QwQ+DAPO/OSS,4.1066666666666665,4.413333333333333,0.8787788414909933,0.7900279388376996,75,75,-0.30666666666666664,-2.2474645212313247,0.026089006555765664,-0.3670094194683491,True,0.6261361573383759,False
Confidence,blue-model,purple-model,DAPO,QwQ+OSS/DAPO,4.1066666666666665,3.2,0.8787788414909933,1.404625563263382,75,75,0.9066666666666663,4.739028173407906,4.9939810665311274e-06,0.7738800601015444,True,0.00011985554559674706,True
Confidence,green-model,purple-model,QwQ+DAPO/OSS,QwQ+OSS/DAPO,4.413333333333333,3.2,0.7900279388376996,1.404625563263382,75,75,1.213333333333333,6.5202611927685155,1.0399779631725606e-09,1.0647541941302459,True,2.4959471116141455e-08,True
best-overall,blue-model,orange-model,DAPO,OSS,2.2666666666666666,3.16,1.0696467380140717,0.944972472071527,75,75,-0.8933333333333335,-5.420460350576373,2.3611641178982588e-07,-0.8851574686599825,True,5.666793882955821e-06,True
best-overall,blue-model,green-model,DAPO,QwQ+DAPO/OSS,2.2666666666666666,1.6666666666666667,1.0696467380140717,0.8750804338835177,75,75,0.5999999999999999,3.7598926118747302,0.00024414980025238685,0.613987892450227,True,0.0058595952060572845,True
best-overall,blue-model,purple-model,DAPO,QwQ+OSS/DAPO,2.2666666666666666,2.9066666666666667,1.0696467380140717,0.946877276407337,75,75,-0.6400000000000001,-3.879884690200606,0.00015684280826852823,-0.633582516788525,True,0.0037642273984446776,True
best-overall,orange-model,green-model,OSS,QwQ+DAPO/OSS,3.16,1.6666666666666667,0.944972472071527,0.8750804338835177,75,75,1.4933333333333334,10.041495483474801,2.043768077903728e-18,1.6397693459316756,True,4.9050433869689476e-17,True
best-overall,orange-model,purple-model,OSS,QwQ+OSS/DAPO,3.16,2.9066666666666667,0.944972472071527,0.946877276407337,75,75,0.2533333333333334,1.6400274056431412,0.10312322105325,0.2678153538670788,False,1.0,False
best-overall,green-model,purple-model,QwQ+DAPO/OSS,QwQ+OSS/DAPO,1.6666666666666667,2.9066666666666667,0.8750804338835177,0.946877276407337,75,75,-1.24,-8.328985063977958,5.0630559547076503e-14,-1.3601175654672202,True,1.215133429129836e-12,True
