uid,strategy,metric,score,metric_logical
011586c0134e387bb8b76844873ed799,same_obvious,LLMJudge-qwen3_32b-seed42,0.675,LLMJudge
8713d9b8067061c34f61019a07e2b325,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
e2f02cf550abe739038cd39847fdee11,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
0179efaa97ca0268d61db40365d0e286,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
3b300f439fc295e25d2eab5314e12960,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,LLMJudge-qwen3_32b-seed42,0.8925000000000001,LLMJudge
2b442cbafe16c8f1e46959aacf81d03d,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
8713d9b8067061c34f61019a07e2b325,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,LLMJudge-qwen3_32b-seed42,0.775,LLMJudge
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
4d328b965414360d44f8b198a9b1afdb,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
0179efaa97ca0268d61db40365d0e286,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
7d8fd314249cebb51ebb9048ca9a38bd,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
011586c0134e387bb8b76844873ed799,same_obvious,LLMJudge-qwen3_32b-seed42,0.675,LLMJudge
9121098f041fcd5aea47e50fa7aac874,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,LLMJudge-qwen3_32b-seed42,0.9075,LLMJudge
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,LLMJudge-qwen3_32b-seed42,0.8500000000000001,LLMJudge
f2a1ebbdfd7ec0dd51e1b4ec3b2fe1cb,same_obvious,LLMJudge-qwen3_32b-seed42,0.9,LLMJudge
e2f02cf550abe739038cd39847fdee11,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
6bce9d27e345a62765e4edffabc69d15,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
4d328b965414360d44f8b198a9b1afdb,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
3b300f439fc295e25d2eab5314e12960,same_obvious,LLMJudge-qwen3_32b-seed42,0.925,LLMJudge
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,LLMJudge-qwen3_32b-seed42,0.9425,LLMJudge
e6cb435292fbb4442fa34c65bc90d310,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,LLMJudge-qwen3_32b-seed42,0.7000000000000001,LLMJudge
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,LLMJudge-qwen3_32b-seed42,0.9075,LLMJudge
011586c0134e387bb8b76844873ed799,same_obvious,DNAEval-qwen3_32b-seed42,0.8154761904761905,DNAEval
8713d9b8067061c34f61019a07e2b325,same_obvious,DNAEval-qwen3_32b-seed42,0.7053571428571428,DNAEval
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,DNAEval-qwen3_32b-seed42,0.8273809523809524,DNAEval
e2f02cf550abe739038cd39847fdee11,same_obvious,DNAEval-qwen3_32b-seed42,0.9047619047619048,DNAEval
0179efaa97ca0268d61db40365d0e286,same_obvious,DNAEval-qwen3_32b-seed42,0.9880952380952381,DNAEval
3b300f439fc295e25d2eab5314e12960,same_obvious,DNAEval-qwen3_32b-seed42,0.9389880952380952,DNAEval
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,DNAEval-qwen3_32b-seed42,0.8191964285714286,DNAEval
2b442cbafe16c8f1e46959aacf81d03d,same_obvious,DNAEval-qwen3_32b-seed42,0.6130952380952381,DNAEval
8713d9b8067061c34f61019a07e2b325,same_obvious,DNAEval-qwen3_32b-seed42,0.9672619047619048,DNAEval
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,DNAEval-qwen3_32b-seed42,0.9940476190476191,DNAEval
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,DNAEval-qwen3_32b-seed42,0.851190476190476,DNAEval
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,DNAEval-qwen3_32b-seed42,0.9345238095238095,DNAEval
4d328b965414360d44f8b198a9b1afdb,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
0179efaa97ca0268d61db40365d0e286,same_obvious,DNAEval-qwen3_32b-seed42,0.9761904761904763,DNAEval
7d8fd314249cebb51ebb9048ca9a38bd,same_obvious,DNAEval-qwen3_32b-seed42,0.7261904761904762,DNAEval
011586c0134e387bb8b76844873ed799,same_obvious,DNAEval-qwen3_32b-seed42,0.7083333333333333,DNAEval
9121098f041fcd5aea47e50fa7aac874,same_obvious,DNAEval-qwen3_32b-seed42,0.9345238095238094,DNAEval
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,DNAEval-qwen3_32b-seed42,0.9322916666666666,DNAEval
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,DNAEval-qwen3_32b-seed42,0.7083333333333333,DNAEval
f2a1ebbdfd7ec0dd51e1b4ec3b2fe1cb,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
e2f02cf550abe739038cd39847fdee11,same_obvious,DNAEval-qwen3_32b-seed42,0.9047619047619048,DNAEval
6bce9d27e345a62765e4edffabc69d15,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
4d328b965414360d44f8b198a9b1afdb,same_obvious,DNAEval-qwen3_32b-seed42,0.9791666666666666,DNAEval
3b300f439fc295e25d2eab5314e12960,same_obvious,DNAEval-qwen3_32b-seed42,0.9955357142857142,DNAEval
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,DNAEval-qwen3_32b-seed42,0.890625,DNAEval
e6cb435292fbb4442fa34c65bc90d310,same_obvious,DNAEval-qwen3_32b-seed42,0.8214285714285714,DNAEval
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,DNAEval-qwen3_32b-seed42,0.931547619047619,DNAEval
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,DNAEval-qwen3_32b-seed42,0.8191964285714286,DNAEval
011586c0134e387bb8b76844873ed799,same_obvious,Autometrics_Regression_score,0.674487763841405,Autometrics
8713d9b8067061c34f61019a07e2b325,same_obvious,Autometrics_Regression_score,0.9561292268020397,Autometrics
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,Autometrics_Regression_score,0.8287612415158253,Autometrics
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,Autometrics_Regression_score,0.9945725843482377,Autometrics
e2f02cf550abe739038cd39847fdee11,same_obvious,Autometrics_Regression_score,0.9034611678334276,Autometrics
0179efaa97ca0268d61db40365d0e286,same_obvious,Autometrics_Regression_score,0.7667585549809343,Autometrics
3b300f439fc295e25d2eab5314e12960,same_obvious,Autometrics_Regression_score,0.9716858192791382,Autometrics
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,Autometrics_Regression_score,0.9417953969742853,Autometrics
2b442cbafe16c8f1e46959aacf81d03d,same_obvious,Autometrics_Regression_score,0.8598660647010513,Autometrics
8713d9b8067061c34f61019a07e2b325,same_obvious,Autometrics_Regression_score,0.7492561663822914,Autometrics
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,Autometrics_Regression_score,0.9504946173079766,Autometrics
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,Autometrics_Regression_score,0.8542312212002485,Autometrics
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,Autometrics_Regression_score,0.9203229687254018,Autometrics
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,Autometrics_Regression_score,0.9011788096076354,Autometrics
4d328b965414360d44f8b198a9b1afdb,same_obvious,Autometrics_Regression_score,0.8471789518597611,Autometrics
0179efaa97ca0268d61db40365d0e286,same_obvious,Autometrics_Regression_score,0.9148777002162815,Autometrics
7d8fd314249cebb51ebb9048ca9a38bd,same_obvious,Autometrics_Regression_score,0.9061048231852259,Autometrics
011586c0134e387bb8b76844873ed799,same_obvious,Autometrics_Regression_score,0.7198107325668068,Autometrics
9121098f041fcd5aea47e50fa7aac874,same_obvious,Autometrics_Regression_score,0.6464949935668418,Autometrics
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,Autometrics_Regression_score,0.8884867502990049,Autometrics
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,Autometrics_Regression_score,0.8554153308467257,Autometrics
f2a1ebbdfd7ec0dd51e1b4ec3b2fe1cb,same_obvious,Autometrics_Regression_score,0.7539576012737029,Autometrics
e2f02cf550abe739038cd39847fdee11,same_obvious,Autometrics_Regression_score,0.8041207225095331,Autometrics
6bce9d27e345a62765e4edffabc69d15,same_obvious,Autometrics_Regression_score,0.8964949935668419,Autometrics
4d328b965414360d44f8b198a9b1afdb,same_obvious,Autometrics_Regression_score,0.8948353701903605,Autometrics
3b300f439fc295e25d2eab5314e12960,same_obvious,Autometrics_Regression_score,0.8738335092041412,Autometrics
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,Autometrics_Regression_score,0.8321362921087946,Autometrics
e6cb435292fbb4442fa34c65bc90d310,same_obvious,Autometrics_Regression_score,0.8535050064331577,Autometrics
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,Autometrics_Regression_score,0.9023629192541126,Autometrics
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,Autometrics_Regression_score,0.9410865670510731,Autometrics
011586c0134e387bb8b76844873ed799,same_obvious,metametrics_score,0.8861733391778566,MetaMetrics
8713d9b8067061c34f61019a07e2b325,same_obvious,metametrics_score,0.9676629496686644,MetaMetrics
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,metametrics_score,0.9055732896264611,MetaMetrics
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,metametrics_score,0.968175260199108,MetaMetrics
e2f02cf550abe739038cd39847fdee11,same_obvious,metametrics_score,0.8017013570665377,MetaMetrics
0179efaa97ca0268d61db40365d0e286,same_obvious,metametrics_score,0.9896456520338873,MetaMetrics
3b300f439fc295e25d2eab5314e12960,same_obvious,metametrics_score,0.8972470448285738,MetaMetrics
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,metametrics_score,0.843003012406425,MetaMetrics
2b442cbafe16c8f1e46959aacf81d03d,same_obvious,metametrics_score,0.8785999848981633,MetaMetrics
8713d9b8067061c34f61019a07e2b325,same_obvious,metametrics_score,0.9616185961032914,MetaMetrics
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,metametrics_score,0.6296526806605532,MetaMetrics
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,metametrics_score,0.9439859043889652,MetaMetrics
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,metametrics_score,0.9019259908650773,MetaMetrics
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,metametrics_score,0.9738053988344562,MetaMetrics
4d328b965414360d44f8b198a9b1afdb,same_obvious,metametrics_score,0.9399203197150621,MetaMetrics
0179efaa97ca0268d61db40365d0e286,same_obvious,metametrics_score,0.9538368075198467,MetaMetrics
7d8fd314249cebb51ebb9048ca9a38bd,same_obvious,metametrics_score,0.36894584592121316,MetaMetrics
011586c0134e387bb8b76844873ed799,same_obvious,metametrics_score,0.9967268094522059,MetaMetrics
9121098f041fcd5aea47e50fa7aac874,same_obvious,metametrics_score,0.8474777157433757,MetaMetrics
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,metametrics_score,0.7987022793400832,MetaMetrics
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,metametrics_score,0.7590592025795678,MetaMetrics
f2a1ebbdfd7ec0dd51e1b4ec3b2fe1cb,same_obvious,metametrics_score,0.6035731526002057,MetaMetrics
e2f02cf550abe739038cd39847fdee11,same_obvious,metametrics_score,0.7073215082248321,MetaMetrics
6bce9d27e345a62765e4edffabc69d15,same_obvious,metametrics_score,0.8192930072462286,MetaMetrics
4d328b965414360d44f8b198a9b1afdb,same_obvious,metametrics_score,0.9639019649516756,MetaMetrics
3b300f439fc295e25d2eab5314e12960,same_obvious,metametrics_score,0.8639231883404164,MetaMetrics
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,metametrics_score,0.7124215747838878,MetaMetrics
e6cb435292fbb4442fa34c65bc90d310,same_obvious,metametrics_score,0.4820684981705303,MetaMetrics
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,metametrics_score,0.8933992124850219,MetaMetrics
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,metametrics_score,0.8046818590506444,MetaMetrics
011586c0134e387bb8b76844873ed799,same_obvious,LENS,0.8489283533603218,BEST_METRIC
8713d9b8067061c34f61019a07e2b325,same_obvious,LENS,0.751536952390469,BEST_METRIC
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,LENS,0.8456326343318252,BEST_METRIC
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,LENS,0.9783818783868385,BEST_METRIC
e2f02cf550abe739038cd39847fdee11,same_obvious,LENS,0.6804794393334243,BEST_METRIC
0179efaa97ca0268d61db40365d0e286,same_obvious,LENS,0.9666050361568415,BEST_METRIC
3b300f439fc295e25d2eab5314e12960,same_obvious,LENS,0.9117207994114916,BEST_METRIC
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,LENS,0.8966826159939044,BEST_METRIC
2b442cbafe16c8f1e46959aacf81d03d,same_obvious,LENS,0.7798411177434196,BEST_METRIC
8713d9b8067061c34f61019a07e2b325,same_obvious,LENS,0.8753093856526478,BEST_METRIC
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,LENS,0.9056715622690208,BEST_METRIC
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,LENS,0.9988355646865849,BEST_METRIC
556a48114ca32e27dd7ca11b5c757ca1,same_obvious,LENS,0.5967268942745572,BEST_METRIC
21b6dc758db276bf6fe0cd15755d5c81,same_obvious,LENS,0.9530344320029737,BEST_METRIC
4d328b965414360d44f8b198a9b1afdb,same_obvious,LENS,0.6978952185494476,BEST_METRIC
0179efaa97ca0268d61db40365d0e286,same_obvious,LENS,0.9006726255620645,BEST_METRIC
7d8fd314249cebb51ebb9048ca9a38bd,same_obvious,LENS,0.8650666556105966,BEST_METRIC
011586c0134e387bb8b76844873ed799,same_obvious,LENS,0.9617271874201445,BEST_METRIC
9121098f041fcd5aea47e50fa7aac874,same_obvious,LENS,0.7606620030506994,BEST_METRIC
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,LENS,0.9739184940559467,BEST_METRIC
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,LENS,0.9819851445890535,BEST_METRIC
f2a1ebbdfd7ec0dd51e1b4ec3b2fe1cb,same_obvious,LENS,0.9026332220563866,BEST_METRIC
e2f02cf550abe739038cd39847fdee11,same_obvious,LENS,0.9010181478783588,BEST_METRIC
6bce9d27e345a62765e4edffabc69d15,same_obvious,LENS,0.8082770785181422,BEST_METRIC
4d328b965414360d44f8b198a9b1afdb,same_obvious,LENS,0.8072427036778258,BEST_METRIC
3b300f439fc295e25d2eab5314e12960,same_obvious,LENS,0.8625937273719714,BEST_METRIC
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,LENS,0.9851560979287299,BEST_METRIC
e6cb435292fbb4442fa34c65bc90d310,same_obvious,LENS,0.7690648489147802,BEST_METRIC
543047ead06ed9ccb6cd1dfd33e772fa,same_obvious,LENS,0.9261732782708683,BEST_METRIC
5a603f108b7b98a7aa5968e9e79e0fad,same_obvious,LENS,0.976260005458447,BEST_METRIC
