uid,strategy,metric,score,metric_logical
f801cfe693b7e8ca9064dc636337cd4e,same_obvious,LLMJudge-qwen3_32b-seed42,0.5,LLMJudge
108cdf63a53d973d167fd3ded5150fbc,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
ec0027e8cfce27851981c11f81814724,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
f82a62137c6e61901b13c08bc5685e4d,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
279b17b0e0f009f16e6eb50e44216cab,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
1a47bf83401270b3185722fc037e5165,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
47b1d275c44d4c678ab9b79632f5ace9,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
04a2bc52bdb946815bec4e524b2fe0e4,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
206d16bccf6136b207d8daa2b41b20ec,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
c4b77ae688b9b377f64abd9a8a559be9,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
084a88f1a9f8f4a017541702c36b7b34,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
d06f9b57c92730ad43912813fdfb49fe,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
76037b936bd9acbe650a5e49224bcaec,same_obvious,LLMJudge-qwen3_32b-seed42,0.6,LLMJudge
79d4c6499793b5e5d9064a00c9a637c2,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
b7ff1369e0053126fea26634d72dedc1,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
4b9ab55523a86fc7a1705c3f6093fae2,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
ca1a7d4507f1cf29ce0f5ae6ca34ea7c,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
0f12b013cb3a5ce3deec93de8020e61f,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
bd19d7643d625304350eea57ae358add,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
3a9a4cd2b5878f39a23a247d67986541,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
f801cfe693b7e8ca9064dc636337cd4e,same_obvious,DNAEval-qwen3_32b-seed42,0.8875968992248062,DNAEval
108cdf63a53d973d167fd3ded5150fbc,same_obvious,DNAEval-qwen3_32b-seed42,0.9689922480620156,DNAEval
ec0027e8cfce27851981c11f81814724,same_obvious,DNAEval-qwen3_32b-seed42,0.8139534883720931,DNAEval
f82a62137c6e61901b13c08bc5685e4d,same_obvious,DNAEval-qwen3_32b-seed42,0.8682170542635659,DNAEval
279b17b0e0f009f16e6eb50e44216cab,same_obvious,DNAEval-qwen3_32b-seed42,0.821705426356589,DNAEval
1a47bf83401270b3185722fc037e5165,same_obvious,DNAEval-qwen3_32b-seed42,0.821705426356589,DNAEval
47b1d275c44d4c678ab9b79632f5ace9,same_obvious,DNAEval-qwen3_32b-seed42,0.8449612403100775,DNAEval
04a2bc52bdb946815bec4e524b2fe0e4,same_obvious,DNAEval-qwen3_32b-seed42,0.8449612403100775,DNAEval
206d16bccf6136b207d8daa2b41b20ec,same_obvious,DNAEval-qwen3_32b-seed42,0.8720930232558141,DNAEval
c4b77ae688b9b377f64abd9a8a559be9,same_obvious,DNAEval-qwen3_32b-seed42,0.937984496124031,DNAEval
084a88f1a9f8f4a017541702c36b7b34,same_obvious,DNAEval-qwen3_32b-seed42,0.6705426356589147,DNAEval
d06f9b57c92730ad43912813fdfb49fe,same_obvious,DNAEval-qwen3_32b-seed42,0.9612403100775194,DNAEval
76037b936bd9acbe650a5e49224bcaec,same_obvious,DNAEval-qwen3_32b-seed42,0.9534883720930232,DNAEval
79d4c6499793b5e5d9064a00c9a637c2,same_obvious,DNAEval-qwen3_32b-seed42,0.9651162790697675,DNAEval
b7ff1369e0053126fea26634d72dedc1,same_obvious,DNAEval-qwen3_32b-seed42,0.9263565891472869,DNAEval
4b9ab55523a86fc7a1705c3f6093fae2,same_obvious,DNAEval-qwen3_32b-seed42,0.8100775193798451,DNAEval
ca1a7d4507f1cf29ce0f5ae6ca34ea7c,same_obvious,DNAEval-qwen3_32b-seed42,0.9806201550387598,DNAEval
0f12b013cb3a5ce3deec93de8020e61f,same_obvious,DNAEval-qwen3_32b-seed42,0.9961240310077518,DNAEval
bd19d7643d625304350eea57ae358add,same_obvious,DNAEval-qwen3_32b-seed42,0.9961240310077519,DNAEval
3a9a4cd2b5878f39a23a247d67986541,same_obvious,DNAEval-qwen3_32b-seed42,0.9302325581395346,DNAEval
f801cfe693b7e8ca9064dc636337cd4e,same_obvious,Autometrics_Regression_grade,0.8110916487707123,Autometrics
108cdf63a53d973d167fd3ded5150fbc,same_obvious,Autometrics_Regression_grade,0.8629723065690207,Autometrics
ec0027e8cfce27851981c11f81814724,same_obvious,Autometrics_Regression_grade,0.4898221889495875,Autometrics
f82a62137c6e61901b13c08bc5685e4d,same_obvious,Autometrics_Regression_grade,0.6713057728383068,Autometrics
279b17b0e0f009f16e6eb50e44216cab,same_obvious,Autometrics_Regression_grade,0.3880609262668442,Autometrics
1a47bf83401270b3185722fc037e5165,same_obvious,Autometrics_Regression_grade,0.3932269616276043,Autometrics
47b1d275c44d4c678ab9b79632f5ace9,same_obvious,Autometrics_Regression_grade,0.8921390816212728,Autometrics
04a2bc52bdb946815bec4e524b2fe0e4,same_obvious,Autometrics_Regression_grade,0.24920330657466894,Autometrics
206d16bccf6136b207d8daa2b41b20ec,same_obvious,Autometrics_Regression_grade,0.6486652444602499,Autometrics
c4b77ae688b9b377f64abd9a8a559be9,same_obvious,Autometrics_Regression_grade,0.8934375241438405,Autometrics
084a88f1a9f8f4a017541702c36b7b34,same_obvious,Autometrics_Regression_grade,0.6617516339322506,Autometrics
d06f9b57c92730ad43912813fdfb49fe,same_obvious,Autometrics_Regression_grade,0.31807585714510067,Autometrics
76037b936bd9acbe650a5e49224bcaec,same_obvious,Autometrics_Regression_grade,0.2980232283527132,Autometrics
79d4c6499793b5e5d9064a00c9a637c2,same_obvious,Autometrics_Regression_grade,0.7370979864069899,Autometrics
b7ff1369e0053126fea26634d72dedc1,same_obvious,Autometrics_Regression_grade,0.3783560642864692,Autometrics
4b9ab55523a86fc7a1705c3f6093fae2,same_obvious,Autometrics_Regression_grade,0.6430321515195817,Autometrics
ca1a7d4507f1cf29ce0f5ae6ca34ea7c,same_obvious,Autometrics_Regression_grade,0.9912412918184604,Autometrics
0f12b013cb3a5ce3deec93de8020e61f,same_obvious,Autometrics_Regression_grade,0.9558658109806957,Autometrics
bd19d7643d625304350eea57ae358add,same_obvious,Autometrics_Regression_grade,0.8454423567423303,Autometrics
3a9a4cd2b5878f39a23a247d67986541,same_obvious,Autometrics_Regression_grade,0.5225678998137668,Autometrics
f801cfe693b7e8ca9064dc636337cd4e,same_obvious,metametrics_score,0.9682122199409521,MetaMetrics
108cdf63a53d973d167fd3ded5150fbc,same_obvious,metametrics_score,0.6622191447413266,MetaMetrics
ec0027e8cfce27851981c11f81814724,same_obvious,metametrics_score,0.695296732278761,MetaMetrics
f82a62137c6e61901b13c08bc5685e4d,same_obvious,metametrics_score,0.8852356863272374,MetaMetrics
279b17b0e0f009f16e6eb50e44216cab,same_obvious,metametrics_score,0.32488413151125706,MetaMetrics
1a47bf83401270b3185722fc037e5165,same_obvious,metametrics_score,0.9598808104835241,MetaMetrics
47b1d275c44d4c678ab9b79632f5ace9,same_obvious,metametrics_score,0.35861140545691905,MetaMetrics
04a2bc52bdb946815bec4e524b2fe0e4,same_obvious,metametrics_score,0.2513567288172256,MetaMetrics
206d16bccf6136b207d8daa2b41b20ec,same_obvious,metametrics_score,0.32805474385285627,MetaMetrics
c4b77ae688b9b377f64abd9a8a559be9,same_obvious,metametrics_score,0.9541862999283627,MetaMetrics
084a88f1a9f8f4a017541702c36b7b34,same_obvious,metametrics_score,0.8321989851158296,MetaMetrics
d06f9b57c92730ad43912813fdfb49fe,same_obvious,metametrics_score,0.942900245809373,MetaMetrics
76037b936bd9acbe650a5e49224bcaec,same_obvious,metametrics_score,0.38751641599356235,MetaMetrics
79d4c6499793b5e5d9064a00c9a637c2,same_obvious,metametrics_score,0.8454445644885366,MetaMetrics
b7ff1369e0053126fea26634d72dedc1,same_obvious,metametrics_score,0.642194514806423,MetaMetrics
4b9ab55523a86fc7a1705c3f6093fae2,same_obvious,metametrics_score,0.9137126705026014,MetaMetrics
ca1a7d4507f1cf29ce0f5ae6ca34ea7c,same_obvious,metametrics_score,0.9325751568967637,MetaMetrics
0f12b013cb3a5ce3deec93de8020e61f,same_obvious,metametrics_score,0.9772136173810011,MetaMetrics
bd19d7643d625304350eea57ae358add,same_obvious,metametrics_score,0.6474480290503821,MetaMetrics
3a9a4cd2b5878f39a23a247d67986541,same_obvious,metametrics_score,0.39123054129221346,MetaMetrics
f801cfe693b7e8ca9064dc636337cd4e,same_obvious,PRM_min,0.8666666666666667,BEST_METRIC
108cdf63a53d973d167fd3ded5150fbc,same_obvious,PRM_min,0.9416666666666667,BEST_METRIC
ec0027e8cfce27851981c11f81814724,same_obvious,PRM_min,0.8666666666666667,BEST_METRIC
f82a62137c6e61901b13c08bc5685e4d,same_obvious,PRM_min,0.5416666666666666,BEST_METRIC
279b17b0e0f009f16e6eb50e44216cab,same_obvious,PRM_min,0.7916666666666666,BEST_METRIC
1a47bf83401270b3185722fc037e5165,same_obvious,PRM_min,0.9333333333333333,BEST_METRIC
47b1d275c44d4c678ab9b79632f5ace9,same_obvious,PRM_min,0.9833333333333333,BEST_METRIC
04a2bc52bdb946815bec4e524b2fe0e4,same_obvious,PRM_min,0.8916666666666667,BEST_METRIC
206d16bccf6136b207d8daa2b41b20ec,same_obvious,PRM_min,0.8333333333333334,BEST_METRIC
c4b77ae688b9b377f64abd9a8a559be9,same_obvious,PRM_min,0.775,BEST_METRIC
084a88f1a9f8f4a017541702c36b7b34,same_obvious,PRM_min,0.8916666666666667,BEST_METRIC
d06f9b57c92730ad43912813fdfb49fe,same_obvious,PRM_min,0.9083333333333333,BEST_METRIC
76037b936bd9acbe650a5e49224bcaec,same_obvious,PRM_min,0.975,BEST_METRIC
79d4c6499793b5e5d9064a00c9a637c2,same_obvious,PRM_min,0.7583333333333333,BEST_METRIC
b7ff1369e0053126fea26634d72dedc1,same_obvious,PRM_min,0.8166666666666667,BEST_METRIC
4b9ab55523a86fc7a1705c3f6093fae2,same_obvious,PRM_min,0.95,BEST_METRIC
ca1a7d4507f1cf29ce0f5ae6ca34ea7c,same_obvious,PRM_min,0.8416666666666667,BEST_METRIC
0f12b013cb3a5ce3deec93de8020e61f,same_obvious,PRM_min,0.9166666666666666,BEST_METRIC
bd19d7643d625304350eea57ae358add,same_obvious,PRM_min,0.6083333333333334,BEST_METRIC
3a9a4cd2b5878f39a23a247d67986541,same_obvious,PRM_min,0.9583333333333333,BEST_METRIC
