uid,strategy,metric,score,metric_logical
dcc0cd81daf008bdfae70cb902823c53,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
2fe9fed70ecfdfc7406f63336b9f67b5,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
1442ea8b59ba6c124ac8525dab82bd08,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
c19ec0c4a847787392020a13174dac02,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
a358fad2e28e1f7d3fc52755bd34849b,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
ff74fa1597632dab4f80bf4aec94e907,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
92bb363244292221145525077d1306d6,same_obvious,LLMJudge-qwen3_32b-seed42,0.5,LLMJudge
0b2c66ac1f8199f86c00bf5d412cc36c,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
0649bc1a231eccf68cb2be7da1adadb7,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
41e967b0ccf6603299264ee180d2b763,same_obvious,LLMJudge-qwen3_32b-seed42,0.25,LLMJudge
5bf6b0f3084abb60f4069c81d97c3ed8,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
f9517141bc8e0acd91a94a0e52426bd4,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
9eb7be8308db996a7ca95250a5f47b4e,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
73ddd7f74ce126f782900cdfe9a55509,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
1faf0ded025e8cd1f60c151bfa4ca0de,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
9a4a189905d89c96ee4ce0481278c7bc,same_obvious,LLMJudge-qwen3_32b-seed42,0.625,LLMJudge
51e0ef37662d6d6ad70b8765eebec1bd,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
7a3a7b0bc82a43df002b41125ef35f33,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
6fb316e2677fc2f091dde02c009c9306,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
c90d54dc00fe18aa94b34778831ca527,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
a6cd9e3ed71e9d66eea1845d39155784,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
6266f27bb635107a3cf388d77e1c51ab,same_obvious,LLMJudge-qwen3_32b-seed42,0.5,LLMJudge
035ed2311b96d2a65ec6a6fe71046c14,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
a6b24668430907d0c15a2e24d42c0ddb,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
5752ef4acf1dbf1b4f8c6dedea76efa4,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
8e50191baec2f75d7317f15f44d0801c,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
fc42f578dfa5cd847ba369215fc723cf,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
6776468400b158821b8e8f7dffb67e41,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
dcc0cd81daf008bdfae70cb902823c53,same_obvious,DNAEval-qwen3_32b-seed42,0.9972222222222222,DNAEval
2fe9fed70ecfdfc7406f63336b9f67b5,same_obvious,DNAEval-qwen3_32b-seed42,0.911111111111111,DNAEval
1442ea8b59ba6c124ac8525dab82bd08,same_obvious,DNAEval-qwen3_32b-seed42,0.8861111111111111,DNAEval
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,DNAEval-qwen3_32b-seed42,0.9833333333333333,DNAEval
c19ec0c4a847787392020a13174dac02,same_obvious,DNAEval-qwen3_32b-seed42,0.9027777777777778,DNAEval
a358fad2e28e1f7d3fc52755bd34849b,same_obvious,DNAEval-qwen3_32b-seed42,0.9666666666666667,DNAEval
ff74fa1597632dab4f80bf4aec94e907,same_obvious,DNAEval-qwen3_32b-seed42,0.9833333333333334,DNAEval
92bb363244292221145525077d1306d6,same_obvious,DNAEval-qwen3_32b-seed42,0.5555555555555556,DNAEval
0b2c66ac1f8199f86c00bf5d412cc36c,same_obvious,DNAEval-qwen3_32b-seed42,0.9500000000000001,DNAEval
0649bc1a231eccf68cb2be7da1adadb7,same_obvious,DNAEval-qwen3_32b-seed42,0.6916666666666668,DNAEval
41e967b0ccf6603299264ee180d2b763,same_obvious,DNAEval-qwen3_32b-seed42,0.18518888888888885,DNAEval
5bf6b0f3084abb60f4069c81d97c3ed8,same_obvious,DNAEval-qwen3_32b-seed42,0.7222222222222221,DNAEval
f9517141bc8e0acd91a94a0e52426bd4,same_obvious,DNAEval-qwen3_32b-seed42,0.9250000000000002,DNAEval
9eb7be8308db996a7ca95250a5f47b4e,same_obvious,DNAEval-qwen3_32b-seed42,0.9444444444444445,DNAEval
73ddd7f74ce126f782900cdfe9a55509,same_obvious,DNAEval-qwen3_32b-seed42,0.8305555555555555,DNAEval
1faf0ded025e8cd1f60c151bfa4ca0de,same_obvious,DNAEval-qwen3_32b-seed42,0.9833333333333333,DNAEval
9a4a189905d89c96ee4ce0481278c7bc,same_obvious,DNAEval-qwen3_32b-seed42,0.5777777777777778,DNAEval
51e0ef37662d6d6ad70b8765eebec1bd,same_obvious,DNAEval-qwen3_32b-seed42,0.9611111111111111,DNAEval
7a3a7b0bc82a43df002b41125ef35f33,same_obvious,DNAEval-qwen3_32b-seed42,0.9527777777777776,DNAEval
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,DNAEval-qwen3_32b-seed42,0.8388888888888888,DNAEval
6fb316e2677fc2f091dde02c009c9306,same_obvious,DNAEval-qwen3_32b-seed42,0.9888888888888889,DNAEval
c90d54dc00fe18aa94b34778831ca527,same_obvious,DNAEval-qwen3_32b-seed42,0.7027777777777777,DNAEval
a6cd9e3ed71e9d66eea1845d39155784,same_obvious,DNAEval-qwen3_32b-seed42,0.7916666666666666,DNAEval
6266f27bb635107a3cf388d77e1c51ab,same_obvious,DNAEval-qwen3_32b-seed42,0.8,DNAEval
035ed2311b96d2a65ec6a6fe71046c14,same_obvious,DNAEval-qwen3_32b-seed42,0.7277777777777777,DNAEval
a6b24668430907d0c15a2e24d42c0ddb,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
5752ef4acf1dbf1b4f8c6dedea76efa4,same_obvious,DNAEval-qwen3_32b-seed42,0.6888888888888889,DNAEval
8e50191baec2f75d7317f15f44d0801c,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
fc42f578dfa5cd847ba369215fc723cf,same_obvious,DNAEval-qwen3_32b-seed42,0.7916666666666666,DNAEval
6776468400b158821b8e8f7dffb67e41,same_obvious,DNAEval-qwen3_32b-seed42,0.9777777777777779,DNAEval
dcc0cd81daf008bdfae70cb902823c53,same_obvious,Autometrics_Regression_helpfulness,0.8853520141714559,Autometrics
2fe9fed70ecfdfc7406f63336b9f67b5,same_obvious,Autometrics_Regression_helpfulness,0.9002309238644568,Autometrics
1442ea8b59ba6c124ac8525dab82bd08,same_obvious,Autometrics_Regression_helpfulness,0.9859552067755538,Autometrics
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,Autometrics_Regression_helpfulness,0.9835921175128021,Autometrics
c19ec0c4a847787392020a13174dac02,same_obvious,Autometrics_Regression_helpfulness,0.985045655381966,Autometrics
a358fad2e28e1f7d3fc52755bd34849b,same_obvious,Autometrics_Regression_helpfulness,0.9472530113307864,Autometrics
ff74fa1597632dab4f80bf4aec94e907,same_obvious,Autometrics_Regression_helpfulness,0.9439685550788773,Autometrics
92bb363244292221145525077d1306d6,same_obvious,Autometrics_Regression_helpfulness,0.669910182525362,Autometrics
0b2c66ac1f8199f86c00bf5d412cc36c,same_obvious,Autometrics_Regression_helpfulness,0.9637356736215718,Autometrics
0649bc1a231eccf68cb2be7da1adadb7,same_obvious,Autometrics_Regression_helpfulness,0.9519746977980786,Autometrics
41e967b0ccf6603299264ee180d2b763,same_obvious,Autometrics_Regression_helpfulness,0.47040431218538725,Autometrics
5bf6b0f3084abb60f4069c81d97c3ed8,same_obvious,Autometrics_Regression_helpfulness,0.8345677738198161,Autometrics
f9517141bc8e0acd91a94a0e52426bd4,same_obvious,Autometrics_Regression_helpfulness,0.8985781913639164,Autometrics
9eb7be8308db996a7ca95250a5f47b4e,same_obvious,Autometrics_Regression_helpfulness,0.8221379108952376,Autometrics
73ddd7f74ce126f782900cdfe9a55509,same_obvious,Autometrics_Regression_helpfulness,0.8517756021069851,Autometrics
1faf0ded025e8cd1f60c151bfa4ca0de,same_obvious,Autometrics_Regression_helpfulness,0.8610132242725819,Autometrics
9a4a189905d89c96ee4ce0481278c7bc,same_obvious,Autometrics_Regression_helpfulness,0.747242613137675,Autometrics
51e0ef37662d6d6ad70b8765eebec1bd,same_obvious,Autometrics_Regression_helpfulness,0.896478868711804,Autometrics
7a3a7b0bc82a43df002b41125ef35f33,same_obvious,Autometrics_Regression_helpfulness,0.9391859905168117,Autometrics
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,Autometrics_Regression_helpfulness,0.8884552469393552,Autometrics
6fb316e2677fc2f091dde02c009c9306,same_obvious,Autometrics_Regression_helpfulness,0.9105782874234922,Autometrics
c90d54dc00fe18aa94b34778831ca527,same_obvious,Autometrics_Regression_helpfulness,0.9299791233905765,Autometrics
a6cd9e3ed71e9d66eea1845d39155784,same_obvious,Autometrics_Regression_helpfulness,0.9354019108126432,Autometrics
6266f27bb635107a3cf388d77e1c51ab,same_obvious,Autometrics_Regression_helpfulness,0.827827430493798,Autometrics
035ed2311b96d2a65ec6a6fe71046c14,same_obvious,Autometrics_Regression_helpfulness,0.9668772476817572,Autometrics
a6b24668430907d0c15a2e24d42c0ddb,same_obvious,Autometrics_Regression_helpfulness,0.8917879232599866,Autometrics
5752ef4acf1dbf1b4f8c6dedea76efa4,same_obvious,Autometrics_Regression_helpfulness,0.8500542484826153,Autometrics
8e50191baec2f75d7317f15f44d0801c,same_obvious,Autometrics_Regression_helpfulness,0.9844894499985053,Autometrics
fc42f578dfa5cd847ba369215fc723cf,same_obvious,Autometrics_Regression_helpfulness,0.7948283202354626,Autometrics
6776468400b158821b8e8f7dffb67e41,same_obvious,Autometrics_Regression_helpfulness,0.9510536984533436,Autometrics
dcc0cd81daf008bdfae70cb902823c53,same_obvious,metametrics_score,0.8942644822348476,MetaMetrics
2fe9fed70ecfdfc7406f63336b9f67b5,same_obvious,metametrics_score,0.9361221492353379,MetaMetrics
1442ea8b59ba6c124ac8525dab82bd08,same_obvious,metametrics_score,0.7106798274522349,MetaMetrics
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,metametrics_score,0.8607925542075754,MetaMetrics
c19ec0c4a847787392020a13174dac02,same_obvious,metametrics_score,0.804238949818946,MetaMetrics
a358fad2e28e1f7d3fc52755bd34849b,same_obvious,metametrics_score,0.8078786797569066,MetaMetrics
ff74fa1597632dab4f80bf4aec94e907,same_obvious,metametrics_score,0.7601898042602149,MetaMetrics
92bb363244292221145525077d1306d6,same_obvious,metametrics_score,0.9594662083430339,MetaMetrics
0b2c66ac1f8199f86c00bf5d412cc36c,same_obvious,metametrics_score,0.9169885156750263,MetaMetrics
0649bc1a231eccf68cb2be7da1adadb7,same_obvious,metametrics_score,0.9391108831309632,MetaMetrics
41e967b0ccf6603299264ee180d2b763,same_obvious,metametrics_score,0.9980489282035714,MetaMetrics
5bf6b0f3084abb60f4069c81d97c3ed8,same_obvious,metametrics_score,0.8511853708510596,MetaMetrics
f9517141bc8e0acd91a94a0e52426bd4,same_obvious,metametrics_score,0.9783764348482701,MetaMetrics
9eb7be8308db996a7ca95250a5f47b4e,same_obvious,metametrics_score,0.3057066009549081,MetaMetrics
73ddd7f74ce126f782900cdfe9a55509,same_obvious,metametrics_score,0.6148863563577495,MetaMetrics
1faf0ded025e8cd1f60c151bfa4ca0de,same_obvious,metametrics_score,0.7698119164501009,MetaMetrics
9a4a189905d89c96ee4ce0481278c7bc,same_obvious,metametrics_score,0.4862727878093853,MetaMetrics
51e0ef37662d6d6ad70b8765eebec1bd,same_obvious,metametrics_score,0.8801996357778832,MetaMetrics
7a3a7b0bc82a43df002b41125ef35f33,same_obvious,metametrics_score,0.8781781851955656,MetaMetrics
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,metametrics_score,0.6445344012602681,MetaMetrics
6fb316e2677fc2f091dde02c009c9306,same_obvious,metametrics_score,0.3563553417543308,MetaMetrics
c90d54dc00fe18aa94b34778831ca527,same_obvious,metametrics_score,0.5935952436526704,MetaMetrics
a6cd9e3ed71e9d66eea1845d39155784,same_obvious,metametrics_score,0.7877994297822368,MetaMetrics
6266f27bb635107a3cf388d77e1c51ab,same_obvious,metametrics_score,0.6234993659557562,MetaMetrics
035ed2311b96d2a65ec6a6fe71046c14,same_obvious,metametrics_score,0.8927537832051446,MetaMetrics
a6b24668430907d0c15a2e24d42c0ddb,same_obvious,metametrics_score,0.8437362539019079,MetaMetrics
5752ef4acf1dbf1b4f8c6dedea76efa4,same_obvious,metametrics_score,0.8902854072418683,MetaMetrics
8e50191baec2f75d7317f15f44d0801c,same_obvious,metametrics_score,0.8011275398577983,MetaMetrics
fc42f578dfa5cd847ba369215fc723cf,same_obvious,metametrics_score,0.859792909634353,MetaMetrics
6776468400b158821b8e8f7dffb67e41,same_obvious,metametrics_score,0.8455247157762622,MetaMetrics
dcc0cd81daf008bdfae70cb902823c53,same_obvious,INFORMRewardModel,0.8967297762478486,BEST_METRIC
2fe9fed70ecfdfc7406f63336b9f67b5,same_obvious,INFORMRewardModel,0.8760757314974182,BEST_METRIC
1442ea8b59ba6c124ac8525dab82bd08,same_obvious,INFORMRewardModel,0.9268502581755593,BEST_METRIC
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,INFORMRewardModel,0.9827882960413081,BEST_METRIC
c19ec0c4a847787392020a13174dac02,same_obvious,INFORMRewardModel,0.9664371772805508,BEST_METRIC
a358fad2e28e1f7d3fc52755bd34849b,same_obvious,INFORMRewardModel,0.9784853700516352,BEST_METRIC
ff74fa1597632dab4f80bf4aec94e907,same_obvious,INFORMRewardModel,0.941655012908778,BEST_METRIC
92bb363244292221145525077d1306d6,same_obvious,INFORMRewardModel,0.9109294320137693,BEST_METRIC
0b2c66ac1f8199f86c00bf5d412cc36c,same_obvious,INFORMRewardModel,0.9518072289156627,BEST_METRIC
0649bc1a231eccf68cb2be7da1adadb7,same_obvious,INFORMRewardModel,0.9621342512908778,BEST_METRIC
41e967b0ccf6603299264ee180d2b763,same_obvious,INFORMRewardModel,0.7074010327022375,BEST_METRIC
5bf6b0f3084abb60f4069c81d97c3ed8,same_obvious,INFORMRewardModel,0.8726333907056799,BEST_METRIC
f9517141bc8e0acd91a94a0e52426bd4,same_obvious,INFORMRewardModel,0.9294320137693631,BEST_METRIC
9eb7be8308db996a7ca95250a5f47b4e,same_obvious,INFORMRewardModel,0.774526678141136,BEST_METRIC
73ddd7f74ce126f782900cdfe9a55509,same_obvious,INFORMRewardModel,0.8192771084337349,BEST_METRIC
1faf0ded025e8cd1f60c151bfa4ca0de,same_obvious,INFORMRewardModel,0.8433734939759037,BEST_METRIC
9a4a189905d89c96ee4ce0481278c7bc,same_obvious,INFORMRewardModel,0.8455249569707401,BEST_METRIC
51e0ef37662d6d6ad70b8765eebec1bd,same_obvious,INFORMRewardModel,0.9240533562822719,BEST_METRIC
7a3a7b0bc82a43df002b41125ef35f33,same_obvious,INFORMRewardModel,0.9716006884681584,BEST_METRIC
48ea9872c8b201ca1c1773bbe70170c9,same_obvious,INFORMRewardModel,0.9156626506024096,BEST_METRIC
6fb316e2677fc2f091dde02c009c9306,same_obvious,INFORMRewardModel,0.9057659208261618,BEST_METRIC
c90d54dc00fe18aa94b34778831ca527,same_obvious,INFORMRewardModel,0.9905335628227195,BEST_METRIC
a6cd9e3ed71e9d66eea1845d39155784,same_obvious,INFORMRewardModel,0.9302925989672978,BEST_METRIC
6266f27bb635107a3cf388d77e1c51ab,same_obvious,INFORMRewardModel,0.927710843373494,BEST_METRIC
035ed2311b96d2a65ec6a6fe71046c14,same_obvious,INFORMRewardModel,0.9690189328743546,BEST_METRIC
a6b24668430907d0c15a2e24d42c0ddb,same_obvious,INFORMRewardModel,0.8919965576592083,BEST_METRIC
5752ef4acf1dbf1b4f8c6dedea76efa4,same_obvious,INFORMRewardModel,0.9367469879518072,BEST_METRIC
8e50191baec2f75d7317f15f44d0801c,same_obvious,INFORMRewardModel,0.9931153184165232,BEST_METRIC
fc42f578dfa5cd847ba369215fc723cf,same_obvious,INFORMRewardModel,0.6987951807228916,BEST_METRIC
6776468400b158821b8e8f7dffb67e41,same_obvious,INFORMRewardModel,1.0,BEST_METRIC
