uid,strategy,metric,score,metric_logical
ce8d55ca3791982c86f047cdc8a3a418,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
8efea49e516203f968b41641ef3f12b8,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
0b2b2cb9001ac4a182d934a24ba427e0,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
505f542de6f8c4bb3f46a30ec06684ca,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
86215affc10344b259275703ffc9f19a,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
3054a3712d3fdab017e24c58d79afb26,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
75ba6dc6f24e2cafff75919543cd3efa,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
c4cf2c49370bb6a971d045d6dfebe21a,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
df66cc7932afe4e89577bd654025c532,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
21290b829e96cfbf062e633a5138ac1f,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
aa0dcb69d0f676846f1bc3cc576ac5e6,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
a226cd75f8c3339a6261a50985156dc9,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
b5b444c236ab45234263c585cc011e8c,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
389665f0c600eca76500129c010a5be2,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
df4ae4215fa3fb628eef337ece89a217,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
82edc54570a07226ee90eda45a3553ca,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
1492b4e463432d1d7d858712e6c79812,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
9d1b51febde3b2664cee7f4125c5cf2d,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
ffdd45dd795d4804ae9c6651b63e25a4,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
26c487fcf9ee09797498df54775d91e5,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
d6090b5d8fb0ec630386b0d6f21796bf,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
9ce851589f3754772f0eec4d8d8c59ca,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
37b5d6ab4f988e5e7812ee0b93e75afe,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
cc4e7ec102c99c881e50fcc5b27e759a,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
18b6cc22c4d999aa96389298ee04125c,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
a3ed03409978affc4cbfe0e08eaeae87,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
5398d26c6787a05f7a204a36de500f50,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
4d1fb838df1225a71be7243ef9b5e357,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
094bd6c5a70e255659a07e0f79390493,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
2881dac4923d649ad0dfec243094dc99,same_obvious,LLMJudge-qwen3_32b-seed42,0.0,LLMJudge
ce8d55ca3791982c86f047cdc8a3a418,same_obvious,DNAEval-qwen3_32b-seed42,0.5,DNAEval
8efea49e516203f968b41641ef3f12b8,same_obvious,DNAEval-qwen3_32b-seed42,0.37777777777777777,DNAEval
0b2b2cb9001ac4a182d934a24ba427e0,same_obvious,DNAEval-qwen3_32b-seed42,0.9777777777777777,DNAEval
505f542de6f8c4bb3f46a30ec06684ca,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
86215affc10344b259275703ffc9f19a,same_obvious,DNAEval-qwen3_32b-seed42,0.0,DNAEval
3054a3712d3fdab017e24c58d79afb26,same_obvious,DNAEval-qwen3_32b-seed42,0.8444444444444446,DNAEval
75ba6dc6f24e2cafff75919543cd3efa,same_obvious,DNAEval-qwen3_32b-seed42,0.1333333333333333,DNAEval
c4cf2c49370bb6a971d045d6dfebe21a,same_obvious,DNAEval-qwen3_32b-seed42,0.6888888888888889,DNAEval
df66cc7932afe4e89577bd654025c532,same_obvious,DNAEval-qwen3_32b-seed42,0.7592111111111111,DNAEval
21290b829e96cfbf062e633a5138ac1f,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
aa0dcb69d0f676846f1bc3cc576ac5e6,same_obvious,DNAEval-qwen3_32b-seed42,0.13611111111111107,DNAEval
a226cd75f8c3339a6261a50985156dc9,same_obvious,DNAEval-qwen3_32b-seed42,0.5222222222222223,DNAEval
b5b444c236ab45234263c585cc011e8c,same_obvious,DNAEval-qwen3_32b-seed42,0.6944444444444444,DNAEval
389665f0c600eca76500129c010a5be2,same_obvious,DNAEval-qwen3_32b-seed42,0.6555555555555557,DNAEval
df4ae4215fa3fb628eef337ece89a217,same_obvious,DNAEval-qwen3_32b-seed42,0.11111111111111116,DNAEval
82edc54570a07226ee90eda45a3553ca,same_obvious,DNAEval-qwen3_32b-seed42,0.1777777777777778,DNAEval
1492b4e463432d1d7d858712e6c79812,same_obvious,DNAEval-qwen3_32b-seed42,0.6000000000000001,DNAEval
9d1b51febde3b2664cee7f4125c5cf2d,same_obvious,DNAEval-qwen3_32b-seed42,0.7666666666666666,DNAEval
ffdd45dd795d4804ae9c6651b63e25a4,same_obvious,DNAEval-qwen3_32b-seed42,0.0,DNAEval
26c487fcf9ee09797498df54775d91e5,same_obvious,DNAEval-qwen3_32b-seed42,0.9305555555555556,DNAEval
d6090b5d8fb0ec630386b0d6f21796bf,same_obvious,DNAEval-qwen3_32b-seed42,0.6777777777777777,DNAEval
9ce851589f3754772f0eec4d8d8c59ca,same_obvious,DNAEval-qwen3_32b-seed42,0.9777777777777777,DNAEval
37b5d6ab4f988e5e7812ee0b93e75afe,same_obvious,DNAEval-qwen3_32b-seed42,0.7833333333333333,DNAEval
cc4e7ec102c99c881e50fcc5b27e759a,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
18b6cc22c4d999aa96389298ee04125c,same_obvious,DNAEval-qwen3_32b-seed42,0.6333333333333332,DNAEval
a3ed03409978affc4cbfe0e08eaeae87,same_obvious,DNAEval-qwen3_32b-seed42,0.8500000000000001,DNAEval
5398d26c6787a05f7a204a36de500f50,same_obvious,DNAEval-qwen3_32b-seed42,0.2222222222222222,DNAEval
4d1fb838df1225a71be7243ef9b5e357,same_obvious,DNAEval-qwen3_32b-seed42,0.7333333333333333,DNAEval
094bd6c5a70e255659a07e0f79390493,same_obvious,DNAEval-qwen3_32b-seed42,0.9555555555555555,DNAEval
2881dac4923d649ad0dfec243094dc99,same_obvious,DNAEval-qwen3_32b-seed42,0.21666666666666667,DNAEval
ce8d55ca3791982c86f047cdc8a3a418,same_obvious,Autometrics_Regression_accepted,0.32057900372497894,Autometrics
8efea49e516203f968b41641ef3f12b8,same_obvious,Autometrics_Regression_accepted,0.585417656419934,Autometrics
0b2b2cb9001ac4a182d934a24ba427e0,same_obvious,Autometrics_Regression_accepted,0.7011839531258746,Autometrics
505f542de6f8c4bb3f46a30ec06684ca,same_obvious,Autometrics_Regression_accepted,0.09736671827347121,Autometrics
86215affc10344b259275703ffc9f19a,same_obvious,Autometrics_Regression_accepted,0.21971769481919845,Autometrics
3054a3712d3fdab017e24c58d79afb26,same_obvious,Autometrics_Regression_accepted,0.8547597129917954,Autometrics
75ba6dc6f24e2cafff75919543cd3efa,same_obvious,Autometrics_Regression_accepted,0.2599871766579568,Autometrics
c4cf2c49370bb6a971d045d6dfebe21a,same_obvious,Autometrics_Regression_accepted,0.6475594122243785,Autometrics
df66cc7932afe4e89577bd654025c532,same_obvious,Autometrics_Regression_accepted,0.9140692224498965,Autometrics
21290b829e96cfbf062e633a5138ac1f,same_obvious,Autometrics_Regression_accepted,0.9788599325094557,Autometrics
aa0dcb69d0f676846f1bc3cc576ac5e6,same_obvious,Autometrics_Regression_accepted,0.5646846163914518,Autometrics
a226cd75f8c3339a6261a50985156dc9,same_obvious,Autometrics_Regression_accepted,0.606170078725939,Autometrics
b5b444c236ab45234263c585cc011e8c,same_obvious,Autometrics_Regression_accepted,0.48291410638817167,Autometrics
389665f0c600eca76500129c010a5be2,same_obvious,Autometrics_Regression_accepted,0.5234733605749294,Autometrics
df4ae4215fa3fb628eef337ece89a217,same_obvious,Autometrics_Regression_accepted,0.24504228115901838,Autometrics
82edc54570a07226ee90eda45a3553ca,same_obvious,Autometrics_Regression_accepted,0.22999631803849285,Autometrics
1492b4e463432d1d7d858712e6c79812,same_obvious,Autometrics_Regression_accepted,0.9669318410579324,Autometrics
9d1b51febde3b2664cee7f4125c5cf2d,same_obvious,Autometrics_Regression_accepted,0.6555257061241262,Autometrics
ffdd45dd795d4804ae9c6651b63e25a4,same_obvious,Autometrics_Regression_accepted,0.30419046202987543,Autometrics
26c487fcf9ee09797498df54775d91e5,same_obvious,Autometrics_Regression_accepted,0.7989152108822823,Autometrics
d6090b5d8fb0ec630386b0d6f21796bf,same_obvious,Autometrics_Regression_accepted,0.5006699261059379,Autometrics
9ce851589f3754772f0eec4d8d8c59ca,same_obvious,Autometrics_Regression_accepted,0.6961444594096637,Autometrics
37b5d6ab4f988e5e7812ee0b93e75afe,same_obvious,Autometrics_Regression_accepted,0.5322315631184997,Autometrics
cc4e7ec102c99c881e50fcc5b27e759a,same_obvious,Autometrics_Regression_accepted,0.8698786690023865,Autometrics
18b6cc22c4d999aa96389298ee04125c,same_obvious,Autometrics_Regression_accepted,0.46233369883203645,Autometrics
a3ed03409978affc4cbfe0e08eaeae87,same_obvious,Autometrics_Regression_accepted,0.9298059039006651,Autometrics
5398d26c6787a05f7a204a36de500f50,same_obvious,Autometrics_Regression_accepted,0.8332245416620772,Autometrics
4d1fb838df1225a71be7243ef9b5e357,same_obvious,Autometrics_Regression_accepted,0.7461192695276482,Autometrics
094bd6c5a70e255659a07e0f79390493,same_obvious,Autometrics_Regression_accepted,0.9895712995173246,Autometrics
2881dac4923d649ad0dfec243094dc99,same_obvious,Autometrics_Regression_accepted,0.23586258268613913,Autometrics
ce8d55ca3791982c86f047cdc8a3a418,same_obvious,metametrics_score,0.2194699902453725,MetaMetrics
8efea49e516203f968b41641ef3f12b8,same_obvious,metametrics_score,0.16157592238003704,MetaMetrics
0b2b2cb9001ac4a182d934a24ba427e0,same_obvious,metametrics_score,0.9147376831180277,MetaMetrics
505f542de6f8c4bb3f46a30ec06684ca,same_obvious,metametrics_score,0.9001089219791701,MetaMetrics
86215affc10344b259275703ffc9f19a,same_obvious,metametrics_score,0.7740770589482806,MetaMetrics
3054a3712d3fdab017e24c58d79afb26,same_obvious,metametrics_score,0.9526409564746592,MetaMetrics
75ba6dc6f24e2cafff75919543cd3efa,same_obvious,metametrics_score,0.5517952937732502,MetaMetrics
c4cf2c49370bb6a971d045d6dfebe21a,same_obvious,metametrics_score,0.9491333649511015,MetaMetrics
df66cc7932afe4e89577bd654025c532,same_obvious,metametrics_score,0.6864294224393136,MetaMetrics
21290b829e96cfbf062e633a5138ac1f,same_obvious,metametrics_score,0.9497151423793658,MetaMetrics
aa0dcb69d0f676846f1bc3cc576ac5e6,same_obvious,metametrics_score,0.3959788342991819,MetaMetrics
a226cd75f8c3339a6261a50985156dc9,same_obvious,metametrics_score,0.37000579193840555,MetaMetrics
b5b444c236ab45234263c585cc011e8c,same_obvious,metametrics_score,0.4925522417149626,MetaMetrics
389665f0c600eca76500129c010a5be2,same_obvious,metametrics_score,0.6949649057967249,MetaMetrics
df4ae4215fa3fb628eef337ece89a217,same_obvious,metametrics_score,0.47623888982424334,MetaMetrics
82edc54570a07226ee90eda45a3553ca,same_obvious,metametrics_score,0.7758483656934376,MetaMetrics
1492b4e463432d1d7d858712e6c79812,same_obvious,metametrics_score,0.9492334888737275,MetaMetrics
9d1b51febde3b2664cee7f4125c5cf2d,same_obvious,metametrics_score,0.4504321894320562,MetaMetrics
ffdd45dd795d4804ae9c6651b63e25a4,same_obvious,metametrics_score,0.7598348634087121,MetaMetrics
26c487fcf9ee09797498df54775d91e5,same_obvious,metametrics_score,0.9405192034288307,MetaMetrics
d6090b5d8fb0ec630386b0d6f21796bf,same_obvious,metametrics_score,0.9269577114849299,MetaMetrics
9ce851589f3754772f0eec4d8d8c59ca,same_obvious,metametrics_score,0.7045713110075819,MetaMetrics
37b5d6ab4f988e5e7812ee0b93e75afe,same_obvious,metametrics_score,0.8384726609475043,MetaMetrics
cc4e7ec102c99c881e50fcc5b27e759a,same_obvious,metametrics_score,0.6598349925880976,MetaMetrics
18b6cc22c4d999aa96389298ee04125c,same_obvious,metametrics_score,0.9387902239674076,MetaMetrics
a3ed03409978affc4cbfe0e08eaeae87,same_obvious,metametrics_score,0.9309204079231588,MetaMetrics
5398d26c6787a05f7a204a36de500f50,same_obvious,metametrics_score,0.6640659342274695,MetaMetrics
4d1fb838df1225a71be7243ef9b5e357,same_obvious,metametrics_score,0.2953654777080158,MetaMetrics
094bd6c5a70e255659a07e0f79390493,same_obvious,metametrics_score,0.8184972714517355,MetaMetrics
2881dac4923d649ad0dfec243094dc99,same_obvious,metametrics_score,0.6361997308505432,MetaMetrics
ce8d55ca3791982c86f047cdc8a3a418,same_obvious,GRMRewardModel,0.4191394658753709,BEST_METRIC
8efea49e516203f968b41641ef3f12b8,same_obvious,GRMRewardModel,0.8101353857566765,BEST_METRIC
0b2b2cb9001ac4a182d934a24ba427e0,same_obvious,GRMRewardModel,0.71629265578635,BEST_METRIC
505f542de6f8c4bb3f46a30ec06684ca,same_obvious,GRMRewardModel,0.16858308605341243,BEST_METRIC
86215affc10344b259275703ffc9f19a,same_obvious,GRMRewardModel,0.3056379821958457,BEST_METRIC
3054a3712d3fdab017e24c58d79afb26,same_obvious,GRMRewardModel,0.974406528189911,BEST_METRIC
75ba6dc6f24e2cafff75919543cd3efa,same_obvious,GRMRewardModel,0.5356083086053411,BEST_METRIC
c4cf2c49370bb6a971d045d6dfebe21a,same_obvious,GRMRewardModel,0.7422106824925816,BEST_METRIC
df66cc7932afe4e89577bd654025c532,same_obvious,GRMRewardModel,0.8890949554896143,BEST_METRIC
21290b829e96cfbf062e633a5138ac1f,same_obvious,GRMRewardModel,0.9770029673590505,BEST_METRIC
aa0dcb69d0f676846f1bc3cc576ac5e6,same_obvious,GRMRewardModel,0.7759643916913946,BEST_METRIC
a226cd75f8c3339a6261a50985156dc9,same_obvious,GRMRewardModel,0.5608308605341247,BEST_METRIC
b5b444c236ab45234263c585cc011e8c,same_obvious,GRMRewardModel,0.603486646884273,BEST_METRIC
389665f0c600eca76500129c010a5be2,same_obvious,GRMRewardModel,0.7722551928783383,BEST_METRIC
df4ae4215fa3fb628eef337ece89a217,same_obvious,GRMRewardModel,0.4553505192878339,BEST_METRIC
82edc54570a07226ee90eda45a3553ca,same_obvious,GRMRewardModel,0.2392433234421365,BEST_METRIC
1492b4e463432d1d7d858712e6c79812,same_obvious,GRMRewardModel,0.7799517804154302,BEST_METRIC
9d1b51febde3b2664cee7f4125c5cf2d,same_obvious,GRMRewardModel,0.6766042284866469,BEST_METRIC
ffdd45dd795d4804ae9c6651b63e25a4,same_obvious,GRMRewardModel,0.4151984421364985,BEST_METRIC
26c487fcf9ee09797498df54775d91e5,same_obvious,GRMRewardModel,0.8117581602373887,BEST_METRIC
d6090b5d8fb0ec630386b0d6f21796bf,same_obvious,GRMRewardModel,0.6005192878338279,BEST_METRIC
9ce851589f3754772f0eec4d8d8c59ca,same_obvious,GRMRewardModel,0.973293768545994,BEST_METRIC
37b5d6ab4f988e5e7812ee0b93e75afe,same_obvious,GRMRewardModel,0.6817507418397626,BEST_METRIC
cc4e7ec102c99c881e50fcc5b27e759a,same_obvious,GRMRewardModel,0.8870548961424333,BEST_METRIC
18b6cc22c4d999aa96389298ee04125c,same_obvious,GRMRewardModel,0.49856268545994065,BEST_METRIC
a3ed03409978affc4cbfe0e08eaeae87,same_obvious,GRMRewardModel,0.9836795252225519,BEST_METRIC
5398d26c6787a05f7a204a36de500f50,same_obvious,GRMRewardModel,0.94473293768546,BEST_METRIC
4d1fb838df1225a71be7243ef9b5e357,same_obvious,GRMRewardModel,0.9688427299703264,BEST_METRIC
094bd6c5a70e255659a07e0f79390493,same_obvious,GRMRewardModel,0.9005934718100891,BEST_METRIC
2881dac4923d649ad0dfec243094dc99,same_obvious,GRMRewardModel,0.4299888724035609,BEST_METRIC
