uid,strategy,metric,score,metric_logical
be87541879d8b12ea79e161867a9445c,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
a3d2de7675556553a5f08e4c88d2c228,same_obvious,LLMJudge-qwen3_32b-seed42,0.9166666666666666,LLMJudge
a3d2de7675556553a5f08e4c88d2c228,same_obvious,LLMJudge-qwen3_32b-seed42,0.9583333333333334,LLMJudge
eb6915eedae301fed322493444be9c96,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
2ebc960777fb053e311af3d795a3fde3,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
b9da6aa86067b6d3fa39d3ca25058485,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
32cfa398933760a88bc534fb0fab8f8b,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
1e9ec4e99f59e7f3a33c66024f466fa0,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
282bb21d514ff2e20a2798587a07bec2,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
b60a58b1dd1e8d1439d5a8fa46e97eb1,same_obvious,LLMJudge-qwen3_32b-seed42,0.75,LLMJudge
31779ba135934ed036644deb47eb1e54,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
beb1f228968a44d4ea347e2c5a5d2495,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
d216512df4831937d9540458a18f8541,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
321db07e5841c8f3f9626b1fac356167,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
04744fe491aa8cd58dbe92d5afdcb120,same_obvious,LLMJudge-qwen3_32b-seed42,0.875,LLMJudge
3b15d01774ca62983e5985d80f64ee71,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
a3d2de7675556553a5f08e4c88d2c228,same_obvious,LLMJudge-qwen3_32b-seed42,0.8333333333333334,LLMJudge
ca54dfebdb5e70386ad964ce57ebe769,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
4da4cbef228eaac0d9614b73a802ca4f,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,LLMJudge-qwen3_32b-seed42,1.0,LLMJudge
be87541879d8b12ea79e161867a9445c,same_obvious,DNAEval-qwen3_32b-seed42,0.9611111111111111,DNAEval
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,DNAEval-qwen3_32b-seed42,0.9999999999999999,DNAEval
a3d2de7675556553a5f08e4c88d2c228,same_obvious,DNAEval-qwen3_32b-seed42,0.9916666666666667,DNAEval
a3d2de7675556553a5f08e4c88d2c228,same_obvious,DNAEval-qwen3_32b-seed42,0.8888888888888888,DNAEval
eb6915eedae301fed322493444be9c96,same_obvious,DNAEval-qwen3_32b-seed42,0.9111111111111113,DNAEval
2ebc960777fb053e311af3d795a3fde3,same_obvious,DNAEval-qwen3_32b-seed42,0.9722222222222223,DNAEval
b9da6aa86067b6d3fa39d3ca25058485,same_obvious,DNAEval-qwen3_32b-seed42,0.9944444444444444,DNAEval
32cfa398933760a88bc534fb0fab8f8b,same_obvious,DNAEval-qwen3_32b-seed42,0.9833333333333334,DNAEval
1e9ec4e99f59e7f3a33c66024f466fa0,same_obvious,DNAEval-qwen3_32b-seed42,0.8333333333333334,DNAEval
282bb21d514ff2e20a2798587a07bec2,same_obvious,DNAEval-qwen3_32b-seed42,0.8333333333333335,DNAEval
b60a58b1dd1e8d1439d5a8fa46e97eb1,same_obvious,DNAEval-qwen3_32b-seed42,0.6111111111111112,DNAEval
31779ba135934ed036644deb47eb1e54,same_obvious,DNAEval-qwen3_32b-seed42,0.9972222222222222,DNAEval
beb1f228968a44d4ea347e2c5a5d2495,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
d216512df4831937d9540458a18f8541,same_obvious,DNAEval-qwen3_32b-seed42,0.7333333333333333,DNAEval
321db07e5841c8f3f9626b1fac356167,same_obvious,DNAEval-qwen3_32b-seed42,1.0,DNAEval
04744fe491aa8cd58dbe92d5afdcb120,same_obvious,DNAEval-qwen3_32b-seed42,0.9944444444444445,DNAEval
3b15d01774ca62983e5985d80f64ee71,same_obvious,DNAEval-qwen3_32b-seed42,0.9666666666666667,DNAEval
a3d2de7675556553a5f08e4c88d2c228,same_obvious,DNAEval-qwen3_32b-seed42,0.8944444444444444,DNAEval
ca54dfebdb5e70386ad964ce57ebe769,same_obvious,DNAEval-qwen3_32b-seed42,0.8055555555555555,DNAEval
4da4cbef228eaac0d9614b73a802ca4f,same_obvious,DNAEval-qwen3_32b-seed42,0.8444444444444444,DNAEval
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,DNAEval-qwen3_32b-seed42,0.9638888888888889,DNAEval
be87541879d8b12ea79e161867a9445c,same_obvious,Autometrics_Regression_outcomeRating,0.9345264144107192,Autometrics
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,Autometrics_Regression_outcomeRating,0.9664426993192525,Autometrics
a3d2de7675556553a5f08e4c88d2c228,same_obvious,Autometrics_Regression_outcomeRating,0.9963887578928732,Autometrics
a3d2de7675556553a5f08e4c88d2c228,same_obvious,Autometrics_Regression_outcomeRating,0.7164043669630547,Autometrics
eb6915eedae301fed322493444be9c96,same_obvious,Autometrics_Regression_outcomeRating,0.991715867964073,Autometrics
2ebc960777fb053e311af3d795a3fde3,same_obvious,Autometrics_Regression_outcomeRating,0.6605211736479165,Autometrics
b9da6aa86067b6d3fa39d3ca25058485,same_obvious,Autometrics_Regression_outcomeRating,0.9863387022349601,Autometrics
32cfa398933760a88bc534fb0fab8f8b,same_obvious,Autometrics_Regression_outcomeRating,0.9950091011626622,Autometrics
1e9ec4e99f59e7f3a33c66024f466fa0,same_obvious,Autometrics_Regression_outcomeRating,0.8295506036070446,Autometrics
282bb21d514ff2e20a2798587a07bec2,same_obvious,Autometrics_Regression_outcomeRating,0.924158349844268,Autometrics
b60a58b1dd1e8d1439d5a8fa46e97eb1,same_obvious,Autometrics_Regression_outcomeRating,0.5705127275639958,Autometrics
31779ba135934ed036644deb47eb1e54,same_obvious,Autometrics_Regression_outcomeRating,0.7446469558395753,Autometrics
beb1f228968a44d4ea347e2c5a5d2495,same_obvious,Autometrics_Regression_outcomeRating,0.8584690467088599,Autometrics
d216512df4831937d9540458a18f8541,same_obvious,Autometrics_Regression_outcomeRating,0.6987739817741313,Autometrics
321db07e5841c8f3f9626b1fac356167,same_obvious,Autometrics_Regression_outcomeRating,0.9950091011626621,Autometrics
04744fe491aa8cd58dbe92d5afdcb120,same_obvious,Autometrics_Regression_outcomeRating,0.8584690467088599,Autometrics
3b15d01774ca62983e5985d80f64ee71,same_obvious,Autometrics_Regression_outcomeRating,0.9295355155733812,Autometrics
a3d2de7675556553a5f08e4c88d2c228,same_obvious,Autometrics_Regression_outcomeRating,0.9035925767735123,Autometrics
ca54dfebdb5e70386ad964ce57ebe769,same_obvious,Autometrics_Regression_outcomeRating,0.9913296010722976,Autometrics
4da4cbef228eaac0d9614b73a802ca4f,same_obvious,Autometrics_Regression_outcomeRating,0.9913296010722977,Autometrics
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,Autometrics_Regression_outcomeRating,0.7782042565203228,Autometrics
be87541879d8b12ea79e161867a9445c,same_obvious,metametrics_score,0.8347191748428029,MetaMetrics
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,metametrics_score,0.9432318881417651,MetaMetrics
a3d2de7675556553a5f08e4c88d2c228,same_obvious,metametrics_score,0.8085170274313063,MetaMetrics
a3d2de7675556553a5f08e4c88d2c228,same_obvious,metametrics_score,0.6817192250406996,MetaMetrics
eb6915eedae301fed322493444be9c96,same_obvious,metametrics_score,0.41821243846146805,MetaMetrics
2ebc960777fb053e311af3d795a3fde3,same_obvious,metametrics_score,0.750974039668527,MetaMetrics
b9da6aa86067b6d3fa39d3ca25058485,same_obvious,metametrics_score,0.9872484512024322,MetaMetrics
32cfa398933760a88bc534fb0fab8f8b,same_obvious,metametrics_score,0.9664684785274087,MetaMetrics
1e9ec4e99f59e7f3a33c66024f466fa0,same_obvious,metametrics_score,0.9982710753784769,MetaMetrics
282bb21d514ff2e20a2798587a07bec2,same_obvious,metametrics_score,0.92859283427429,MetaMetrics
b60a58b1dd1e8d1439d5a8fa46e97eb1,same_obvious,metametrics_score,0.9676864110123601,MetaMetrics
31779ba135934ed036644deb47eb1e54,same_obvious,metametrics_score,0.8068455497319224,MetaMetrics
beb1f228968a44d4ea347e2c5a5d2495,same_obvious,metametrics_score,0.40998290224352507,MetaMetrics
d216512df4831937d9540458a18f8541,same_obvious,metametrics_score,0.9762130289683562,MetaMetrics
321db07e5841c8f3f9626b1fac356167,same_obvious,metametrics_score,0.7589606052358812,MetaMetrics
04744fe491aa8cd58dbe92d5afdcb120,same_obvious,metametrics_score,0.7012940632888025,MetaMetrics
3b15d01774ca62983e5985d80f64ee71,same_obvious,metametrics_score,0.8337658638584924,MetaMetrics
a3d2de7675556553a5f08e4c88d2c228,same_obvious,metametrics_score,0.702478284598339,MetaMetrics
ca54dfebdb5e70386ad964ce57ebe769,same_obvious,metametrics_score,0.8399033826806992,MetaMetrics
4da4cbef228eaac0d9614b73a802ca4f,same_obvious,metametrics_score,0.6812245913241675,MetaMetrics
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,metametrics_score,0.9653006477132453,MetaMetrics
be87541879d8b12ea79e161867a9445c,same_obvious,INFORMRewardModel,0.9432098765432099,BEST_METRIC
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,INFORMRewardModel,0.865432098765432,BEST_METRIC
a3d2de7675556553a5f08e4c88d2c228,same_obvious,INFORMRewardModel,0.9858539094650206,BEST_METRIC
a3d2de7675556553a5f08e4c88d2c228,same_obvious,INFORMRewardModel,0.9397408693415639,BEST_METRIC
eb6915eedae301fed322493444be9c96,same_obvious,INFORMRewardModel,0.8296296296296297,BEST_METRIC
2ebc960777fb053e311af3d795a3fde3,same_obvious,INFORMRewardModel,0.95679012345679,BEST_METRIC
b9da6aa86067b6d3fa39d3ca25058485,same_obvious,INFORMRewardModel,0.9925925925925926,BEST_METRIC
32cfa398933760a88bc534fb0fab8f8b,same_obvious,INFORMRewardModel,0.917283950617284,BEST_METRIC
1e9ec4e99f59e7f3a33c66024f466fa0,same_obvious,INFORMRewardModel,0.8499614197530865,BEST_METRIC
282bb21d514ff2e20a2798587a07bec2,same_obvious,INFORMRewardModel,0.9851851851851852,BEST_METRIC
b60a58b1dd1e8d1439d5a8fa46e97eb1,same_obvious,INFORMRewardModel,0.9790123456790124,BEST_METRIC
31779ba135934ed036644deb47eb1e54,same_obvious,INFORMRewardModel,0.9274691358024691,BEST_METRIC
beb1f228968a44d4ea347e2c5a5d2495,same_obvious,INFORMRewardModel,0.9234567901234568,BEST_METRIC
d216512df4831937d9540458a18f8541,same_obvious,INFORMRewardModel,0.8098765432098766,BEST_METRIC
321db07e5841c8f3f9626b1fac356167,same_obvious,INFORMRewardModel,0.8839506172839505,BEST_METRIC
04744fe491aa8cd58dbe92d5afdcb120,same_obvious,INFORMRewardModel,0.9876543209876543,BEST_METRIC
3b15d01774ca62983e5985d80f64ee71,same_obvious,INFORMRewardModel,0.9703703703703703,BEST_METRIC
a3d2de7675556553a5f08e4c88d2c228,same_obvious,INFORMRewardModel,0.9870884773662552,BEST_METRIC
ca54dfebdb5e70386ad964ce57ebe769,same_obvious,INFORMRewardModel,0.8302469135802468,BEST_METRIC
4da4cbef228eaac0d9614b73a802ca4f,same_obvious,INFORMRewardModel,0.9382716049382717,BEST_METRIC
1ba471f81f9ac7fc3ac07189e44f1384,same_obvious,INFORMRewardModel,0.9987654320987654,BEST_METRIC
