Model,SOP,errors,structural_alignment_avg,property_fidelity_avg,semantic_fidelity_avg,code_bonus_avg,code_compliance_avg,norm_counterexample_traces_avg,norm_minor_issues_avg,execution_compliance_avg,Final_score_avg,structural_alignment_std,property_fidelity_std,semantic_fidelity_std,code_bonus_std,code_compliance_std,norm_counterexample_traces_std,norm_minor_issues_std,execution_compliance_std,Final_score_std
Baseline,eval_model_01,No,4.84,5.17,5.66,0.19,0.45,0.0,0.16,0.97,0.76,0.23,0.71,0.47,0.44,0.1,0.0,0.23,0.05,0.01
Claude,eval_model_01,No,5.66,6.5,6.0,0.25,0.53,0.67,0.58,0.34,0.42,0.47,0.24,0.47,0.35,0.08,0.0,0.59,0.12,0.1
DeepSeek,eval_model_01,No,4.33,4.33,5.0,0.03,0.37,0.58,0.42,0.45,0.42,1.41,1.41,0.47,0.04,0.07,0.12,0.35,0.17,0.07
Gemini,eval_model_01,No,4.83,5.66,5.0,-0.23,0.36,0.5,0.0,0.6,0.51,0.71,0.47,0.0,0.21,0.04,0.0,0.0,0.0,0.01
Qwen,eval_model_01,No,5.16,5.16,5.33,0.2,0.45,1.0,0.84,0.04,0.2,0.23,0.23,0.0,0.19,0.04,0.0,0.23,0.05,0.04
Baseline,eval_model_02,No,5.5,5.5,6.5,0.64,0.59,0.0,0.5,0.9,0.78,0.71,0.71,0.71,0.15,0.08,0.0,0.71,0.14,0.05
Claude,eval_model_02,No,7.0,7.84,7.16,0.36,0.66,1.0,0.34,0.14,0.34,0.95,0.23,1.18,0.26,0.01,0.0,0.47,0.09,0.05
DeepSeek,eval_model_02,No,4.33,4.83,5.84,0.04,0.41,0.25,0.66,0.66,0.56,0.0,0.71,0.23,0.06,0.0,0.0,0.47,0.09,0.06
Gemini,eval_model_02,No,6.34,7.16,7.0,0.12,0.57,0.12,0.5,0.8,0.71,0.94,1.18,0.47,0.18,0.1,0.0,0.71,0.14,0.13
Qwen,eval_model_02,No,5.0,5.16,4.67,0.66,0.53,0.38,1.0,0.5,0.52,0.47,0.23,0.0,0.12,0.01,0.0,0.0,0.0,0.01
Baseline,eval_model_03,No,4.0,4.5,3.5,0.28,0.38,0.0,0.5,0.9,0.69,1.41,1.17,0.71,0.15,0.06,0.0,0.35,0.07,0.07
Claude,eval_model_03,No,6.83,8.34,8.0,0.73,0.76,1.0,1.0,0.0,0.3,0.71,0.94,0.0,0.03,0.04,0.0,0.0,0.0,0.02
DeepSeek,eval_model_03,No,5.16,6.16,5.5,-0.06,0.44,0.36,0.38,0.64,0.56,1.18,0.23,0.24,0.05,0.04,0.04,0.18,0.06,0.02
Gemini,eval_model_03,Yes,8.84,9.34,8.34,0.22,0.75,0.0,0.38,0.0,0.3,0.23,0.47,0.47,0.19,0.03,0.0,0.53,0.0,0.01
Qwen,eval_model_03,Yes,6.66,7.17,5.66,0.34,0.58,0.0,0.12,0.0,0.24,0.94,0.71,0.47,0.06,0.05,0.0,0.18,0.0,0.02
Baseline,eval_model_04,No,6.5,7.5,6.5,0.25,0.6,0.0,0.16,0.97,0.82,1.17,1.17,1.17,0.11,0.12,0.0,0.23,0.05,0.02
Claude,eval_model_04,No,8.16,8.67,7.34,0.24,0.7,0.0,0.6,0.88,0.81,1.18,0.0,0.94,0.04,0.06,0.0,0.57,0.11,0.04
DeepSeek,eval_model_04,No,6.34,6.84,6.16,-0.09,0.5,0.0,0.44,0.92,0.75,0.47,1.65,1.18,0.04,0.07,0.0,0.33,0.06,0.01
Gemini,eval_model_04,No,4.66,6.84,5.66,0.52,0.56,0.0,0.84,0.84,0.72,0.47,0.23,0.47,0.21,0.01,0.0,0.23,0.05,0.02
Qwen,eval_model_04,No,5.5,6.16,6.0,-0.06,0.45,1.0,0.2,0.16,0.28,1.17,0.23,0.95,0.02,0.06,0.0,0.28,0.06,0.06
Baseline,eval_model_05,No,3.84,5.34,5.17,0.35,0.45,0.0,0.69,0.86,0.7,0.23,0.47,0.71,0.33,0.1,0.0,0.03,0.01,0.04
Claude,eval_model_05,No,5.5,6.0,6.67,0.64,0.62,1.0,1.0,0.0,0.24,0.71,0.47,0.0,0.26,0.02,0.0,0.0,0.0,0.01
DeepSeek,eval_model_05,No,3.5,3.84,3.5,-0.12,0.26,0.17,0.34,0.8,0.58,0.71,1.18,0.71,0.18,0.04,0.0,0.47,0.09,0.04
Gemini,eval_model_05,Yes,4.66,7.16,5.5,0.62,0.59,0.0,0.31,0.0,0.24,0.47,0.23,0.71,0.29,0.07,0.0,0.03,0.0,0.03
Qwen,eval_model_05,Yes,4.66,5.5,5.16,0.11,0.42,0.0,0.07,0.0,0.17,0.47,0.24,0.23,0.32,0.06,0.0,0.1,0.0,0.03
Baseline,eval_model_06,No,5.83,8.17,5.0,0.14,0.54,0.0,0.3,0.94,0.78,0.71,0.71,0.47,0.15,0.06,0.0,0.42,0.08,0.03
Claude,eval_model_06,No,6.33,7.5,6.67,0.22,0.59,1.0,1.0,0.0,0.24,1.41,0.24,0.0,0.36,0.1,0.0,0.0,0.0,0.04
DeepSeek,eval_model_06,No,6.34,8.34,8.16,0.34,0.68,0.11,0.7,0.77,0.74,0.47,0.47,1.18,0.07,0.05,0.01,0.42,0.08,0.06
Gemini,eval_model_06,No,8.0,9.84,8.5,0.14,0.73,0.0,0.16,0.97,0.87,0.47,0.23,0.24,0.54,0.1,0.0,0.23,0.05,0.01
Qwen,eval_model_06,No,6.0,8.0,6.33,0.25,0.59,0.0,0.0,1.0,0.84,0.0,1.41,1.41,0.41,0.16,0.0,0.0,0.0,0.06
Baseline,eval_model_07,No,8.5,7.84,7.0,0.19,0.66,0.0,0.75,0.85,0.77,0.71,0.23,0.95,0.27,0.09,0.0,0.35,0.07,0.08
Claude,eval_model_07,No,9.66,9.5,9.34,0.02,0.76,0.0,0.25,0.95,0.88,0.47,0.71,0.94,0.39,0.13,0.0,0.35,0.07,0.09
DeepSeek,eval_model_07,No,8.5,9.16,7.16,0.03,0.66,0.0,0.0,1.0,0.86,1.17,1.18,1.18,0.21,0.13,0.0,0.0,0.0,0.05
Gemini,eval_model_07,No,8.0,9.84,6.5,0.38,0.73,0.0,0.75,0.85,0.8,0.47,0.23,1.17,0.18,0.08,0.0,0.35,0.07,0.08
Qwen,eval_model_07,No,9.66,10.0,9.5,0.06,0.79,1.0,0.5,0.1,0.38,0.47,0.0,0.71,0.08,0.06,0.0,0.0,0.0,0.02
Baseline,eval_model_08,No,5.84,6.17,5.0,0.34,0.52,0.0,0.5,0.9,0.75,1.18,0.71,0.0,0.11,0.03,0.0,0.71,0.14,0.1
Claude,eval_model_08,No,6.17,8.5,8.0,-0.03,0.6,0.13,0.62,0.77,0.7,2.12,0.24,0.47,0.07,0.06,0.01,0.18,0.04,0.0
DeepSeek,eval_model_08,No,7.16,8.83,6.5,0.04,0.6,0.66,1.0,0.26,0.4,0.23,0.71,0.71,0.12,0.02,0.06,0.0,0.05,0.02
Gemini,eval_model_08,No,9.84,10.0,9.5,0.53,0.89,1.0,0.62,0.08,0.4,0.23,0.0,0.71,0.31,0.08,0.0,0.53,0.11,0.1
Qwen,eval_model_08,No,8.66,9.0,8.83,0.44,0.79,0.13,0.12,0.87,0.84,0.47,0.47,0.71,0.35,0.11,0.01,0.18,0.03,0.06
Baseline,eval_model_09,No,5.0,5.67,4.67,0.46,0.5,0.0,0.8,0.84,0.7,0.0,0.0,0.0,0.22,0.04,0.0,0.28,0.06,0.05
Claude,eval_model_09,No,6.5,8.84,6.84,0.27,0.64,1.0,0.72,0.06,0.29,0.71,0.23,0.23,0.16,0.01,0.0,0.16,0.04,0.01
DeepSeek,eval_model_09,No,6.16,7.16,6.5,0.14,0.56,0.22,0.84,0.66,0.62,0.23,0.23,0.24,0.07,0.01,0.0,0.23,0.05,0.02
Gemini,eval_model_09,Yes,8.5,9.34,7.66,0.4,0.76,0.0,0.0,0.0,0.3,0.71,0.47,0.94,0.07,0.01,0.0,0.0,0.0,0.01
Qwen,eval_model_09,No,6.5,7.16,6.84,-0.08,0.53,0.33,0.64,0.61,0.57,0.71,0.23,0.23,0.09,0.04,0.0,0.05,0.01,0.01
Baseline,eval_model_10,No,5.66,6.84,5.5,0.38,0.55,0.0,0.5,0.9,0.76,0.47,0.23,0.24,0.18,0.02,0.0,0.71,0.14,0.08
Claude,eval_model_10,No,6.66,8.16,6.84,0.1,0.6,0.75,0.14,0.38,0.46,0.94,0.23,1.18,0.1,0.08,0.0,0.19,0.04,0.05
DeepSeek,eval_model_10,No,5.0,5.16,5.34,0.2,0.45,0.5,0.5,0.5,0.48,0.47,0.23,0.47,0.08,0.01,0.0,0.71,0.14,0.08
Gemini,eval_model_10,Yes,7.34,8.84,8.17,0.52,0.75,0.0,0.06,0.0,0.3,0.47,0.23,0.71,0.21,0.04,0.0,0.08,0.0,0.01
Qwen,eval_model_10,No,6.67,8.0,7.17,-0.14,0.55,1.0,0.2,0.16,0.32,0.0,0.95,0.71,0.15,0.04,0.0,0.19,0.04,0.04
