Model,SOP,structural_alignment,property_fidelity,semantic_fidelity,code_bonus,code_compliance,norm_counterexample_traces,norm_minor_issues,errors,execution_compliance,Final_score
Baseline,eval_model_01,4.67,4.67,6.0,-0.12,0.38,0.0,0.0,No,1.0,0.75
Claude,eval_model_01,6.0,6.67,5.67,0.5,0.59,0.67,0.17,No,0.43,0.49
DeepSeek,eval_model_01,3.33,3.33,4.67,0.06,0.32,0.5,0.17,No,0.57,0.47
Gemini,eval_model_01,5.33,5.33,5.0,-0.38,0.34,0.5,0.0,No,0.6,0.5
Qwen,eval_model_01,5.33,5.0,5.33,0.06,0.43,1.0,1.0,No,0.0,0.17
Baseline,eval_model_02,6.0,6.0,7.0,0.75,0.65,0.0,1.0,No,0.8,0.74
Claude,eval_model_02,6.33,7.67,6.33,0.54,0.65,1.0,0.0,No,0.2,0.38
DeepSeek,eval_model_02,4.33,4.33,6.0,0.08,0.41,0.25,1.0,No,0.6,0.52
Gemini,eval_model_02,5.67,6.33,6.67,0.0,0.5,0.12,1.0,No,0.7,0.62
Qwen,eval_model_02,5.33,5.0,4.67,0.58,0.52,0.38,1.0,No,0.5,0.51
Baseline,eval_model_03,3.0,3.67,3.0,0.38,0.33,0.0,0.75,No,0.85,0.64
Claude,eval_model_03,6.33,7.67,8.0,0.75,0.73,1.0,1.0,No,0.0,0.29
DeepSeek,eval_model_03,6.0,6.0,5.67,-0.02,0.47,0.38,0.5,No,0.6,0.55
Gemini,eval_model_03,8.67,9.67,8.0,0.35,0.77,0.0,0.75,Yes,0.0,0.31
Qwen,eval_model_03,7.33,7.67,6.0,0.29,0.62,0.0,0.0,Yes,0.0,0.25
Baseline,eval_model_04,7.33,8.33,7.33,0.33,0.68,0.0,0.33,No,0.93,0.83
Claude,eval_model_04,9.0,8.67,8.0,0.27,0.74,0.0,1.0,No,0.8,0.78
DeepSeek,eval_model_04,6.0,8.0,7.0,-0.06,0.55,0.0,0.67,No,0.87,0.74
Gemini,eval_model_04,5.0,7.0,6.0,0.38,0.55,0.0,0.67,No,0.87,0.74
Qwen,eval_model_04,6.33,6.33,6.67,-0.08,0.5,1.0,0.0,No,0.2,0.32
Baseline,eval_model_05,3.67,5.0,4.67,0.12,0.38,0.0,0.71,No,0.86,0.67
Claude,eval_model_05,6.0,6.33,6.67,0.46,0.6,1.0,1.0,No,0.0,0.24
DeepSeek,eval_model_05,3.0,3.0,3.0,0.0,0.24,0.17,0.0,No,0.86,0.61
Gemini,eval_model_05,4.33,7.33,6.0,0.83,0.64,0.0,0.29,Yes,0.0,0.26
Qwen,eval_model_05,4.33,5.67,5.33,-0.12,0.38,0.0,0.14,Yes,0.0,0.15
Baseline,eval_model_06,6.33,8.67,4.67,0.25,0.58,0.0,0.6,No,0.88,0.76
Claude,eval_model_06,7.33,7.33,6.67,0.47,0.66,1.0,1.0,No,0.0,0.26
DeepSeek,eval_model_06,6.67,8.0,9.0,0.39,0.71,0.11,0.4,No,0.83,0.78
Gemini,eval_model_06,8.33,10.0,8.33,-0.25,0.66,0.0,0.0,No,1.0,0.86
Qwen,eval_model_06,6.0,9.0,7.33,0.54,0.7,0.0,0.0,No,1.0,0.88
Baseline,eval_model_07,8.0,8.0,6.33,0.0,0.59,0.0,1.0,No,0.8,0.72
Claude,eval_model_07,10.0,10.0,10.0,0.3,0.86,0.0,0.0,No,1.0,0.94
DeepSeek,eval_model_07,9.33,10.0,8.0,0.17,0.76,0.0,0.0,No,1.0,0.9
Gemini,eval_model_07,8.33,10.0,7.33,0.5,0.79,0.0,0.5,No,0.9,0.86
Qwen,eval_model_07,9.33,10.0,9.0,0.0,0.75,1.0,0.5,No,0.1,0.36
Baseline,eval_model_08,5.0,5.67,5.0,0.42,0.5,0.0,1.0,No,0.8,0.68
Claude,eval_model_08,7.67,8.67,8.33,-0.08,0.64,0.14,0.75,No,0.74,0.7
DeepSeek,eval_model_08,7.33,9.33,7.0,-0.04,0.62,0.71,1.0,No,0.23,0.39
Gemini,eval_model_08,9.67,10.0,9.0,0.31,0.83,1.0,1.0,No,0.0,0.33
Qwen,eval_model_08,9.0,9.33,9.33,0.69,0.87,0.14,0.0,No,0.89,0.88
Baseline,eval_model_09,5.0,5.67,4.67,0.62,0.53,0.0,0.6,No,0.88,0.74
Claude,eval_model_09,6.0,8.67,6.67,0.38,0.64,1.0,0.6,No,0.08,0.3
DeepSeek,eval_model_09,6.0,7.33,6.67,0.19,0.57,0.22,1.0,No,0.62,0.6
Gemini,eval_model_09,8.0,9.67,7.0,0.45,0.75,0.0,0.0,Yes,0.0,0.3
Qwen,eval_model_09,6.0,7.0,6.67,-0.15,0.5,0.33,0.6,No,0.62,0.57
Baseline,eval_model_10,5.33,7.0,5.33,0.5,0.57,0.0,1.0,No,0.8,0.71
Claude,eval_model_10,7.33,8.33,7.67,0.17,0.66,0.75,0.0,No,0.4,0.5
DeepSeek,eval_model_10,4.67,5.0,5.0,0.25,0.44,0.5,0.0,No,0.6,0.54
Gemini,eval_model_10,7.0,8.67,8.67,0.67,0.78,0.0,0.12,Yes,0.0,0.31
Qwen,eval_model_10,6.67,8.67,6.67,-0.04,0.58,1.0,0.06,No,0.19,0.35
