Model,SOP,structural_alignment,property_fidelity,semantic_fidelity,code_bonus,code_compliance,norm_counterexample_traces,norm_minor_issues,errors,execution_compliance,Final_score
Baseline,eval_model_01,5.0,5.67,5.33,0.5,0.52,0.0,0.33,No,0.93,0.77
Claude,eval_model_01,5.33,6.33,6.33,0.0,0.48,0.67,1.0,No,0.26,0.35
DeepSeek,eval_model_01,5.33,5.33,5.33,0.0,0.42,0.67,0.67,No,0.33,0.37
Gemini,eval_model_01,4.33,6.0,5.0,-0.08,0.39,0.5,0.0,No,0.6,0.52
Qwen,eval_model_01,5.0,5.33,5.33,0.33,0.48,1.0,0.67,No,0.07,0.23
Baseline,eval_model_02,5.0,5.0,6.0,0.54,0.53,0.0,0.0,No,1.0,0.81
Claude,eval_model_02,7.67,8.0,8.0,0.17,0.67,1.0,0.67,No,0.07,0.31
DeepSeek,eval_model_02,4.33,5.33,5.67,0.0,0.41,0.25,0.33,No,0.73,0.6
Gemini,eval_model_02,7.0,8.0,7.33,0.25,0.64,0.12,0.0,No,0.9,0.8
Qwen,eval_model_02,4.67,5.33,4.67,0.75,0.54,0.38,1.0,No,0.5,0.52
Baseline,eval_model_03,5.0,5.33,4.0,0.17,0.42,0.0,0.25,No,0.95,0.74
Claude,eval_model_03,7.33,9.0,8.0,0.71,0.79,1.0,1.0,No,0.0,0.32
DeepSeek,eval_model_03,4.33,6.33,5.33,-0.09,0.41,0.33,0.25,No,0.69,0.58
Gemini,eval_model_03,9.0,9.0,8.67,0.08,0.73,0.0,0.0,Yes,0.0,0.29
Qwen,eval_model_03,6.0,6.67,5.33,0.38,0.55,0.0,0.25,Yes,0.0,0.22
Baseline,eval_model_04,5.67,6.67,5.67,0.17,0.51,0.0,0.0,No,1.0,0.8
Claude,eval_model_04,7.33,8.67,6.67,0.21,0.65,0.0,0.2,No,0.96,0.84
DeepSeek,eval_model_04,6.67,5.67,5.33,-0.12,0.45,0.0,0.2,No,0.96,0.76
Gemini,eval_model_04,4.33,6.67,5.33,0.67,0.57,0.0,1.0,No,0.8,0.71
Qwen,eval_model_04,4.67,6.0,5.33,-0.05,0.41,1.0,0.4,No,0.12,0.24
Baseline,eval_model_05,4.0,5.67,5.67,0.58,0.52,0.0,0.67,No,0.87,0.73
Claude,eval_model_05,5.0,5.67,6.67,0.83,0.63,1.0,1.0,No,0.0,0.25
DeepSeek,eval_model_05,4.0,4.67,4.0,-0.25,0.29,0.17,0.67,No,0.73,0.55
Gemini,eval_model_05,5.0,7.0,5.0,0.42,0.54,0.0,0.33,Yes,0.0,0.22
Qwen,eval_model_05,5.0,5.33,5.0,0.33,0.47,0.0,0.0,Yes,0.0,0.19
Baseline,eval_model_06,5.33,7.67,5.33,0.04,0.5,0.0,0.0,No,1.0,0.8
Claude,eval_model_06,5.33,7.67,6.67,-0.04,0.52,1.0,1.0,No,0.0,0.21
DeepSeek,eval_model_06,6.0,8.67,7.33,0.29,0.64,0.1,1.0,No,0.72,0.69
Gemini,eval_model_06,7.67,9.67,8.67,0.52,0.8,0.0,0.33,No,0.93,0.88
Qwen,eval_model_06,6.0,7.0,5.33,-0.04,0.48,0.0,0.0,No,1.0,0.79
Baseline,eval_model_07,9.0,7.67,7.67,0.38,0.72,0.0,0.5,No,0.9,0.83
Claude,eval_model_07,9.33,9.0,8.67,-0.25,0.67,0.0,0.5,No,0.9,0.81
DeepSeek,eval_model_07,7.67,8.33,6.33,-0.12,0.57,0.0,0.0,No,1.0,0.83
Gemini,eval_model_07,7.67,9.67,5.67,0.25,0.67,0.0,1.0,No,0.8,0.75
Qwen,eval_model_07,10.0,10.0,10.0,0.12,0.83,1.0,0.5,No,0.1,0.39
Baseline,eval_model_08,6.67,6.67,5.0,0.26,0.54,0.0,0.0,No,1.0,0.82
Claude,eval_model_08,4.67,8.33,7.67,0.02,0.56,0.12,0.5,No,0.8,0.7
DeepSeek,eval_model_08,7.0,8.33,6.0,0.13,0.59,0.62,1.0,No,0.3,0.42
Gemini,eval_model_08,10.0,10.0,10.0,0.75,0.95,1.0,0.25,No,0.15,0.47
Qwen,eval_model_08,8.33,8.67,8.33,0.19,0.71,0.12,0.25,No,0.85,0.79
Baseline,eval_model_09,5.0,5.67,4.67,0.31,0.47,0.0,1.0,No,0.8,0.67
Claude,eval_model_09,7.0,9.0,7.0,0.16,0.65,1.0,0.83,No,0.03,0.28
DeepSeek,eval_model_09,6.33,7.0,6.33,0.09,0.55,0.22,0.67,No,0.69,0.63
Gemini,eval_model_09,9.0,9.0,8.33,0.35,0.77,0.0,0.0,Yes,0.0,0.31
Qwen,eval_model_09,7.0,7.33,7.0,-0.02,0.56,0.33,0.67,No,0.6,0.58
Baseline,eval_model_10,6.0,6.67,5.67,0.25,0.54,0.0,0.0,No,1.0,0.82
Claude,eval_model_10,6.0,8.0,6.0,0.03,0.54,0.75,0.27,No,0.35,0.43
DeepSeek,eval_model_10,5.33,5.33,5.67,0.14,0.46,0.5,1.0,No,0.4,0.42
Gemini,eval_model_10,7.67,9.0,7.67,0.38,0.72,0.0,0.0,Yes,0.0,0.29
Qwen,eval_model_10,6.67,7.33,7.67,-0.25,0.53,1.0,0.33,No,0.13,0.29
