Model,SOP,norm_counterexample_traces,norm_minor_issues,errors,execution_compliance
Baseline,eval_model_01,0.0,0.0,No,1.0
Claude,eval_model_01,0.67,0.17,No,0.43
DeepSeek,eval_model_01,0.5,0.17,No,0.57
Gemini,eval_model_01,0.5,0.0,No,0.6
Qwen,eval_model_01,1.0,1.0,No,0.0
Baseline,eval_model_02,0.0,1.0,No,0.8
Claude,eval_model_02,1.0,0.0,No,0.2
DeepSeek,eval_model_02,0.25,1.0,No,0.6
Gemini,eval_model_02,0.12,1.0,No,0.7
Qwen,eval_model_02,0.38,1.0,No,0.5
Baseline,eval_model_03,0.0,0.75,No,0.85
Claude,eval_model_03,1.0,1.0,No,0.0
DeepSeek,eval_model_03,0.38,0.5,No,0.6
Gemini,eval_model_03,0.0,0.75,Yes,0.0
Qwen,eval_model_03,0.0,0.0,Yes,0.0
Baseline,eval_model_04,0.0,0.33,No,0.93
Claude,eval_model_04,0.0,1.0,No,0.8
DeepSeek,eval_model_04,0.0,0.67,No,0.87
Gemini,eval_model_04,0.0,0.67,No,0.87
Qwen,eval_model_04,1.0,0.0,No,0.2
Baseline,eval_model_05,0.0,0.71,No,0.86
Claude,eval_model_05,1.0,1.0,No,0.0
DeepSeek,eval_model_05,0.17,0.0,No,0.86
Gemini,eval_model_05,0.0,0.29,Yes,0.0
Qwen,eval_model_05,0.0,0.14,Yes,0.0
Baseline,eval_model_06,0.0,0.6,No,0.88
Claude,eval_model_06,1.0,1.0,No,0.0
DeepSeek,eval_model_06,0.11,0.4,No,0.83
Gemini,eval_model_06,0.0,0.0,No,1.0
Qwen,eval_model_06,0.0,0.0,No,1.0
Baseline,eval_model_07,0.0,1.0,No,0.8
Claude,eval_model_07,0.0,0.0,No,1.0
DeepSeek,eval_model_07,0.0,0.0,No,1.0
Gemini,eval_model_07,0.0,0.5,No,0.9
Qwen,eval_model_07,1.0,0.5,No,0.1
Baseline,eval_model_08,0.0,1.0,No,0.8
Claude,eval_model_08,0.14,0.75,No,0.74
DeepSeek,eval_model_08,0.71,1.0,No,0.23
Gemini,eval_model_08,1.0,1.0,No,0.0
Qwen,eval_model_08,0.14,0.0,No,0.89
Baseline,eval_model_09,0.0,0.6,No,0.88
Claude,eval_model_09,1.0,0.6,No,0.08
DeepSeek,eval_model_09,0.22,1.0,No,0.62
Gemini,eval_model_09,0.0,0.0,Yes,0.0
Qwen,eval_model_09,0.33,0.6,No,0.62
Baseline,eval_model_10,0.0,1.0,No,0.8
Claude,eval_model_10,0.75,0.0,No,0.4
DeepSeek,eval_model_10,0.5,0.0,No,0.6
Gemini,eval_model_10,0.0,0.12,Yes,0.0
Qwen,eval_model_10,1.0,0.06,No,0.19
