Model,SOP,norm_counterexample_traces,norm_minor_issues,errors,execution_compliance
Baseline,eval_model_01,0.0,0.33,No,0.93
Claude,eval_model_01,0.67,1.0,No,0.26
DeepSeek,eval_model_01,0.67,0.67,No,0.33
Gemini,eval_model_01,0.5,0.0,No,0.6
Qwen,eval_model_01,1.0,0.67,No,0.07
Baseline,eval_model_02,0.0,0.0,No,1.0
Claude,eval_model_02,1.0,0.67,No,0.07
DeepSeek,eval_model_02,0.25,0.33,No,0.73
Gemini,eval_model_02,0.12,0.0,No,0.9
Qwen,eval_model_02,0.38,1.0,No,0.5
Baseline,eval_model_03,0.0,0.25,No,0.95
Claude,eval_model_03,1.0,1.0,No,0.0
DeepSeek,eval_model_03,0.33,0.25,No,0.69
Gemini,eval_model_03,0.0,0.0,Yes,0.0
Qwen,eval_model_03,0.0,0.25,Yes,0.0
Baseline,eval_model_04,0.0,0.0,No,1.0
Claude,eval_model_04,0.0,0.2,No,0.96
DeepSeek,eval_model_04,0.0,0.2,No,0.96
Gemini,eval_model_04,0.0,1.0,No,0.8
Qwen,eval_model_04,1.0,0.4,No,0.12
Baseline,eval_model_05,0.0,0.67,No,0.87
Claude,eval_model_05,1.0,1.0,No,0.0
DeepSeek,eval_model_05,0.17,0.67,No,0.73
Gemini,eval_model_05,0.0,0.33,Yes,0.0
Qwen,eval_model_05,0.0,0.0,Yes,0.0
Baseline,eval_model_06,0.0,0.0,No,1.0
Claude,eval_model_06,1.0,1.0,No,0.0
DeepSeek,eval_model_06,0.1,1.0,No,0.72
Gemini,eval_model_06,0.0,0.33,No,0.93
Qwen,eval_model_06,0.0,0.0,No,1.0
Baseline,eval_model_07,0.0,0.5,No,0.9
Claude,eval_model_07,0.0,0.5,No,0.9
DeepSeek,eval_model_07,0.0,0.0,No,1.0
Gemini,eval_model_07,0.0,1.0,No,0.8
Qwen,eval_model_07,1.0,0.5,No,0.1
Baseline,eval_model_08,0.0,0.0,No,1.0
Claude,eval_model_08,0.12,0.5,No,0.8
DeepSeek,eval_model_08,0.62,1.0,No,0.3
Gemini,eval_model_08,1.0,0.25,No,0.15
Qwen,eval_model_08,0.12,0.25,No,0.85
Baseline,eval_model_09,0.0,1.0,No,0.8
Claude,eval_model_09,1.0,0.83,No,0.03
DeepSeek,eval_model_09,0.22,0.67,No,0.69
Gemini,eval_model_09,0.0,0.0,Yes,0.0
Qwen,eval_model_09,0.33,0.67,No,0.6
Baseline,eval_model_10,0.0,0.0,No,1.0
Claude,eval_model_10,0.75,0.27,No,0.35
DeepSeek,eval_model_10,0.5,1.0,No,0.4
Gemini,eval_model_10,0.0,0.0,Yes,0.0
Qwen,eval_model_10,1.0,0.33,No,0.13
