Model,SOP,norm_counterexample_traces,norm_minor_issues,errors,execution_compliance
Agent,eval_model_01,0.17,0.0,No,0.86
Agent_Debuged,eval_model_01,0.0,0.0,No,1.0
Qwen,eval_model_01,1.0,0.0,No,0.2
Agent,eval_model_02,0.0,1.0,No,0.8
Agent_Debuged,eval_model_02,0.0,0.0,No,1.0
Qwen,eval_model_02,1.0,0.33,No,0.13
Agent,eval_model_03,0.38,0.67,No,0.56
Agent_Debuged,eval_model_03,0.0,0.0,No,1.0
Qwen,eval_model_03,1.0,1.0,No,0.0
Agent,eval_model_04,1.0,0.0,No,0.2
Agent_Debuged,eval_model_04,0.0,1.0,No,0.8
Qwen,eval_model_04,0.0,0.5,No,0.9
Agent,eval_model_05,0.0,1.0,No,0.8
Agent_Debuged,eval_model_05,0.0,0.0,No,1.0
Qwen,eval_model_05,0.0,0.25,Yes,0.0
Agent,eval_model_06,1.0,1.0,No,0.0
Agent_Debuged,eval_model_06,0.0,1.0,No,0.8
Qwen,eval_model_06,0.0,0.0,No,1.0
Agent,eval_model_07,1.0,1.0,No,0.0
Agent_Debuged,eval_model_07,0.0,0.5,No,0.9
Qwen,eval_model_07,0.0,0.0,No,1.0
Agent,eval_model_08,1.0,1.0,No,0.0
Agent_Debuged,eval_model_08,0.0,0.0,No,1.0
Qwen,eval_model_08,1.0,0.0,No,0.2
Agent,eval_model_09,0.88,0.0,No,0.3
Agent_Debuged,eval_model_09,0.0,1.0,No,0.8
Qwen,eval_model_09,1.0,0.0,No,0.2
Agent,eval_model_10,1.0,0.29,No,0.14
Agent_Debuged,eval_model_10,0.0,1.0,No,0.8
Qwen,eval_model_10,0.0,0.0,Yes,0.0
