Model,Model Family,dataset,eval_type,Model Size (B),task_success_rate,state_goal,relation_goal,action_goal,total_goal,execution_success_rate,parsing_error,hallucination_error,predicate_argument_number_error,wrong_order_error,missing_step_error,affordance_error,additional_step_error
01-ai/Yi-1.5-34B,Yi,behavior,action_sequencing,34.4,0.0,1.4200000000000002,16.220000000000002,0,11.0,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
01-ai/Yi-1.5-34B-Chat,Yi,behavior,action_sequencing,34.4,10.14,22.7,20.080000000000002,0,21.0,13.04,62.32,1.4500000000000002,0.0,2.9000000000000004,14.49,,1.4500000000000002
01-ai/Yi-1.5-6B,Yi,behavior,action_sequencing,6.1,0.0,1.4200000000000002,16.220000000000002,0,11.0,0.0,98.55000000000001,0.0,0.0,0.0,1.4500000000000002,,0.0
01-ai/Yi-1.5-6B-Chat,Yi,behavior,action_sequencing,6.1,1.4500000000000002,4.26,18.529999999999998,0,13.5,2.9000000000000004,71.00999999999999,10.14,0.0,1.4500000000000002,8.7,,0.0
01-ai/Yi-1.5-9B,Yi,behavior,action_sequencing,8.8,8.7,11.35,21.62,0,18.0,11.59,71.00999999999999,7.249999999999999,0.0,1.4500000000000002,7.249999999999999,,1.4500000000000002
01-ai/Yi-Coder-1.5B,Yi,behavior,action_sequencing,1.5,0.0,2.8400000000000003,16.99,0,12.0,1.4500000000000002,98.55000000000001,0.0,0.0,0.0,0.0,,0.0
01-ai/Yi-Coder-1.5B-Chat,Yi,behavior,action_sequencing,1.5,0.0,1.4200000000000002,16.220000000000002,0,11.0,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
01-ai/Yi-Coder-9B,Yi,behavior,action_sequencing,8.8,2.9000000000000004,9.93,17.76,0,15.0,4.35,86.96000000000001,2.9000000000000004,0.0,1.4500000000000002,2.9000000000000004,,0.0
01-ai/Yi-Coder-9B-Chat,Yi,behavior,action_sequencing,8.8,15.939999999999998,17.02,23.94,0,21.5,17.39,68.12,0.0,0.0,5.800000000000001,4.35,,1.4500000000000002
Qwen/Qwen-72B,Qwen,behavior,action_sequencing,72.3,11.34,8.0,21.46,0,17.580000000000002,12.370000000000001,68.04,5.1499999999999995,0.0,0.0,11.34,,0.0
Qwen/Qwen1.5-1.8B,Qwen1.5,behavior,action_sequencing,1.8,0.0,4.0,12.15,0,9.8,0.0,86.6,6.1899999999999995,0.0,0.0,1.03,,1.03
Qwen/Qwen1.5-14B,Qwen1.5,behavior,action_sequencing,14.2,4.12,13.0,14.57,0,14.12,8.25,58.76,9.28,1.03,5.1499999999999995,13.4,,5.1499999999999995
Qwen/Qwen1.5-4B,Qwen1.5,behavior,action_sequencing,4.0,0.0,4.0,10.530000000000001,0,8.649999999999999,0.0,67.01,5.1499999999999995,0.0,1.03,13.4,,4.12
Qwen/Qwen1.5-72B,Qwen1.5,behavior,action_sequencing,72.3,18.56,25.0,27.939999999999998,0,27.089999999999996,23.71,37.11,3.09,0.0,8.25,23.71,,5.1499999999999995
Qwen/Qwen1.5-7B,Qwen1.5,behavior,action_sequencing,7.7,2.06,7.000000000000001,12.15,0,10.66,1.03,76.29,10.31,1.03,2.06,7.22,,1.03
Qwen/Qwen3-0.6B,Qwen3,behavior,action_sequencing,0.8,0.0,4.0,12.55,0,10.09,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
Qwen/Qwen3-1.7B,Qwen3,behavior,action_sequencing,2.0,0.0,4.0,12.55,0,10.09,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
Qwen/Qwen3-14B,Qwen3,behavior,action_sequencing,14.8,8.25,14.000000000000002,14.17,0,14.12,8.25,90.72,0.0,0.0,0.0,1.03,,0.0
Qwen/Qwen3-235B-A22B-Thinking-2507,Qwen3,behavior,action_sequencing,235.1,0.0,4.0,11.65,0,9.56,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
Qwen/Qwen3-32B,Qwen3,behavior,action_sequencing,32.8,8.25,10.0,14.979999999999999,0,13.54,8.25,91.75,0.0,0.0,0.0,0.0,,0.0
Qwen/Qwen3-4B,Qwen3,behavior,action_sequencing,4.0,1.03,10.0,12.55,0,11.82,1.03,97.94,0.0,0.0,0.0,0.0,,0.0
Qwen/Qwen3-8B,Qwen3,behavior,action_sequencing,8.2,3.09,7.000000000000001,12.959999999999999,0,11.24,3.09,96.91,0.0,0.0,0.0,0.0,,0.0
baichuan-inc/Baichuan2-7B-Base,Baichuan,behavior,action_sequencing,7.0,0.0,1.31,14.860000000000001,0,10.5,0.0,92.21000000000001,2.6,0.0,0.0,3.9,,0.0
baichuan-inc/Baichuan2-7B-Chat,Baichuan,behavior,action_sequencing,7.0,1.3,9.15,13.62,0,12.18,1.3,77.92,15.58,0.0,0.0,2.6,,1.3
CohereLabs/c4ai-command-r-08-2024,Cohere,behavior,action_sequencing,32.3,16.0,22.0,25.94,0,24.86,19.0,5.0,13.0,0.0,8.0,43.0,,4.0
CohereLabs/c4ai-command-r-plus-08-2024,Cohere,behavior,action_sequencing,103.8,28.000000000000004,28.999999999999996,31.95,0,31.15,35.0,0.0,1.0,15.0,10.0,39.0,,15.0
deepseek-ai/DeepSeek-R1,DeepSeek,behavior,action_sequencing,684.5,1.0,6.0,12.030000000000001,0,10.38,1.0,98.0,0.0,0.0,0.0,1.0,,0.0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek,behavior,action_sequencing,70.6,11.219999999999999,13.0,16.93,0,15.82,11.219999999999999,86.72999999999999,0.0,0.0,0.0,2.04,,0.0
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,DeepSeek,behavior,action_sequencing,8.0,4.08,15.0,13.389999999999999,0,13.84,5.1,81.63,3.06,0.0,2.04,4.08,,3.06
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek,behavior,action_sequencing,1.8,0.0,4.0,12.2,0,9.89,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek,behavior,action_sequencing,14.8,7.140000000000001,13.0,14.17,0,13.84,7.140000000000001,87.76,0.0,0.0,1.02,2.04,,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek,behavior,action_sequencing,32.8,17.349999999999998,26.5,16.73,0,19.49,19.39,70.41,4.08,0.0,2.04,3.06,,1.02
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek,behavior,action_sequencing,7.6,0.0,4.0,12.2,0,9.89,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
deepseek-ai/DeepSeek-V3,DeepSeek,behavior,action_sequencing,684.5,41.0,54.50000000000001,42.67,0,45.9,51.0,0.0,0.0,0.0,6.0,36.0,,1.0
google/gemma-1.1-2b-it,Gemma,behavior,action_sequencing,2.5,0.0,2.1999999999999997,12.959999999999999,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
google/gemma-1.1-7b-it,Gemma,behavior,action_sequencing,8.5,1.11,9.89,14.81,0,13.36,7.779999999999999,32.22,16.669999999999998,8.89,2.22,27.779999999999998,,4.44
google/gemma-2-27b,Gemma,behavior,action_sequencing,27.2,14.44,13.19,19.439999999999998,0,17.59,16.669999999999998,72.22,1.11,0.0,1.11,6.67,,0.0
google/gemma-2-27b-it,Gemma,behavior,action_sequencing,27.2,30.0,25.27,41.67,0,36.809999999999995,41.11,22.220000000000002,0.0,0.0,12.22,24.44,,0.0
google/gemma-2-2b,Gemma,behavior,action_sequencing,2.6,0.0,2.1999999999999997,12.959999999999999,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
google/gemma-2-2b-it,Gemma,behavior,action_sequencing,2.6,0.0,6.59,10.65,0,9.45,0.0,24.44,11.110000000000001,8.89,0.0,54.44,,0.0
google/gemma-2-9b,Gemma,behavior,action_sequencing,9.2,0.0,3.3000000000000003,13.43,0,10.42,0.0,98.89,0.0,0.0,1.11,0.0,,1.11
google/gemma-2-9b-it,Gemma,behavior,action_sequencing,9.2,20.0,15.379999999999999,29.17,0,25.080000000000002,28.89,32.22,1.11,0.0,7.779999999999999,28.89,,3.3300000000000005
google/gemma-2b,Gemma,behavior,action_sequencing,2.5,0.0,2.1999999999999997,12.959999999999999,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
google/gemma-2b-it,Gemma,behavior,action_sequencing,2.5,0.0,2.1999999999999997,12.959999999999999,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
google/gemma-3-12b-it,Gemma,behavior,action_sequencing,12.2,29.21,37.64,31.069999999999997,0,33.0,34.83,31.46,0.0,0.0,5.62,26.97,,2.25
google/gemma-3-12b-pt,Gemma,behavior,action_sequencing,12.2,1.1199999999999999,4.49,14.49,0,11.55,3.37,96.63000000000001,0.0,0.0,0.0,0.0,,0.0
google/gemma-3-27b-it,Gemma,behavior,action_sequencing,27.4,32.58,36.52,39.019999999999996,0,38.279999999999994,39.33,30.34,0.0,0.0,5.62,22.470000000000002,,1.1199999999999999
google/gemma-3-4b-it,Gemma,behavior,action_sequencing,4.3,2.25,7.870000000000001,16.82,0,14.19,7.870000000000001,40.45,0.0,1.1199999999999999,5.62,35.96,,4.49
google/gemma-3-4b-pt,Gemma,behavior,action_sequencing,4.3,0.0,3.37,13.55,0,10.56,1.1199999999999999,97.75,0.0,0.0,0.0,1.1199999999999999,,0.0
google/gemma-7b,Gemma,behavior,action_sequencing,8.5,0.0,7.6899999999999995,13.43,0,11.73,1.11,92.22,1.11,0.0,2.22,1.11,,0.0
google/gemma-7b-it,Gemma,behavior,action_sequencing,8.5,0.0,6.59,13.43,0,11.4,0.0,30.0,41.11,16.669999999999998,0.0,12.22,,0.0
ibm-granite/granite-3.1-2b-base,Granite,behavior,action_sequencing,2.5,1.18,3.53,15.310000000000002,0,11.74,1.18,90.59,0.0,0.0,1.18,2.35,,0.0
ibm-granite/granite-3.1-2b-instruct,Granite,behavior,action_sequencing,2.5,5.81,10.59,18.41,0,16.08,9.3,43.02,6.98,0.0,6.98,29.07,,4.65
ibm-granite/granite-3.1-8b-base,Granite,behavior,action_sequencing,8.2,4.71,9.41,16.84,0,14.59,8.24,69.41000000000001,3.53,0.0,1.18,11.76,,1.18
ibm-granite/granite-3.1-8b-instruct,Granite,behavior,action_sequencing,8.2,3.49,10.59,18.91,0,16.43,4.65,48.84,8.14,0.0,9.3,20.93,,2.33
ibm-granite/granite-3.2-2b-instruct,Granite,behavior,action_sequencing,2.5,0.0,7.06,13.43,0,11.540000000000001,3.49,41.86,3.49,1.16,11.63,33.72,,3.49
ibm-granite/granite-3.2-8b-instruct,Granite,behavior,action_sequencing,8.2,4.65,11.76,16.919999999999998,0,15.379999999999999,8.14,44.190000000000005,5.81,0.0,15.120000000000001,15.120000000000001,,5.81
ibm-granite/granite-3.3-2b-base,Granite,behavior,action_sequencing,2.5,1.18,3.53,13.270000000000001,0,10.32,0.0,90.59,1.18,1.18,2.35,4.71,,0.0
ibm-granite/granite-3.3-2b-instruct,Granite,behavior,action_sequencing,2.5,2.33,7.06,15.920000000000002,0,13.29,3.49,38.37,13.950000000000001,4.65,10.47,26.740000000000002,,8.14
ibm-granite/granite-3.3-8b-base,Granite,behavior,action_sequencing,8.2,4.71,5.88,15.82,0,12.809999999999999,4.71,64.71000000000001,7.06,0.0,2.35,11.76,,2.35
ibm-granite/granite-3.3-8b-instruct,Granite,behavior,action_sequencing,8.2,3.49,8.24,16.42,0,13.99,8.14,48.84,10.47,0.0,4.65,22.09,,3.49
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,behavior,action_sequencing,32.0,28.999999999999996,32.5,34.02,0,33.61,34.0,0.0,1.0,1.0,11.0,49.0,,1.0
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,behavior,action_sequencing,32.0,8.0,15.0,18.8,0,17.76,10.0,80.0,1.0,0.0,0.0,4.0,,0.0
meta-llama/Llama-2-13b-hf,Llama-2,behavior,action_sequencing,13.0,0.0,1.4200000000000002,16.12,0,11.110000000000001,0.0,98.61,0.0,0.0,0.0,0.0,,0.0
meta-llama/Llama-2-70b-hf,Llama-2,behavior,action_sequencing,69.0,0.0,1.4200000000000002,16.12,0,11.110000000000001,0.0,100.0,0.0,0.0,0.0,0.0,,0.0
meta-llama/Llama-2-7b-hf,Llama-2,behavior,action_sequencing,6.7,0.0,1.4200000000000002,16.85,0,11.59,0.0,91.67,2.78,1.39,0.0,2.78,,0.0
meta-llama/Llama-3.1-70B,Llama-3,behavior,action_sequencing,70.6,3.06,8.0,13.780000000000001,0,12.15,3.06,93.88,0.0,0.0,1.02,2.04,,0.0
meta-llama/Llama-3.1-8B,Llama-3,behavior,action_sequencing,8.0,3.06,10.0,13.389999999999999,0,12.43,5.1,78.57,3.06,0.0,2.04,7.140000000000001,,2.04
meta-llama/Llama-3.3-70B-Instruct,Llama-3,behavior,action_sequencing,70.6,41.0,49.5,43.05,0,44.81,49.0,0.0,0.0,0.0,10.0,38.0,,6.0
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,behavior,action_sequencing,401.6,45.0,56.49999999999999,53.949999999999996,0,54.64,56.00000000000001,16.0,1.0,0.0,6.0,18.0,,3.0
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,behavior,action_sequencing,108.6,37.0,35.5,56.58,0,50.82,50.0,0.0,7.000000000000001,0.0,14.000000000000002,28.999999999999996,,4.0
meta-llama/Meta-Llama-3-70B,Llama-3,behavior,action_sequencing,70.6,1.02,8.0,12.6,0,11.3,2.04,95.92,1.02,0.0,0.0,1.02,,0.0
meta-llama/Meta-Llama-3-8B,Llama-3,behavior,action_sequencing,8.0,1.02,4.0,12.6,0,10.17,1.02,92.86,1.02,0.0,0.0,3.06,,0.0
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,behavior,action_sequencing,8.0,13.270000000000001,18.0,21.65,0,20.62,18.37,16.33,11.219999999999999,3.06,5.1,36.730000000000004,,5.1
mistralai/Mistral-7B-Instruct-v0.2,Mistral,behavior,action_sequencing,7.2,3.0,5.0,13.530000000000001,0,11.200000000000001,5.0,8.0,38.0,5.0,5.0,39.0,,3.0
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,behavior,action_sequencing,46.7,6.0,17.0,15.790000000000001,0,16.12,7.000000000000001,21.0,53.0,0.0,6.0,11.0,,1.0
mistralai/Mixtral-8x22B-Instruct-v0.1,Mistral,behavior,action_sequencing,140.6,31.0,37.5,38.16,0,37.980000000000004,40.0,3.0,6.0,0.0,10.0,32.0,,2.0
moonshotai/Kimi-K2-Instruct,Kimi,behavior,action_sequencing,1000.0,53.0,58.5,65.60000000000001,0,63.660000000000004,66.0,0.0,1.0,0.0,6.0,27.0,,3.0
openai/gpt-oss-120b,GPT-OSS,behavior,action_sequencing,120.4,53.54,52.15,63.449999999999996,0,60.5,64.64999999999999,1.01,0.0,0.0,2.02,32.32,,2.02
openai/gpt-oss-20b,GPT-OSS,behavior,action_sequencing,21.5,35.0,28.000000000000004,48.5,0,42.9,46.0,37.0,0.0,0.0,2.0,13.0,,1.0
