Model,Model Family,dataset,eval_type,Model Size (B),task_success_rate,state_goal,relation_goal,action_goal,total_goal,execution_success_rate,parsing_error,hallucination_error,predicate_argument_number_error,wrong_order_error,missing_step_error,affordance_error,additional_step_error,Pretraining Data Size (T),FLOPs (1E21)
01-ai/Yi-1.5-34B,Yi,behavior,action_sequencing,34.4,0.0,1.4200000000000002,16.220000000000002,0,11.0,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,3.6,743.04
01-ai/Yi-1.5-34B-Chat,Yi,behavior,action_sequencing,34.4,10.14,22.7,20.08,0,21.0,13.04,62.32,1.4500000000000002,0.0,2.9000000000000004,14.49,,1.4500000000000002,3.6,743.04
01-ai/Yi-1.5-6B,Yi,behavior,action_sequencing,6.1,0.0,1.4200000000000002,16.220000000000002,0,11.0,0.0,98.55,0.0,0.0,0.0,1.4500000000000002,,0.0,3.6,131.76
01-ai/Yi-1.5-6B-Chat,Yi,behavior,action_sequencing,6.1,1.4500000000000002,4.26,18.53,0,13.5,2.9000000000000004,71.00999999999999,10.14,0.0,1.4500000000000002,8.7,,0.0,3.6,131.76
01-ai/Yi-1.5-9B,Yi,behavior,action_sequencing,8.8,8.7,11.35,21.62,0,18.0,11.59,71.00999999999999,7.249999999999999,0.0,1.4500000000000002,7.249999999999999,,1.4500000000000002,3.6,190.08000000000004
01-ai/Yi-Coder-1.5B,Yi,behavior,action_sequencing,1.5,0.0,2.8400000000000003,16.99,0,12.0,1.4500000000000002,98.55,0.0,0.0,0.0,0.0,,0.0,2.4,21.599999999999998
01-ai/Yi-Coder-1.5B-Chat,Yi,behavior,action_sequencing,1.5,0.0,1.4200000000000002,16.220000000000002,0,11.0,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,2.4,21.599999999999998
01-ai/Yi-Coder-9B,Yi,behavior,action_sequencing,8.8,2.9000000000000004,9.93,17.76,0,15.0,4.35,86.96000000000001,2.9000000000000004,0.0,1.4500000000000002,2.9000000000000004,,0.0,2.4,126.72
01-ai/Yi-Coder-9B-Chat,Yi,behavior,action_sequencing,8.8,15.939999999999998,17.02,23.94,0,21.5,17.39,68.12,0.0,0.0,5.800000000000001,4.35,,1.4500000000000002,2.4,126.72
Qwen/Qwen-72B,Qwen,behavior,action_sequencing,72.3,11.34,8.0,21.46,0,17.580000000000002,12.37,68.04,5.15,0.0,0.0,11.34,,0.0,3.0,1296.0
Qwen/Qwen1.5-1.8B,Qwen1.5,behavior,action_sequencing,1.8,0.0,4.0,12.15,0,9.8,0.0,86.6,6.19,0.0,0.0,1.03,,1.03,2.4,25.92
Qwen/Qwen1.5-14B,Qwen1.5,behavior,action_sequencing,14.2,4.12,13.0,14.57,0,14.12,8.25,58.76,9.28,1.03,5.15,13.4,,5.15,4.0,336.0
Qwen/Qwen1.5-4B,Qwen1.5,behavior,action_sequencing,4.0,0.0,4.0,10.53,0,8.649999999999999,0.0,67.01,5.15,0.0,1.03,13.4,,4.12,2.4,57.6
Qwen/Qwen1.5-72B,Qwen1.5,behavior,action_sequencing,72.3,18.56,25.0,27.94,0,27.09,23.71,37.11,3.09,0.0,8.25,23.71,,5.15,3.0,1296.0
Qwen/Qwen1.5-7B,Qwen1.5,behavior,action_sequencing,7.7,2.06,7.000000000000001,12.15,0,10.66,1.03,76.29,10.31,1.03,2.06,7.22,,1.03,4.0,168.0
Qwen/Qwen3-0.6B,Qwen3,behavior,action_sequencing,0.8,0.0,4.0,12.55,0,10.09,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,36.0,172.8
Qwen/Qwen3-1.7B,Qwen3,behavior,action_sequencing,2.0,0.0,4.0,12.55,0,10.09,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,36.0,432.0
Qwen/Qwen3-14B,Qwen3,behavior,action_sequencing,14.8,8.25,14.000000000000002,14.17,0,14.12,8.25,90.72,0.0,0.0,0.0,1.03,,0.0,36.0,3196.8
Qwen/Qwen3-235B-A22B-Thinking-2507,Qwen3,behavior,action_sequencing,235.1,0.0,4.0,11.65,0,9.56,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,36.0,50781.6
Qwen/Qwen3-32B,Qwen3,behavior,action_sequencing,32.8,8.25,10.0,14.98,0,13.54,8.25,91.75,0.0,0.0,0.0,0.0,,0.0,36.0,7084.799999999999
Qwen/Qwen3-4B,Qwen3,behavior,action_sequencing,4.0,1.03,10.0,12.55,0,11.82,1.03,97.94,0.0,0.0,0.0,0.0,,0.0,36.0,864.0
Qwen/Qwen3-8B,Qwen3,behavior,action_sequencing,8.2,3.09,7.000000000000001,12.96,0,11.24,3.09,96.91,0.0,0.0,0.0,0.0,,0.0,36.0,1771.1999999999998
baichuan-inc/Baichuan2-7B-Base,Baichuan,behavior,action_sequencing,7.0,0.0,1.31,14.86,0,10.5,0.0,92.21,2.6,0.0,0.0,3.9,,0.0,2.6,109.20000000000002
baichuan-inc/Baichuan2-7B-Chat,Baichuan,behavior,action_sequencing,7.0,1.3,9.15,13.62,0,12.18,1.3,77.92,15.58,0.0,0.0,2.6,,1.3,2.6,109.20000000000002
CohereLabs/c4ai-command-r-08-2024,Cohere,behavior,action_sequencing,32.3,16.0,22.0,25.94,0,24.86,19.0,5.0,13.0,0.0,8.0,43.0,,4.0,,
CohereLabs/c4ai-command-r-plus-08-2024,Cohere,behavior,action_sequencing,103.8,28.000000000000004,29.0,31.95,0,31.15,35.0,0.0,1.0,15.0,10.0,39.0,,15.0,,
deepseek-ai/DeepSeek-R1,DeepSeek,behavior,action_sequencing,684.5,1.0,6.0,12.03,0,10.38,1.0,98.0,0.0,0.0,0.0,1.0,,0.0,14.8,60783.600000000006
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek,behavior,action_sequencing,70.6,11.22,13.0,16.93,0,15.82,11.22,86.72999999999999,0.0,0.0,0.0,2.04,,0.0,15.0,6353.999999999999
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,DeepSeek,behavior,action_sequencing,8.0,4.08,15.0,13.39,0,13.84,5.1,81.63,3.06,0.0,2.04,4.08,,3.06,15.0,720.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek,behavior,action_sequencing,1.8,0.0,4.0,12.2,0,9.89,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,18.0,194.4
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek,behavior,action_sequencing,14.8,7.140000000000001,13.0,14.17,0,13.84,7.140000000000001,87.76,0.0,0.0,1.02,2.04,,0.0,18.0,1598.4
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek,behavior,action_sequencing,32.8,17.349999999999998,26.5,16.73,0,19.49,19.39,70.41,4.08,0.0,2.04,3.06,,1.02,18.0,3542.3999999999996
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek,behavior,action_sequencing,7.6,0.0,4.0,12.2,0,9.89,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,18.0,820.8
deepseek-ai/DeepSeek-V3,DeepSeek,behavior,action_sequencing,684.5,41.0,54.50000000000001,42.67,0,45.9,51.0,0.0,0.0,0.0,6.0,36.0,,1.0,14.8,60783.600000000006
google/gemma-1.1-2b-it,Gemma,behavior,action_sequencing,2.5,0.0,2.2,12.96,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,3.0,45.0
google/gemma-1.1-7b-it,Gemma,behavior,action_sequencing,8.5,1.11,9.89,14.81,0,13.36,7.779999999999999,32.22,16.669999999999998,8.89,2.22,27.78,,4.44,6.0,306.0
google/gemma-2-27b,Gemma,behavior,action_sequencing,27.2,14.44,13.19,19.44,0,17.59,16.669999999999998,72.22,1.11,0.0,1.11,6.67,,0.0,13.0,2121.6
google/gemma-2-27b-it,Gemma,behavior,action_sequencing,27.2,30.0,25.27,41.67,0,36.81,41.11,22.22,0.0,0.0,12.22,24.44,,0.0,13.0,2121.6
google/gemma-2-2b,Gemma,behavior,action_sequencing,2.6,0.0,2.2,12.96,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,2.0,31.200000000000003
google/gemma-2-2b-it,Gemma,behavior,action_sequencing,2.6,0.0,6.59,10.65,0,9.45,0.0,24.44,11.11,8.89,0.0,54.44,,0.0,2.0,31.200000000000003
google/gemma-2-9b,Gemma,behavior,action_sequencing,9.2,0.0,3.3000000000000003,13.43,0,10.42,0.0,98.89,0.0,0.0,1.11,0.0,,1.11,8.0,441.59999999999997
google/gemma-2-9b-it,Gemma,behavior,action_sequencing,9.2,20.0,15.38,29.17,0,25.08,28.89,32.22,1.11,0.0,7.779999999999999,28.89,,3.3300000000000005,8.0,441.59999999999997
google/gemma-2b,Gemma,behavior,action_sequencing,2.5,0.0,2.2,12.96,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,6.0,72.0
google/gemma-2b-it,Gemma,behavior,action_sequencing,2.5,0.0,2.2,12.96,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,6.0,90.0
google/gemma-3-12b-it,Gemma,behavior,action_sequencing,12.2,29.21,37.64,31.07,0,33.0,34.83,31.46,0.0,0.0,5.62,26.97,,2.25,12.0,878.4
google/gemma-3-12b-pt,Gemma,behavior,action_sequencing,12.2,1.12,4.49,14.49,0,11.55,3.37,96.63,0.0,0.0,0.0,0.0,,0.0,12.0,878.4
google/gemma-3-27b-it,Gemma,behavior,action_sequencing,27.4,32.58,36.52,39.02,0,38.28,39.33,30.34,0.0,0.0,5.62,22.47,,1.12,14.0,2301.6
google/gemma-3-4b-it,Gemma,behavior,action_sequencing,4.3,2.25,7.870000000000001,16.82,0,14.19,7.870000000000001,40.45,0.0,1.12,5.62,35.96,,4.49,4.0,103.19999999999999
google/gemma-3-4b-pt,Gemma,behavior,action_sequencing,4.3,0.0,3.37,13.55,0,10.56,1.12,97.75,0.0,0.0,0.0,1.12,,0.0,4.0,103.19999999999999
google/gemma-7b,Gemma,behavior,action_sequencing,8.5,0.0,7.689999999999999,13.43,0,11.73,1.11,92.22,1.11,0.0,2.22,1.11,,0.0,6.0,252.0
google/gemma-7b-it,Gemma,behavior,action_sequencing,8.5,0.0,6.59,13.43,0,11.4,0.0,30.0,41.11,16.669999999999998,0.0,12.22,,0.0,2.0,102.0
ibm-granite/granite-3.1-2b-base,Granite,behavior,action_sequencing,2.5,1.18,3.53,15.310000000000002,0,11.74,1.18,90.59,0.0,0.0,1.18,2.35,,0.0,12.0,180.0
ibm-granite/granite-3.1-2b-instruct,Granite,behavior,action_sequencing,2.5,5.81,10.59,18.41,0,16.08,9.3,43.02,6.98,0.0,6.98,29.07,,4.65,12.0,180.0
ibm-granite/granite-3.1-8b-base,Granite,behavior,action_sequencing,8.2,4.71,9.41,16.84,0,14.59,8.24,69.41000000000001,3.53,0.0,1.18,11.76,,1.18,12.0,590.4
ibm-granite/granite-3.1-8b-instruct,Granite,behavior,action_sequencing,8.2,3.49,10.59,18.91,0,16.43,4.65,48.84,8.14,0.0,9.3,20.93,,2.33,12.0,590.4
ibm-granite/granite-3.2-2b-instruct,Granite,behavior,action_sequencing,2.5,0.0,7.06,13.43,0,11.54,3.49,41.86,3.49,1.16,11.63,33.72,,3.49,12.0,180.0
ibm-granite/granite-3.2-8b-instruct,Granite,behavior,action_sequencing,8.2,4.65,11.76,16.919999999999998,0,15.38,8.14,44.190000000000005,5.81,0.0,15.12,15.12,,5.81,12.0,590.4
ibm-granite/granite-3.3-2b-base,Granite,behavior,action_sequencing,2.5,1.18,3.53,13.27,0,10.32,0.0,90.59,1.18,1.18,2.35,4.71,,0.0,12.0,180.0
ibm-granite/granite-3.3-2b-instruct,Granite,behavior,action_sequencing,2.5,2.33,7.06,15.920000000000002,0,13.29,3.49,38.37,13.95,4.65,10.47,26.74,,8.14,12.0,180.0
ibm-granite/granite-3.3-8b-base,Granite,behavior,action_sequencing,8.2,4.71,5.88,15.82,0,12.81,4.71,64.71000000000001,7.06,0.0,2.35,11.76,,2.35,12.0,590.4
ibm-granite/granite-3.3-8b-instruct,Granite,behavior,action_sequencing,8.2,3.49,8.24,16.42,0,13.99,8.14,48.84,10.47,0.0,4.65,22.09,,3.49,12.0,590.4
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,behavior,action_sequencing,32.0,29.0,32.5,34.02,0,33.61,34.0,0.0,1.0,1.0,11.0,49.0,,1.0,6.5,1248.0
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,behavior,action_sequencing,32.0,8.0,15.0,18.8,0,17.76,10.0,80.0,1.0,0.0,0.0,4.0,,0.0,6.5,1248.0
meta-llama/Llama-2-13b-hf,Llama-2,behavior,action_sequencing,13.0,0.0,1.4200000000000002,16.12,0,11.11,0.0,98.61,0.0,0.0,0.0,0.0,,0.0,2.0,156.0
meta-llama/Llama-2-70b-hf,Llama-2,behavior,action_sequencing,69.0,0.0,1.4200000000000002,16.12,0,11.11,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,2.0,840.0
meta-llama/Llama-2-7b-hf,Llama-2,behavior,action_sequencing,6.7,0.0,1.4200000000000002,16.85,0,11.59,0.0,91.67,2.78,1.39,0.0,2.78,,0.0,2.0,84.0
meta-llama/Llama-3.1-70B,Llama-3,behavior,action_sequencing,70.6,3.06,8.0,13.78,0,12.15,3.06,93.88,0.0,0.0,1.02,2.04,,0.0,15.0,6353.999999999999
meta-llama/Llama-3.1-8B,Llama-3,behavior,action_sequencing,8.0,3.06,10.0,13.39,0,12.43,5.1,78.57,3.06,0.0,2.04,7.140000000000001,,2.04,,
meta-llama/Llama-3.3-70B-Instruct,Llama-3,behavior,action_sequencing,70.6,41.0,49.5,43.05,0,44.81,49.0,0.0,0.0,0.0,10.0,38.0,,6.0,15.0,6353.999999999999
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,behavior,action_sequencing,401.6,45.0,56.49999999999999,53.95,0,54.64,56.00000000000001,16.0,1.0,0.0,6.0,18.0,,3.0,22.0,53011.200000000004
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,behavior,action_sequencing,108.6,37.0,35.5,56.58,0,50.82,50.0,0.0,7.000000000000001,0.0,14.000000000000002,29.0,,4.0,40.0,26064.0
meta-llama/Meta-Llama-3-70B,Llama-3,behavior,action_sequencing,70.6,1.02,8.0,12.6,0,11.3,2.04,95.92,1.02,0.0,0.0,1.02,,0.0,15.0,6300.0
meta-llama/Meta-Llama-3-8B,Llama-3,behavior,action_sequencing,8.0,1.02,4.0,12.6,0,10.17,1.02,92.86,1.02,0.0,0.0,3.06,,0.0,15.0,720.0
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,behavior,action_sequencing,8.0,13.27,18.0,21.65,0,20.62,18.37,16.33,11.22,3.06,5.1,36.73,,5.1,15.0,720.0
mistralai/Mistral-7B-Instruct-v0.2,Mistral,behavior,action_sequencing,7.2,3.0,5.0,13.53,0,11.2,5.0,8.0,38.0,5.0,5.0,39.0,,3.0,,
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,behavior,action_sequencing,46.7,6.0,17.0,15.79,0,16.12,7.000000000000001,21.0,53.0,0.0,6.0,11.0,,1.0,,
mistralai/Mixtral-8x22B-Instruct-v0.1,Mistral,behavior,action_sequencing,140.6,31.0,37.5,38.16,0,37.98,40.0,3.0,6.0,0.0,10.0,32.0,,2.0,,
moonshotai/Kimi-K2-Instruct,Kimi,behavior,action_sequencing,1000.0,53.0,58.5,65.60000000000001,0,63.66,66.0,0.0,1.0,0.0,6.0,27.0,,3.0,15.5,93000.0
openai/gpt-oss-120b,GPT-OSS,behavior,action_sequencing,120.4,53.54,52.15,63.45,0,60.5,64.64999999999999,1.01,0.0,0.0,2.02,32.32,,2.02,,
openai/gpt-oss-20b,GPT-OSS,behavior,action_sequencing,21.5,35.0,28.000000000000004,48.5,0,42.9,46.0,37.0,0.0,0.0,2.0,13.0,,1.0,,
