model_name,dataset,eval_type,model_size_b,task_success_rate,state_goal,relation_goal,action_goal,total_goal,execution_success_rate,parsing_error,hallucination_error,predicate_argument_number_error,wrong_order_error,missing_step_error,affordance_error,additional_step_error
meta-llama/Llama-3.3-70B-Instruct,virtualhome,action_sequencing,70.6,61.9672,55.7554,77.7778,54.7297,62.0462,68.5,0.9836,15.7377,0.0,0.0,13.7705,0.9836,1.9672
Qwen/Qwen1.5-1.8B,virtualhome,action_sequencing,1.8,0.0,0.3597,0.0,0.0,0.165,0.0,22.2951,81.9672,0.3279,0.0,0.3279,0.0,0.0
Qwen/Qwen1.5-7B,virtualhome,action_sequencing,7.7,5.5738,20.5036,2.2222,2.027,10.5611,4.6,12.1311,36.0656,7.541,1.9672,26.8852,13.1148,11.4754
Qwen/Qwen1.5-14B,virtualhome,action_sequencing,14.2,19.3443,19.4245,23.8889,25.6757,22.2772,24.6,6.2295,32.1311,1.9672,0.9836,20.3279,13.7705,25.9016
Qwen/Qwen3-14B,virtualhome,action_sequencing,14.8,45.9016,55.7554,41.1111,14.1892,41.2541,43.0,43.6066,3.9344,0.3279,6.5574,4.5902,0.0,19.3443
Qwen/Qwen1.5-72B,virtualhome,action_sequencing,72.3,6.8852,10.7914,8.3333,4.7297,8.5809,11.1,0.3279,33.7705,13.4426,0.0,35.082,6.2295,0.6557
openai/gpt-oss-120b,virtualhome,action_sequencing,120.4,74.0984,87.4101,78.3333,62.1622,78.5479,79.3,0.6557,2.9508,0.0,0.0,17.377,0.0,2.9508
Qwen/Qwen1.5-4B,virtualhome,action_sequencing,4.0,2.9508,7.554,2.2222,0.0,4.1254,3.0,21.3115,63.2787,2.623,0.9836,9.8361,8.5246,0.9836
Qwen/Qwen-72B,virtualhome,action_sequencing,72.3,15.082,15.1079,20.5556,16.2162,16.9967,22.0,3.6066,27.8689,6.8852,0.3279,31.1475,8.1967,0.6557
Qwen/Qwen3-32B,virtualhome,action_sequencing,32.8,51.4754,62.5899,45.0,25.0,48.1848,48.5,28.1967,8.1967,0.0,6.8852,8.1967,0.0,19.3443
meta-llama/Llama-2-7b-hf,virtualhome,action_sequencing,6.7,0.0,0.0,0.0,0.0,0.0,0.0,86.1386,15.1815,0.0,0.0,0.33,0.0,0.0
Qwen/Qwen3-8B,virtualhome,action_sequencing,8.2,45.9016,55.7554,47.7778,29.0541,46.8647,33.1,29.1803,6.8852,0.0,12.7869,17.7049,0.3279,27.8689
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,virtualhome,action_sequencing,70.6,51.1475,61.1511,56.1111,26.3514,51.1551,50.5,25.2459,2.2951,0.0,4.5902,17.377,0.0,10.8197
deepseek-ai/DeepSeek-R1,virtualhome,action_sequencing,684.5,36.7213,43.5252,33.3333,12.1622,32.8383,29.5,43.6066,18.6885,0.0,8.1967,2.623,0.0,40.3279
Qwen/Qwen3-0.6B,virtualhome,action_sequencing,0.8,0.3279,3.2374,0.5556,0.0,1.6502,0.3,47.8689,26.2295,1.3115,0.0,24.2623,0.3279,0.0
deepseek-ai/DeepSeek-V3,virtualhome,action_sequencing,684.5,78.3607,79.8561,84.4444,72.2973,79.3729,85.9,0.0,6.2295,0.6557,0.0,6.5574,0.6557,1.9672
meta-llama/Meta-Llama-3-70B,virtualhome,action_sequencing,70.6,28.5246,34.5324,31.1111,18.9189,29.703,27.9,18.3607,21.3115,0.6557,3.9344,26.5574,1.9672,29.5082
meta-llama/Llama-3.2-1B,virtualhome,action_sequencing,1.2,0.0,0.7194,0.0,0.0,0.33,0.0,42.2951,56.3934,0.6557,0.0,1.3115,0.0,0.0
meta-llama/Meta-Llama-3-70B-Instruct,virtualhome,action_sequencing,70.6,60.6557,55.036,68.8889,45.9459,56.9307,65.9,0.0,18.6885,8.1967,0.3279,5.9016,0.9836,3.9344
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,virtualhome,action_sequencing,8.0,23.9344,36.3309,21.1111,14.1892,26.4026,26.9,7.541,25.2459,0.3279,1.3115,35.4098,3.6066,13.7705
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,virtualhome,action_sequencing,1.8,0.9836,2.1583,1.6667,0.0,1.4851,2.3,46.8852,42.9508,0.3279,0.0,7.8689,2.2951,0.3279
openai/gpt-oss-20b,virtualhome,action_sequencing,21.5,67.541,74.4604,66.1111,48.6486,65.6766,68.9,13.1148,4.2623,0.6557,0.0,13.1148,0.0,0.6557
meta-llama/Meta-Llama-3-8B-Instruct,virtualhome,action_sequencing,8.0,22.2951,27.6978,22.7778,10.8108,22.1122,26.2,0.0,36.3934,7.8689,0.3279,28.1967,0.9836,0.9836
Qwen/Qwen3-1.7B,virtualhome,action_sequencing,2.0,2.9508,12.5899,5.0,0.0,7.2607,1.3,58.6885,20.0,0.0,2.2951,18.0328,0.0,0.0
meta-llama/Llama-2-13b-hf,virtualhome,action_sequencing,13.0,0.0,1.083,0.0,0.0,0.4967,1.3,64.6865,25.7426,1.9802,0.0,7.2607,1.3201,0.33
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,virtualhome,action_sequencing,14.8,47.8689,61.8705,48.3333,25.6757,49.0099,53.4,26.2295,3.2787,0.0,1.3115,15.082,0.6557,6.5574
mistralai/Mistral-7B-Instruct-v0.2,virtualhome,action_sequencing,7.2,17.1053,26.6187,11.7318,6.0811,17.1901,23.7,4.9342,30.2632,2.6316,0.0,39.8026,0.3289,22.3684
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,virtualhome,action_sequencing,32.8,17.377,22.6619,16.6667,16.8919,19.4719,17.0,69.5082,1.9672,0.3279,2.9508,6.5574,1.6393,36.7213
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,virtualhome,action_sequencing,7.6,13.4426,28.0576,14.4444,6.0811,18.6469,14.8,46.2295,5.5738,1.9672,2.9508,24.2623,5.2459,3.6066
Qwen/Qwen-7B,virtualhome,action_sequencing,7.7,0.3279,1.0791,0.5556,0.0,0.6601,0.0,90.8197,13.4426,0.0,0.0,0.6557,0.3279,0.0
meta-llama/Meta-Llama-3-8B,virtualhome,action_sequencing,8.0,10.1639,20.1439,7.2222,6.0811,12.8713,9.5,14.0984,33.1148,7.2131,1.6393,29.1803,6.2295,6.5574
Qwen/Qwen3-4B,virtualhome,action_sequencing,4.0,29.5082,41.0072,25.0,6.7568,27.8878,26.2,63.6066,1.6393,0.6557,5.2459,2.623,0.0,15.082
meta-llama/Llama-3.1-70B,virtualhome,action_sequencing,70.6,29.1803,36.6906,33.8889,17.5676,31.1881,30.5,26.5574,20.6557,0.9836,2.2951,19.0164,1.9672,90.8197
meta-llama/Llama-2-70b-hf,virtualhome,action_sequencing,69.0,5.6106,10.4693,4.4693,2.027,6.6225,4.6,55.4455,24.0924,2.3102,0.33,10.8911,2.9703,0.33
