Model,Model Family,dataset,eval_type,Model Size (B),task_success_rate,state_goal,relation_goal,action_goal,total_goal,execution_success_rate,parsing_error,hallucination_error,predicate_argument_number_error,wrong_order_error,missing_step_error,affordance_error,additional_step_error
meta-llama/Llama-3.3-70B-Instruct,Llama-3,virtualhome,action_sequencing_v4,70.6,63.9344,54.3165,76.1111,61.4865,62.5413,68.9,0.3279,16.0656,0.0,0.0,14.7541,0.0,1.9672
01-ai/Yi-Coder-1.5B-Chat,Yi,virtualhome,action_sequencing_v4,1.5,0.6557,8.2734,0.5556,0.0,3.9604,2.6,10.4918,44.5902,14.4262,0.0,22.9508,6.5574,0.6557
Qwen/Qwen1.5-1.8B,Qwen1.5,virtualhome,action_sequencing_v4,1.8,0.0,0.0,0.0,0.0,0.0,0.0,122.2951,18.3607,0.0,0.0,0.0,0.0,0.0
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,virtualhome,action_sequencing_v4,46.7,29.1803,36.6906,21.1111,16.8919,27.2277,26.2,2.623,22.9508,2.9508,0.9836,42.623,1.9672,1.3115
01-ai/Yi-1.5-6B-Chat,Yi,virtualhome,action_sequencing_v4,6.1,13.9456,22.3827,14.7059,5.4054,15.9664,16.0,10.2041,31.6327,4.0816,0.0,36.0544,4.7619,1.7007
Qwen/Qwen1.5-7B,Qwen1.5,virtualhome,action_sequencing_v4,7.7,0.9836,4.3165,1.6667,0.0,2.4752,2.3,67.8689,25.2459,2.2951,0.0,11.1475,5.5738,0.0
Qwen/Qwen1.5-14B,Qwen1.5,virtualhome,action_sequencing_v4,14.2,7.541,8.9928,9.4444,6.7568,8.5809,12.5,30.4918,42.2951,2.623,0.6557,13.1148,3.6066,0.6557
Qwen/Qwen3-14B,Qwen3,virtualhome,action_sequencing_v4,14.8,0.3279,1.0791,0.0,0.6757,0.6601,0.3,99.0164,0.0,0.0,0.0,0.6557,0.0,0.0
Qwen/Qwen1.5-72B,Qwen1.5,virtualhome,action_sequencing_v4,72.3,3.9344,14.7482,3.8889,1.3514,8.2508,7.9,18.3607,47.541,5.9016,0.3279,20.9836,4.2623,0.6557
openai/gpt-oss-120b,GPT-OSS,virtualhome,action_sequencing_v4,120.4,75.7377,87.4101,75.5556,64.1892,78.2178,75.1,3.2787,3.9344,0.6557,0.9836,17.7049,0.0,4.2623
Qwen/Qwen1.5-4B,Qwen1.5,virtualhome,action_sequencing_v4,4.0,0.0,1.4388,0.0,0.0,0.6601,1.6,60.9836,45.9016,0.3279,0.0,3.9344,2.2951,0.0
meta-llama/Llama-3.1-70B-Instruct,Llama-3,virtualhome,action_sequencing_v4,70.6,67.541,71.5827,76.6667,51.3514,68.1518,73.1,0.0,7.2131,2.623,1.6393,14.7541,0.6557,2.623
Qwen/Qwen-72B,Qwen,virtualhome,action_sequencing_v4,72.3,9.5082,14.0288,10.5556,3.3784,10.396,14.8,4.5902,36.0656,8.1967,1.9672,31.1475,5.5738,2.623
Qwen/Qwen3-32B,Qwen3,virtualhome,action_sequencing_v4,32.8,46.2295,39.9281,56.1111,41.2162,45.0495,47.9,0.3279,15.4098,1.3115,0.6557,31.4754,2.9508,1.6393
Qwen/Qwen3-8B,Qwen3,virtualhome,action_sequencing_v4,8.2,0.6557,23.3813,0.0,0.0,10.7261,0.0,5.9016,20.6557,5.9016,0.0,65.2459,2.2951,0.0
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,virtualhome,action_sequencing_v4,108.6,62.9508,75.5396,66.1111,52.7027,67.1617,63.3,0.0,8.5246,6.5574,3.6066,16.0656,1.9672,3.6066
deepseek-ai/DeepSeek-R1,DeepSeek-R1,virtualhome,action_sequencing_v4,684.5,57.0492,38.4892,69.4444,51.3514,50.8251,63.6,0.3279,21.3115,3.2787,0.9836,9.8361,0.6557,0.6557
meta-llama/Llama-3.2-1B-Instruct,Llama-3,virtualhome,action_sequencing_v4,1.2,0.0,2.1583,0.0,0.0,1.0526,0.0,28.8809,40.0722,12.2744,0.0,25.6318,0.361,0.0
deepseek-ai/DeepSeek-V3,DeepSeek,virtualhome,action_sequencing_v4,684.5,78.0328,80.2158,82.7778,70.2703,78.5479,84.3,0.0,5.5738,0.6557,0.0,9.1803,0.3279,0.6557
01-ai/Yi-1.5-34B-Chat,Yi,virtualhome,action_sequencing_v4,34.4,29.932,27.7978,45.8824,32.4324,34.1176,35.7,0.0,20.4082,7.483,1.0204,30.9524,4.4218,2.0408
meta-llama/Meta-Llama-3-70B-Instruct,Llama-3,virtualhome,action_sequencing_v4,70.6,57.0492,53.5971,68.3333,43.9189,55.6106,62.6,0.3279,20.0,9.1803,0.6557,6.5574,0.6557,3.9344
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,virtualhome,action_sequencing_v4,32.0,48.5246,50.7194,52.7778,50.0,51.1551,52.1,10.8197,18.6885,0.6557,2.2951,14.0984,1.3115,1.6393
openai/gpt-oss-20b,GPT-OSS,virtualhome,action_sequencing_v4,21.5,71.8033,83.8129,70.5556,54.0541,72.6073,74.8,11.8033,3.2787,0.0,0.0,10.4918,0.0,1.3115
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,virtualhome,action_sequencing_v4,401.6,76.0656,88.8489,85.0,64.1892,81.6832,80.0,0.0,5.5738,0.3279,0.9836,11.4754,1.6393,4.5902
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,virtualhome,action_sequencing_v4,8.0,28.1967,34.1727,25.5556,13.5135,26.5677,32.8,0.0,38.0328,4.5902,0.0,24.2623,0.3279,1.9672
Qwen/Qwen3-1.7B,Qwen3,virtualhome,action_sequencing_v4,2.0,0.0,8.2734,0.0,0.0,3.7954,0.0,26.2295,48.8525,3.2787,0.0,16.7213,4.918,0.0
meta-llama/Llama-3.1-8B-Instruct,Llama-3,virtualhome,action_sequencing_v4,8.0,36.7213,53.2374,31.6667,28.3784,40.7591,36.7,0.0,20.3279,8.1967,5.2459,20.9836,8.5246,6.5574
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek-R1,virtualhome,action_sequencing_v4,14.8,23.2558,46.5455,30.6818,27.027,37.0618,28.6,2.3256,10.6312,2.6578,0.3322,55.1495,0.3322,1.9934
mistralai/Mistral-7B-Instruct-v0.2,Mistral,virtualhome,action_sequencing_v4,7.2,25.9016,26.6187,30.0,32.4324,29.0429,29.8,3.2787,20.3279,3.9344,0.3279,36.7213,5.5738,0.3279
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek-R1,virtualhome,action_sequencing_v4,32.8,49.3333,72.4265,46.0227,32.8671,54.9915,55.0,0.3333,14.6667,0.6667,0.3333,25.3333,3.6667,3.0
moonshotai/Kimi-K2-Instruct,Kimi,virtualhome,action_sequencing_v4,1000.0,76.7213,88.8489,80.5556,62.8378,80.033,82.3,2.2951,1.9672,0.3279,0.6557,11.8033,0.6557,0.9836
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek-R1,virtualhome,action_sequencing_v4,7.6,0.0,0.0,0.0,9.0909,1.2346,0.0,23.6842,39.4737,18.4211,0.0,2.6316,15.7895,0.0
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,virtualhome,action_sequencing_v4,32.0,1.6393,28.4173,1.1111,0.0,13.3663,0.7,5.9016,6.2295,2.9508,0.0,75.082,9.1803,0.0
Qwen/Qwen-7B,Qwen,virtualhome,action_sequencing_v4,7.7,0.3279,0.0,0.5556,1.3514,0.495,0.3,85.2459,20.3279,2.623,0.3279,3.2787,0.6557,0.0
Qwen/Qwen3-4B,Qwen3,virtualhome,action_sequencing_v4,4.0,41.9672,53.5971,43.8889,45.9459,48.8449,48.9,0.0,2.2951,1.6393,0.0,34.4262,12.7869,1.6393
Qwen/Qwen3-235B-A22B-Thinking-2507,Qwen3,virtualhome,action_sequencing_v4,235.1,4.5902,7.554,3.8889,4.0541,5.6106,4.3,86.5574,7.2131,1.3115,0.0,9.1803,0.0,0.0
01-ai/Yi-Coder-9B-Chat,Yi,virtualhome,action_sequencing_v4,8.8,36.3934,51.4388,34.4444,22.973,39.4389,38.7,0.3279,19.0164,7.8689,1.9672,29.1803,2.9508,2.9508
