Model,Model Family,dataset,eval_type,Model Size (B),task_success_rate,state_goal,relation_goal,action_goal,total_goal,execution_success_rate,parsing_error,hallucination_error,predicate_argument_number_error,wrong_order_error,missing_step_error,affordance_error,additional_step_error
tiiuae/falcon-40b,falcon,virtualhome,action_sequencing,41.8,0.0,0.0,0.0,0.0,0.0,0.0,105.5921,0.9868,0.3289,0.0,0.3289,0.0,0.0
meta-llama/Llama-3.3-70B-Instruct,Llama-3,virtualhome,action_sequencing,70.6,61.9672,55.7554,77.7778,54.7297,62.0462,68.5,0.9836,15.7377,0.0,0.0,13.7705,0.9836,1.9672
baichuan-inc/Baichuan2-7B-Base,Baichuan,virtualhome,action_sequencing,7.0,0.6579,2.1583,0.5587,0.6757,1.3223,1.0,48.0263,38.1579,10.5263,0.3289,9.8684,0.9868,0.0
01-ai/Yi-Coder-1.5B-Chat,Yi,virtualhome,action_sequencing,1.5,0.6601,12.2744,0.0,0.0,5.6291,2.6,4.2904,45.5446,17.1617,0.0,28.3828,2.9703,0.33
Qwen/Qwen1.5-1.8B,Qwen1.5,virtualhome,action_sequencing,1.8,0.0,0.3597,0.0,0.0,0.165,0.0,22.2951,81.9672,0.3279,0.0,0.3279,0.0,0.0
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,virtualhome,action_sequencing,46.7,30.8197,42.446,22.7778,18.9189,30.8581,30.8,2.623,22.2951,1.6393,0.9836,39.3443,2.2951,3.2787
google/gemma-3-12b-it,Gemma-3,virtualhome,action_sequencing,12.2,49.8361,69.4245,50.5556,33.7838,55.1155,51.8,0.0,4.5902,1.3115,0.0,38.3607,3.9344,8.1967
google/gemma-3-4b-it,Gemma-3,virtualhome,action_sequencing,4.3,40.3279,55.7554,43.3333,27.7027,45.2145,51.8,1.6393,5.9016,7.2131,2.2951,25.9016,5.2459,1.6393
ibm-granite/granite-3.2-2b-instruct,Granite,virtualhome,action_sequencing,2.5,4.5902,9.7122,7.7778,6.7568,8.4158,5.9,15.082,39.6721,3.6066,0.0,24.5902,12.1311,0.0
ibm-granite/granite-3.1-8b-instruct,Granite,virtualhome,action_sequencing,8.2,38.6885,47.8417,37.7778,29.0541,40.264,38.4,8.1967,13.4426,1.6393,0.3279,34.0984,3.9344,1.3115
google/gemma-7b-it,Gemma,virtualhome,action_sequencing,8.5,4.5902,16.1871,5.5556,0.6757,9.2409,7.2,24.2623,25.2459,3.9344,0.0,33.7705,5.5738,0.0
01-ai/Yi-1.5-6B-Chat,Yi,virtualhome,action_sequencing,6.1,10.8911,13.3574,18.9944,3.3784,12.5828,13.5,7.9208,33.6634,2.3102,0.0,43.8944,1.6502,0.33
Qwen/Qwen1.5-7B,Qwen1.5,virtualhome,action_sequencing,7.7,5.5738,20.5036,2.2222,2.027,10.5611,4.6,12.1311,36.0656,7.541,1.9672,26.8852,13.1148,11.4754
Qwen/Qwen1.5-14B,Qwen1.5,virtualhome,action_sequencing,14.2,18.6885,19.4245,23.8889,24.3243,21.9472,24.6,6.2295,32.1311,1.9672,0.9836,20.3279,13.7705,25.9016
bigcode/starcoder2-3b,starcoder2,virtualhome,action_sequencing,3.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemma-2b-it,Gemma-2,virtualhome,action_sequencing,2.5,5.2459,4.3165,6.6667,10.1351,6.4356,7.5,9.5082,69.1803,0.9836,0.0,12.459,0.6557,0.0
Qwen/Qwen3-14B,Qwen3,virtualhome,action_sequencing,14.8,45.5738,55.7554,41.1111,14.1892,41.2541,43.0,43.6066,3.9344,0.3279,6.5574,4.5902,0.0,19.3443
google/gemma-2-9b,Gemma-2,virtualhome,action_sequencing,9.2,1.3115,3.5971,1.6667,0.6757,2.3102,1.3,45.5738,47.8689,0.0,0.3279,4.5902,0.6557,6.5574
Qwen/Qwen1.5-72B,Qwen1.5,virtualhome,action_sequencing,72.3,6.8852,10.7914,8.3333,4.7297,8.5809,11.1,0.3279,33.7705,13.4426,0.0,35.082,6.2295,0.6557
openai/gpt-oss-120b,GPT-OSS,virtualhome,action_sequencing,120.4,74.0984,87.4101,78.3333,62.1622,78.5479,79.3,0.6557,2.9508,0.0,0.0,17.377,0.0,2.9508
tiiuae/falcon-11B,falcon,virtualhome,action_sequencing,11.1,1.3158,1.7986,1.676,1.3514,1.6529,3.0,49.0132,36.8421,0.9868,0.6579,10.5263,1.3158,17.4342
Qwen/Qwen1.5-4B,Qwen1.5,virtualhome,action_sequencing,4.0,2.9508,7.554,2.2222,0.0,4.1254,3.0,21.3115,63.2787,2.623,0.9836,9.8361,8.5246,0.9836
Qwen/Qwen-14B,Qwen,virtualhome,action_sequencing,14.2,1.6393,1.0791,2.7778,3.3784,2.1452,2.0,96.7213,3.2787,0.6557,0.0,3.2787,2.623,0.0
gpt-4.1-mini-2025-04-14,gpt-4.1-mini-2025-04-14,virtualhome,action_sequencing,,73.7705,87.4101,72.7778,57.4324,75.7426,80.0,0.3279,3.6066,1.3115,0.6557,12.7869,1.3115,1.6393
bigcode/starcoderbase-7b,starcoder,virtualhome,action_sequencing,15.5,0.0,0.0,0.0,0.0,0.0,0.0,100.3279,0.0,0.0,0.0,0.0,0.0,0.0
Qwen/Qwen-72B,Qwen,virtualhome,action_sequencing,72.3,15.082,15.1079,20.5556,16.2162,16.9967,22.0,3.6066,27.8689,6.8852,0.3279,31.1475,8.1967,0.6557
Qwen/Qwen1.5-32B,Qwen1.5,virtualhome,action_sequencing,32.5,28.5246,52.8777,30.0,14.1892,36.6337,28.9,12.7869,5.2459,5.2459,0.9836,41.3115,5.5738,4.5902
meta-llama/llama3_8B_o4-mini-2025-04-16,Llama,virtualhome,action_sequencing,,72.459,85.6115,70.0,64.1892,75.7426,83.9,0.3279,4.2623,0.0,0.3279,11.1475,0.0,4.918
microsoft/Phi-3-mini-128k-instruct,phi,virtualhome,action_sequencing,3.8,11.8812,18.7726,10.6145,12.1622,14.7351,15.5,2.3102,42.5743,6.2706,0.0,30.033,3.3003,1.3201
ibm-granite/granite-3.3-2b-base,Granite,virtualhome,action_sequencing,2.5,1.3115,3.9568,1.1111,1.3514,2.4752,1.3,19.3443,58.6885,3.2787,0.3279,14.0984,2.9508,11.8033
Qwen/Qwen3-32B,Qwen3,virtualhome,action_sequencing,32.8,51.4754,62.5899,45.0,25.0,48.1848,48.5,28.1967,8.1967,0.0,6.8852,8.1967,0.0,19.3443
01-ai/Yi-1.5-9B,Yi,virtualhome,action_sequencing,8.8,24.7525,36.8231,27.933,18.2432,29.6358,28.4,0.6601,15.8416,3.3003,1.3201,38.9439,11.8812,3.6304
deepseek-ai/deepseek-coder-33b-instruct,DeepSeek-Coder,virtualhome,action_sequencing,33.3,11.5512,16.6065,13.4078,12.8378,14.7351,15.8,5.2805,22.7723,15.8416,1.6502,35.6436,3.9604,4.9505
deepseek-ai/deepseek-coder-7b-base-v1.5,DeepSeek-Coder,virtualhome,action_sequencing,6.9,3.2787,14.3885,3.8889,0.6757,7.9208,1.3,23.9344,23.9344,15.7377,0.6557,26.2295,9.1803,3.6066
bigcode/starcoderbase-3b,starcoder,virtualhome,action_sequencing,15.5,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
gpt-4.1-nano-2025-04-14,gpt-4.1-nano-2025-04-14,virtualhome,action_sequencing,,33.4426,27.3381,37.2222,29.0541,30.6931,45.6,4.2623,26.8852,0.6557,0.0,21.9672,0.6557,0.3279
microsoft/Phi-3-medium-128k-instruct,phi,virtualhome,action_sequencing,14.0,24.7525,40.4332,20.1117,22.973,30.1325,36.0,0.0,19.802,5.6106,0.33,32.6733,5.6106,4.2904
meta-llama/Llama-2-7b-hf,Llama-2,virtualhome,action_sequencing,6.7,0.0,0.0,0.0,0.0,0.0,0.0,86.1386,15.1815,0.0,0.0,0.33,0.0,0.0
deepseek-ai/deepseek-coder-6.7b-base,DeepSeek-Coder,virtualhome,action_sequencing,6.7,0.9901,6.1372,1.1173,0.0,3.1457,0.7,46.8647,34.9835,1.6502,0.33,11.8812,3.6304,1.9802
bigcode/starcoder2-7b,starcoder2,virtualhome,action_sequencing,7.2,0.0,0.0,0.0,0.0,0.0,0.0,100.3279,0.0,0.0,0.0,0.0,0.0,0.0
microsoft/phi-1_5,phi,virtualhome,action_sequencing,1.4,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
Qwen/Qwen3-8B,Qwen3,virtualhome,action_sequencing,8.2,45.9016,55.7554,47.7778,29.0541,46.8647,33.1,29.1803,6.8852,0.0,12.7869,17.7049,0.3279,27.8689
google/gemma-2-27b-it,Gemma-2,virtualhome,action_sequencing,27.2,63.2787,82.0144,58.8889,43.9189,65.8416,71.8,0.0,2.2951,4.2623,1.6393,20.0,0.0,2.9508
google/gemma-1.1-2b-it,Gemma,virtualhome,action_sequencing,2.5,0.0,0.0,0.0,0.0,0.0,0.0,5.9016,94.0984,0.0,0.0,0.0,0.0,0.0
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,virtualhome,action_sequencing,108.6,65.9016,74.8201,72.2222,54.7297,69.1419,63.6,1.9672,10.8197,6.8852,2.623,13.7705,1.3115,35.7377
gpt-4.1-2025-04-14,gpt-4.1-2025-04-14,virtualhome,action_sequencing,,72.459,90.6475,76.1111,58.7838,78.5479,83.0,0.0,1.3115,0.3279,0.0,15.082,0.3279,1.9672
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek-R1,virtualhome,action_sequencing,70.6,51.1475,61.1511,56.1111,27.027,51.3201,50.5,25.2459,2.2951,0.0,4.5902,17.377,0.0,10.8197
google/gemma-2b,Gemma-2,virtualhome,action_sequencing,2.5,0.0,1.0791,0.0,0.0,0.495,0.0,98.3607,1.3115,0.0,0.0,0.6557,0.0,0.0
ibm-granite/granite-3.1-2b-base,Granite,virtualhome,action_sequencing,2.5,1.3115,2.8777,0.5556,0.6757,1.6502,1.6,18.0328,67.2131,1.9672,0.0,8.5246,2.9508,16.3934
o4-mini-2025-04-16,o4-mini-2025-04-16,virtualhome,action_sequencing,,75.082,93.1655,81.6667,60.1351,81.6832,83.6,0.3279,1.6393,0.3279,0.0,14.0984,0.0,2.623
bigcode/starcoder2-15b,starcoder2,virtualhome,action_sequencing,16.0,0.0,0.0,0.0,0.0,0.0,0.0,99.6721,0.3279,0.0,0.0,0.0,0.0,0.0
meta-llama/Llama-3.2-3B,Llama-3,virtualhome,action_sequencing,3.2,0.3279,2.518,1.1111,0.0,1.4851,0.3,61.6393,35.4098,0.3279,0.6557,3.2787,0.6557,1.9672
deepseek-ai/DeepSeek-R1,DeepSeek-R1,virtualhome,action_sequencing,684.5,36.7213,43.5252,33.3333,12.1622,32.8383,29.5,43.6066,18.6885,0.0,8.1967,2.623,0.0,40.3279
google/gemma-2-2b-it,Gemma-2,virtualhome,action_sequencing,2.6,10.4918,26.259,13.8889,0.6757,16.3366,21.3,6.5574,20.6557,0.9836,0.0,47.2131,4.5902,0.0
Qwen/Qwen3-0.6B,Qwen3,virtualhome,action_sequencing,0.8,0.3279,3.2374,0.5556,0.0,1.6502,0.3,47.8689,26.2295,1.3115,0.0,24.2623,0.3279,0.0
google/gemma-1.1-7b-it,Gemma,virtualhome,action_sequencing,8.5,9.1803,17.2662,5.0,8.7838,11.5512,10.8,23.2787,11.8033,1.9672,0.0,46.5574,5.5738,0.6557
Qwen/Qwen2.5-14B,Qwen2.5,virtualhome,action_sequencing,14.8,25.9016,42.8058,22.2222,22.2973,31.6832,25.6,16.0656,26.5574,1.3115,3.9344,23.9344,3.6066,2.623
ibm-granite/granite-3.1-2b-instruct,Granite,virtualhome,action_sequencing,2.5,4.918,14.3885,8.3333,2.7027,9.736,4.3,12.7869,40.0,3.2787,0.3279,22.623,16.7213,0.6557
microsoft/Phi-3-medium-4k-instruct,phi,virtualhome,action_sequencing,14.0,31.0231,47.6534,21.2291,25.6757,34.4371,34.0,2.9703,15.8416,8.9109,0.6601,25.7426,12.8713,4.9505
ibm-granite/granite-3.2-8b-instruct,Granite,virtualhome,action_sequencing,8.2,37.7049,52.8777,34.4444,31.7568,42.2442,39.7,5.2459,15.7377,1.3115,0.3279,33.1148,4.5902,3.2787
Qwen/Qwen2.5-72B,Qwen2.5,virtualhome,action_sequencing,72.7,33.7705,43.1655,32.2222,24.3243,35.3135,37.7,12.1311,38.6885,0.9836,0.0,10.1639,0.3279,19.0164
deepseek-ai/DeepSeek-V3,DeepSeek,virtualhome,action_sequencing,684.5,78.3607,79.8561,84.4444,72.2973,79.3729,85.9,0.0,6.2295,0.6557,0.0,6.5574,0.6557,1.9672
meta-llama/Meta-Llama-3-70B,Llama-3,virtualhome,action_sequencing,70.6,27.8689,34.5324,31.1111,19.5946,29.868,27.9,18.3607,21.3115,0.6557,3.9344,26.5574,1.9672,29.5082
Qwen/Qwen2.5-32B,Qwen2.5,virtualhome,action_sequencing,32.8,42.9508,47.482,47.2222,33.7838,44.0594,44.6,7.2131,27.541,2.2951,3.2787,12.1311,3.2787,11.4754
google/gemma-2-27b,Gemma-2,virtualhome,action_sequencing,27.2,13.7705,25.8993,13.3333,7.4324,17.6568,17.7,43.9344,10.1639,3.9344,1.9672,23.2787,0.3279,24.5902
01-ai/Yi-34B,Yi,virtualhome,action_sequencing,34.4,1.6502,1.083,1.1173,0.0,0.8278,1.0,40.5941,46.5347,1.9802,0.0,9.901,0.6601,4.2904
ibm-granite/granite-3.1-8b-base,Granite,virtualhome,action_sequencing,8.2,8.5246,14.7482,6.6667,6.0811,10.231,10.2,55.082,15.082,0.6557,1.6393,16.0656,1.3115,14.7541
meta-llama/Llama-3.2-1B,Llama-3,virtualhome,action_sequencing,1.2,0.0,0.7194,0.0,0.0,0.33,0.0,42.2951,56.3934,0.6557,0.0,1.3115,0.0,0.0
01-ai/Yi-1.5-34B-Chat,Yi,virtualhome,action_sequencing,34.4,38.2838,28.8809,57.5419,35.1351,38.9073,38.9,0.0,20.7921,6.9307,1.3201,26.0726,5.9406,1.3201
01-ai/Yi-1.5-34B,Yi,virtualhome,action_sequencing,34.4,6.9307,18.0505,8.9385,0.6757,11.0927,6.3,1.6502,24.7525,3.9604,0.33,59.0759,3.9604,0.9901
deepseek-ai/deepseek-coder-1.3b-instruct,DeepSeek-Coder,virtualhome,action_sequencing,1.3,1.3201,7.5812,0.0,0.0,3.4768,0.7,12.5413,58.4158,3.6304,0.0,9.2409,15.5116,0.0
meta-llama/Meta-Llama-3-70B-Instruct,Llama-3,virtualhome,action_sequencing,70.6,60.6557,55.036,68.8889,45.9459,56.9307,65.9,0.0,18.6885,8.1967,0.3279,5.9016,0.9836,3.9344
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,DeepSeek-R1,virtualhome,action_sequencing,8.0,24.2623,36.3309,21.1111,14.8649,26.5677,26.9,7.541,25.2459,0.3279,1.3115,35.4098,3.6066,13.7705
tiiuae/Falcon3-10B-Base,falcon,virtualhome,action_sequencing,10.3,21.7822,40.4332,23.4637,20.9459,30.6291,22.4,7.2607,31.3531,0.33,3.6304,31.3531,4.6205,68.6469
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,virtualhome,action_sequencing,32.0,51.8033,69.0647,50.0,47.2973,58.0858,61.0,0.0,16.0656,0.6557,2.2951,19.0164,0.9836,3.9344
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek-R1,virtualhome,action_sequencing,1.8,0.9836,2.1583,1.6667,0.0,1.4851,2.3,46.8852,42.9508,0.3279,0.0,7.8689,2.2951,0.3279
openai/gpt-oss-20b,GPT-OSS,virtualhome,action_sequencing,21.5,67.2131,74.4604,66.1111,47.973,65.5116,68.9,13.1148,4.2623,0.6557,0.0,13.1148,0.0,0.6557
Qwen/Qwen1.5-110B,Qwen1.5,virtualhome,action_sequencing,111.2,35.7377,46.7626,46.1111,33.1081,43.2343,40.0,0.0,14.7541,4.2623,6.2295,20.9836,13.7705,4.918
bigcode/starcoderbase-1b,starcoder,virtualhome,action_sequencing,15.5,0.0,0.0,0.0,0.0,0.0,0.0,100.3279,0.0,0.0,0.0,0.0,0.0,0.0
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,virtualhome,action_sequencing,401.6,74.4262,88.4892,78.8889,60.1351,78.7129,73.4,0.3279,11.1475,0.6557,4.2623,9.1803,0.9836,58.3607
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,virtualhome,action_sequencing,8.0,22.2951,27.6978,22.7778,10.8108,22.1122,26.2,0.0,36.3934,7.8689,0.3279,28.1967,0.9836,0.9836
Qwen/Qwen2.5-3B,Qwen2.5,virtualhome,action_sequencing,3.1,0.0,0.3597,0.0,0.0,0.165,0.0,18.0328,71.1475,8.5246,0.0,2.623,0.0,0.0
tiiuae/falcon-7b,falcon,virtualhome,action_sequencing,7.2,0.0,0.0,0.0,0.0,0.0,0.0,106.5789,1.9737,0.0,0.0,0.0,0.0,0.0
Qwen/Qwen3-1.7B,Qwen3,virtualhome,action_sequencing,2.0,2.9508,12.5899,5.0,0.0,7.2607,1.3,58.6885,20.0,0.0,2.2951,18.0328,0.0,0.0
meta-llama/Llama-2-13b-hf,Llama-2,virtualhome,action_sequencing,13.0,0.0,1.083,0.0,0.0,0.4967,1.3,64.6865,25.7426,1.9802,0.0,7.2607,1.3201,0.33
tiiuae/Falcon3-7B-Base,falcon,virtualhome,action_sequencing,7.5,8.9109,22.0217,8.3799,8.1081,14.5695,6.9,13.8614,32.6733,5.6106,1.6502,33.0033,6.6007,2.6403
ibm-granite/granite-3.3-2b-instruct,Granite,virtualhome,action_sequencing,2.5,1.6393,17.9856,2.2222,0.0,8.9109,12.8,10.4918,16.0656,6.2295,0.0,42.2951,12.459,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek-R1,virtualhome,action_sequencing,14.8,47.8689,61.8705,48.3333,25.6757,49.0099,53.4,26.2295,3.2787,0.0,1.3115,15.082,0.6557,6.5574
mistralai/Mistral-7B-Instruct-v0.2,Mistral,virtualhome,action_sequencing,7.2,17.1053,26.6187,11.7318,6.0811,17.1901,23.7,4.9342,30.2632,2.6316,0.0,39.8026,0.3289,22.3684
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek-R1,virtualhome,action_sequencing,32.8,17.0492,22.6619,16.6667,16.8919,19.4719,17.0,69.5082,1.9672,0.3279,2.9508,6.5574,1.6393,36.7213
baichuan-inc/Baichuan-7B,Baichuan,virtualhome,action_sequencing,,0.0,0.0,0.0,0.0,0.0,0.0,79.2642,37.1237,0.3344,0.0,1.0033,0.0,0.0
moonshotai/Kimi-K2-Instruct,Kimi,virtualhome,action_sequencing,1000.0,75.082,89.9281,80.5556,58.7838,79.538,82.0,0.0,1.6393,0.0,0.0,15.082,1.3115,0.9836
01-ai/Yi-Coder-1.5B,Yi,virtualhome,action_sequencing,1.5,1.9802,4.6931,2.7933,3.3784,3.8079,2.6,56.1056,25.7426,4.9505,0.0,10.5611,1.6502,9.571
google/gemma-2-9b-it,Gemma-2,virtualhome,action_sequencing,9.2,52.1311,69.7842,51.6667,41.2162,57.4257,54.1,30.8197,2.2951,3.6066,0.9836,20.3279,3.2787,4.2623
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek-R1,virtualhome,action_sequencing,7.6,13.4426,28.0576,14.4444,6.0811,18.6469,14.8,46.2295,5.5738,1.9672,2.9508,24.2623,5.2459,3.6066
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,virtualhome,action_sequencing,32.0,27.2131,41.3669,28.3333,15.5405,31.1881,19.7,20.6557,23.2787,0.6557,8.1967,37.7049,0.0,51.4754
01-ai/Yi-1.5-6B,Yi,virtualhome,action_sequencing,6.1,2.6403,5.0542,1.676,2.7027,3.4768,2.0,40.264,46.2046,0.0,0.33,11.2211,3.9604,0.6601
baichuan-inc/Baichuan2-7B-Chat,Baichuan,virtualhome,action_sequencing,7.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.6579,0.0,0.0,0.0,0.0,0.0
microsoft/phi-4,phi,virtualhome,action_sequencing,14.7,55.4098,72.3022,64.4444,41.2162,62.3762,58.0,0.6557,4.2623,2.623,0.6557,31.8033,2.2951,1.6393
Qwen/Qwen-7B,Qwen,virtualhome,action_sequencing,7.7,0.3279,1.0791,0.5556,0.0,0.6601,0.0,90.8197,13.4426,0.0,0.0,0.6557,0.3279,0.0
google/gemma-3-12b-pt,Gemma-3,virtualhome,action_sequencing,12.2,6.8852,17.9856,7.2222,4.7297,11.5512,8.5,36.7213,20.3279,1.9672,1.9672,31.4754,1.6393,4.918
Qwen/Qwen2.5-1.5B,Qwen2.5,virtualhome,action_sequencing,1.5,0.0,0.0,0.0,0.0,0.0,0.0,88.1967,11.4754,0.0,0.0,0.3279,0.3279,0.0
meta-llama/Meta-Llama-3-8B,Llama-3,virtualhome,action_sequencing,8.0,9.5082,20.1439,7.2222,4.7297,12.5413,9.5,14.0984,33.1148,7.2131,1.6393,29.1803,6.2295,6.5574
google/gemma-3-27b-it,Gemma-3,virtualhome,action_sequencing,27.4,69.5082,80.9353,70.0,58.7838,72.2772,75.4,0.0,3.9344,0.0,0.0,19.6721,0.9836,3.6066
Qwen/Qwen3-4B,Qwen3,virtualhome,action_sequencing,4.0,29.5082,41.0072,25.0,6.7568,27.8878,26.2,63.6066,1.6393,0.6557,5.2459,2.623,0.0,15.082
Qwen/Qwen2.5-7B,Qwen2.5,virtualhome,action_sequencing,7.6,21.3115,28.0576,21.1111,21.6216,24.4224,22.6,12.1311,30.8197,3.2787,0.6557,23.6066,8.1967,2.623
Qwen/Qwen3-235B-A22B-Thinking-2507,Qwen3,virtualhome,action_sequencing,235.1,29.8361,35.2518,25.0,5.4054,24.9175,25.2,57.7049,11.4754,0.0,4.918,0.9836,0.0,20.9836
bigcode/starcoderbase,starcoder,virtualhome,action_sequencing,15.5,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.3279,0.0,0.0,0.0,0.0,0.0
ibm-granite/granite-3.3-8b-instruct,Granite,virtualhome,action_sequencing,8.2,35.082,41.3669,38.3333,35.8108,39.1089,40.7,1.9672,15.4098,6.5574,0.3279,24.918,10.1639,1.6393
google/gemma-3-4b-pt,Gemma-3,virtualhome,action_sequencing,4.3,1.6393,6.1151,0.5556,0.0,2.9703,1.3,73.1148,12.7869,1.3115,0.0,12.7869,1.3115,0.3279
01-ai/Yi-Coder-9B-Chat,Yi,virtualhome,action_sequencing,8.8,42.9043,59.2058,44.1341,30.4054,47.6821,46.5,0.33,13.2013,5.6106,3.3003,28.7129,2.3102,1.3201
meta-llama/Llama-3.1-70B,Llama-3,virtualhome,action_sequencing,70.6,28.8525,36.6906,33.8889,17.5676,31.1881,30.5,26.5574,20.6557,0.9836,2.2951,19.0164,1.9672,90.8197
google/gemma-2-2b,Gemma-2,virtualhome,action_sequencing,2.6,0.6557,0.3597,1.6667,2.7027,1.3201,2.0,71.8033,31.4754,1.3115,0.0,3.9344,0.6557,0.0
deepseek-ai/deepseek-coder-33b-base,DeepSeek-Coder,virtualhome,action_sequencing,33.3,1.3201,4.6931,2.2346,0.0,2.8146,0.7,32.3432,55.7756,0.9901,0.9901,8.5809,1.3201,1.3201
google/gemma-7b,Gemma,virtualhome,action_sequencing,8.5,1.3115,4.3165,0.5556,1.3514,2.4752,2.3,55.4098,28.8525,0.9836,0.0,10.8197,1.6393,0.3279
deepseek-ai/deepseek-coder-6.7b-instruct,DeepSeek-Coder,virtualhome,action_sequencing,6.7,6.6007,23.1047,7.2626,8.7838,14.9007,6.9,24.4224,17.4917,3.6304,0.6601,47.1947,5.6106,1.3201
01-ai/Yi-Coder-9B,Yi,virtualhome,action_sequencing,8.8,14.1914,15.1625,18.4358,10.1351,14.9007,15.2,53.4653,23.7624,1.3201,0.6601,7.2607,0.6601,0.6601
deepseek-ai/deepseek-coder-7b-instruct-v1.5,DeepSeek-Coder,virtualhome,action_sequencing,6.9,0.6557,17.2662,0.5556,0.6757,8.2508,0.7,2.9508,15.082,35.7377,0.0,43.6066,1.9672,0.0
ibm-granite/granite-3.3-8b-base,Granite,virtualhome,action_sequencing,8.2,16.7213,19.0647,16.6667,21.6216,18.9769,26.2,45.9016,5.9016,0.9836,1.9672,13.4426,5.5738,1.3115
meta-llama/Llama-2-70b-hf,Llama-2,virtualhome,action_sequencing,69.0,5.6106,10.4693,4.4693,2.027,6.6225,4.6,55.4455,24.0924,2.3102,0.33,10.8911,2.9703,0.33
deepseek-ai/deepseek-coder-1.3b-base,DeepSeek-Coder,virtualhome,action_sequencing,1.3,0.33,1.444,0.0,0.0,0.6623,0.0,45.2145,46.5347,1.6502,0.0,8.2508,0.6601,0.0
01-ai/Yi-6B,Yi,virtualhome,action_sequencing,6.1,0.0,0.0,0.0,0.0,0.0,0.0,86.7987,11.2211,0.0,0.0,1.9802,0.33,0.0
Qwen/Qwen2.5-0.5B,Qwen2.5,virtualhome,action_sequencing,0.5,0.0,0.0,0.0,0.0,0.0,0.0,93.7705,15.082,0.0,0.0,0.0,0.0,0.0
microsoft/Phi-3-mini-4k-instruct,phi,virtualhome,action_sequencing,3.8,13.8614,22.0217,12.2905,6.7568,15.3974,17.8,17.4917,15.5116,12.8713,0.0,23.4323,12.8713,0.33
