Model,Model Family,dataset,eval_type,Model Size (B),task_success_rate,state_goal,relation_goal,action_goal,total_goal,execution_success_rate,parsing_error,hallucination_error,predicate_argument_number_error,wrong_order_error,missing_step_error,affordance_error,additional_step_error,Pretraining Data Size (T),FLOPs (1E21),Average,BBH,MATH Lvl 5,GPQA,MUSR,MMLU-PRO,IFEval
tiiuae/falcon-40b,falcon,virtualhome,action_sequencing,41.8,0.0,0.0,0.0,0.0,0.0,0.0,105.5921,0.9868,0.3289,0.0,0.3289,0.0,0.0,1.0,240.0,11.40130446230009,16.583304730312175,1.812688821752266,3.1319910514541416,5.193229166666668,16.722074468085104,24.964538535530174
meta-llama/Llama-3.3-70B-Instruct,Llama-3,virtualhome,action_sequencing,70.6,61.9672,55.7554,77.7778,54.7297,62.0462,68.5,0.9836,15.7377,0.0,0.0,13.7705,0.9836,1.9672,15.0,6353.999999999999,44.84747145129876,56.561410788022194,48.338368580060425,10.514541387024611,15.565624999999999,48.12906323877069,89.97581971391463
baichuan-inc/Baichuan2-7B-Base,Baichuan,virtualhome,action_sequencing,7.0,0.6579,2.1583,0.5587,0.6757,1.3223,1.0,48.0263,38.1579,10.5263,0.3289,9.8684,0.9868,0.0,2.6,109.20000000000002,,,,,,,
01-ai/Yi-Coder-1.5B-Chat,Yi,virtualhome,action_sequencing,1.5,0.6601,12.2744,0.0,0.0,5.6291,2.6,4.2904,45.5446,17.1617,0.0,28.3828,2.9703,0.33,2.4,21.6,,,,,,,
Qwen/Qwen1.5-1.8B,Qwen1.5,virtualhome,action_sequencing,1.8,0.0,0.3597,0.0,0.0,0.165,0.0,22.2951,81.9672,0.3279,0.0,0.3279,0.0,0.0,2.4,25.92,9.269492522098927,9.759901587727937,3.1722054380664653,7.38255033557047,3.963802083333334,9.79609929078014,21.542396397115212
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,virtualhome,action_sequencing,46.7,30.8197,42.446,22.7778,18.9189,30.8581,30.8,2.623,22.2951,1.6393,0.9836,39.3443,2.2951,3.2787,,,23.8171027058463,29.742398380967334,9.138972809667674,7.046979865771815,11.073697916666667,29.909131205673756,55.991436056330535
google/gemma-3-12b-it,Gemma-3,virtualhome,action_sequencing,12.2,49.8361,69.4245,50.5556,33.7838,55.1155,51.8,0.0,4.5902,1.3115,0.0,38.3607,3.9344,8.1967,12.0,878.4,,,,,,,
google/gemma-3-4b-it,Gemma-3,virtualhome,action_sequencing,4.3,40.3279,55.7554,43.3333,27.7027,45.2145,51.8,1.6393,5.9016,7.2131,2.2951,25.9016,5.2459,1.6393,4.0,103.2,,,,,,,
ibm-granite/granite-3.2-2b-instruct,Granite,virtualhome,action_sequencing,2.5,4.5902,9.7122,7.7778,6.7568,8.4158,5.9,15.082,39.6721,3.6066,0.0,24.5902,12.1311,0.0,12.0,180.0,21.25014812377563,21.668268416036614,14.425981873111782,5.369127516778524,4.704947916666668,19.815676713947987,61.51688630611223
ibm-granite/granite-3.1-8b-instruct,Granite,virtualhome,action_sequencing,8.2,38.6885,47.8417,37.7778,29.0541,40.264,38.4,8.1967,13.4426,1.6393,0.3279,34.0984,3.9344,1.3115,12.0,590.4,30.6030430081627,34.089655299414055,21.978851963746223,8.277404921700223,19.00520833333333,28.191489361702125,72.07564816908027
google/gemma-7b-it,Gemma,virtualhome,action_sequencing,8.5,4.5902,16.1871,5.5556,0.6757,9.2409,7.2,24.2623,25.2459,3.9344,0.0,33.7705,5.5738,0.0,2.0,102.0,13.067087110466217,11.940832085290182,2.9456193353474323,4.5861297539149914,12.528385416666667,7.7183067375886525,38.68324933398937
01-ai/Yi-1.5-6B-Chat,Yi,virtualhome,action_sequencing,6.1,10.8911,13.3574,18.9944,3.3784,12.5828,13.5,7.9208,33.6634,2.3102,0.0,43.8944,1.6502,0.33,3.6,131.76,22.784006289829847,23.67872313235784,16.238670694864048,6.935123042505594,14.030468750000002,24.368351063829788,51.452701055421834
Qwen/Qwen1.5-7B,Qwen1.5,virtualhome,action_sequencing,7.7,5.5738,20.5036,2.2222,2.027,10.5611,4.6,12.1311,36.0656,7.541,1.9672,26.8852,13.1148,11.4754,4.0,168.0,16.024674155407357,23.075768754340448,9.290030211480364,6.487695749440718,9.158333333333333,21.293218085106382,26.842998798742894
Qwen/Qwen1.5-14B,Qwen1.5,virtualhome,action_sequencing,14.2,18.6885,19.4245,23.8889,24.3243,21.9472,24.6,6.2295,32.1311,1.9672,0.9836,20.3279,13.7705,25.9016,4.0,336.0,20.854080062460586,30.063103282917453,20.241691842900302,5.92841163310962,10.464062500000002,29.373522458628837,29.05368865720732
bigcode/starcoder2-3b,starcoder2,virtualhome,action_sequencing,3.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,3.3,59.4,6.549147626379535,8.909299421083569,1.5105740181268883,0.0,1.432291666666666,7.0718823877068555,20.370838264693234
google/gemma-2b-it,Gemma-2,virtualhome,action_sequencing,2.5,5.2459,4.3165,6.6667,10.1351,6.4356,7.5,9.5082,69.1803,0.9836,0.0,12.459,0.6557,0.0,6.0,90.0,7.485804130315127,5.214303022163619,2.0392749244712993,3.8031319910514525,3.0322916666666675,3.9228723404255303,26.902950837112197
Qwen/Qwen3-14B,Qwen3,virtualhome,action_sequencing,14.8,45.5738,55.7554,41.1111,14.1892,41.2541,43.0,43.6066,3.9344,0.3279,6.5574,4.5902,0.0,19.3443,36.0,3196.8,,,,,,,
google/gemma-2-9b,Gemma-2,virtualhome,action_sequencing,9.2,1.3115,3.5971,1.6667,0.6757,2.3102,1.3,45.5738,47.8689,0.0,0.3279,4.5902,0.6557,6.5574,8.0,441.6,21.205286776100692,34.09681853589784,13.444108761329304,10.514541387024611,14.297656250000001,34.48027482269504,20.398320899657357
Qwen/Qwen1.5-72B,Qwen1.5,virtualhome,action_sequencing,72.3,6.8852,10.7914,8.3333,4.7297,8.5809,11.1,0.3279,33.7705,13.4426,0.0,35.082,6.2295,0.6557,3.0,1296.0,,,,,,,
openai/gpt-oss-120b,GPT-OSS,virtualhome,action_sequencing,120.4,74.0984,87.4101,78.3333,62.1622,78.5479,79.3,0.6557,2.9508,0.0,0.0,17.377,0.0,2.9508,,,,,,,,,
tiiuae/falcon-11B,falcon,virtualhome,action_sequencing,11.1,1.3158,1.7986,1.676,1.3514,1.6529,3.0,49.0132,36.8421,0.9868,0.6579,10.5263,1.3158,17.4342,5.0,333.0,13.851902586180215,21.937999462890275,2.794561933534743,2.796420581655479,7.530729166666667,15.438460401891252,32.613243970442866
Qwen/Qwen1.5-4B,Qwen1.5,virtualhome,action_sequencing,4.0,2.9508,7.554,2.2222,0.0,4.1254,3.0,21.3115,63.2787,2.623,0.9836,9.8361,8.5246,0.9836,2.4,57.6,11.76818275851784,16.249142581095292,5.287009063444108,3.5794183445190177,4.8226562500000005,16.22340425531915,24.447466056729475
Qwen/Qwen-14B,Qwen,virtualhome,action_sequencing,14.2,1.6393,1.0791,2.7778,3.3784,2.1452,2.0,96.7213,3.2787,0.6557,0.0,3.2787,2.623,0.0,3.0,252.0,,,,,,,
gpt-4.1-mini-2025-04-14,gpt-4.1-mini-2025-04-14,virtualhome,action_sequencing,,73.7705,87.4101,72.7778,57.4324,75.7426,80.0,0.3279,3.6066,1.3115,0.6557,12.7869,1.3115,1.6393,,,,,,,,,
bigcode/starcoderbase-7b,starcoder,virtualhome,action_sequencing,15.5,0.0,0.0,0.0,0.0,0.0,0.0,100.3279,0.0,0.0,0.0,0.0,0.0,0.0,1.0,42.0,,,,,,,
Qwen/Qwen-72B,Qwen,virtualhome,action_sequencing,72.3,15.082,15.1079,20.5556,16.2162,16.9967,22.0,3.6066,27.8689,6.8852,0.3279,31.1475,8.1967,0.6557,3.0,1296.0,,,,,,,
Qwen/Qwen1.5-32B,Qwen1.5,virtualhome,action_sequencing,32.5,28.5246,52.8777,30.0,14.1892,36.6337,28.9,12.7869,5.2459,5.2459,0.9836,41.3115,5.5738,4.5902,4.0,768.0,27.2987558571606,38.980351633108974,30.28700906344411,10.626398210290827,12.040625000000004,38.88519503546098,32.97295620065869
meta-llama/llama3_8B_o4-mini-2025-04-16,Llama,virtualhome,action_sequencing,,72.459,85.6115,70.0,64.1892,75.7426,83.9,0.3279,4.2623,0.0,0.3279,11.1475,0.0,4.918,,,,,,,,,
microsoft/Phi-3-mini-128k-instruct,phi,virtualhome,action_sequencing,3.8,11.8812,18.7726,10.6145,12.1622,14.7351,15.5,2.3102,42.5743,6.2706,0.0,30.033,3.3003,1.3201,4.9,111.72,26.343809931865636,37.09976663224031,14.04833836858006,9.060402684563762,7.710937500000003,30.38009751773049,59.76331688807919
ibm-granite/granite-3.3-2b-base,Granite,virtualhome,action_sequencing,2.5,1.3115,3.9568,1.1111,1.3514,2.4752,1.3,19.3443,58.6885,3.2787,0.3279,14.0984,2.9508,11.8033,12.0,180.0,,,,,,,
Qwen/Qwen3-32B,Qwen3,virtualhome,action_sequencing,32.8,51.4754,62.5899,45.0,25.0,48.1848,48.5,28.1967,8.1967,0.0,6.8852,8.1967,0.0,19.3443,36.0,7084.799999999999,,,,,,,
01-ai/Yi-1.5-9B,Yi,virtualhome,action_sequencing,8.8,24.7525,36.8231,27.933,18.2432,29.6358,28.4,0.6601,15.8416,3.3003,1.3201,38.9439,11.8812,3.6304,3.6,190.08000000000004,22.153901514184795,30.50071699492122,11.404833836858005,17.225950782997764,12.030989583333332,32.402482269503544,29.358435617494916
deepseek-ai/deepseek-coder-33b-instruct,DeepSeek-Coder,virtualhome,action_sequencing,33.3,11.5512,16.6065,13.4078,12.8378,14.7351,15.8,5.2805,22.7723,15.8416,1.6502,35.6436,3.9604,4.9505,2.0,399.6,,,,,,,
deepseek-ai/deepseek-coder-7b-base-v1.5,DeepSeek-Coder,virtualhome,action_sequencing,6.9,3.2787,14.3885,3.8889,0.6757,7.9208,1.3,23.9344,23.9344,15.7377,0.6557,26.2295,9.1803,3.6066,2.0,82.80000000000001,,,,,,,
bigcode/starcoderbase-3b,starcoder,virtualhome,action_sequencing,15.5,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18.0,,,,,,,
gpt-4.1-nano-2025-04-14,gpt-4.1-nano-2025-04-14,virtualhome,action_sequencing,,33.4426,27.3381,37.2222,29.0541,30.6931,45.6,4.2623,26.8852,0.6557,0.0,21.9672,0.6557,0.3279,,,,,,,,,
microsoft/Phi-3-medium-128k-instruct,phi,virtualhome,action_sequencing,14.0,24.7525,40.4332,20.1117,22.973,30.1325,36.0,0.0,19.802,5.6106,0.33,32.6733,5.6106,4.2904,4.8,403.19999999999993,32.026356176108685,48.46045127399018,19.184290030211482,11.521252796420578,11.351822916666663,41.24002659574468,60.400293443618494
meta-llama/Llama-2-7b-hf,Llama-2,virtualhome,action_sequencing,6.7,0.0,0.0,0.0,0.0,0.0,0.0,86.1386,15.1815,0.0,0.0,0.33,0.0,0.0,2.0,84.0,8.806357596540016,10.35141665784897,1.7371601208459215,2.2371364653243813,3.7578125,9.56523345153664,25.18938638368418
deepseek-ai/deepseek-coder-6.7b-base,DeepSeek-Coder,virtualhome,action_sequencing,6.7,0.9901,6.1372,1.1173,0.0,3.1457,0.7,46.8647,34.9835,1.6502,0.33,11.8812,3.6304,1.9802,2.0,80.4,,,,,,,
bigcode/starcoder2-7b,starcoder2,virtualhome,action_sequencing,7.2,0.0,0.0,0.0,0.0,0.0,0.0,100.3279,0.0,0.0,0.0,0.0,0.0,0.0,3.7,155.4,8.2934383764798,11.395110106503443,3.096676737160121,0.22371364653244186,5.8166666666666655,7.1365248226950335,22.09193827932109
microsoft/phi-1_5,phi,virtualhome,action_sequencing,1.4,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,1.1700000000000002,7.170966845799231,7.468938770070243,1.812688821752266,2.348993288590602,3.385416666666666,7.6813682033096935,20.32839532440591
Qwen/Qwen3-8B,Qwen3,virtualhome,action_sequencing,8.2,45.9016,55.7554,47.7778,29.0541,46.8647,33.1,29.1803,6.8852,0.0,12.7869,17.7049,0.3279,27.8689,36.0,1771.1999999999998,,,,,,,
google/gemma-2-27b-it,Gemma-2,virtualhome,action_sequencing,27.2,63.2787,82.0144,58.8889,43.9189,65.8416,71.8,0.0,2.2951,4.2623,1.6393,20.0,0.0,2.9508,13.0,2121.6,36.17428251510342,49.27284215130387,23.867069486404834,16.666666666666664,9.112760416666667,38.34958628841608,79.77677008116243
google/gemma-1.1-2b-it,Gemma,virtualhome,action_sequencing,2.5,0.0,0.0,0.0,0.0,0.0,0.0,5.9016,94.0984,0.0,0.0,0.0,0.0,0.0,3.0,45.0,8.053373854341979,5.862826722774347,1.812688821752266,2.572706935123044,2.024479166666666,5.372709810874704,30.674831668860847
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,virtualhome,action_sequencing,108.6,65.9016,74.8201,72.2222,54.7297,69.1419,63.6,1.9672,10.8197,6.8852,2.623,13.7705,1.3115,35.7377,40.0,26064.0,,,,,,,
gpt-4.1-2025-04-14,gpt-4.1-2025-04-14,virtualhome,action_sequencing,,72.459,90.6475,76.1111,58.7838,78.5479,83.0,0.0,1.3115,0.3279,0.0,15.082,0.3279,1.9672,,,,,,,,,
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek-R1,virtualhome,action_sequencing,70.6,51.1475,61.1511,56.1111,27.027,51.3201,50.5,25.2459,2.2951,0.0,4.5902,17.377,0.0,10.8197,15.0,6353.999999999999,27.809426360756188,35.81986234433108,30.74018126888218,2.0134228187919474,13.277343749999998,41.64635047281324,43.35939750971866
google/gemma-2b,Gemma-2,virtualhome,action_sequencing,2.5,0.0,1.0791,0.0,0.0,0.495,0.0,98.3607,1.3115,0.0,0.0,0.6557,0.0,0.0,6.0,72.0,7.321959810488082,8.246263426638125,3.0211480362537766,0.6711409395973182,7.555989583333336,4.061391843971631,20.375825033134305
ibm-granite/granite-3.1-2b-base,Granite,virtualhome,action_sequencing,2.5,1.3115,2.8777,0.5556,0.6757,1.6502,1.6,18.0328,67.2131,1.9672,0.0,8.5246,2.9508,16.3934,12.0,180.0,13.202826259598206,16.843689846888516,5.664652567975831,3.6912751677852316,3.9049479166666674,13.89627659574468,35.216115462528315
o4-mini-2025-04-16,o4-mini-2025-04-16,virtualhome,action_sequencing,,75.082,93.1655,81.6667,60.1351,81.6832,83.6,0.3279,1.6393,0.3279,0.0,14.0984,0.0,2.623,,,,,,,,,
bigcode/starcoder2-15b,starcoder2,virtualhome,action_sequencing,16.0,0.0,0.0,0.0,0.0,0.0,0.0,99.6721,0.3279,0.0,0.0,0.0,0.0,0.0,4.3,387.0,12.539175421645837,20.373540752678547,5.966767371601208,3.1319910514541416,2.9283854166666674,15.032136524822693,27.802231412651764
meta-llama/Llama-3.2-3B,Llama-3,virtualhome,action_sequencing,3.2,0.3279,2.518,1.1111,0.0,1.4851,0.3,61.6393,35.4098,0.3279,0.6557,3.2787,0.6557,1.9672,9.0,172.8,8.697822716562822,14.232664884364107,1.8882175226586102,2.348993288590602,3.8148437499999996,16.528147163120565,13.374069690643047
deepseek-ai/DeepSeek-R1,DeepSeek-R1,virtualhome,action_sequencing,684.5,36.7213,43.5252,33.3333,12.1622,32.8383,29.5,43.6066,18.6885,0.0,8.1967,2.623,0.0,40.3279,14.8,60783.600000000006,,,,,,,
google/gemma-2-2b-it,Gemma-2,virtualhome,action_sequencing,2.6,10.4918,26.259,13.8889,0.6757,16.3366,21.3,6.5574,20.6557,0.9836,0.0,47.2131,4.5902,0.0,2.0,31.200000000000003,17.046939294966545,17.980792881523424,0.0755287009063444,3.243847874720355,7.077343750000001,17.22074468085106,56.68337788179808
Qwen/Qwen3-0.6B,Qwen3,virtualhome,action_sequencing,0.8,0.3279,3.2374,0.5556,0.0,1.6502,0.3,47.8689,26.2295,1.3115,0.0,24.2623,0.3279,0.0,36.0,172.8,,,,,,,
google/gemma-1.1-7b-it,Gemma,virtualhome,action_sequencing,8.5,9.1803,17.2662,5.0,8.7838,11.5512,10.8,23.2787,11.8033,1.9672,0.0,46.5574,5.5738,0.6557,6.0,306.0,17.693584228972615,15.93420938501317,4.909365558912387,5.8165548098433995,11.510937500000002,17.5993646572104,50.391073462856326
Qwen/Qwen2.5-14B,Qwen2.5,virtualhome,action_sequencing,14.8,25.9016,42.8058,22.2222,22.2973,31.6832,25.6,16.0656,26.5574,1.3115,3.9344,23.9344,3.6066,2.623,18.0,1598.4,31.951062693148973,45.078312404984935,29.003021148036257,17.561521252796418,15.913281249999997,47.205599881796694,36.94464022127954
ibm-granite/granite-3.1-2b-instruct,Granite,virtualhome,action_sequencing,2.5,4.918,14.3885,8.3333,2.7027,9.736,4.3,12.7869,40.0,3.2787,0.3279,22.623,16.7213,0.6557,12.0,180.0,21.712212822028288,21.822956140794506,15.256797583081571,5.257270693512303,4.867708333333335,20.212765957446805,62.8557782240012
microsoft/Phi-3-medium-4k-instruct,phi,virtualhome,action_sequencing,14.0,31.0231,47.6534,21.2291,25.6757,34.4371,34.0,2.9703,15.8416,8.9109,0.6601,25.7426,12.8713,4.9505,4.8,403.19999999999993,33.09765943937642,49.38061007422016,19.561933534743204,11.521252796420578,13.052083333333334,40.84293735224587,64.22713954529537
ibm-granite/granite-3.2-8b-instruct,Granite,virtualhome,action_sequencing,8.2,37.7049,52.8777,34.4444,31.7568,42.2442,39.7,5.2459,15.7377,1.3115,0.3279,33.1148,4.5902,3.2787,12.0,590.4,30.7704488980163,34.65536965519957,23.791540785498487,8.7248322147651,16.791406250000005,27.914450354609933,72.74509412802476
Qwen/Qwen2.5-72B,Qwen2.5,virtualhome,action_sequencing,72.7,33.7705,43.1655,32.2222,24.3243,35.3135,37.7,12.1311,38.6885,0.9836,0.0,10.1639,0.3279,19.0164,18.0,7851.6,38.441143572535815,54.61505780163693,39.12386706948641,20.69351230425056,19.640624999999996,55.20279255319149,41.37100670664947
deepseek-ai/DeepSeek-V3,DeepSeek,virtualhome,action_sequencing,684.5,78.3607,79.8561,84.4444,72.2973,79.3729,85.9,0.0,6.2295,0.6557,0.0,6.5574,0.6557,1.9672,14.8,60783.600000000006,,,,,,,
meta-llama/Meta-Llama-3-70B,Llama-3,virtualhome,action_sequencing,70.6,27.8689,34.5324,31.1111,19.5946,29.868,27.9,18.3607,21.3115,0.6557,3.9344,26.5574,1.9672,29.5082,15.0,6300.0,26.705350171613343,48.709812647505885,18.580060422960727,19.686800894854585,16.011197916666664,41.21232269503546,16.031906452656727
Qwen/Qwen2.5-32B,Qwen2.5,virtualhome,action_sequencing,32.8,42.9508,47.482,47.2222,33.7838,44.0594,44.6,7.2131,27.541,2.2951,3.2787,12.1311,3.2787,11.4754,18.0,3542.3999999999996,38.00796730514634,53.954752851331996,35.64954682779456,21.588366890380314,22.69583333333333,53.39280437352246,40.766499554515356
google/gemma-2-27b,Gemma-2,virtualhome,action_sequencing,27.2,13.7705,25.8993,13.3333,7.4324,17.6568,17.7,43.9344,10.1639,3.9344,1.9672,23.2787,0.3279,24.5902,13.0,2121.6,23.926167340782822,37.390737454186464,16.61631419939577,13.422818791946312,13.921093749999997,37.4538268321513,24.75221301701707
01-ai/Yi-34B,Yi,virtualhome,action_sequencing,34.4,1.6502,1.083,1.1173,0.0,0.8278,1.0,40.5941,46.5347,1.9802,0.0,9.901,0.6601,4.2904,3.1,639.84,22.373127018936653,35.542431259008794,5.13595166163142,15.548098434004473,9.648437500000004,37.90632387706855,30.45751938190668
ibm-granite/granite-3.1-8b-base,Granite,virtualhome,action_sequencing,8.2,8.5246,14.7482,6.6667,6.0811,10.231,10.2,55.082,15.082,0.6557,1.6393,16.0656,1.3115,14.7541,12.0,590.4,20.05719991900457,26.01958867101177,9.441087613293051,9.507829977628639,8.36197916666667,24.802378841607563,42.21033524381973
meta-llama/Llama-3.2-1B,Llama-3,virtualhome,action_sequencing,1.2,0.0,0.7194,0.0,0.0,0.33,0.0,42.2951,56.3934,0.6557,0.0,1.3115,0.0,0.0,9.0,64.8,4.195140014045501,4.366029656556756,1.2084592145015105,0.0,2.5578125000000003,2.2606382978723394,14.777900415342401
01-ai/Yi-1.5-34B-Chat,Yi,virtualhome,action_sequencing,34.4,38.2838,28.8809,57.5419,35.1351,38.9073,38.9,0.0,20.7921,6.9307,1.3201,26.0726,5.9406,1.3201,3.6,743.04,33.35799367075618,44.262825981005655,27.719033232628398,15.324384787472036,13.058072916666665,39.11606087470449,60.66758423205982
01-ai/Yi-1.5-34B,Yi,virtualhome,action_sequencing,34.4,6.9307,18.0505,8.9385,0.6757,11.0927,6.3,1.6502,24.7525,3.9604,0.33,59.0759,3.9604,0.9901,3.6,743.04,25.64649419429311,42.74936268839652,15.332326283987916,15.436241610738257,11.217187500000003,40.732121749408975,28.411725333226947
deepseek-ai/deepseek-coder-1.3b-instruct,DeepSeek-Coder,virtualhome,action_sequencing,1.3,1.3201,7.5812,0.0,0.0,3.4768,0.7,12.5413,58.4158,3.6304,0.0,9.2409,15.5116,0.0,2.0,15.6,,,,,,,
meta-llama/Meta-Llama-3-70B-Instruct,Llama-3,virtualhome,action_sequencing,70.6,60.6557,55.036,68.8889,45.9459,56.9307,65.9,0.0,18.6885,8.1967,0.3279,5.9016,0.9836,3.9344,15.0,6353.999999999999,36.37222412927012,50.18513318440344,24.47129909365559,4.921700223713646,10.92057291666667,46.74386820330969,80.99077115387172
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,DeepSeek-R1,virtualhome,action_sequencing,8.0,24.2623,36.3309,21.1111,14.8649,26.5677,26.9,7.541,25.2459,0.3279,1.3115,35.4098,3.6066,13.7705,15.0,720.0,13.059950104920146,5.325247153240706,21.978851963746223,0.6711409395973182,0.45572916666666624,12.10475768321513,37.82397372305483
tiiuae/Falcon3-10B-Base,falcon,virtualhome,action_sequencing,10.3,21.7822,40.4332,23.4637,20.9459,30.6291,22.4,7.2607,31.3531,0.33,3.6304,31.3531,4.6205,68.6469,14.0,865.2,27.617850879493677,41.37546218651794,24.924471299093657,12.751677852348994,14.173958333333331,36.003989361702125,36.47754624396601
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,virtualhome,action_sequencing,32.0,51.8033,69.0647,50.0,47.2973,58.0858,61.0,0.0,16.0656,0.6557,2.2951,19.0164,0.9836,3.9344,6.5,1248.0,37.603165755662836,39.82420331711213,51.283987915407856,5.033557046979867,5.150000000000001,40.40890957446809,83.91833668000905
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek-R1,virtualhome,action_sequencing,1.8,0.9836,2.1583,1.6667,0.0,1.4851,2.3,46.8852,42.9508,0.3279,0.0,7.8689,2.2951,0.3279,18.0,194.4,10.351036796154286,4.729119207646243,16.91842900302115,0.7829977628635317,2.9656249999999993,2.0759456264775418,34.63410417691725
openai/gpt-oss-20b,GPT-OSS,virtualhome,action_sequencing,21.5,67.2131,74.4604,66.1111,47.973,65.5116,68.9,13.1148,4.2623,0.6557,0.0,13.1148,0.0,0.6557,,,,,,,,,
Qwen/Qwen1.5-110B,Qwen1.5,virtualhome,action_sequencing,111.2,35.7377,46.7626,46.1111,33.1081,43.2343,40.0,0.0,14.7541,4.2623,6.2295,20.9836,13.7705,4.918,7.0,4670.400000000001,29.83367750486893,44.28047655387545,24.69788519637462,13.646532438478745,13.705468750000001,48.45227541371159,34.21942667677318
bigcode/starcoderbase-1b,starcoder,virtualhome,action_sequencing,15.5,0.0,0.0,0.0,0.0,0.0,0.0,100.3279,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,,,,,,,
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,virtualhome,action_sequencing,401.6,74.4262,88.4892,78.8889,60.1351,78.7129,73.4,0.3279,11.1475,0.6557,4.2623,9.1803,0.9836,58.3607,22.0,53011.2,,,,,,,
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,virtualhome,action_sequencing,8.0,22.2951,27.6978,22.7778,10.8108,22.1122,26.2,0.0,36.3934,7.8689,0.3279,28.1967,0.9836,0.9836,15.0,720.0,23.908735693936837,28.244949576343615,8.685800604229607,1.230425055928408,1.602864583333335,29.604388297872337,74.08398604591373
Qwen/Qwen2.5-3B,Qwen2.5,virtualhome,action_sequencing,3.1,0.0,0.3597,0.0,0.0,0.165,0.0,18.0328,71.1475,8.5246,0.0,2.623,0.0,0.0,18.0,334.8,18.102770217683673,24.304241726371686,14.803625377643503,6.375838926174497,11.758333333333335,24.479166666666664,26.895415275912356
tiiuae/falcon-7b,falcon,virtualhome,action_sequencing,7.2,0.0,0.0,0.0,0.0,0.0,0.0,106.5789,1.9737,0.0,0.0,0.0,0.0,0.0,1.5,63.0,5.1734447203194796,5.963936911876051,0.9818731117824773,0.0,4.497135416666667,1.392582742316784,18.205140139274903
Qwen/Qwen3-1.7B,Qwen3,virtualhome,action_sequencing,2.0,2.9508,12.5899,5.0,0.0,7.2607,1.3,58.6885,20.0,0.0,2.2951,18.0328,0.0,0.0,36.0,432.0,,,,,,,
meta-llama/Llama-2-13b-hf,Llama-2,virtualhome,action_sequencing,13.0,0.0,1.083,0.0,0.0,0.4967,1.3,64.6865,25.7426,1.9802,0.0,7.2607,1.3201,0.33,2.0,156.0,11.065185981273997,17.222559825058127,1.5105740181268883,4.138702460850116,3.385416666666666,15.309175531914892,24.824687385027282
tiiuae/Falcon3-7B-Base,falcon,virtualhome,action_sequencing,7.5,8.9109,22.0217,8.3799,8.1081,14.5695,6.9,13.8614,32.6733,5.6106,1.6502,33.0033,6.6007,2.6403,14.0,630.0,24.745725360383613,31.55991854750336,19.410876132930515,12.863534675615215,18.142708333333335,32.337839834515364,34.15947463840388
ibm-granite/granite-3.3-2b-instruct,Granite,virtualhome,action_sequencing,2.5,1.6393,17.9856,2.2222,0.0,8.9109,12.8,10.4918,16.0656,6.2295,0.0,42.2951,12.459,0.0,12.0,180.0,,,,,,,
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek-R1,virtualhome,action_sequencing,14.8,47.8689,61.8705,48.3333,25.6757,49.0099,53.4,26.2295,3.2787,0.0,1.3115,15.082,0.6557,6.5574,18.0,1598.4,38.22146462032291,40.69076685552542,57.02416918429003,18.34451901565996,28.711458333333326,40.74135638297872,43.81651795015004
mistralai/Mistral-7B-Instruct-v0.2,Mistral,virtualhome,action_sequencing,7.2,17.1053,26.6187,11.7318,6.0811,17.1901,23.7,4.9342,30.2632,2.6316,0.0,39.8026,0.3289,22.3684,,,18.50789159273764,22.910601936713604,3.0211480362537766,3.467561521252797,7.608854166666667,19.076906028368796,54.96227786717022
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek-R1,virtualhome,action_sequencing,32.8,17.0492,22.6619,16.6667,16.8919,19.4719,17.0,69.5082,1.9672,0.3279,2.9508,6.5574,1.6393,36.7213,18.0,3542.3999999999996,22.96226839270608,17.149673765590364,17.069486404833835,4.5861297539149914,16.1421875,40.962987588652474,41.86314534324481
baichuan-inc/Baichuan-7B,Baichuan,virtualhome,action_sequencing,,0.0,0.0,0.0,0.0,0.0,0.0,79.2642,37.1237,0.3344,0.0,1.0033,0.0,0.0,1.2,,,,,,,,
moonshotai/Kimi-K2-Instruct,Kimi,virtualhome,action_sequencing,1000.0,75.082,89.9281,80.5556,58.7838,79.538,82.0,0.0,1.6393,0.0,0.0,15.082,1.3115,0.9836,15.5,93000.0,,,,,,,
01-ai/Yi-Coder-1.5B,Yi,virtualhome,action_sequencing,1.5,1.9802,4.6931,2.7933,3.3784,3.8079,2.6,56.1056,25.7426,4.9505,0.0,10.5611,1.6502,9.571,2.4,21.6,,,,,,,
google/gemma-2-9b-it,Gemma-2,virtualhome,action_sequencing,9.2,52.1311,69.7842,51.6667,41.2162,57.4257,54.1,30.8197,2.2951,3.6066,0.9836,20.3279,3.2787,4.2623,8.0,441.6,32.07276025267082,42.136619683664655,19.486404833836858,14.76510067114094,9.742187500000002,31.949985224586293,74.35626360279613
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek-R1,virtualhome,action_sequencing,7.6,13.4426,28.0576,14.4444,6.0811,18.6469,14.8,46.2295,5.5738,1.9672,2.9508,24.2623,5.2459,3.6066,18.0,820.8,14.99492256865316,7.882702983365756,19.561933534743204,3.9149888143176734,3.5518229166666675,14.681220449172578,40.3768667136531
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,virtualhome,action_sequencing,32.0,27.2131,41.3669,28.3333,15.5405,31.1881,19.7,20.6557,23.2787,0.6557,8.1967,37.7049,0.0,51.4754,6.5,1248.0,,,,,,,
01-ai/Yi-1.5-6B,Yi,virtualhome,action_sequencing,6.1,2.6403,5.0542,1.676,2.7027,3.4768,2.0,40.264,46.2046,0.0,0.33,11.2211,3.9604,0.6601,3.6,131.76,16.745698054972127,22.027904536694773,6.646525679758309,8.501118568232664,13.309114583333335,23.823507683215126,26.166017278598567
baichuan-inc/Baichuan2-7B-Chat,Baichuan,virtualhome,action_sequencing,7.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.6579,0.0,0.0,0.0,0.0,0.0,2.6,109.20000000000002,,,,,,,
microsoft/phi-4,phi,virtualhome,action_sequencing,14.7,55.4098,72.3022,64.4444,41.2162,62.3762,58.0,0.6557,4.2623,2.623,0.6557,31.8033,2.2951,1.6393,9.8,864.36,30.35812781134617,52.42784845820486,31.64652567975831,20.805369127516784,23.7859375,47.630393026004725,5.852693076592331
Qwen/Qwen-7B,Qwen,virtualhome,action_sequencing,7.7,0.3279,1.0791,0.5556,0.0,0.6601,0.0,90.8197,13.4426,0.0,0.0,0.6557,0.3279,0.0,2.4,100.8,,,,,,,
google/gemma-3-12b-pt,Gemma-3,virtualhome,action_sequencing,12.2,6.8852,17.9856,7.2222,4.7297,11.5512,8.5,36.7213,20.3279,1.9672,1.9672,31.4754,1.6393,4.918,12.0,878.4,,,,,,,
Qwen/Qwen2.5-1.5B,Qwen2.5,virtualhome,action_sequencing,1.5,0.0,0.0,0.0,0.0,0.0,0.0,88.1967,11.4754,0.0,0.0,0.3279,0.3279,0.0,18.0,162.0,13.852701161320264,16.660465167691854,9.138972809667674,4.697986577181204,5.265885416666666,20.609855200945624,26.743041795768562
meta-llama/Meta-Llama-3-8B,Llama-3,virtualhome,action_sequencing,8.0,9.5082,20.1439,7.2222,4.7297,12.5413,9.5,14.0984,33.1148,7.2131,1.6393,29.1803,6.2295,6.5574,15.0,720.0,13.626857071686075,24.50076379676797,4.531722054380665,7.38255033557047,6.242447916666666,24.553043735224584,14.550614591506093
google/gemma-3-27b-it,Gemma-3,virtualhome,action_sequencing,27.4,69.5082,80.9353,70.0,58.7838,72.2772,75.4,0.0,3.9344,0.0,0.0,19.6721,0.9836,3.6066,14.0,2301.6,,,,,,,
Qwen/Qwen3-4B,Qwen3,virtualhome,action_sequencing,4.0,29.5082,41.0072,25.0,6.7568,27.8878,26.2,63.6066,1.6393,0.6557,5.2459,2.623,0.0,15.082,36.0,864.0,,,,,,,
Qwen/Qwen2.5-7B,Qwen2.5,virtualhome,action_sequencing,7.6,21.3115,28.0576,21.1111,21.6216,24.4224,22.6,12.1311,30.8197,3.2787,0.6557,23.6066,8.1967,2.623,18.0,820.8,26.019159924095096,35.81347328754777,25.075528700906347,9.955257270693513,14.13671875,37.38918439716312,33.74479713825982
Qwen/Qwen3-235B-A22B-Thinking-2507,Qwen3,virtualhome,action_sequencing,235.1,29.8361,35.2518,25.0,5.4054,24.9175,25.2,57.7049,11.4754,0.0,4.918,0.9836,0.0,20.9836,36.0,50781.6,,,,,,,
bigcode/starcoderbase,starcoder,virtualhome,action_sequencing,15.5,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.3279,0.0,0.0,0.0,0.0,0.0,1.0,93.0,,,,,,,
ibm-granite/granite-3.3-8b-instruct,Granite,virtualhome,action_sequencing,8.2,35.082,41.3669,38.3333,35.8108,39.1089,40.7,1.9672,15.4098,6.5574,0.3279,24.918,10.1639,1.6393,12.0,590.4,,,,,,,
google/gemma-3-4b-pt,Gemma-3,virtualhome,action_sequencing,4.3,1.6393,6.1151,0.5556,0.0,2.9703,1.3,73.1148,12.7869,1.3115,0.0,12.7869,1.3115,0.3279,4.0,103.2,,,,,,,
01-ai/Yi-Coder-9B-Chat,Yi,virtualhome,action_sequencing,8.8,42.9043,59.2058,44.1341,30.4054,47.6821,46.5,0.33,13.2013,5.6106,3.3003,28.7129,2.3102,1.3201,2.4,126.72,16.985989314863886,25.94315294491389,4.003021148036254,0.0,7.963802083333333,15.83554964539007,48.17041006750976
meta-llama/Llama-3.1-70B,Llama-3,virtualhome,action_sequencing,70.6,28.8525,36.6906,33.8889,17.5676,31.1881,30.5,26.5574,20.6557,0.9836,2.2951,19.0164,1.9672,90.8197,15.0,6353.999999999999,26.200215843375947,46.39941295581887,18.429003021148038,18.34451901565996,16.581770833333337,40.602836879432616,16.843752354862875
google/gemma-2-2b,Gemma-2,virtualhome,action_sequencing,2.6,0.6557,0.3597,1.6667,2.7027,1.3201,2.0,71.8033,31.4754,1.3115,0.0,3.9344,0.6557,0.0,2.0,31.200000000000003,10.129463155055184,11.755807532236112,2.8700906344410875,1.6778523489932917,11.430468750000001,13.111332742316787,19.931226922343825
deepseek-ai/deepseek-coder-33b-base,DeepSeek-Coder,virtualhome,action_sequencing,33.3,1.3201,4.6931,2.2346,0.0,2.8146,0.7,32.3432,55.7756,0.9901,0.9901,8.5809,1.3201,1.3201,2.0,396.0,,,,,,,
google/gemma-7b,Gemma,virtualhome,action_sequencing,8.5,1.3115,4.3165,0.5556,1.3514,2.4752,2.3,55.4098,28.8525,0.9836,0.0,10.8197,1.6393,0.3279,6.0,252.0,15.442818570272307,21.11609932329174,7.401812688821751,4.921700223713646,10.979947916666669,21.644134160756497,26.593217108383534
deepseek-ai/deepseek-coder-6.7b-instruct,DeepSeek-Coder,virtualhome,action_sequencing,6.7,6.6007,23.1047,7.2626,8.7838,14.9007,6.9,24.4224,17.4917,3.6304,0.6601,47.1947,5.6106,1.3201,2.0,80.4,,,,,,,
01-ai/Yi-Coder-9B,Yi,virtualhome,action_sequencing,8.8,14.1914,15.1625,18.4358,10.1351,14.9007,15.2,53.4653,23.7624,1.3201,0.6601,7.2607,0.6601,0.6601,2.4,126.72,,,,,,,
deepseek-ai/deepseek-coder-7b-instruct-v1.5,DeepSeek-Coder,virtualhome,action_sequencing,6.9,0.6557,17.2662,0.5556,0.6757,8.2508,0.7,2.9508,15.082,35.7377,0.0,43.6066,1.9672,0.0,2.0,82.80000000000001,,,,,,,
ibm-granite/granite-3.3-8b-base,Granite,virtualhome,action_sequencing,8.2,16.7213,19.0647,16.6667,21.6216,18.9769,26.2,45.9016,5.9016,0.9836,1.9672,13.4426,5.5738,1.3115,12.0,590.4,,,,,,,
meta-llama/Llama-2-70b-hf,Llama-2,virtualhome,action_sequencing,69.0,5.6106,10.4693,4.4693,2.027,6.6225,4.6,55.4455,24.0924,2.3102,0.33,10.8911,2.9703,0.33,2.0,840.0,18.372598605703004,35.900061863721675,3.2477341389728096,7.046979865771815,9.777604166666668,30.1954048463357,24.06780675274937
deepseek-ai/deepseek-coder-1.3b-base,DeepSeek-Coder,virtualhome,action_sequencing,1.3,0.33,1.444,0.0,0.0,0.6623,0.0,45.2145,46.5347,1.6502,0.0,8.2508,0.6601,0.0,2.0,15.6,,,,,,,
01-ai/Yi-6B,Yi,virtualhome,action_sequencing,6.1,0.0,0.0,0.0,0.0,0.0,0.0,86.7987,11.2211,0.0,0.0,1.9802,0.33,0.0,3.1,113.46,13.611617485376058,19.408504737915056,1.5861027190332326,2.572706935123044,7.044270833333335,22.12433510638298,28.93378458046871
Qwen/Qwen2.5-0.5B,Qwen2.5,virtualhome,action_sequencing,0.5,0.0,0.0,0.0,0.0,0.0,0.0,93.7705,15.082,0.0,0.0,0.0,0.0,0.0,18.0,54.0,6.550067614297009,6.953961634882263,3.927492447129909,0.0,2.0833333333333326,10.0639036643026,16.271714606133948
microsoft/Phi-3-mini-4k-instruct,phi,virtualhome,action_sequencing,3.8,13.8614,22.0217,12.2905,6.7568,15.3974,17.8,17.4917,15.5116,12.8713,0.0,23.4323,12.8713,0.33,4.9,111.72,25.967732638041607,39.2693352377728,11.63141993957704,9.284116331096197,7.644270833333336,31.848404255319146,56.12884923115112
