Model,Model Family,dataset,eval_type,Model Size (B),task_success_rate,state_goal,relation_goal,action_goal,total_goal,execution_success_rate,parsing_error,hallucination_error,predicate_argument_number_error,wrong_order_error,missing_step_error,affordance_error,additional_step_error,Pretraining Data Size (T),FLOPs (1E21),Average,BBH,MATH Lvl 5,GPQA,MUSR,MMLU-PRO,IFEval
meta-llama/Llama-3.3-70B-Instruct,Llama-3,virtualhome,action_sequencing_v4,70.6,63.9344,54.3165,76.1111,61.4865,62.5413,68.9,0.3279,16.0656,0.0,0.0,14.7541,0.0,1.9672,15.0,6353.999999999999,44.84747145129876,56.561410788022194,48.338368580060425,10.514541387024611,15.565624999999999,48.12906323877069,89.97581971391463
01-ai/Yi-Coder-1.5B-Chat,Yi,virtualhome,action_sequencing_v4,1.5,0.6557,8.2734,0.5556,0.0,3.9604,2.6,10.4918,44.5902,14.4262,0.0,22.9508,6.5574,0.6557,2.4,21.6,,,,,,,
Qwen/Qwen1.5-1.8B,Qwen1.5,virtualhome,action_sequencing_v4,1.8,0.0,0.0,0.0,0.0,0.0,0.0,122.2951,18.3607,0.0,0.0,0.0,0.0,0.0,2.4,25.92,9.269492522098927,9.759901587727937,3.1722054380664653,7.38255033557047,3.963802083333334,9.79609929078014,21.542396397115212
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,virtualhome,action_sequencing_v4,46.7,29.1803,36.6906,21.1111,16.8919,27.2277,26.2,2.623,22.9508,2.9508,0.9836,42.623,1.9672,1.3115,,,23.8171027058463,29.742398380967334,9.138972809667674,7.046979865771815,11.073697916666667,29.909131205673756,55.991436056330535
01-ai/Yi-1.5-6B-Chat,Yi,virtualhome,action_sequencing_v4,6.1,13.9456,22.3827,14.7059,5.4054,15.9664,16.0,10.2041,31.6327,4.0816,0.0,36.0544,4.7619,1.7007,3.6,131.76,22.784006289829847,23.67872313235784,16.238670694864048,6.935123042505594,14.030468750000002,24.368351063829788,51.452701055421834
Qwen/Qwen1.5-7B,Qwen1.5,virtualhome,action_sequencing_v4,7.7,0.9836,4.3165,1.6667,0.0,2.4752,2.3,67.8689,25.2459,2.2951,0.0,11.1475,5.5738,0.0,4.0,168.0,16.024674155407357,23.075768754340448,9.290030211480364,6.487695749440718,9.158333333333333,21.293218085106382,26.842998798742894
Qwen/Qwen1.5-14B,Qwen1.5,virtualhome,action_sequencing_v4,14.2,7.541,8.9928,9.4444,6.7568,8.5809,12.5,30.4918,42.2951,2.623,0.6557,13.1148,3.6066,0.6557,4.0,336.0,20.854080062460586,30.063103282917453,20.241691842900302,5.92841163310962,10.464062500000002,29.373522458628837,29.05368865720732
Qwen/Qwen3-14B,Qwen3,virtualhome,action_sequencing_v4,14.8,0.3279,1.0791,0.0,0.6757,0.6601,0.3,99.0164,0.0,0.0,0.0,0.6557,0.0,0.0,36.0,3196.8,,,,,,,
Qwen/Qwen1.5-72B,Qwen1.5,virtualhome,action_sequencing_v4,72.3,3.9344,14.7482,3.8889,1.3514,8.2508,7.9,18.3607,47.541,5.9016,0.3279,20.9836,4.2623,0.6557,3.0,1296.0,,,,,,,
openai/gpt-oss-120b,GPT-OSS,virtualhome,action_sequencing_v4,120.4,75.7377,87.4101,75.5556,64.1892,78.2178,75.1,3.2787,3.9344,0.6557,0.9836,17.7049,0.0,4.2623,,,,,,,,,
Qwen/Qwen1.5-4B,Qwen1.5,virtualhome,action_sequencing_v4,4.0,0.0,1.4388,0.0,0.0,0.6601,1.6,60.9836,45.9016,0.3279,0.0,3.9344,2.2951,0.0,2.4,57.6,11.76818275851784,16.249142581095292,5.287009063444108,3.5794183445190177,4.8226562500000005,16.22340425531915,24.447466056729475
meta-llama/Llama-3.1-70B-Instruct,Llama-3,virtualhome,action_sequencing_v4,70.6,67.541,71.5827,76.6667,51.3514,68.1518,73.1,0.0,7.2131,2.623,1.6393,14.7541,0.6557,2.623,15.0,6353.999999999999,43.409948245645786,55.92799173898473,38.066465256797585,14.205816554809845,17.691145833333334,47.87972813238771,86.6885419575615
Qwen/Qwen-72B,Qwen,virtualhome,action_sequencing_v4,72.3,9.5082,14.0288,10.5556,3.3784,10.396,14.8,4.5902,36.0656,8.1967,1.9672,31.1475,5.5738,2.623,3.0,1296.0,,,,,,,
Qwen/Qwen3-32B,Qwen3,virtualhome,action_sequencing_v4,32.8,46.2295,39.9281,56.1111,41.2162,45.0495,47.9,0.3279,15.4098,1.3115,0.6557,31.4754,2.9508,1.6393,36.0,7084.799999999999,,,,,,,
Qwen/Qwen3-8B,Qwen3,virtualhome,action_sequencing_v4,8.2,0.6557,23.3813,0.0,0.0,10.7261,0.0,5.9016,20.6557,5.9016,0.0,65.2459,2.2951,0.0,36.0,1771.1999999999998,,,,,,,
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,virtualhome,action_sequencing_v4,108.6,62.9508,75.5396,66.1111,52.7027,67.1617,63.3,0.0,8.5246,6.5574,3.6066,16.0656,1.9672,3.6066,40.0,26064.0,,,,,,,
deepseek-ai/DeepSeek-R1,DeepSeek-R1,virtualhome,action_sequencing_v4,684.5,57.0492,38.4892,69.4444,51.3514,50.8251,63.6,0.3279,21.3115,3.2787,0.9836,9.8361,0.6557,0.6557,14.8,60783.600000000006,,,,,,,
meta-llama/Llama-3.2-1B-Instruct,Llama-3,virtualhome,action_sequencing_v4,1.2,0.0,2.1583,0.0,0.0,1.0526,0.0,28.8809,40.0722,12.2744,0.0,25.6318,0.361,0.0,9.0,64.8,14.443126333711135,8.742521312303046,7.02416918429003,3.355704697986576,2.973437500000001,7.579787234042552,56.9831380736446
deepseek-ai/DeepSeek-V3,DeepSeek,virtualhome,action_sequencing_v4,684.5,78.0328,80.2158,82.7778,70.2703,78.5479,84.3,0.0,5.5738,0.6557,0.0,9.1803,0.3279,0.6557,14.8,60783.600000000006,,,,,,,
01-ai/Yi-1.5-34B-Chat,Yi,virtualhome,action_sequencing_v4,34.4,29.932,27.7978,45.8824,32.4324,34.1176,35.7,0.0,20.4082,7.483,1.0204,30.9524,4.4218,2.0408,3.6,743.04,33.35799367075618,44.262825981005655,27.719033232628398,15.324384787472036,13.058072916666665,39.11606087470449,60.66758423205982
meta-llama/Meta-Llama-3-70B-Instruct,Llama-3,virtualhome,action_sequencing_v4,70.6,57.0492,53.5971,68.3333,43.9189,55.6106,62.6,0.3279,20.0,9.1803,0.6557,6.5574,0.6557,3.9344,15.0,6353.999999999999,36.37222412927012,50.18513318440344,24.47129909365559,4.921700223713646,10.92057291666667,46.74386820330969,80.99077115387172
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,virtualhome,action_sequencing_v4,32.0,48.5246,50.7194,52.7778,50.0,51.1551,52.1,10.8197,18.6885,0.6557,2.2951,14.0984,1.3115,1.6393,6.5,1248.0,37.603165755662836,39.82420331711213,51.283987915407856,5.033557046979867,5.150000000000001,40.40890957446809,83.91833668000905
openai/gpt-oss-20b,GPT-OSS,virtualhome,action_sequencing_v4,21.5,71.8033,83.8129,70.5556,54.0541,72.6073,74.8,11.8033,3.2787,0.0,0.0,10.4918,0.0,1.3115,,,,,,,,,
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,virtualhome,action_sequencing_v4,401.6,76.0656,88.8489,85.0,64.1892,81.6832,80.0,0.0,5.5738,0.3279,0.9836,11.4754,1.6393,4.5902,22.0,53011.2,,,,,,,
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,virtualhome,action_sequencing_v4,8.0,28.1967,34.1727,25.5556,13.5135,26.5677,32.8,0.0,38.0328,4.5902,0.0,24.2623,0.3279,1.9672,15.0,720.0,23.908735693936837,28.244949576343615,8.685800604229607,1.230425055928408,1.602864583333335,29.604388297872337,74.08398604591373
Qwen/Qwen3-1.7B,Qwen3,virtualhome,action_sequencing_v4,2.0,0.0,8.2734,0.0,0.0,3.7954,0.0,26.2295,48.8525,3.2787,0.0,16.7213,4.918,0.0,36.0,432.0,,,,,,,
meta-llama/Llama-3.1-8B-Instruct,Llama-3,virtualhome,action_sequencing_v4,8.0,36.7213,53.2374,31.6667,28.3784,40.7591,36.7,0.0,20.3279,8.1967,5.2459,20.9836,8.5246,6.5574,,,23.763729445470883,29.379192497334035,15.55891238670695,8.7248322147651,8.611197916666667,31.091164302600465,49.217077354752064
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek-R1,virtualhome,action_sequencing_v4,14.8,23.2558,46.5455,30.6818,27.027,37.0618,28.6,2.3256,10.6312,2.6578,0.3322,55.1495,0.3322,1.9934,18.0,1598.4,38.22146462032291,40.69076685552542,57.02416918429003,18.34451901565996,28.711458333333326,40.74135638297872,43.81651795015004
mistralai/Mistral-7B-Instruct-v0.2,Mistral,virtualhome,action_sequencing_v4,7.2,25.9016,26.6187,30.0,32.4324,29.0429,29.8,3.2787,20.3279,3.9344,0.3279,36.7213,5.5738,0.3279,,,18.50789159273764,22.910601936713604,3.0211480362537766,3.467561521252797,7.608854166666667,19.076906028368796,54.96227786717022
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek-R1,virtualhome,action_sequencing_v4,32.8,49.3333,72.4265,46.0227,32.8671,54.9915,55.0,0.3333,14.6667,0.6667,0.3333,25.3333,3.6667,3.0,18.0,3542.3999999999996,22.96226839270608,17.149673765590364,17.069486404833835,4.5861297539149914,16.1421875,40.962987588652474,41.86314534324481
moonshotai/Kimi-K2-Instruct,Kimi,virtualhome,action_sequencing_v4,1000.0,76.7213,88.8489,80.5556,62.8378,80.033,82.3,2.2951,1.9672,0.3279,0.6557,11.8033,0.6557,0.9836,15.5,93000.0,,,,,,,
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek-R1,virtualhome,action_sequencing_v4,7.6,0.0,0.0,0.0,9.0909,1.2346,0.0,23.6842,39.4737,18.4211,0.0,2.6316,15.7895,0.0,18.0,820.8,14.99492256865316,7.882702983365756,19.561933534743204,3.9149888143176734,3.5518229166666675,14.681220449172578,40.3768667136531
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,virtualhome,action_sequencing_v4,32.0,1.6393,28.4173,1.1111,0.0,13.3663,0.7,5.9016,6.2295,2.9508,0.0,75.082,9.1803,0.0,6.5,1248.0,,,,,,,
Qwen/Qwen-7B,Qwen,virtualhome,action_sequencing_v4,7.7,0.3279,0.0,0.5556,1.3514,0.495,0.3,85.2459,20.3279,2.623,0.3279,3.2787,0.6557,0.0,2.4,100.8,,,,,,,
Qwen/Qwen3-4B,Qwen3,virtualhome,action_sequencing_v4,4.0,41.9672,53.5971,43.8889,45.9459,48.8449,48.9,0.0,2.2951,1.6393,0.0,34.4262,12.7869,1.6393,36.0,864.0,,,,,,,
Qwen/Qwen3-235B-A22B-Thinking-2507,Qwen3,virtualhome,action_sequencing_v4,235.1,4.5902,7.554,3.8889,4.0541,5.6106,4.3,86.5574,7.2131,1.3115,0.0,9.1803,0.0,0.0,36.0,50781.6,,,,,,,
01-ai/Yi-Coder-9B-Chat,Yi,virtualhome,action_sequencing_v4,8.8,36.3934,51.4388,34.4444,22.973,39.4389,38.7,0.3279,19.0164,7.8689,1.9672,29.1803,2.9508,2.9508,2.4,126.72,16.985989314863886,25.94315294491389,4.003021148036254,0.0,7.963802083333333,15.83554964539007,48.17041006750976
