Model,Model Family,dataset,eval_type,Model Size (B),node_precision,node_recall,node_f1,edge_precision,edge_recall,edge_f1,action_precision,action_recall,action_f1,all_precision,all_recall,all_f1,Pretraining Data Size (T),FLOPs (1E21),Average,BBH,MATH Lvl 5,GPQA,MUSR,MMLU-PRO,IFEval
meta-llama/Llama-3.3-70B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,70.6,28.763,57.27,38.2937,24.0084,38.9831,29.7158,11.9556,86.4198,21.0053,19.302,56.4232,28.764,15.0,6353.999999999999,44.84747145129876,56.561410788022194,48.338368580060425,10.514541387024611,15.565624999999999,48.12906323877069,89.97581971391463
01-ai/Yi-Coder-1.5B-Chat,Yi,virtualhome,goal_interpretation_v4,1.5,26.7241,10.8014,15.3846,3.4483,0.4132,0.738,6.7729,11.7241,8.5859,12.3737,7.27,9.1589,2.4,21.6,,,,,,,
Qwen/Qwen1.5-1.8B,Qwen1.5,virtualhome,goal_interpretation_v4,1.8,13.0435,4.7809,6.9971,0.0,0.0,0.0,1.5198,5.4945,2.381,3.6638,3.1136,3.3663,2.4,25.92,9.269492522098927,9.759901587727937,3.1722054380664653,7.38255033557047,3.963802083333334,9.79609929078014,21.542396397115212
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,virtualhome,goal_interpretation_v4,46.7,19.7674,50.4451,28.4043,15.8363,30.1695,20.7701,8.8417,81.1321,15.9456,13.4675,49.0518,21.1329,,,23.8171027058463,29.742398380967334,9.138972809667674,7.046979865771815,11.073697916666667,29.909131205673756,55.991436056330535
google/gemma-3-12b-it,Gemma,virtualhome,goal_interpretation_v4,12.2,24.7326,54.4118,34.0074,57.1429,1.3423,2.623,14.3139,89.5062,24.6809,18.8914,41.75,26.0125,12.0,878.4,,,,,,,
google/gemma-3-4b-it,Gemma,virtualhome,goal_interpretation_v4,4.3,22.0896,43.6578,29.336,9.3168,10.0671,9.6774,10.5628,84.5679,18.78,13.7615,39.4243,20.4016,4.0,103.2,,,,,,,
google/gemma-7b-it,Gemma,virtualhome,goal_interpretation_v4,8.5,21.2581,28.8235,24.4694,13.8462,15.1007,14.4462,9.5335,58.0247,16.3763,13.3747,29.625,18.4292,2.0,102.0,13.067087110466217,11.940832085290182,2.9456193353474323,4.5861297539149914,12.528385416666667,7.7183067375886525,38.68324933398937
01-ai/Yi-1.5-6B-Chat,Yi,virtualhome,goal_interpretation_v4,6.1,16.1212,39.5833,22.9113,2.9268,8.4211,4.3439,11.1018,84.0764,19.6137,10.1976,37.1465,16.0022,3.6,131.76,22.784006289829847,23.67872313235784,16.238670694864048,6.935123042505594,14.030468750000002,24.368351063829788,51.452701055421834
Qwen/Qwen1.5-7B,Qwen1.5,virtualhome,goal_interpretation_v4,7.7,14.986,32.8221,20.5769,3.0612,7.2917,4.3121,3.5211,9.4937,5.137,7.8313,18.5233,11.0085,4.0,168.0,16.024674155407357,23.075768754340448,9.290030211480364,6.487695749440718,9.158333333333333,21.293218085106382,26.842998798742894
Qwen/Qwen1.5-14B,Qwen1.5,virtualhome,goal_interpretation_v4,14.2,15.7395,35.5828,21.825,5.1672,5.9233,5.5195,5.1546,30.4054,8.8149,9.18,23.3903,13.1852,4.0,336.0,20.854080062460586,30.063103282917453,20.241691842900302,5.92841163310962,10.464062500000002,29.373522458628837,29.05368865720732
google/gemma-2b-it,Gemma,virtualhome,goal_interpretation_v4,2.5,4.6429,11.8541,6.6724,5.2632,4.5296,4.8689,4.7041,20.8054,7.6733,4.7537,10.8497,6.6109,6.0,90.0,7.485804130315127,5.214303022163619,2.0392749244712993,3.8031319910514525,3.0322916666666675,3.9228723404255303,26.902950837112197
Qwen/Qwen3-14B,Qwen3,virtualhome,goal_interpretation_v4,14.8,22.9167,19.4118,21.0191,24.7368,15.7718,19.2623,11.7962,27.1605,16.4486,18.4489,19.625,19.0188,36.0,3196.8,,,,,,,
Qwen/Qwen1.5-72B,Qwen1.5,virtualhome,goal_interpretation_v4,72.3,17.8723,40.0,24.7059,5.5556,5.6391,5.597,6.8256,44.3662,11.831,10.7482,28.2158,15.5666,3.0,1296.0,,,,,,,
openai/gpt-oss-120b,GPT-OSS,virtualhome,goal_interpretation_v4,120.4,33.7815,59.6439,43.133,49.8728,66.2162,56.894,15.5844,75.9494,25.8621,29.4084,65.3603,40.5649,,,,,,,,,
Qwen/Qwen1.5-4B,Qwen1.5,virtualhome,goal_interpretation_v4,4.0,16.3895,21.6301,18.6486,1.3201,1.444,1.3793,3.8647,10.5263,5.6537,7.8207,11.8984,9.438,2.4,57.6,11.76818275851784,16.249142581095292,5.287009063444108,3.5794183445190177,4.8226562500000005,16.22340425531915,24.447466056729475
meta-llama/Llama-3.1-70B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,70.6,24.3176,57.6471,34.2059,23.2394,44.2953,30.485,11.874,90.7407,21.0,18.1853,59.375,27.8429,,,43.409948245645786,55.92799173898473,38.066465256797585,14.205816554809845,17.691145833333334,47.87972813238771,86.6885419575615
Qwen/Qwen-72B,Qwen,virtualhome,goal_interpretation_v4,72.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1296.0,,,,,,,
Qwen/Qwen3-32B,Qwen3,virtualhome,goal_interpretation_v4,32.8,26.3328,48.368,34.1004,26.1954,42.4242,32.3907,12.6173,75.625,21.6265,19.9126,51.6373,28.7417,36.0,7084.799999999999,,,,,,,
Qwen/Qwen3-8B,Qwen3,virtualhome,goal_interpretation_v4,8.2,22.2607,56.7647,31.9801,22.5632,41.9463,29.3427,8.1855,74.0741,14.742,15.1715,54.75,23.7592,36.0,1771.1999999999998,,,,,,,
google/gemma-2-27b-it,Gemma,virtualhome,goal_interpretation_v4,27.2,33.0784,51.6418,40.3263,20.922,20.1365,20.5217,12.1212,90.566,21.3808,18.866,47.7764,27.0504,13.0,2121.6,36.17428251510342,49.27284215130387,23.867069486404834,16.666666666666664,9.112760416666667,38.34958628841608,79.77677008116243
google/gemma-1.1-2b-it,Gemma,virtualhome,goal_interpretation_v4,2.5,12.3552,20.3822,15.3846,1.3575,1.1765,1.2605,5.2567,28.1046,8.8568,7.0649,15.2355,9.6534,3.0,45.0,8.053373854341979,5.862826722774347,1.812688821752266,2.572706935123044,2.024479166666666,5.372709810874704,30.674831668860847
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,virtualhome,goal_interpretation_v4,108.6,22.9581,61.1765,33.3868,21.0526,49.6644,29.5704,10.3163,84.5679,18.3893,16.7858,61.625,26.3848,40.0,26064.0,,,,,,,
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek,virtualhome,goal_interpretation_v4,70.6,30.6569,15.6134,20.6897,15.0,3.7344,5.9801,10.1852,16.6667,12.6437,17.6755,11.3707,13.8389,15.0,6353.999999999999,27.809426360756188,35.81986234433108,30.74018126888218,2.0134228187919474,13.277343749999998,41.64635047281324,43.35939750971866
deepseek-ai/DeepSeek-R1,DeepSeek,virtualhome,goal_interpretation_v4,684.5,25.8209,51.952,34.4965,28.6334,45.0512,35.0133,16.0839,86.25,27.112,22.2725,56.3613,31.9279,14.8,60783.600000000006,,,,,,,
google/gemma-2-2b-it,Gemma,virtualhome,goal_interpretation_v4,2.6,18.541,39.1026,25.1546,3.5714,5.7143,4.3956,7.0284,36.5385,11.789,10.1721,26.0695,14.6341,2.0,31.200000000000003,17.046939294966545,17.980792881523424,0.0755287009063444,3.243847874720355,7.077343750000001,17.22074468085106,56.68337788179808
Qwen/Qwen3-0.6B,Qwen3,virtualhome,goal_interpretation_v4,0.8,18.4516,42.4332,25.7194,2.1841,4.7458,2.9915,8.1967,44.0252,13.8203,10.0,28.6979,14.8318,36.0,172.8,,,,,,,
google/gemma-1.1-7b-it,Gemma,virtualhome,goal_interpretation_v4,8.5,13.2743,4.4379,6.6519,6.25,0.6734,1.2158,11.5385,14.9068,13.0081,11.6147,5.1508,7.1366,6.0,306.0,17.693584228972615,15.93420938501317,4.909365558912387,5.8165548098433995,11.510937500000002,17.5993646572104,50.391073462856326
meta-llama/Llama-3.2-1B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,1.2,17.2043,6.1776,9.0909,0.8734,1.0,0.9324,5.4922,42.7419,9.7337,5.5167,12.1784,7.5936,,,14.443126333711135,8.742521312303046,7.02416918429003,3.355704697986576,2.973437500000001,7.579787234042552,56.9831380736446
deepseek-ai/DeepSeek-V3,DeepSeek,virtualhome,goal_interpretation_v4,684.5,21.966,59.5395,32.0922,32.13,62.6761,42.4821,13.7021,99.3827,24.0838,20.3682,69.3333,31.4865,14.8,60783.600000000006,,,,,,,
01-ai/Yi-1.5-34B-Chat,Yi,virtualhome,goal_interpretation_v4,34.4,24.3354,35.3116,28.8136,18.3607,18.9189,18.6356,9.6277,47.4684,16.0085,15.8932,31.6056,21.1506,3.6,743.04,33.35799367075618,44.262825981005655,27.719033232628398,15.324384787472036,13.058072916666665,39.11606087470449,60.66758423205982
meta-llama/Meta-Llama-3-70B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,70.6,20.7831,60.8824,30.988,20.2194,43.2886,27.5641,10.429,87.037,18.6262,15.9745,59.625,25.1981,15.0,6353.999999999999,36.37222412927012,50.18513318440344,24.47129909365559,4.921700223713646,10.92057291666667,46.74386820330969,80.99077115387172
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,DeepSeek,virtualhome,goal_interpretation_v4,8.0,22.6131,39.9408,28.877,6.4748,9.1837,7.5949,14.3293,58.3851,23.011,15.3293,32.2825,20.7877,15.0,720.0,13.059950104920146,5.325247153240706,21.978851963746223,0.6711409395973182,0.45572916666666624,12.10475768321513,37.82397372305483
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,virtualhome,goal_interpretation_v4,32.0,18.0952,59.19,27.717,12.6829,28.0576,17.4692,10.1879,92.1569,18.3474,13.4142,54.3883,21.5207,6.5,1248.0,37.603165755662836,39.82420331711213,51.283987915407856,5.033557046979867,5.150000000000001,40.40890957446809,83.91833668000905
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek,virtualhome,goal_interpretation_v4,1.8,7.3969,19.403,10.7106,1.0225,2.2321,1.4025,6.5141,30.0813,10.7091,5.3409,15.2846,7.9158,18.0,194.4,10.351036796154286,4.729119207646243,16.91842900302115,0.7829977628635317,2.9656249999999993,2.0759456264775418,34.63410417691725
openai/gpt-oss-20b,GPT-OSS,virtualhome,goal_interpretation_v4,21.5,33.1976,56.0137,41.688,54.9763,46.5863,50.4348,13.5431,71.7391,22.7848,26.3782,55.7522,35.8124,,,,,,,,,
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,virtualhome,goal_interpretation_v4,401.6,35.1301,56.9277,43.4483,27.1144,37.0748,31.3218,24.0876,61.4907,34.6154,29.3856,50.4447,37.1375,22.0,53011.2,,,,,,,
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,8.0,19.9408,59.4118,29.8596,13.7214,22.1477,16.9448,10.6227,71.6049,18.5008,14.8492,48.0,22.6816,15.0,720.0,23.908735693936837,28.244949576343615,8.685800604229607,1.230425055928408,1.602864583333335,29.604388297872337,74.08398604591373
Qwen/Qwen3-1.7B,Qwen3,virtualhome,goal_interpretation_v4,2.0,23.7082,46.4286,31.3883,7.6487,9.1525,8.3333,8.1818,45.0,13.8462,13.4849,32.2377,19.0157,36.0,432.0,,,,,,,
meta-llama/Llama-3.1-8B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,8.0,18.1646,56.4706,27.4875,12.9293,21.5488,16.1616,11.4099,86.9565,20.1729,14.2497,49.6241,22.1415,,,23.763729445470883,29.379192497334035,15.55891238670695,8.7248322147651,8.611197916666667,31.091164302600465,49.217077354752064
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek,virtualhome,goal_interpretation_v4,14.8,18.1382,55.5882,27.3517,7.8704,17.2881,10.8165,9.1111,76.875,16.2914,11.9408,45.6604,18.9309,18.0,1598.4,38.22146462032291,40.69076685552542,57.02416918429003,18.34451901565996,28.711458333333326,40.74135638297872,43.81651795015004
mistralai/Mistral-7B-Instruct-v0.2,Mistral,virtualhome,goal_interpretation_v4,7.2,12.806,46.4164,20.0738,7.6164,18.4932,10.7892,7.377,74.0506,13.4174,9.1451,41.319,14.9756,,,18.50789159273764,22.910601936713604,3.0211480362537766,3.467561521252797,7.608854166666667,19.076906028368796,54.96227786717022
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek,virtualhome,goal_interpretation_v4,32.8,25.2778,54.1667,34.4697,33.7621,35.9589,34.8259,12.8599,83.75,22.2962,20.3087,53.4264,29.4303,18.0,3542.3999999999996,22.96226839270608,17.149673765590364,17.069486404833835,4.5861297539149914,16.1421875,40.962987588652474,41.86314534324481
moonshotai/Kimi-K2-Instruct,Kimi,virtualhome,goal_interpretation_v4,1000.0,40.3226,15.015,21.8818,20.7358,21.2329,20.9814,12.9477,58.75,21.219,17.9286,26.242,21.303,15.5,93000.0,,,,,,,
google/gemma-2-9b-it,Gemma,virtualhome,goal_interpretation_v4,9.2,19.5329,54.5994,28.7725,17.8571,1.7007,3.1056,8.8907,67.7019,15.7174,13.5701,37.6263,19.9465,8.0,441.6,32.07276025267082,42.136619683664655,19.486404833836858,14.76510067114094,9.742187500000002,31.949985224586293,74.35626360279613
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek,virtualhome,goal_interpretation_v4,7.6,18.2137,31.8043,23.1626,8.3333,4.947,6.2084,9.067,45.098,15.0985,12.4667,24.5085,16.5267,18.0,820.8,14.99492256865316,7.882702983365756,19.561933534743204,3.9149888143176734,3.5518229166666675,14.681220449172578,40.3768667136531
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,virtualhome,goal_interpretation_v4,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,
Qwen/Qwen-7B,Qwen,virtualhome,goal_interpretation_v4,7.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4,100.8,,,,,,,
google/gemma-3-27b-it,Gemma,virtualhome,goal_interpretation_v4,27.4,38.0952,54.1176,44.7145,29.0598,11.4094,16.3855,16.3392,97.5309,27.9894,23.9949,47.0,31.7702,14.0,2301.6,,,,,,,
Qwen/Qwen3-4B,Qwen3,virtualhome,goal_interpretation_v4,4.0,24.3466,52.2124,33.2083,12.4098,28.8591,17.3562,8.1377,64.5963,14.4545,13.6027,45.99,20.9954,36.0,864.0,,,,,,,
01-ai/Yi-Coder-9B-Chat,Yi,virtualhome,goal_interpretation_v4,8.8,19.2628,49.0909,27.6687,15.7989,30.8772,20.9026,10.7081,77.5,18.8164,14.6322,48.2581,22.4557,2.4,126.72,16.985989314863886,25.94315294491389,4.003021148036254,0.0,7.963802083333333,15.83554964539007,48.17041006750976
meta-llama/Llama-3.2-3B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,3.2,17.0268,58.806,26.4075,3.3606,12.585,5.3047,9.7623,73.7179,17.2414,10.1572,44.4586,16.5364,,,24.204650807793456,24.059186446885473,17.673716012084594,3.8031319910514525,1.3734374999999996,24.386820330969268,73.93161256576994
