Model,Model Family,dataset,eval_type,Model Size (B),node_precision,node_recall,node_f1,edge_precision,edge_recall,edge_f1,action_precision,action_recall,action_f1,all_precision,all_recall,all_f1,Pretraining Data Size (T),FLOPs (1E21)
meta-llama/Llama-3.3-70B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,70.6,28.763,57.27,38.2937,24.0084,38.9831,29.7158,11.9556,86.4198,21.0053,19.302,56.4232,28.764,15.0,6353.999999999999
01-ai/Yi-Coder-1.5B-Chat,Yi,virtualhome,goal_interpretation_v4,1.5,26.7241,10.8014,15.3846,3.4483,0.4132,0.738,6.7729,11.7241,8.5859,12.3737,7.27,9.1589,2.4,21.599999999999998
Qwen/Qwen1.5-1.8B,Qwen1.5,virtualhome,goal_interpretation_v4,1.8,13.0435,4.7809,6.9971,0.0,0.0,0.0,1.5198,5.4945,2.381,3.6638,3.1136,3.3663,2.4,25.92
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,virtualhome,goal_interpretation_v4,46.7,19.7674,50.4451,28.4043,15.8363,30.1695,20.7701,8.8417,81.1321,15.9456,13.4675,49.0518,21.1329,,
google/gemma-3-12b-it,Gemma,virtualhome,goal_interpretation_v4,12.2,24.7326,54.4118,34.0074,57.1429,1.3423,2.623,14.3139,89.5062,24.6809,18.8914,41.75,26.0125,12.0,878.4
google/gemma-3-4b-it,Gemma,virtualhome,goal_interpretation_v4,4.3,22.0896,43.6578,29.336,9.3168,10.0671,9.6774,10.5628,84.5679,18.78,13.7615,39.4243,20.4016,4.0,103.19999999999999
google/gemma-7b-it,Gemma,virtualhome,goal_interpretation_v4,8.5,21.2581,28.8235,24.4694,13.8462,15.1007,14.4462,9.5335,58.0247,16.3763,13.3747,29.625,18.4292,2.0,102.0
01-ai/Yi-1.5-6B-Chat,Yi,virtualhome,goal_interpretation_v4,6.1,16.1212,39.5833,22.9113,2.9268,8.4211,4.3439,11.1018,84.0764,19.6137,10.1976,37.1465,16.0022,3.6,131.76
Qwen/Qwen1.5-7B,Qwen1.5,virtualhome,goal_interpretation_v4,7.7,14.986,32.8221,20.5769,3.0612,7.2917,4.3121,3.5211,9.4937,5.137,7.8313,18.5233,11.0085,4.0,168.0
Qwen/Qwen1.5-14B,Qwen1.5,virtualhome,goal_interpretation_v4,14.2,15.7395,35.5828,21.825,5.1672,5.9233,5.5195,5.1546,30.4054,8.8149,9.18,23.3903,13.1852,4.0,336.0
google/gemma-2b-it,Gemma,virtualhome,goal_interpretation_v4,2.5,4.6429,11.8541,6.6724,5.2632,4.5296,4.8689,4.7041,20.8054,7.6733,4.7537,10.8497,6.6109,6.0,90.0
Qwen/Qwen3-14B,Qwen3,virtualhome,goal_interpretation_v4,14.8,22.9167,19.4118,21.0191,24.7368,15.7718,19.2623,11.7962,27.1605,16.4486,18.4489,19.625,19.0188,36.0,3196.8
Qwen/Qwen1.5-72B,Qwen1.5,virtualhome,goal_interpretation_v4,72.3,17.8723,40.0,24.7059,5.5556,5.6391,5.597,6.8256,44.3662,11.831,10.7482,28.2158,15.5666,3.0,1296.0
openai/gpt-oss-120b,GPT-OSS,virtualhome,goal_interpretation_v4,120.4,33.7815,59.6439,43.133,49.8728,66.2162,56.894,15.5844,75.9494,25.8621,29.4084,65.3603,40.5649,,
Qwen/Qwen1.5-4B,Qwen1.5,virtualhome,goal_interpretation_v4,4.0,16.3895,21.6301,18.6486,1.3201,1.444,1.3793,3.8647,10.5263,5.6537,7.8207,11.8984,9.438,2.4,57.6
meta-llama/Llama-3.1-70B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,70.6,24.3176,57.6471,34.2059,23.2394,44.2953,30.485,11.874,90.7407,21.0,18.1853,59.375,27.8429,,
Qwen/Qwen-72B,Qwen,virtualhome,goal_interpretation_v4,72.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1296.0
Qwen/Qwen3-32B,Qwen3,virtualhome,goal_interpretation_v4,32.8,26.3328,48.368,34.1004,26.1954,42.4242,32.3907,12.6173,75.625,21.6265,19.9126,51.6373,28.7417,36.0,7084.799999999999
Qwen/Qwen3-8B,Qwen3,virtualhome,goal_interpretation_v4,8.2,22.2607,56.7647,31.9801,22.5632,41.9463,29.3427,8.1855,74.0741,14.742,15.1715,54.75,23.7592,36.0,1771.1999999999998
google/gemma-2-27b-it,Gemma,virtualhome,goal_interpretation_v4,27.2,33.0784,51.6418,40.3263,20.922,20.1365,20.5217,12.1212,90.566,21.3808,18.866,47.7764,27.0504,13.0,2121.6
google/gemma-1.1-2b-it,Gemma,virtualhome,goal_interpretation_v4,2.5,12.3552,20.3822,15.3846,1.3575,1.1765,1.2605,5.2567,28.1046,8.8568,7.0649,15.2355,9.6534,3.0,45.0
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,virtualhome,goal_interpretation_v4,108.6,22.9581,61.1765,33.3868,21.0526,49.6644,29.5704,10.3163,84.5679,18.3893,16.7858,61.625,26.3848,40.0,26064.0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek,virtualhome,goal_interpretation_v4,70.6,30.6569,15.6134,20.6897,15.0,3.7344,5.9801,10.1852,16.6667,12.6437,17.6755,11.3707,13.8389,15.0,6353.999999999999
deepseek-ai/DeepSeek-R1,DeepSeek,virtualhome,goal_interpretation_v4,684.5,25.8209,51.952,34.4965,28.6334,45.0512,35.0133,16.0839,86.25,27.112,22.2725,56.3613,31.9279,14.8,60783.600000000006
google/gemma-2-2b-it,Gemma,virtualhome,goal_interpretation_v4,2.6,18.541,39.1026,25.1546,3.5714,5.7143,4.3956,7.0284,36.5385,11.789,10.1721,26.0695,14.6341,2.0,31.200000000000003
Qwen/Qwen3-0.6B,Qwen3,virtualhome,goal_interpretation_v4,0.8,18.4516,42.4332,25.7194,2.1841,4.7458,2.9915,8.1967,44.0252,13.8203,10.0,28.6979,14.8318,36.0,172.8
google/gemma-1.1-7b-it,Gemma,virtualhome,goal_interpretation_v4,8.5,13.2743,4.4379,6.6519,6.25,0.6734,1.2158,11.5385,14.9068,13.0081,11.6147,5.1508,7.1366,6.0,306.0
meta-llama/Llama-3.2-1B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,1.2,17.2043,6.1776,9.0909,0.8734,1.0,0.9324,5.4922,42.7419,9.7337,5.5167,12.1784,7.5936,,
deepseek-ai/DeepSeek-V3,DeepSeek,virtualhome,goal_interpretation_v4,684.5,21.966,59.5395,32.0922,32.13,62.6761,42.4821,13.7021,99.3827,24.0838,20.3682,69.3333,31.4865,14.8,60783.600000000006
01-ai/Yi-1.5-34B-Chat,Yi,virtualhome,goal_interpretation_v4,34.4,24.3354,35.3116,28.8136,18.3607,18.9189,18.6356,9.6277,47.4684,16.0085,15.8932,31.6056,21.1506,3.6,743.04
meta-llama/Meta-Llama-3-70B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,70.6,20.7831,60.8824,30.988,20.2194,43.2886,27.5641,10.429,87.037,18.6262,15.9745,59.625,25.1981,15.0,6353.999999999999
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,DeepSeek,virtualhome,goal_interpretation_v4,8.0,22.6131,39.9408,28.877,6.4748,9.1837,7.5949,14.3293,58.3851,23.011,15.3293,32.2825,20.7877,15.0,720.0
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,virtualhome,goal_interpretation_v4,32.0,18.0952,59.19,27.717,12.6829,28.0576,17.4692,10.1879,92.1569,18.3474,13.4142,54.3883,21.5207,6.5,1248.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek,virtualhome,goal_interpretation_v4,1.8,7.3969,19.403,10.7106,1.0225,2.2321,1.4025,6.5141,30.0813,10.7091,5.3409,15.2846,7.9158,18.0,194.4
openai/gpt-oss-20b,GPT-OSS,virtualhome,goal_interpretation_v4,21.5,33.1976,56.0137,41.688,54.9763,46.5863,50.4348,13.5431,71.7391,22.7848,26.3782,55.7522,35.8124,,
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,virtualhome,goal_interpretation_v4,401.6,35.1301,56.9277,43.4483,27.1144,37.0748,31.3218,24.0876,61.4907,34.6154,29.3856,50.4447,37.1375,22.0,53011.200000000004
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,8.0,19.9408,59.4118,29.8596,13.7214,22.1477,16.9448,10.6227,71.6049,18.5008,14.8492,48.0,22.6816,15.0,720.0
Qwen/Qwen3-1.7B,Qwen3,virtualhome,goal_interpretation_v4,2.0,23.7082,46.4286,31.3883,7.6487,9.1525,8.3333,8.1818,45.0,13.8462,13.4849,32.2377,19.0157,36.0,432.0
meta-llama/Llama-3.1-8B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,8.0,18.1646,56.4706,27.4875,12.9293,21.5488,16.1616,11.4099,86.9565,20.1729,14.2497,49.6241,22.1415,,
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek,virtualhome,goal_interpretation_v4,14.8,18.1382,55.5882,27.3517,7.8704,17.2881,10.8165,9.1111,76.875,16.2914,11.9408,45.6604,18.9309,18.0,1598.4
mistralai/Mistral-7B-Instruct-v0.2,Mistral,virtualhome,goal_interpretation_v4,7.2,12.806,46.4164,20.0738,7.6164,18.4932,10.7892,7.377,74.0506,13.4174,9.1451,41.319,14.9756,,
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek,virtualhome,goal_interpretation_v4,32.8,25.2778,54.1667,34.4697,33.7621,35.9589,34.8259,12.8599,83.75,22.2962,20.3087,53.4264,29.4303,18.0,3542.3999999999996
moonshotai/Kimi-K2-Instruct,Kimi,virtualhome,goal_interpretation_v4,1000.0,40.3226,15.015,21.8818,20.7358,21.2329,20.9814,12.9477,58.75,21.219,17.9286,26.242,21.303,15.5,93000.0
google/gemma-2-9b-it,Gemma,virtualhome,goal_interpretation_v4,9.2,19.5329,54.5994,28.7725,17.8571,1.7007,3.1056,8.8907,67.7019,15.7174,13.5701,37.6263,19.9465,8.0,441.59999999999997
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek,virtualhome,goal_interpretation_v4,7.6,18.2137,31.8043,23.1626,8.3333,4.947,6.2084,9.067,45.098,15.0985,12.4667,24.5085,16.5267,18.0,820.8
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,virtualhome,goal_interpretation_v4,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
Qwen/Qwen-7B,Qwen,virtualhome,goal_interpretation_v4,7.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4,100.8
google/gemma-3-27b-it,Gemma,virtualhome,goal_interpretation_v4,27.4,38.0952,54.1176,44.7145,29.0598,11.4094,16.3855,16.3392,97.5309,27.9894,23.9949,47.0,31.7702,14.0,2301.6
Qwen/Qwen3-4B,Qwen3,virtualhome,goal_interpretation_v4,4.0,24.3466,52.2124,33.2083,12.4098,28.8591,17.3562,8.1377,64.5963,14.4545,13.6027,45.99,20.9954,36.0,864.0
01-ai/Yi-Coder-9B-Chat,Yi,virtualhome,goal_interpretation_v4,8.8,19.2628,49.0909,27.6687,15.7989,30.8772,20.9026,10.7081,77.5,18.8164,14.6322,48.2581,22.4557,2.4,126.72
meta-llama/Llama-3.2-3B-Instruct,Llama-3,virtualhome,goal_interpretation_v4,3.2,17.0268,58.806,26.4075,3.3606,12.585,5.3047,9.7623,73.7179,17.2414,10.1572,44.4586,16.5364,,
