Model,Model Family,dataset,eval_type,Model Size (B),node_precision,node_recall,node_f1,edge_precision,edge_recall,edge_f1,action_precision,action_recall,action_f1,all_precision,all_recall,all_f1
meta-llama/Llama-3.3-70B-Instruct,Llama-3,virtualhome,goal_interpretation,70.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-ai/Yi-Coder-1.5B-Chat,Yi,virtualhome,goal_interpretation,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Qwen/Qwen1.5-1.8B,Qwen1.5,virtualhome,goal_interpretation,1.8,5.5556,3.4483,4.2553,2.5,5.2632,3.3898,5.3435,53.8462,9.7222,4.7619,14.7541,7.2
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,virtualhome,goal_interpretation,46.7,20.1717,45.6311,27.9762,17.1306,31.3725,22.1607,8.2171,79.1045,14.8876,13.3143,46.8481,20.7356
google/gemma-3-12b-it,Gemma,virtualhome,goal_interpretation,12.2,23.0961,54.4118,32.4277,35.4839,3.6913,6.6869,13.8235,87.037,23.8579,18.1965,42.125,25.4148
google/gemma-3-4b-it,Gemma,virtualhome,goal_interpretation,4.3,22.1889,43.5294,29.3942,5.5556,6.3758,5.9375,11.0201,91.358,19.6678,13.3929,39.375,19.9873
ibm-granite/granite-3.2-2b-instruct,Granite,virtualhome,goal_interpretation,2.5,14.9296,32.5153,20.4633,0.5917,1.7241,0.8811,7.281,83.1169,13.3891,7.214,31.039,11.7071
ibm-granite/granite-3.1-8b-instruct,Granite,virtualhome,goal_interpretation,8.2,23.2384,45.9941,30.8765,6.6202,12.8814,8.7457,7.6833,82.9114,14.0633,10.998,41.0127,17.3448
google/gemma-7b-it,Gemma,virtualhome,goal_interpretation,8.5,18.5185,71.4286,29.4118,12.0,23.0769,15.7895,3.7037,40.0,6.7797,9.434,40.0,15.2672
01-ai/Yi-1.5-6B-Chat,Yi,virtualhome,goal_interpretation,6.1,19.883,45.6376,27.6986,1.7115,5.6,2.6217,13.8889,101.6949,24.4399,11.4117,40.5405,17.81
Qwen/Qwen1.5-14B,Qwen1.5,virtualhome,goal_interpretation,14.2,19.9029,53.7118,29.0437,5.4217,12.6761,7.5949,7.9023,54.4554,13.8018,11.3135,37.7532,17.4098
google/gemma-2b-it,Gemma,virtualhome,goal_interpretation,2.5,5.5398,16.7382,8.3244,7.3579,11.4583,8.9613,6.2724,38.8889,10.8025,6.1499,18.6408,9.2486
Qwen/Qwen3-14B,Qwen3,virtualhome,goal_interpretation,14.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemma-2-9b,Gemma,virtualhome,goal_interpretation,9.2,100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,33.3333,100.0,50.0
Qwen/Qwen1.5-72B,Qwen1.5,virtualhome,goal_interpretation,72.3,22.5931,55.873,32.1755,10.5516,16.7939,12.9602,9.5494,69.5312,16.7925,14.5207,43.8298,21.8143
openai/gpt-oss-120b,GPT-OSS,virtualhome,goal_interpretation,120.4,32.3529,58.7537,41.7281,52.2857,61.8243,56.6563,16.3373,77.0186,26.9565,29.3434,63.602,40.159
Qwen/Qwen1.5-4B,Qwen1.5,virtualhome,goal_interpretation,4.0,25.3687,40.3756,31.1594,1.937,4.1451,2.6403,12.8028,34.5794,18.6869,12.5841,25.5361,16.8597
meta-llama/Llama-3.1-70B-Instruct,Llama-3,virtualhome,goal_interpretation,70.6,20.0,42.8571,27.2727,37.5,33.3333,35.2941,31.0345,100.0,47.3684,28.8462,60.0,38.961
Qwen/Qwen-72B,Qwen,virtualhome,goal_interpretation,72.3,21.8155,49.3377,30.2538,10.4895,17.0455,12.987,9.899,70.0,17.3451,13.8915,41.3598,20.7977
ibm-granite/granite-3.3-2b-base,Granite,virtualhome,goal_interpretation,2.5,16.0,28.8,20.5714,2.0305,4.1667,2.7304,7.231,73.2143,13.1621,8.1901,29.2419,12.7962
Qwen/Qwen3-32B,Qwen3,virtualhome,goal_interpretation,32.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-ai/Yi-1.5-9B,Yi,virtualhome,goal_interpretation,8.8,24.8322,44.5783,31.8966,10.0,17.8571,12.8205,10.8374,64.7059,18.5654,15.2655,39.8844,22.08
meta-llama/Llama-2-7b-hf,Llama-2,virtualhome,goal_interpretation,6.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Qwen/Qwen3-8B,Qwen3,virtualhome,goal_interpretation,8.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemma-2-27b-it,Gemma,virtualhome,goal_interpretation,27.2,34.4322,55.6213,42.5339,23.0769,26.1745,24.5283,12.4478,91.9753,21.9279,19.9423,52.005,28.8295
google/gemma-1.1-2b-it,Gemma,virtualhome,goal_interpretation,2.5,9.6296,18.0556,12.5604,0.0,0.0,0.0,5.4795,32.0,9.3567,6.0694,13.6364,8.4
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,virtualhome,goal_interpretation,108.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek,virtualhome,goal_interpretation,70.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemma-2b,Gemma,virtualhome,goal_interpretation,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibm-granite/granite-3.1-2b-base,Granite,virtualhome,goal_interpretation,2.5,17.6471,23.0769,20.0,2.0408,2.5974,2.2857,6.3745,44.4444,11.1498,8.3333,19.1176,11.6071
meta-llama/Llama-3.2-3B,Llama-3,virtualhome,goal_interpretation,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
deepseek-ai/DeepSeek-R1,DeepSeek,virtualhome,goal_interpretation,684.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemma-2-2b-it,Gemma,virtualhome,goal_interpretation,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Qwen/Qwen3-0.6B,Qwen3,virtualhome,goal_interpretation,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemma-1.1-7b-it,Gemma,virtualhome,goal_interpretation,8.5,16.6667,50.0,25.0,0.0,0.0,0.0,28.5714,133.3333,47.0588,22.7273,83.3333,35.7143
meta-llama/Llama-3.2-1B-Instruct,Llama-3,virtualhome,goal_interpretation,1.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibm-granite/granite-3.1-2b-instruct,Granite,virtualhome,goal_interpretation,2.5,14.5183,32.4242,20.0562,0.9756,2.7778,1.444,7.6509,91.6129,14.1223,7.53,33.2471,12.279
ibm-granite/granite-3.2-8b-instruct,Granite,virtualhome,goal_interpretation,8.2,23.5023,45.9459,31.0976,8.4428,15.411,10.9091,8.0024,84.9057,14.6262,11.5987,42.4745,18.2216
deepseek-ai/DeepSeek-V3,DeepSeek,virtualhome,goal_interpretation,684.5,22.9621,58.8235,33.0306,32.3843,61.0738,42.3256,13.04,100.6173,23.0878,20.3131,68.125,31.2949
meta-llama/Meta-Llama-3-70B,Llama-3,virtualhome,goal_interpretation,70.6,30.7692,88.8889,45.7143,15.3846,50.0,23.5294,12.5,100.0,22.2222,20.6349,81.25,32.9114
meta-llama/Llama-3.1-8B,Llama-3,virtualhome,goal_interpretation,8.0,17.9104,48.0,26.087,5.3571,15.0,7.8947,7.7922,50.0,13.4831,10.5,36.8421,16.3424
google/gemma-2-27b,Gemma,virtualhome,goal_interpretation,27.2,27.2727,50.0,35.2941,16.6667,40.0,23.5294,0.0,0.0,0.0,17.3913,44.4444,25.0
ibm-granite/granite-3.1-8b-base,Granite,virtualhome,goal_interpretation,8.2,28.2759,41.8367,33.7449,3.0303,2.7027,2.8571,7.4169,82.8571,13.615,11.9601,34.7826,17.7998
01-ai/Yi-1.5-34B-Chat,Yi,virtualhome,goal_interpretation,34.4,22.0238,55.8491,31.5902,16.4811,37.0,22.8043,9.6267,93.3333,17.4533,14.9603,56.1404,23.625
01-ai/Yi-1.5-34B,Yi,virtualhome,goal_interpretation,34.4,27.5109,51.6393,35.8974,9.2105,11.0526,10.0478,12.012,86.0215,21.0804,16.7899,43.074,24.1618
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,virtualhome,goal_interpretation,32.0,24.3553,61.1511,34.8361,16.2562,36.2637,22.449,10.0656,97.8723,18.254,16.2537,59.2058,25.5054
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek,virtualhome,goal_interpretation,1.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
openai/gpt-oss-20b,GPT-OSS,virtualhome,goal_interpretation,21.5,35.4756,58.4746,44.16,51.0101,46.7593,48.7923,14.24,80.1802,24.1848,27.0627,58.2593,36.9577
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,virtualhome,goal_interpretation,401.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,virtualhome,goal_interpretation,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Qwen/Qwen3-1.7B,Qwen3,virtualhome,goal_interpretation,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta-llama/Llama-2-13b-hf,Llama-2,virtualhome,goal_interpretation,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibm-granite/granite-3.3-2b-instruct,Granite,virtualhome,goal_interpretation,2.5,16.8142,28.7009,21.2054,1.7937,1.3652,1.5504,7.3139,72.9032,13.2941,9.087,27.2144,13.6247
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek,virtualhome,goal_interpretation,14.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mistralai/Mistral-7B-Instruct-v0.2,Mistral,virtualhome,goal_interpretation,7.2,13.5309,48.6111,21.1694,9.8361,26.2136,14.3046,9.0435,78.7879,16.2246,10.6263,47.4729,17.3655
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek,virtualhome,goal_interpretation,32.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
moonshotai/Kimi-K2-Instruct,Kimi,virtualhome,goal_interpretation,1000.0,34.8624,56.7164,43.1818,26.8797,48.3108,34.5411,13.9489,87.6543,24.0678,22.673,59.8991,32.8947
01-ai/Yi-Coder-1.5B,Yi,virtualhome,goal_interpretation,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemma-2-9b-it,Gemma,virtualhome,goal_interpretation,9.2,47.619,52.6316,50.0,0.0,0.0,0.0,3.0303,33.3333,5.5556,20.3704,40.7407,27.1605
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek,virtualhome,goal_interpretation,7.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,virtualhome,goal_interpretation,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-ai/Yi-1.5-6B,Yi,virtualhome,goal_interpretation,6.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemma-3-12b-pt,Gemma,virtualhome,goal_interpretation,12.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta-llama/Meta-Llama-3-8B,Llama-3,virtualhome,goal_interpretation,8.0,16.6667,66.6667,26.6667,16.6667,9.0909,11.7647,0.0,0.0,0.0,10.8696,25.0,15.1515
google/gemma-3-27b-it,Gemma,virtualhome,goal_interpretation,27.4,33.3333,59.1176,42.6299,29.0,19.4631,23.2932,16.3,100.6173,28.0551,23.4054,52.75,32.4241
Qwen/Qwen3-4B,Qwen3,virtualhome,goal_interpretation,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Qwen/Qwen3-235B-A22B-Thinking-2507,Qwen3,virtualhome,goal_interpretation,235.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ibm-granite/granite-3.3-8b-instruct,Granite,virtualhome,goal_interpretation,8.2,26.4,40.2439,31.8841,22.0339,22.807,22.4138,5.4237,51.6129,9.816,12.9436,36.4706,19.1063
google/gemma-3-4b-pt,Gemma,virtualhome,goal_interpretation,4.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-ai/Yi-Coder-9B-Chat,Yi,virtualhome,goal_interpretation,8.8,19.833,52.7778,28.8316,15.1007,29.8013,20.0445,11.463,83.5165,20.1592,15.0,51.1848,23.2009
meta-llama/Llama-3.1-70B,Llama-3,virtualhome,goal_interpretation,70.6,23.1638,56.1644,32.8,29.9065,50.0,37.4269,13.3333,66.6667,22.2222,20.2358,56.5934,29.8119
google/gemma-2-2b,Gemma,virtualhome,goal_interpretation,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemma-7b,Gemma,virtualhome,goal_interpretation,8.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-ai/Yi-Coder-9B,Yi,virtualhome,goal_interpretation,8.8,26.3889,51.3514,34.8624,0.0,0.0,0.0,13.9535,70.5882,23.301,17.5141,34.8315,23.3083
ibm-granite/granite-3.3-8b-base,Granite,virtualhome,goal_interpretation,8.2,22.2989,45.3271,29.8921,4.386,5.0761,4.7059,6.3749,76.1364,11.7647,10.1517,34.8697,15.7253
meta-llama/Llama-2-70b-hf,Llama-2,virtualhome,goal_interpretation,69.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
meta-llama/Llama-3.2-3B-Instruct,Llama-3,virtualhome,goal_interpretation,3.2,17.9601,58.2734,27.4576,3.1674,11.9658,5.0089,9.7561,72.1311,17.1875,10.3423,43.8486,16.7369
