[
  {
    "Model":"meta-llama\/Llama-3.3-70B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":70.6,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"01-ai\/Yi-Coder-1.5B-Chat",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.5,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"Qwen\/Qwen1.5-1.8B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.8,
    "node_precision":5.5556,
    "node_recall":3.4483,
    "node_f1":4.2553,
    "edge_precision":2.5,
    "edge_recall":5.2632,
    "edge_f1":3.3898,
    "action_precision":5.3435,
    "action_recall":53.8462,
    "action_f1":9.7222,
    "all_precision":4.7619,
    "all_recall":14.7541,
    "all_f1":7.2
  },
  {
    "Model":"mistralai\/Mixtral-8x7B-Instruct-v0.1",
    "Model Family":"Mistral",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":46.7,
    "node_precision":20.1717,
    "node_recall":45.6311,
    "node_f1":27.9762,
    "edge_precision":17.1306,
    "edge_recall":31.3725,
    "edge_f1":22.1607,
    "action_precision":8.2171,
    "action_recall":79.1045,
    "action_f1":14.8876,
    "all_precision":13.3143,
    "all_recall":46.8481,
    "all_f1":20.7356
  },
  {
    "Model":"google\/gemma-3-12b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":12.2,
    "node_precision":23.0961,
    "node_recall":54.4118,
    "node_f1":32.4277,
    "edge_precision":35.4839,
    "edge_recall":3.6913,
    "edge_f1":6.6869,
    "action_precision":13.8235,
    "action_recall":87.037,
    "action_f1":23.8579,
    "all_precision":18.1965,
    "all_recall":42.125,
    "all_f1":25.4148
  },
  {
    "Model":"google\/gemma-3-4b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":4.3,
    "node_precision":22.1889,
    "node_recall":43.5294,
    "node_f1":29.3942,
    "edge_precision":5.5556,
    "edge_recall":6.3758,
    "edge_f1":5.9375,
    "action_precision":11.0201,
    "action_recall":91.358,
    "action_f1":19.6678,
    "all_precision":13.3929,
    "all_recall":39.375,
    "all_f1":19.9873
  },
  {
    "Model":"ibm-granite\/granite-3.2-2b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.5,
    "node_precision":14.9296,
    "node_recall":32.5153,
    "node_f1":20.4633,
    "edge_precision":0.5917,
    "edge_recall":1.7241,
    "edge_f1":0.8811,
    "action_precision":7.281,
    "action_recall":83.1169,
    "action_f1":13.3891,
    "all_precision":7.214,
    "all_recall":31.039,
    "all_f1":11.7071
  },
  {
    "Model":"ibm-granite\/granite-3.1-8b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.2,
    "node_precision":23.2384,
    "node_recall":45.9941,
    "node_f1":30.8765,
    "edge_precision":6.6202,
    "edge_recall":12.8814,
    "edge_f1":8.7457,
    "action_precision":7.6833,
    "action_recall":82.9114,
    "action_f1":14.0633,
    "all_precision":10.998,
    "all_recall":41.0127,
    "all_f1":17.3448
  },
  {
    "Model":"google\/gemma-7b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.5,
    "node_precision":18.5185,
    "node_recall":71.4286,
    "node_f1":29.4118,
    "edge_precision":12.0,
    "edge_recall":23.0769,
    "edge_f1":15.7895,
    "action_precision":3.7037,
    "action_recall":40.0,
    "action_f1":6.7797,
    "all_precision":9.434,
    "all_recall":40.0,
    "all_f1":15.2672
  },
  {
    "Model":"01-ai\/Yi-1.5-6B-Chat",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":6.1,
    "node_precision":19.883,
    "node_recall":45.6376,
    "node_f1":27.6986,
    "edge_precision":1.7115,
    "edge_recall":5.6,
    "edge_f1":2.6217,
    "action_precision":13.8889,
    "action_recall":101.6949,
    "action_f1":24.4399,
    "all_precision":11.4117,
    "all_recall":40.5405,
    "all_f1":17.81
  },
  {
    "Model":"Qwen\/Qwen1.5-14B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":14.2,
    "node_precision":19.9029,
    "node_recall":53.7118,
    "node_f1":29.0437,
    "edge_precision":5.4217,
    "edge_recall":12.6761,
    "edge_f1":7.5949,
    "action_precision":7.9023,
    "action_recall":54.4554,
    "action_f1":13.8018,
    "all_precision":11.3135,
    "all_recall":37.7532,
    "all_f1":17.4098
  },
  {
    "Model":"google\/gemma-2b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.5,
    "node_precision":5.5398,
    "node_recall":16.7382,
    "node_f1":8.3244,
    "edge_precision":7.3579,
    "edge_recall":11.4583,
    "edge_f1":8.9613,
    "action_precision":6.2724,
    "action_recall":38.8889,
    "action_f1":10.8025,
    "all_precision":6.1499,
    "all_recall":18.6408,
    "all_f1":9.2486
  },
  {
    "Model":"Qwen\/Qwen3-14B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":14.8,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"google\/gemma-2-9b",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":9.2,
    "node_precision":100.0,
    "node_recall":100.0,
    "node_f1":100.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":33.3333,
    "all_recall":100.0,
    "all_f1":50.0
  },
  {
    "Model":"Qwen\/Qwen1.5-72B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":72.3,
    "node_precision":22.5931,
    "node_recall":55.873,
    "node_f1":32.1755,
    "edge_precision":10.5516,
    "edge_recall":16.7939,
    "edge_f1":12.9602,
    "action_precision":9.5494,
    "action_recall":69.5312,
    "action_f1":16.7925,
    "all_precision":14.5207,
    "all_recall":43.8298,
    "all_f1":21.8143
  },
  {
    "Model":"openai\/gpt-oss-120b",
    "Model Family":"GPT-OSS",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":120.4,
    "node_precision":32.3529,
    "node_recall":58.7537,
    "node_f1":41.7281,
    "edge_precision":52.2857,
    "edge_recall":61.8243,
    "edge_f1":56.6563,
    "action_precision":16.3373,
    "action_recall":77.0186,
    "action_f1":26.9565,
    "all_precision":29.3434,
    "all_recall":63.602,
    "all_f1":40.159
  },
  {
    "Model":"Qwen\/Qwen1.5-4B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":4.0,
    "node_precision":25.3687,
    "node_recall":40.3756,
    "node_f1":31.1594,
    "edge_precision":1.937,
    "edge_recall":4.1451,
    "edge_f1":2.6403,
    "action_precision":12.8028,
    "action_recall":34.5794,
    "action_f1":18.6869,
    "all_precision":12.5841,
    "all_recall":25.5361,
    "all_f1":16.8597
  },
  {
    "Model":"meta-llama\/Llama-3.1-70B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":70.6,
    "node_precision":20.0,
    "node_recall":42.8571,
    "node_f1":27.2727,
    "edge_precision":37.5,
    "edge_recall":33.3333,
    "edge_f1":35.2941,
    "action_precision":31.0345,
    "action_recall":100.0,
    "action_f1":47.3684,
    "all_precision":28.8462,
    "all_recall":60.0,
    "all_f1":38.961
  },
  {
    "Model":"Qwen\/Qwen-72B",
    "Model Family":"Qwen",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":72.3,
    "node_precision":21.8155,
    "node_recall":49.3377,
    "node_f1":30.2538,
    "edge_precision":10.4895,
    "edge_recall":17.0455,
    "edge_f1":12.987,
    "action_precision":9.899,
    "action_recall":70.0,
    "action_f1":17.3451,
    "all_precision":13.8915,
    "all_recall":41.3598,
    "all_f1":20.7977
  },
  {
    "Model":"ibm-granite\/granite-3.3-2b-base",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.5,
    "node_precision":16.0,
    "node_recall":28.8,
    "node_f1":20.5714,
    "edge_precision":2.0305,
    "edge_recall":4.1667,
    "edge_f1":2.7304,
    "action_precision":7.231,
    "action_recall":73.2143,
    "action_f1":13.1621,
    "all_precision":8.1901,
    "all_recall":29.2419,
    "all_f1":12.7962
  },
  {
    "Model":"Qwen\/Qwen3-32B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":32.8,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"01-ai\/Yi-1.5-9B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.8,
    "node_precision":24.8322,
    "node_recall":44.5783,
    "node_f1":31.8966,
    "edge_precision":10.0,
    "edge_recall":17.8571,
    "edge_f1":12.8205,
    "action_precision":10.8374,
    "action_recall":64.7059,
    "action_f1":18.5654,
    "all_precision":15.2655,
    "all_recall":39.8844,
    "all_f1":22.08
  },
  {
    "Model":"meta-llama\/Llama-2-7b-hf",
    "Model Family":"Llama-2",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":6.7,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"Qwen\/Qwen3-8B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.2,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"google\/gemma-2-27b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":27.2,
    "node_precision":34.4322,
    "node_recall":55.6213,
    "node_f1":42.5339,
    "edge_precision":23.0769,
    "edge_recall":26.1745,
    "edge_f1":24.5283,
    "action_precision":12.4478,
    "action_recall":91.9753,
    "action_f1":21.9279,
    "all_precision":19.9423,
    "all_recall":52.005,
    "all_f1":28.8295
  },
  {
    "Model":"google\/gemma-1.1-2b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.5,
    "node_precision":9.6296,
    "node_recall":18.0556,
    "node_f1":12.5604,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":5.4795,
    "action_recall":32.0,
    "action_f1":9.3567,
    "all_precision":6.0694,
    "all_recall":13.6364,
    "all_f1":8.4
  },
  {
    "Model":"meta-llama\/Llama-4-Scout-17B-16E-Instruct",
    "Model Family":"Llama",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":108.6,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Llama-70B",
    "Model Family":"DeepSeek",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":70.6,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"google\/gemma-2b",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.5,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.1-2b-base",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.5,
    "node_precision":17.6471,
    "node_recall":23.0769,
    "node_f1":20.0,
    "edge_precision":2.0408,
    "edge_recall":2.5974,
    "edge_f1":2.2857,
    "action_precision":6.3745,
    "action_recall":44.4444,
    "action_f1":11.1498,
    "all_precision":8.3333,
    "all_recall":19.1176,
    "all_f1":11.6071
  },
  {
    "Model":"meta-llama\/Llama-3.2-3B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":3.2,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1",
    "Model Family":"DeepSeek",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":684.5,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"google\/gemma-2-2b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.6,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"Qwen\/Qwen3-0.6B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":0.8,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"google\/gemma-1.1-7b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.5,
    "node_precision":16.6667,
    "node_recall":50.0,
    "node_f1":25.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":28.5714,
    "action_recall":133.3333,
    "action_f1":47.0588,
    "all_precision":22.7273,
    "all_recall":83.3333,
    "all_f1":35.7143
  },
  {
    "Model":"meta-llama\/Llama-3.2-1B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.2,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.1-2b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.5,
    "node_precision":14.5183,
    "node_recall":32.4242,
    "node_f1":20.0562,
    "edge_precision":0.9756,
    "edge_recall":2.7778,
    "edge_f1":1.444,
    "action_precision":7.6509,
    "action_recall":91.6129,
    "action_f1":14.1223,
    "all_precision":7.53,
    "all_recall":33.2471,
    "all_f1":12.279
  },
  {
    "Model":"ibm-granite\/granite-3.2-8b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.2,
    "node_precision":23.5023,
    "node_recall":45.9459,
    "node_f1":31.0976,
    "edge_precision":8.4428,
    "edge_recall":15.411,
    "edge_f1":10.9091,
    "action_precision":8.0024,
    "action_recall":84.9057,
    "action_f1":14.6262,
    "all_precision":11.5987,
    "all_recall":42.4745,
    "all_f1":18.2216
  },
  {
    "Model":"deepseek-ai\/DeepSeek-V3",
    "Model Family":"DeepSeek",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":684.5,
    "node_precision":22.9621,
    "node_recall":58.8235,
    "node_f1":33.0306,
    "edge_precision":32.3843,
    "edge_recall":61.0738,
    "edge_f1":42.3256,
    "action_precision":13.04,
    "action_recall":100.6173,
    "action_f1":23.0878,
    "all_precision":20.3131,
    "all_recall":68.125,
    "all_f1":31.2949
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-70B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":70.6,
    "node_precision":30.7692,
    "node_recall":88.8889,
    "node_f1":45.7143,
    "edge_precision":15.3846,
    "edge_recall":50.0,
    "edge_f1":23.5294,
    "action_precision":12.5,
    "action_recall":100.0,
    "action_f1":22.2222,
    "all_precision":20.6349,
    "all_recall":81.25,
    "all_f1":32.9114
  },
  {
    "Model":"meta-llama\/Llama-3.1-8B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.0,
    "node_precision":17.9104,
    "node_recall":48.0,
    "node_f1":26.087,
    "edge_precision":5.3571,
    "edge_recall":15.0,
    "edge_f1":7.8947,
    "action_precision":7.7922,
    "action_recall":50.0,
    "action_f1":13.4831,
    "all_precision":10.5,
    "all_recall":36.8421,
    "all_f1":16.3424
  },
  {
    "Model":"google\/gemma-2-27b",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":27.2,
    "node_precision":27.2727,
    "node_recall":50.0,
    "node_f1":35.2941,
    "edge_precision":16.6667,
    "edge_recall":40.0,
    "edge_f1":23.5294,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":17.3913,
    "all_recall":44.4444,
    "all_f1":25.0
  },
  {
    "Model":"ibm-granite\/granite-3.1-8b-base",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.2,
    "node_precision":28.2759,
    "node_recall":41.8367,
    "node_f1":33.7449,
    "edge_precision":3.0303,
    "edge_recall":2.7027,
    "edge_f1":2.8571,
    "action_precision":7.4169,
    "action_recall":82.8571,
    "action_f1":13.615,
    "all_precision":11.9601,
    "all_recall":34.7826,
    "all_f1":17.7998
  },
  {
    "Model":"01-ai\/Yi-1.5-34B-Chat",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":34.4,
    "node_precision":22.0238,
    "node_recall":55.8491,
    "node_f1":31.5902,
    "edge_precision":16.4811,
    "edge_recall":37.0,
    "edge_f1":22.8043,
    "action_precision":9.6267,
    "action_recall":93.3333,
    "action_f1":17.4533,
    "all_precision":14.9603,
    "all_recall":56.1404,
    "all_f1":23.625
  },
  {
    "Model":"01-ai\/Yi-1.5-34B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":34.4,
    "node_precision":27.5109,
    "node_recall":51.6393,
    "node_f1":35.8974,
    "edge_precision":9.2105,
    "edge_recall":11.0526,
    "edge_f1":10.0478,
    "action_precision":12.012,
    "action_recall":86.0215,
    "action_f1":21.0804,
    "all_precision":16.7899,
    "all_recall":43.074,
    "all_f1":24.1618
  },
  {
    "Model":"LGAI-EXAONE\/EXAONE-3.5-32B-Instruct",
    "Model Family":"Exaone",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":32.0,
    "node_precision":24.3553,
    "node_recall":61.1511,
    "node_f1":34.8361,
    "edge_precision":16.2562,
    "edge_recall":36.2637,
    "edge_f1":22.449,
    "action_precision":10.0656,
    "action_recall":97.8723,
    "action_f1":18.254,
    "all_precision":16.2537,
    "all_recall":59.2058,
    "all_f1":25.5054
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-1.5B",
    "Model Family":"DeepSeek",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.8,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"openai\/gpt-oss-20b",
    "Model Family":"GPT-OSS",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":21.5,
    "node_precision":35.4756,
    "node_recall":58.4746,
    "node_f1":44.16,
    "edge_precision":51.0101,
    "edge_recall":46.7593,
    "edge_f1":48.7923,
    "action_precision":14.24,
    "action_recall":80.1802,
    "action_f1":24.1848,
    "all_precision":27.0627,
    "all_recall":58.2593,
    "all_f1":36.9577
  },
  {
    "Model":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct-FP8",
    "Model Family":"Llama",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":401.6,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-8B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.0,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"Qwen\/Qwen3-1.7B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.0,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"meta-llama\/Llama-2-13b-hf",
    "Model Family":"Llama-2",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":13.0,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.3-2b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.5,
    "node_precision":16.8142,
    "node_recall":28.7009,
    "node_f1":21.2054,
    "edge_precision":1.7937,
    "edge_recall":1.3652,
    "edge_f1":1.5504,
    "action_precision":7.3139,
    "action_recall":72.9032,
    "action_f1":13.2941,
    "all_precision":9.087,
    "all_recall":27.2144,
    "all_f1":13.6247
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-14B",
    "Model Family":"DeepSeek",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":14.8,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"mistralai\/Mistral-7B-Instruct-v0.2",
    "Model Family":"Mistral",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":7.2,
    "node_precision":13.5309,
    "node_recall":48.6111,
    "node_f1":21.1694,
    "edge_precision":9.8361,
    "edge_recall":26.2136,
    "edge_f1":14.3046,
    "action_precision":9.0435,
    "action_recall":78.7879,
    "action_f1":16.2246,
    "all_precision":10.6263,
    "all_recall":47.4729,
    "all_f1":17.3655
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-32B",
    "Model Family":"DeepSeek",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":32.8,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"moonshotai\/Kimi-K2-Instruct",
    "Model Family":"Kimi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1000.0,
    "node_precision":34.8624,
    "node_recall":56.7164,
    "node_f1":43.1818,
    "edge_precision":26.8797,
    "edge_recall":48.3108,
    "edge_f1":34.5411,
    "action_precision":13.9489,
    "action_recall":87.6543,
    "action_f1":24.0678,
    "all_precision":22.673,
    "all_recall":59.8991,
    "all_f1":32.8947
  },
  {
    "Model":"01-ai\/Yi-Coder-1.5B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.5,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"google\/gemma-2-9b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":9.2,
    "node_precision":47.619,
    "node_recall":52.6316,
    "node_f1":50.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":3.0303,
    "action_recall":33.3333,
    "action_f1":5.5556,
    "all_precision":20.3704,
    "all_recall":40.7407,
    "all_f1":27.1605
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-7B",
    "Model Family":"DeepSeek",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":7.6,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"LGAI-EXAONE\/EXAONE-Deep-32B",
    "Model Family":"Exaone",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":32.0,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"01-ai\/Yi-1.5-6B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":6.1,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"google\/gemma-3-12b-pt",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":12.2,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-8B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.0,
    "node_precision":16.6667,
    "node_recall":66.6667,
    "node_f1":26.6667,
    "edge_precision":16.6667,
    "edge_recall":9.0909,
    "edge_f1":11.7647,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":10.8696,
    "all_recall":25.0,
    "all_f1":15.1515
  },
  {
    "Model":"google\/gemma-3-27b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":27.4,
    "node_precision":33.3333,
    "node_recall":59.1176,
    "node_f1":42.6299,
    "edge_precision":29.0,
    "edge_recall":19.4631,
    "edge_f1":23.2932,
    "action_precision":16.3,
    "action_recall":100.6173,
    "action_f1":28.0551,
    "all_precision":23.4054,
    "all_recall":52.75,
    "all_f1":32.4241
  },
  {
    "Model":"Qwen\/Qwen3-4B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":4.0,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"Qwen\/Qwen3-235B-A22B-Thinking-2507",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":235.1,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.3-8b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.2,
    "node_precision":26.4,
    "node_recall":40.2439,
    "node_f1":31.8841,
    "edge_precision":22.0339,
    "edge_recall":22.807,
    "edge_f1":22.4138,
    "action_precision":5.4237,
    "action_recall":51.6129,
    "action_f1":9.816,
    "all_precision":12.9436,
    "all_recall":36.4706,
    "all_f1":19.1063
  },
  {
    "Model":"google\/gemma-3-4b-pt",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":4.3,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"01-ai\/Yi-Coder-9B-Chat",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.8,
    "node_precision":19.833,
    "node_recall":52.7778,
    "node_f1":28.8316,
    "edge_precision":15.1007,
    "edge_recall":29.8013,
    "edge_f1":20.0445,
    "action_precision":11.463,
    "action_recall":83.5165,
    "action_f1":20.1592,
    "all_precision":15.0,
    "all_recall":51.1848,
    "all_f1":23.2009
  },
  {
    "Model":"meta-llama\/Llama-3.1-70B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":70.6,
    "node_precision":23.1638,
    "node_recall":56.1644,
    "node_f1":32.8,
    "edge_precision":29.9065,
    "edge_recall":50.0,
    "edge_f1":37.4269,
    "action_precision":13.3333,
    "action_recall":66.6667,
    "action_f1":22.2222,
    "all_precision":20.2358,
    "all_recall":56.5934,
    "all_f1":29.8119
  },
  {
    "Model":"google\/gemma-2-2b",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":2.6,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"google\/gemma-7b",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.5,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"01-ai\/Yi-Coder-9B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.8,
    "node_precision":26.3889,
    "node_recall":51.3514,
    "node_f1":34.8624,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":13.9535,
    "action_recall":70.5882,
    "action_f1":23.301,
    "all_precision":17.5141,
    "all_recall":34.8315,
    "all_f1":23.3083
  },
  {
    "Model":"ibm-granite\/granite-3.3-8b-base",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.2,
    "node_precision":22.2989,
    "node_recall":45.3271,
    "node_f1":29.8921,
    "edge_precision":4.386,
    "edge_recall":5.0761,
    "edge_f1":4.7059,
    "action_precision":6.3749,
    "action_recall":76.1364,
    "action_f1":11.7647,
    "all_precision":10.1517,
    "all_recall":34.8697,
    "all_f1":15.7253
  },
  {
    "Model":"meta-llama\/Llama-2-70b-hf",
    "Model Family":"Llama-2",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":69.0,
    "node_precision":0.0,
    "node_recall":0.0,
    "node_f1":0.0,
    "edge_precision":0.0,
    "edge_recall":0.0,
    "edge_f1":0.0,
    "action_precision":0.0,
    "action_recall":0.0,
    "action_f1":0.0,
    "all_precision":0.0,
    "all_recall":0.0,
    "all_f1":0.0
  },
  {
    "Model":"meta-llama\/Llama-3.2-3B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"goal_interpretation",
    "Model Size (B)":3.2,
    "node_precision":17.9601,
    "node_recall":58.2734,
    "node_f1":27.4576,
    "edge_precision":3.1674,
    "edge_recall":11.9658,
    "edge_f1":5.0089,
    "action_precision":9.7561,
    "action_recall":72.1311,
    "action_f1":17.1875,
    "all_precision":10.3423,
    "all_recall":43.8486,
    "all_f1":16.7369
  }
]