[
  {
    "Model":"tiiuae\/falcon-40b",
    "Model Family":"falcon",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":41.8,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":105.5921,
    "hallucination_error":0.9868,
    "predicate_argument_number_error":0.3289,
    "wrong_order_error":0.0,
    "missing_step_error":0.3289,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-3.3-70B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":70.6,
    "task_success_rate":61.9672,
    "state_goal":55.7554,
    "relation_goal":77.7778,
    "action_goal":54.7297,
    "total_goal":62.0462,
    "execution_success_rate":68.5,
    "parsing_error":0.9836,
    "hallucination_error":15.7377,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":13.7705,
    "affordance_error":0.9836,
    "additional_step_error":1.9672
  },
  {
    "Model":"baichuan-inc\/Baichuan2-7B-Base",
    "Model Family":"Baichuan",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.0,
    "task_success_rate":0.6579,
    "state_goal":2.1583,
    "relation_goal":0.5587,
    "action_goal":0.6757,
    "total_goal":1.3223,
    "execution_success_rate":1.0,
    "parsing_error":48.0263,
    "hallucination_error":38.1579,
    "predicate_argument_number_error":10.5263,
    "wrong_order_error":0.3289,
    "missing_step_error":9.8684,
    "affordance_error":0.9868,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-Coder-1.5B-Chat",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.5,
    "task_success_rate":0.6601,
    "state_goal":12.2744,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":5.6291,
    "execution_success_rate":2.6,
    "parsing_error":4.2904,
    "hallucination_error":45.5446,
    "predicate_argument_number_error":17.1617,
    "wrong_order_error":0.0,
    "missing_step_error":28.3828,
    "affordance_error":2.9703,
    "additional_step_error":0.33
  },
  {
    "Model":"Qwen\/Qwen1.5-1.8B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.8,
    "task_success_rate":0.0,
    "state_goal":0.3597,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.165,
    "execution_success_rate":0.0,
    "parsing_error":22.2951,
    "hallucination_error":81.9672,
    "predicate_argument_number_error":0.3279,
    "wrong_order_error":0.0,
    "missing_step_error":0.3279,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"mistralai\/Mixtral-8x7B-Instruct-v0.1",
    "Model Family":"Mistral",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":46.7,
    "task_success_rate":30.8197,
    "state_goal":42.446,
    "relation_goal":22.7778,
    "action_goal":18.9189,
    "total_goal":30.8581,
    "execution_success_rate":30.8,
    "parsing_error":2.623,
    "hallucination_error":22.2951,
    "predicate_argument_number_error":1.6393,
    "wrong_order_error":0.9836,
    "missing_step_error":39.3443,
    "affordance_error":2.2951,
    "additional_step_error":3.2787
  },
  {
    "Model":"google\/gemma-3-12b-it",
    "Model Family":"Gemma-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":12.2,
    "task_success_rate":49.8361,
    "state_goal":69.4245,
    "relation_goal":50.5556,
    "action_goal":33.7838,
    "total_goal":55.1155,
    "execution_success_rate":51.8,
    "parsing_error":0.0,
    "hallucination_error":4.5902,
    "predicate_argument_number_error":1.3115,
    "wrong_order_error":0.0,
    "missing_step_error":38.3607,
    "affordance_error":3.9344,
    "additional_step_error":8.1967
  },
  {
    "Model":"google\/gemma-3-4b-it",
    "Model Family":"Gemma-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":4.3,
    "task_success_rate":40.3279,
    "state_goal":55.7554,
    "relation_goal":43.3333,
    "action_goal":27.7027,
    "total_goal":45.2145,
    "execution_success_rate":51.8,
    "parsing_error":1.6393,
    "hallucination_error":5.9016,
    "predicate_argument_number_error":7.2131,
    "wrong_order_error":2.2951,
    "missing_step_error":25.9016,
    "affordance_error":5.2459,
    "additional_step_error":1.6393
  },
  {
    "Model":"ibm-granite\/granite-3.2-2b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":4.5902,
    "state_goal":9.7122,
    "relation_goal":7.7778,
    "action_goal":6.7568,
    "total_goal":8.4158,
    "execution_success_rate":5.9,
    "parsing_error":15.082,
    "hallucination_error":39.6721,
    "predicate_argument_number_error":3.6066,
    "wrong_order_error":0.0,
    "missing_step_error":24.5902,
    "affordance_error":12.1311,
    "additional_step_error":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.1-8b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":38.6885,
    "state_goal":47.8417,
    "relation_goal":37.7778,
    "action_goal":29.0541,
    "total_goal":40.264,
    "execution_success_rate":38.4,
    "parsing_error":8.1967,
    "hallucination_error":13.4426,
    "predicate_argument_number_error":1.6393,
    "wrong_order_error":0.3279,
    "missing_step_error":34.0984,
    "affordance_error":3.9344,
    "additional_step_error":1.3115
  },
  {
    "Model":"google\/gemma-7b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.5,
    "task_success_rate":4.5902,
    "state_goal":16.1871,
    "relation_goal":5.5556,
    "action_goal":0.6757,
    "total_goal":9.2409,
    "execution_success_rate":7.2,
    "parsing_error":24.2623,
    "hallucination_error":25.2459,
    "predicate_argument_number_error":3.9344,
    "wrong_order_error":0.0,
    "missing_step_error":33.7705,
    "affordance_error":5.5738,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-1.5-6B-Chat",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.1,
    "task_success_rate":10.8911,
    "state_goal":13.3574,
    "relation_goal":18.9944,
    "action_goal":3.3784,
    "total_goal":12.5828,
    "execution_success_rate":13.5,
    "parsing_error":7.9208,
    "hallucination_error":33.6634,
    "predicate_argument_number_error":2.3102,
    "wrong_order_error":0.0,
    "missing_step_error":43.8944,
    "affordance_error":1.6502,
    "additional_step_error":0.33
  },
  {
    "Model":"Qwen\/Qwen1.5-7B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.7,
    "task_success_rate":5.5738,
    "state_goal":20.5036,
    "relation_goal":2.2222,
    "action_goal":2.027,
    "total_goal":10.5611,
    "execution_success_rate":4.6,
    "parsing_error":12.1311,
    "hallucination_error":36.0656,
    "predicate_argument_number_error":7.541,
    "wrong_order_error":1.9672,
    "missing_step_error":26.8852,
    "affordance_error":13.1148,
    "additional_step_error":11.4754
  },
  {
    "Model":"Qwen\/Qwen1.5-14B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.2,
    "task_success_rate":18.6885,
    "state_goal":19.4245,
    "relation_goal":23.8889,
    "action_goal":24.3243,
    "total_goal":21.9472,
    "execution_success_rate":24.6,
    "parsing_error":6.2295,
    "hallucination_error":32.1311,
    "predicate_argument_number_error":1.9672,
    "wrong_order_error":0.9836,
    "missing_step_error":20.3279,
    "affordance_error":13.7705,
    "additional_step_error":25.9016
  },
  {
    "Model":"bigcode\/starcoder2-3b",
    "Model Family":"starcoder2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":3.0,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-2b-it",
    "Model Family":"Gemma-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":5.2459,
    "state_goal":4.3165,
    "relation_goal":6.6667,
    "action_goal":10.1351,
    "total_goal":6.4356,
    "execution_success_rate":7.5,
    "parsing_error":9.5082,
    "hallucination_error":69.1803,
    "predicate_argument_number_error":0.9836,
    "wrong_order_error":0.0,
    "missing_step_error":12.459,
    "affordance_error":0.6557,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-14B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.8,
    "task_success_rate":45.5738,
    "state_goal":55.7554,
    "relation_goal":41.1111,
    "action_goal":14.1892,
    "total_goal":41.2541,
    "execution_success_rate":43.0,
    "parsing_error":43.6066,
    "hallucination_error":3.9344,
    "predicate_argument_number_error":0.3279,
    "wrong_order_error":6.5574,
    "missing_step_error":4.5902,
    "affordance_error":0.0,
    "additional_step_error":19.3443
  },
  {
    "Model":"google\/gemma-2-9b",
    "Model Family":"Gemma-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":9.2,
    "task_success_rate":1.3115,
    "state_goal":3.5971,
    "relation_goal":1.6667,
    "action_goal":0.6757,
    "total_goal":2.3102,
    "execution_success_rate":1.3,
    "parsing_error":45.5738,
    "hallucination_error":47.8689,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.3279,
    "missing_step_error":4.5902,
    "affordance_error":0.6557,
    "additional_step_error":6.5574
  },
  {
    "Model":"Qwen\/Qwen1.5-72B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":72.3,
    "task_success_rate":6.8852,
    "state_goal":10.7914,
    "relation_goal":8.3333,
    "action_goal":4.7297,
    "total_goal":8.5809,
    "execution_success_rate":11.1,
    "parsing_error":0.3279,
    "hallucination_error":33.7705,
    "predicate_argument_number_error":13.4426,
    "wrong_order_error":0.0,
    "missing_step_error":35.082,
    "affordance_error":6.2295,
    "additional_step_error":0.6557
  },
  {
    "Model":"openai\/gpt-oss-120b",
    "Model Family":"GPT-OSS",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":120.4,
    "task_success_rate":74.0984,
    "state_goal":87.4101,
    "relation_goal":78.3333,
    "action_goal":62.1622,
    "total_goal":78.5479,
    "execution_success_rate":79.3,
    "parsing_error":0.6557,
    "hallucination_error":2.9508,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":17.377,
    "affordance_error":0.0,
    "additional_step_error":2.9508
  },
  {
    "Model":"tiiuae\/falcon-11B",
    "Model Family":"falcon",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":11.1,
    "task_success_rate":1.3158,
    "state_goal":1.7986,
    "relation_goal":1.676,
    "action_goal":1.3514,
    "total_goal":1.6529,
    "execution_success_rate":3.0,
    "parsing_error":49.0132,
    "hallucination_error":36.8421,
    "predicate_argument_number_error":0.9868,
    "wrong_order_error":0.6579,
    "missing_step_error":10.5263,
    "affordance_error":1.3158,
    "additional_step_error":17.4342
  },
  {
    "Model":"Qwen\/Qwen1.5-4B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":4.0,
    "task_success_rate":2.9508,
    "state_goal":7.554,
    "relation_goal":2.2222,
    "action_goal":0.0,
    "total_goal":4.1254,
    "execution_success_rate":3.0,
    "parsing_error":21.3115,
    "hallucination_error":63.2787,
    "predicate_argument_number_error":2.623,
    "wrong_order_error":0.9836,
    "missing_step_error":9.8361,
    "affordance_error":8.5246,
    "additional_step_error":0.9836
  },
  {
    "Model":"Qwen\/Qwen-14B",
    "Model Family":"Qwen",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.2,
    "task_success_rate":1.6393,
    "state_goal":1.0791,
    "relation_goal":2.7778,
    "action_goal":3.3784,
    "total_goal":2.1452,
    "execution_success_rate":2.0,
    "parsing_error":96.7213,
    "hallucination_error":3.2787,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":0.0,
    "missing_step_error":3.2787,
    "affordance_error":2.623,
    "additional_step_error":0.0
  },
  {
    "Model":"gpt-4.1-mini-2025-04-14",
    "Model Family":"gpt-4.1-mini-2025-04-14",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":null,
    "task_success_rate":73.7705,
    "state_goal":87.4101,
    "relation_goal":72.7778,
    "action_goal":57.4324,
    "total_goal":75.7426,
    "execution_success_rate":80.0,
    "parsing_error":0.3279,
    "hallucination_error":3.6066,
    "predicate_argument_number_error":1.3115,
    "wrong_order_error":0.6557,
    "missing_step_error":12.7869,
    "affordance_error":1.3115,
    "additional_step_error":1.6393
  },
  {
    "Model":"bigcode\/starcoderbase-7b",
    "Model Family":"starcoder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":15.5,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":100.3279,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen-72B",
    "Model Family":"Qwen",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":72.3,
    "task_success_rate":15.082,
    "state_goal":15.1079,
    "relation_goal":20.5556,
    "action_goal":16.2162,
    "total_goal":16.9967,
    "execution_success_rate":22.0,
    "parsing_error":3.6066,
    "hallucination_error":27.8689,
    "predicate_argument_number_error":6.8852,
    "wrong_order_error":0.3279,
    "missing_step_error":31.1475,
    "affordance_error":8.1967,
    "additional_step_error":0.6557
  },
  {
    "Model":"Qwen\/Qwen1.5-32B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.5,
    "task_success_rate":28.5246,
    "state_goal":52.8777,
    "relation_goal":30.0,
    "action_goal":14.1892,
    "total_goal":36.6337,
    "execution_success_rate":28.9,
    "parsing_error":12.7869,
    "hallucination_error":5.2459,
    "predicate_argument_number_error":5.2459,
    "wrong_order_error":0.9836,
    "missing_step_error":41.3115,
    "affordance_error":5.5738,
    "additional_step_error":4.5902
  },
  {
    "Model":"meta-llama\/llama3_8B_o4-mini-2025-04-16",
    "Model Family":"Llama",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":null,
    "task_success_rate":72.459,
    "state_goal":85.6115,
    "relation_goal":70.0,
    "action_goal":64.1892,
    "total_goal":75.7426,
    "execution_success_rate":83.9,
    "parsing_error":0.3279,
    "hallucination_error":4.2623,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.3279,
    "missing_step_error":11.1475,
    "affordance_error":0.0,
    "additional_step_error":4.918
  },
  {
    "Model":"microsoft\/Phi-3-mini-128k-instruct",
    "Model Family":"phi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":3.8,
    "task_success_rate":11.8812,
    "state_goal":18.7726,
    "relation_goal":10.6145,
    "action_goal":12.1622,
    "total_goal":14.7351,
    "execution_success_rate":15.5,
    "parsing_error":2.3102,
    "hallucination_error":42.5743,
    "predicate_argument_number_error":6.2706,
    "wrong_order_error":0.0,
    "missing_step_error":30.033,
    "affordance_error":3.3003,
    "additional_step_error":1.3201
  },
  {
    "Model":"ibm-granite\/granite-3.3-2b-base",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":1.3115,
    "state_goal":3.9568,
    "relation_goal":1.1111,
    "action_goal":1.3514,
    "total_goal":2.4752,
    "execution_success_rate":1.3,
    "parsing_error":19.3443,
    "hallucination_error":58.6885,
    "predicate_argument_number_error":3.2787,
    "wrong_order_error":0.3279,
    "missing_step_error":14.0984,
    "affordance_error":2.9508,
    "additional_step_error":11.8033
  },
  {
    "Model":"Qwen\/Qwen3-32B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.8,
    "task_success_rate":51.4754,
    "state_goal":62.5899,
    "relation_goal":45.0,
    "action_goal":25.0,
    "total_goal":48.1848,
    "execution_success_rate":48.5,
    "parsing_error":28.1967,
    "hallucination_error":8.1967,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":6.8852,
    "missing_step_error":8.1967,
    "affordance_error":0.0,
    "additional_step_error":19.3443
  },
  {
    "Model":"01-ai\/Yi-1.5-9B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.8,
    "task_success_rate":24.7525,
    "state_goal":36.8231,
    "relation_goal":27.933,
    "action_goal":18.2432,
    "total_goal":29.6358,
    "execution_success_rate":28.4,
    "parsing_error":0.6601,
    "hallucination_error":15.8416,
    "predicate_argument_number_error":3.3003,
    "wrong_order_error":1.3201,
    "missing_step_error":38.9439,
    "affordance_error":11.8812,
    "additional_step_error":3.6304
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-33b-instruct",
    "Model Family":"DeepSeek-Coder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":33.3,
    "task_success_rate":11.5512,
    "state_goal":16.6065,
    "relation_goal":13.4078,
    "action_goal":12.8378,
    "total_goal":14.7351,
    "execution_success_rate":15.8,
    "parsing_error":5.2805,
    "hallucination_error":22.7723,
    "predicate_argument_number_error":15.8416,
    "wrong_order_error":1.6502,
    "missing_step_error":35.6436,
    "affordance_error":3.9604,
    "additional_step_error":4.9505
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-7b-base-v1.5",
    "Model Family":"DeepSeek-Coder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.9,
    "task_success_rate":3.2787,
    "state_goal":14.3885,
    "relation_goal":3.8889,
    "action_goal":0.6757,
    "total_goal":7.9208,
    "execution_success_rate":1.3,
    "parsing_error":23.9344,
    "hallucination_error":23.9344,
    "predicate_argument_number_error":15.7377,
    "wrong_order_error":0.6557,
    "missing_step_error":26.2295,
    "affordance_error":9.1803,
    "additional_step_error":3.6066
  },
  {
    "Model":"bigcode\/starcoderbase-3b",
    "Model Family":"starcoder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":15.5,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"gpt-4.1-nano-2025-04-14",
    "Model Family":"gpt-4.1-nano-2025-04-14",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":null,
    "task_success_rate":33.4426,
    "state_goal":27.3381,
    "relation_goal":37.2222,
    "action_goal":29.0541,
    "total_goal":30.6931,
    "execution_success_rate":45.6,
    "parsing_error":4.2623,
    "hallucination_error":26.8852,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":0.0,
    "missing_step_error":21.9672,
    "affordance_error":0.6557,
    "additional_step_error":0.3279
  },
  {
    "Model":"microsoft\/Phi-3-medium-128k-instruct",
    "Model Family":"phi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.0,
    "task_success_rate":24.7525,
    "state_goal":40.4332,
    "relation_goal":20.1117,
    "action_goal":22.973,
    "total_goal":30.1325,
    "execution_success_rate":36.0,
    "parsing_error":0.0,
    "hallucination_error":19.802,
    "predicate_argument_number_error":5.6106,
    "wrong_order_error":0.33,
    "missing_step_error":32.6733,
    "affordance_error":5.6106,
    "additional_step_error":4.2904
  },
  {
    "Model":"meta-llama\/Llama-2-7b-hf",
    "Model Family":"Llama-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.7,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":86.1386,
    "hallucination_error":15.1815,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.33,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-6.7b-base",
    "Model Family":"DeepSeek-Coder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.7,
    "task_success_rate":0.9901,
    "state_goal":6.1372,
    "relation_goal":1.1173,
    "action_goal":0.0,
    "total_goal":3.1457,
    "execution_success_rate":0.7,
    "parsing_error":46.8647,
    "hallucination_error":34.9835,
    "predicate_argument_number_error":1.6502,
    "wrong_order_error":0.33,
    "missing_step_error":11.8812,
    "affordance_error":3.6304,
    "additional_step_error":1.9802
  },
  {
    "Model":"bigcode\/starcoder2-7b",
    "Model Family":"starcoder2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.2,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":100.3279,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"microsoft\/phi-1_5",
    "Model Family":"phi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.4,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-8B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":45.9016,
    "state_goal":55.7554,
    "relation_goal":47.7778,
    "action_goal":29.0541,
    "total_goal":46.8647,
    "execution_success_rate":33.1,
    "parsing_error":29.1803,
    "hallucination_error":6.8852,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":12.7869,
    "missing_step_error":17.7049,
    "affordance_error":0.3279,
    "additional_step_error":27.8689
  },
  {
    "Model":"google\/gemma-2-27b-it",
    "Model Family":"Gemma-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":27.2,
    "task_success_rate":63.2787,
    "state_goal":82.0144,
    "relation_goal":58.8889,
    "action_goal":43.9189,
    "total_goal":65.8416,
    "execution_success_rate":71.8,
    "parsing_error":0.0,
    "hallucination_error":2.2951,
    "predicate_argument_number_error":4.2623,
    "wrong_order_error":1.6393,
    "missing_step_error":20.0,
    "affordance_error":0.0,
    "additional_step_error":2.9508
  },
  {
    "Model":"google\/gemma-1.1-2b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":5.9016,
    "hallucination_error":94.0984,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-4-Scout-17B-16E-Instruct",
    "Model Family":"Llama",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":108.6,
    "task_success_rate":65.9016,
    "state_goal":74.8201,
    "relation_goal":72.2222,
    "action_goal":54.7297,
    "total_goal":69.1419,
    "execution_success_rate":63.6,
    "parsing_error":1.9672,
    "hallucination_error":10.8197,
    "predicate_argument_number_error":6.8852,
    "wrong_order_error":2.623,
    "missing_step_error":13.7705,
    "affordance_error":1.3115,
    "additional_step_error":35.7377
  },
  {
    "Model":"gpt-4.1-2025-04-14",
    "Model Family":"gpt-4.1-2025-04-14",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":null,
    "task_success_rate":72.459,
    "state_goal":90.6475,
    "relation_goal":76.1111,
    "action_goal":58.7838,
    "total_goal":78.5479,
    "execution_success_rate":83.0,
    "parsing_error":0.0,
    "hallucination_error":1.3115,
    "predicate_argument_number_error":0.3279,
    "wrong_order_error":0.0,
    "missing_step_error":15.082,
    "affordance_error":0.3279,
    "additional_step_error":1.9672
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Llama-70B",
    "Model Family":"DeepSeek-R1",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":70.6,
    "task_success_rate":51.1475,
    "state_goal":61.1511,
    "relation_goal":56.1111,
    "action_goal":27.027,
    "total_goal":51.3201,
    "execution_success_rate":50.5,
    "parsing_error":25.2459,
    "hallucination_error":2.2951,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":4.5902,
    "missing_step_error":17.377,
    "affordance_error":0.0,
    "additional_step_error":10.8197
  },
  {
    "Model":"google\/gemma-2b",
    "Model Family":"Gemma-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":0.0,
    "state_goal":1.0791,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.495,
    "execution_success_rate":0.0,
    "parsing_error":98.3607,
    "hallucination_error":1.3115,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.6557,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.1-2b-base",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":1.3115,
    "state_goal":2.8777,
    "relation_goal":0.5556,
    "action_goal":0.6757,
    "total_goal":1.6502,
    "execution_success_rate":1.6,
    "parsing_error":18.0328,
    "hallucination_error":67.2131,
    "predicate_argument_number_error":1.9672,
    "wrong_order_error":0.0,
    "missing_step_error":8.5246,
    "affordance_error":2.9508,
    "additional_step_error":16.3934
  },
  {
    "Model":"o4-mini-2025-04-16",
    "Model Family":"o4-mini-2025-04-16",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":null,
    "task_success_rate":75.082,
    "state_goal":93.1655,
    "relation_goal":81.6667,
    "action_goal":60.1351,
    "total_goal":81.6832,
    "execution_success_rate":83.6,
    "parsing_error":0.3279,
    "hallucination_error":1.6393,
    "predicate_argument_number_error":0.3279,
    "wrong_order_error":0.0,
    "missing_step_error":14.0984,
    "affordance_error":0.0,
    "additional_step_error":2.623
  },
  {
    "Model":"bigcode\/starcoder2-15b",
    "Model Family":"starcoder2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":16.0,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":99.6721,
    "hallucination_error":0.3279,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-3.2-3B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":3.2,
    "task_success_rate":0.3279,
    "state_goal":2.518,
    "relation_goal":1.1111,
    "action_goal":0.0,
    "total_goal":1.4851,
    "execution_success_rate":0.3,
    "parsing_error":61.6393,
    "hallucination_error":35.4098,
    "predicate_argument_number_error":0.3279,
    "wrong_order_error":0.6557,
    "missing_step_error":3.2787,
    "affordance_error":0.6557,
    "additional_step_error":1.9672
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1",
    "Model Family":"DeepSeek-R1",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":684.5,
    "task_success_rate":36.7213,
    "state_goal":43.5252,
    "relation_goal":33.3333,
    "action_goal":12.1622,
    "total_goal":32.8383,
    "execution_success_rate":29.5,
    "parsing_error":43.6066,
    "hallucination_error":18.6885,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":8.1967,
    "missing_step_error":2.623,
    "affordance_error":0.0,
    "additional_step_error":40.3279
  },
  {
    "Model":"google\/gemma-2-2b-it",
    "Model Family":"Gemma-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.6,
    "task_success_rate":10.4918,
    "state_goal":26.259,
    "relation_goal":13.8889,
    "action_goal":0.6757,
    "total_goal":16.3366,
    "execution_success_rate":21.3,
    "parsing_error":6.5574,
    "hallucination_error":20.6557,
    "predicate_argument_number_error":0.9836,
    "wrong_order_error":0.0,
    "missing_step_error":47.2131,
    "affordance_error":4.5902,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-0.6B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":0.8,
    "task_success_rate":0.3279,
    "state_goal":3.2374,
    "relation_goal":0.5556,
    "action_goal":0.0,
    "total_goal":1.6502,
    "execution_success_rate":0.3,
    "parsing_error":47.8689,
    "hallucination_error":26.2295,
    "predicate_argument_number_error":1.3115,
    "wrong_order_error":0.0,
    "missing_step_error":24.2623,
    "affordance_error":0.3279,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-1.1-7b-it",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.5,
    "task_success_rate":9.1803,
    "state_goal":17.2662,
    "relation_goal":5.0,
    "action_goal":8.7838,
    "total_goal":11.5512,
    "execution_success_rate":10.8,
    "parsing_error":23.2787,
    "hallucination_error":11.8033,
    "predicate_argument_number_error":1.9672,
    "wrong_order_error":0.0,
    "missing_step_error":46.5574,
    "affordance_error":5.5738,
    "additional_step_error":0.6557
  },
  {
    "Model":"Qwen\/Qwen2.5-14B",
    "Model Family":"Qwen2.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.8,
    "task_success_rate":25.9016,
    "state_goal":42.8058,
    "relation_goal":22.2222,
    "action_goal":22.2973,
    "total_goal":31.6832,
    "execution_success_rate":25.6,
    "parsing_error":16.0656,
    "hallucination_error":26.5574,
    "predicate_argument_number_error":1.3115,
    "wrong_order_error":3.9344,
    "missing_step_error":23.9344,
    "affordance_error":3.6066,
    "additional_step_error":2.623
  },
  {
    "Model":"ibm-granite\/granite-3.1-2b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":4.918,
    "state_goal":14.3885,
    "relation_goal":8.3333,
    "action_goal":2.7027,
    "total_goal":9.736,
    "execution_success_rate":4.3,
    "parsing_error":12.7869,
    "hallucination_error":40.0,
    "predicate_argument_number_error":3.2787,
    "wrong_order_error":0.3279,
    "missing_step_error":22.623,
    "affordance_error":16.7213,
    "additional_step_error":0.6557
  },
  {
    "Model":"microsoft\/Phi-3-medium-4k-instruct",
    "Model Family":"phi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.0,
    "task_success_rate":31.0231,
    "state_goal":47.6534,
    "relation_goal":21.2291,
    "action_goal":25.6757,
    "total_goal":34.4371,
    "execution_success_rate":34.0,
    "parsing_error":2.9703,
    "hallucination_error":15.8416,
    "predicate_argument_number_error":8.9109,
    "wrong_order_error":0.6601,
    "missing_step_error":25.7426,
    "affordance_error":12.8713,
    "additional_step_error":4.9505
  },
  {
    "Model":"ibm-granite\/granite-3.2-8b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":37.7049,
    "state_goal":52.8777,
    "relation_goal":34.4444,
    "action_goal":31.7568,
    "total_goal":42.2442,
    "execution_success_rate":39.7,
    "parsing_error":5.2459,
    "hallucination_error":15.7377,
    "predicate_argument_number_error":1.3115,
    "wrong_order_error":0.3279,
    "missing_step_error":33.1148,
    "affordance_error":4.5902,
    "additional_step_error":3.2787
  },
  {
    "Model":"Qwen\/Qwen2.5-72B",
    "Model Family":"Qwen2.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":72.7,
    "task_success_rate":33.7705,
    "state_goal":43.1655,
    "relation_goal":32.2222,
    "action_goal":24.3243,
    "total_goal":35.3135,
    "execution_success_rate":37.7,
    "parsing_error":12.1311,
    "hallucination_error":38.6885,
    "predicate_argument_number_error":0.9836,
    "wrong_order_error":0.0,
    "missing_step_error":10.1639,
    "affordance_error":0.3279,
    "additional_step_error":19.0164
  },
  {
    "Model":"deepseek-ai\/DeepSeek-V3",
    "Model Family":"DeepSeek",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":684.5,
    "task_success_rate":78.3607,
    "state_goal":79.8561,
    "relation_goal":84.4444,
    "action_goal":72.2973,
    "total_goal":79.3729,
    "execution_success_rate":85.9,
    "parsing_error":0.0,
    "hallucination_error":6.2295,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":0.0,
    "missing_step_error":6.5574,
    "affordance_error":0.6557,
    "additional_step_error":1.9672
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-70B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":70.6,
    "task_success_rate":27.8689,
    "state_goal":34.5324,
    "relation_goal":31.1111,
    "action_goal":19.5946,
    "total_goal":29.868,
    "execution_success_rate":27.9,
    "parsing_error":18.3607,
    "hallucination_error":21.3115,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":3.9344,
    "missing_step_error":26.5574,
    "affordance_error":1.9672,
    "additional_step_error":29.5082
  },
  {
    "Model":"Qwen\/Qwen2.5-32B",
    "Model Family":"Qwen2.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.8,
    "task_success_rate":42.9508,
    "state_goal":47.482,
    "relation_goal":47.2222,
    "action_goal":33.7838,
    "total_goal":44.0594,
    "execution_success_rate":44.6,
    "parsing_error":7.2131,
    "hallucination_error":27.541,
    "predicate_argument_number_error":2.2951,
    "wrong_order_error":3.2787,
    "missing_step_error":12.1311,
    "affordance_error":3.2787,
    "additional_step_error":11.4754
  },
  {
    "Model":"google\/gemma-2-27b",
    "Model Family":"Gemma-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":27.2,
    "task_success_rate":13.7705,
    "state_goal":25.8993,
    "relation_goal":13.3333,
    "action_goal":7.4324,
    "total_goal":17.6568,
    "execution_success_rate":17.7,
    "parsing_error":43.9344,
    "hallucination_error":10.1639,
    "predicate_argument_number_error":3.9344,
    "wrong_order_error":1.9672,
    "missing_step_error":23.2787,
    "affordance_error":0.3279,
    "additional_step_error":24.5902
  },
  {
    "Model":"01-ai\/Yi-34B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":34.4,
    "task_success_rate":1.6502,
    "state_goal":1.083,
    "relation_goal":1.1173,
    "action_goal":0.0,
    "total_goal":0.8278,
    "execution_success_rate":1.0,
    "parsing_error":40.5941,
    "hallucination_error":46.5347,
    "predicate_argument_number_error":1.9802,
    "wrong_order_error":0.0,
    "missing_step_error":9.901,
    "affordance_error":0.6601,
    "additional_step_error":4.2904
  },
  {
    "Model":"ibm-granite\/granite-3.1-8b-base",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":8.5246,
    "state_goal":14.7482,
    "relation_goal":6.6667,
    "action_goal":6.0811,
    "total_goal":10.231,
    "execution_success_rate":10.2,
    "parsing_error":55.082,
    "hallucination_error":15.082,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":1.6393,
    "missing_step_error":16.0656,
    "affordance_error":1.3115,
    "additional_step_error":14.7541
  },
  {
    "Model":"meta-llama\/Llama-3.2-1B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.2,
    "task_success_rate":0.0,
    "state_goal":0.7194,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.33,
    "execution_success_rate":0.0,
    "parsing_error":42.2951,
    "hallucination_error":56.3934,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":0.0,
    "missing_step_error":1.3115,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-1.5-34B-Chat",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":34.4,
    "task_success_rate":38.2838,
    "state_goal":28.8809,
    "relation_goal":57.5419,
    "action_goal":35.1351,
    "total_goal":38.9073,
    "execution_success_rate":38.9,
    "parsing_error":0.0,
    "hallucination_error":20.7921,
    "predicate_argument_number_error":6.9307,
    "wrong_order_error":1.3201,
    "missing_step_error":26.0726,
    "affordance_error":5.9406,
    "additional_step_error":1.3201
  },
  {
    "Model":"01-ai\/Yi-1.5-34B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":34.4,
    "task_success_rate":6.9307,
    "state_goal":18.0505,
    "relation_goal":8.9385,
    "action_goal":0.6757,
    "total_goal":11.0927,
    "execution_success_rate":6.3,
    "parsing_error":1.6502,
    "hallucination_error":24.7525,
    "predicate_argument_number_error":3.9604,
    "wrong_order_error":0.33,
    "missing_step_error":59.0759,
    "affordance_error":3.9604,
    "additional_step_error":0.9901
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-1.3b-instruct",
    "Model Family":"DeepSeek-Coder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.3,
    "task_success_rate":1.3201,
    "state_goal":7.5812,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":3.4768,
    "execution_success_rate":0.7,
    "parsing_error":12.5413,
    "hallucination_error":58.4158,
    "predicate_argument_number_error":3.6304,
    "wrong_order_error":0.0,
    "missing_step_error":9.2409,
    "affordance_error":15.5116,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-70B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":70.6,
    "task_success_rate":60.6557,
    "state_goal":55.036,
    "relation_goal":68.8889,
    "action_goal":45.9459,
    "total_goal":56.9307,
    "execution_success_rate":65.9,
    "parsing_error":0.0,
    "hallucination_error":18.6885,
    "predicate_argument_number_error":8.1967,
    "wrong_order_error":0.3279,
    "missing_step_error":5.9016,
    "affordance_error":0.9836,
    "additional_step_error":3.9344
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Llama-8B",
    "Model Family":"DeepSeek-R1",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.0,
    "task_success_rate":24.2623,
    "state_goal":36.3309,
    "relation_goal":21.1111,
    "action_goal":14.8649,
    "total_goal":26.5677,
    "execution_success_rate":26.9,
    "parsing_error":7.541,
    "hallucination_error":25.2459,
    "predicate_argument_number_error":0.3279,
    "wrong_order_error":1.3115,
    "missing_step_error":35.4098,
    "affordance_error":3.6066,
    "additional_step_error":13.7705
  },
  {
    "Model":"tiiuae\/Falcon3-10B-Base",
    "Model Family":"falcon",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":10.3,
    "task_success_rate":21.7822,
    "state_goal":40.4332,
    "relation_goal":23.4637,
    "action_goal":20.9459,
    "total_goal":30.6291,
    "execution_success_rate":22.4,
    "parsing_error":7.2607,
    "hallucination_error":31.3531,
    "predicate_argument_number_error":0.33,
    "wrong_order_error":3.6304,
    "missing_step_error":31.3531,
    "affordance_error":4.6205,
    "additional_step_error":68.6469
  },
  {
    "Model":"LGAI-EXAONE\/EXAONE-3.5-32B-Instruct",
    "Model Family":"Exaone",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.0,
    "task_success_rate":51.8033,
    "state_goal":69.0647,
    "relation_goal":50.0,
    "action_goal":47.2973,
    "total_goal":58.0858,
    "execution_success_rate":61.0,
    "parsing_error":0.0,
    "hallucination_error":16.0656,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":2.2951,
    "missing_step_error":19.0164,
    "affordance_error":0.9836,
    "additional_step_error":3.9344
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-1.5B",
    "Model Family":"DeepSeek-R1",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.8,
    "task_success_rate":0.9836,
    "state_goal":2.1583,
    "relation_goal":1.6667,
    "action_goal":0.0,
    "total_goal":1.4851,
    "execution_success_rate":2.3,
    "parsing_error":46.8852,
    "hallucination_error":42.9508,
    "predicate_argument_number_error":0.3279,
    "wrong_order_error":0.0,
    "missing_step_error":7.8689,
    "affordance_error":2.2951,
    "additional_step_error":0.3279
  },
  {
    "Model":"openai\/gpt-oss-20b",
    "Model Family":"GPT-OSS",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":21.5,
    "task_success_rate":67.2131,
    "state_goal":74.4604,
    "relation_goal":66.1111,
    "action_goal":47.973,
    "total_goal":65.5116,
    "execution_success_rate":68.9,
    "parsing_error":13.1148,
    "hallucination_error":4.2623,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":0.0,
    "missing_step_error":13.1148,
    "affordance_error":0.0,
    "additional_step_error":0.6557
  },
  {
    "Model":"Qwen\/Qwen1.5-110B",
    "Model Family":"Qwen1.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":111.2,
    "task_success_rate":35.7377,
    "state_goal":46.7626,
    "relation_goal":46.1111,
    "action_goal":33.1081,
    "total_goal":43.2343,
    "execution_success_rate":40.0,
    "parsing_error":0.0,
    "hallucination_error":14.7541,
    "predicate_argument_number_error":4.2623,
    "wrong_order_error":6.2295,
    "missing_step_error":20.9836,
    "affordance_error":13.7705,
    "additional_step_error":4.918
  },
  {
    "Model":"bigcode\/starcoderbase-1b",
    "Model Family":"starcoder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":15.5,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":100.3279,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct-FP8",
    "Model Family":"Llama",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":401.6,
    "task_success_rate":74.4262,
    "state_goal":88.4892,
    "relation_goal":78.8889,
    "action_goal":60.1351,
    "total_goal":78.7129,
    "execution_success_rate":73.4,
    "parsing_error":0.3279,
    "hallucination_error":11.1475,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":4.2623,
    "missing_step_error":9.1803,
    "affordance_error":0.9836,
    "additional_step_error":58.3607
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-8B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.0,
    "task_success_rate":22.2951,
    "state_goal":27.6978,
    "relation_goal":22.7778,
    "action_goal":10.8108,
    "total_goal":22.1122,
    "execution_success_rate":26.2,
    "parsing_error":0.0,
    "hallucination_error":36.3934,
    "predicate_argument_number_error":7.8689,
    "wrong_order_error":0.3279,
    "missing_step_error":28.1967,
    "affordance_error":0.9836,
    "additional_step_error":0.9836
  },
  {
    "Model":"Qwen\/Qwen2.5-3B",
    "Model Family":"Qwen2.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":3.1,
    "task_success_rate":0.0,
    "state_goal":0.3597,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.165,
    "execution_success_rate":0.0,
    "parsing_error":18.0328,
    "hallucination_error":71.1475,
    "predicate_argument_number_error":8.5246,
    "wrong_order_error":0.0,
    "missing_step_error":2.623,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"tiiuae\/falcon-7b",
    "Model Family":"falcon",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.2,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":106.5789,
    "hallucination_error":1.9737,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-1.7B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.0,
    "task_success_rate":2.9508,
    "state_goal":12.5899,
    "relation_goal":5.0,
    "action_goal":0.0,
    "total_goal":7.2607,
    "execution_success_rate":1.3,
    "parsing_error":58.6885,
    "hallucination_error":20.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":2.2951,
    "missing_step_error":18.0328,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-2-13b-hf",
    "Model Family":"Llama-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":13.0,
    "task_success_rate":0.0,
    "state_goal":1.083,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.4967,
    "execution_success_rate":1.3,
    "parsing_error":64.6865,
    "hallucination_error":25.7426,
    "predicate_argument_number_error":1.9802,
    "wrong_order_error":0.0,
    "missing_step_error":7.2607,
    "affordance_error":1.3201,
    "additional_step_error":0.33
  },
  {
    "Model":"tiiuae\/Falcon3-7B-Base",
    "Model Family":"falcon",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.5,
    "task_success_rate":8.9109,
    "state_goal":22.0217,
    "relation_goal":8.3799,
    "action_goal":8.1081,
    "total_goal":14.5695,
    "execution_success_rate":6.9,
    "parsing_error":13.8614,
    "hallucination_error":32.6733,
    "predicate_argument_number_error":5.6106,
    "wrong_order_error":1.6502,
    "missing_step_error":33.0033,
    "affordance_error":6.6007,
    "additional_step_error":2.6403
  },
  {
    "Model":"ibm-granite\/granite-3.3-2b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":1.6393,
    "state_goal":17.9856,
    "relation_goal":2.2222,
    "action_goal":0.0,
    "total_goal":8.9109,
    "execution_success_rate":12.8,
    "parsing_error":10.4918,
    "hallucination_error":16.0656,
    "predicate_argument_number_error":6.2295,
    "wrong_order_error":0.0,
    "missing_step_error":42.2951,
    "affordance_error":12.459,
    "additional_step_error":0.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-14B",
    "Model Family":"DeepSeek-R1",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.8,
    "task_success_rate":47.8689,
    "state_goal":61.8705,
    "relation_goal":48.3333,
    "action_goal":25.6757,
    "total_goal":49.0099,
    "execution_success_rate":53.4,
    "parsing_error":26.2295,
    "hallucination_error":3.2787,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.3115,
    "missing_step_error":15.082,
    "affordance_error":0.6557,
    "additional_step_error":6.5574
  },
  {
    "Model":"mistralai\/Mistral-7B-Instruct-v0.2",
    "Model Family":"Mistral",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.2,
    "task_success_rate":17.1053,
    "state_goal":26.6187,
    "relation_goal":11.7318,
    "action_goal":6.0811,
    "total_goal":17.1901,
    "execution_success_rate":23.7,
    "parsing_error":4.9342,
    "hallucination_error":30.2632,
    "predicate_argument_number_error":2.6316,
    "wrong_order_error":0.0,
    "missing_step_error":39.8026,
    "affordance_error":0.3289,
    "additional_step_error":22.3684
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-32B",
    "Model Family":"DeepSeek-R1",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.8,
    "task_success_rate":17.0492,
    "state_goal":22.6619,
    "relation_goal":16.6667,
    "action_goal":16.8919,
    "total_goal":19.4719,
    "execution_success_rate":17.0,
    "parsing_error":69.5082,
    "hallucination_error":1.9672,
    "predicate_argument_number_error":0.3279,
    "wrong_order_error":2.9508,
    "missing_step_error":6.5574,
    "affordance_error":1.6393,
    "additional_step_error":36.7213
  },
  {
    "Model":"baichuan-inc\/Baichuan-7B",
    "Model Family":"Baichuan",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":null,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":79.2642,
    "hallucination_error":37.1237,
    "predicate_argument_number_error":0.3344,
    "wrong_order_error":0.0,
    "missing_step_error":1.0033,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"moonshotai\/Kimi-K2-Instruct",
    "Model Family":"Kimi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1000.0,
    "task_success_rate":75.082,
    "state_goal":89.9281,
    "relation_goal":80.5556,
    "action_goal":58.7838,
    "total_goal":79.538,
    "execution_success_rate":82.0,
    "parsing_error":0.0,
    "hallucination_error":1.6393,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":15.082,
    "affordance_error":1.3115,
    "additional_step_error":0.9836
  },
  {
    "Model":"01-ai\/Yi-Coder-1.5B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.5,
    "task_success_rate":1.9802,
    "state_goal":4.6931,
    "relation_goal":2.7933,
    "action_goal":3.3784,
    "total_goal":3.8079,
    "execution_success_rate":2.6,
    "parsing_error":56.1056,
    "hallucination_error":25.7426,
    "predicate_argument_number_error":4.9505,
    "wrong_order_error":0.0,
    "missing_step_error":10.5611,
    "affordance_error":1.6502,
    "additional_step_error":9.571
  },
  {
    "Model":"google\/gemma-2-9b-it",
    "Model Family":"Gemma-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":9.2,
    "task_success_rate":52.1311,
    "state_goal":69.7842,
    "relation_goal":51.6667,
    "action_goal":41.2162,
    "total_goal":57.4257,
    "execution_success_rate":54.1,
    "parsing_error":30.8197,
    "hallucination_error":2.2951,
    "predicate_argument_number_error":3.6066,
    "wrong_order_error":0.9836,
    "missing_step_error":20.3279,
    "affordance_error":3.2787,
    "additional_step_error":4.2623
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-7B",
    "Model Family":"DeepSeek-R1",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.6,
    "task_success_rate":13.4426,
    "state_goal":28.0576,
    "relation_goal":14.4444,
    "action_goal":6.0811,
    "total_goal":18.6469,
    "execution_success_rate":14.8,
    "parsing_error":46.2295,
    "hallucination_error":5.5738,
    "predicate_argument_number_error":1.9672,
    "wrong_order_error":2.9508,
    "missing_step_error":24.2623,
    "affordance_error":5.2459,
    "additional_step_error":3.6066
  },
  {
    "Model":"LGAI-EXAONE\/EXAONE-Deep-32B",
    "Model Family":"Exaone",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.0,
    "task_success_rate":27.2131,
    "state_goal":41.3669,
    "relation_goal":28.3333,
    "action_goal":15.5405,
    "total_goal":31.1881,
    "execution_success_rate":19.7,
    "parsing_error":20.6557,
    "hallucination_error":23.2787,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":8.1967,
    "missing_step_error":37.7049,
    "affordance_error":0.0,
    "additional_step_error":51.4754
  },
  {
    "Model":"01-ai\/Yi-1.5-6B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.1,
    "task_success_rate":2.6403,
    "state_goal":5.0542,
    "relation_goal":1.676,
    "action_goal":2.7027,
    "total_goal":3.4768,
    "execution_success_rate":2.0,
    "parsing_error":40.264,
    "hallucination_error":46.2046,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.33,
    "missing_step_error":11.2211,
    "affordance_error":3.9604,
    "additional_step_error":0.6601
  },
  {
    "Model":"baichuan-inc\/Baichuan2-7B-Chat",
    "Model Family":"Baichuan",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.0,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.6579,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"microsoft\/phi-4",
    "Model Family":"phi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.7,
    "task_success_rate":55.4098,
    "state_goal":72.3022,
    "relation_goal":64.4444,
    "action_goal":41.2162,
    "total_goal":62.3762,
    "execution_success_rate":58.0,
    "parsing_error":0.6557,
    "hallucination_error":4.2623,
    "predicate_argument_number_error":2.623,
    "wrong_order_error":0.6557,
    "missing_step_error":31.8033,
    "affordance_error":2.2951,
    "additional_step_error":1.6393
  },
  {
    "Model":"Qwen\/Qwen-7B",
    "Model Family":"Qwen",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.7,
    "task_success_rate":0.3279,
    "state_goal":1.0791,
    "relation_goal":0.5556,
    "action_goal":0.0,
    "total_goal":0.6601,
    "execution_success_rate":0.0,
    "parsing_error":90.8197,
    "hallucination_error":13.4426,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.6557,
    "affordance_error":0.3279,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-3-12b-pt",
    "Model Family":"Gemma-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":12.2,
    "task_success_rate":6.8852,
    "state_goal":17.9856,
    "relation_goal":7.2222,
    "action_goal":4.7297,
    "total_goal":11.5512,
    "execution_success_rate":8.5,
    "parsing_error":36.7213,
    "hallucination_error":20.3279,
    "predicate_argument_number_error":1.9672,
    "wrong_order_error":1.9672,
    "missing_step_error":31.4754,
    "affordance_error":1.6393,
    "additional_step_error":4.918
  },
  {
    "Model":"Qwen\/Qwen2.5-1.5B",
    "Model Family":"Qwen2.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.5,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":88.1967,
    "hallucination_error":11.4754,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.3279,
    "affordance_error":0.3279,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-8B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.0,
    "task_success_rate":9.5082,
    "state_goal":20.1439,
    "relation_goal":7.2222,
    "action_goal":4.7297,
    "total_goal":12.5413,
    "execution_success_rate":9.5,
    "parsing_error":14.0984,
    "hallucination_error":33.1148,
    "predicate_argument_number_error":7.2131,
    "wrong_order_error":1.6393,
    "missing_step_error":29.1803,
    "affordance_error":6.2295,
    "additional_step_error":6.5574
  },
  {
    "Model":"google\/gemma-3-27b-it",
    "Model Family":"Gemma-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":27.4,
    "task_success_rate":69.5082,
    "state_goal":80.9353,
    "relation_goal":70.0,
    "action_goal":58.7838,
    "total_goal":72.2772,
    "execution_success_rate":75.4,
    "parsing_error":0.0,
    "hallucination_error":3.9344,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":19.6721,
    "affordance_error":0.9836,
    "additional_step_error":3.6066
  },
  {
    "Model":"Qwen\/Qwen3-4B",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":4.0,
    "task_success_rate":29.5082,
    "state_goal":41.0072,
    "relation_goal":25.0,
    "action_goal":6.7568,
    "total_goal":27.8878,
    "execution_success_rate":26.2,
    "parsing_error":63.6066,
    "hallucination_error":1.6393,
    "predicate_argument_number_error":0.6557,
    "wrong_order_error":5.2459,
    "missing_step_error":2.623,
    "affordance_error":0.0,
    "additional_step_error":15.082
  },
  {
    "Model":"Qwen\/Qwen2.5-7B",
    "Model Family":"Qwen2.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.6,
    "task_success_rate":21.3115,
    "state_goal":28.0576,
    "relation_goal":21.1111,
    "action_goal":21.6216,
    "total_goal":24.4224,
    "execution_success_rate":22.6,
    "parsing_error":12.1311,
    "hallucination_error":30.8197,
    "predicate_argument_number_error":3.2787,
    "wrong_order_error":0.6557,
    "missing_step_error":23.6066,
    "affordance_error":8.1967,
    "additional_step_error":2.623
  },
  {
    "Model":"Qwen\/Qwen3-235B-A22B-Thinking-2507",
    "Model Family":"Qwen3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":235.1,
    "task_success_rate":29.8361,
    "state_goal":35.2518,
    "relation_goal":25.0,
    "action_goal":5.4054,
    "total_goal":24.9175,
    "execution_success_rate":25.2,
    "parsing_error":57.7049,
    "hallucination_error":11.4754,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":4.918,
    "missing_step_error":0.9836,
    "affordance_error":0.0,
    "additional_step_error":20.9836
  },
  {
    "Model":"bigcode\/starcoderbase",
    "Model Family":"starcoder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":15.5,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.3279,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.3-8b-instruct",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":35.082,
    "state_goal":41.3669,
    "relation_goal":38.3333,
    "action_goal":35.8108,
    "total_goal":39.1089,
    "execution_success_rate":40.7,
    "parsing_error":1.9672,
    "hallucination_error":15.4098,
    "predicate_argument_number_error":6.5574,
    "wrong_order_error":0.3279,
    "missing_step_error":24.918,
    "affordance_error":10.1639,
    "additional_step_error":1.6393
  },
  {
    "Model":"google\/gemma-3-4b-pt",
    "Model Family":"Gemma-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":4.3,
    "task_success_rate":1.6393,
    "state_goal":6.1151,
    "relation_goal":0.5556,
    "action_goal":0.0,
    "total_goal":2.9703,
    "execution_success_rate":1.3,
    "parsing_error":73.1148,
    "hallucination_error":12.7869,
    "predicate_argument_number_error":1.3115,
    "wrong_order_error":0.0,
    "missing_step_error":12.7869,
    "affordance_error":1.3115,
    "additional_step_error":0.3279
  },
  {
    "Model":"01-ai\/Yi-Coder-9B-Chat",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.8,
    "task_success_rate":42.9043,
    "state_goal":59.2058,
    "relation_goal":44.1341,
    "action_goal":30.4054,
    "total_goal":47.6821,
    "execution_success_rate":46.5,
    "parsing_error":0.33,
    "hallucination_error":13.2013,
    "predicate_argument_number_error":5.6106,
    "wrong_order_error":3.3003,
    "missing_step_error":28.7129,
    "affordance_error":2.3102,
    "additional_step_error":1.3201
  },
  {
    "Model":"meta-llama\/Llama-3.1-70B",
    "Model Family":"Llama-3",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":70.6,
    "task_success_rate":28.8525,
    "state_goal":36.6906,
    "relation_goal":33.8889,
    "action_goal":17.5676,
    "total_goal":31.1881,
    "execution_success_rate":30.5,
    "parsing_error":26.5574,
    "hallucination_error":20.6557,
    "predicate_argument_number_error":0.9836,
    "wrong_order_error":2.2951,
    "missing_step_error":19.0164,
    "affordance_error":1.9672,
    "additional_step_error":90.8197
  },
  {
    "Model":"google\/gemma-2-2b",
    "Model Family":"Gemma-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.6,
    "task_success_rate":0.6557,
    "state_goal":0.3597,
    "relation_goal":1.6667,
    "action_goal":2.7027,
    "total_goal":1.3201,
    "execution_success_rate":2.0,
    "parsing_error":71.8033,
    "hallucination_error":31.4754,
    "predicate_argument_number_error":1.3115,
    "wrong_order_error":0.0,
    "missing_step_error":3.9344,
    "affordance_error":0.6557,
    "additional_step_error":0.0
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-33b-base",
    "Model Family":"DeepSeek-Coder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":33.3,
    "task_success_rate":1.3201,
    "state_goal":4.6931,
    "relation_goal":2.2346,
    "action_goal":0.0,
    "total_goal":2.8146,
    "execution_success_rate":0.7,
    "parsing_error":32.3432,
    "hallucination_error":55.7756,
    "predicate_argument_number_error":0.9901,
    "wrong_order_error":0.9901,
    "missing_step_error":8.5809,
    "affordance_error":1.3201,
    "additional_step_error":1.3201
  },
  {
    "Model":"google\/gemma-7b",
    "Model Family":"Gemma",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.5,
    "task_success_rate":1.3115,
    "state_goal":4.3165,
    "relation_goal":0.5556,
    "action_goal":1.3514,
    "total_goal":2.4752,
    "execution_success_rate":2.3,
    "parsing_error":55.4098,
    "hallucination_error":28.8525,
    "predicate_argument_number_error":0.9836,
    "wrong_order_error":0.0,
    "missing_step_error":10.8197,
    "affordance_error":1.6393,
    "additional_step_error":0.3279
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-6.7b-instruct",
    "Model Family":"DeepSeek-Coder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.7,
    "task_success_rate":6.6007,
    "state_goal":23.1047,
    "relation_goal":7.2626,
    "action_goal":8.7838,
    "total_goal":14.9007,
    "execution_success_rate":6.9,
    "parsing_error":24.4224,
    "hallucination_error":17.4917,
    "predicate_argument_number_error":3.6304,
    "wrong_order_error":0.6601,
    "missing_step_error":47.1947,
    "affordance_error":5.6106,
    "additional_step_error":1.3201
  },
  {
    "Model":"01-ai\/Yi-Coder-9B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.8,
    "task_success_rate":14.1914,
    "state_goal":15.1625,
    "relation_goal":18.4358,
    "action_goal":10.1351,
    "total_goal":14.9007,
    "execution_success_rate":15.2,
    "parsing_error":53.4653,
    "hallucination_error":23.7624,
    "predicate_argument_number_error":1.3201,
    "wrong_order_error":0.6601,
    "missing_step_error":7.2607,
    "affordance_error":0.6601,
    "additional_step_error":0.6601
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-7b-instruct-v1.5",
    "Model Family":"DeepSeek-Coder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.9,
    "task_success_rate":0.6557,
    "state_goal":17.2662,
    "relation_goal":0.5556,
    "action_goal":0.6757,
    "total_goal":8.2508,
    "execution_success_rate":0.7,
    "parsing_error":2.9508,
    "hallucination_error":15.082,
    "predicate_argument_number_error":35.7377,
    "wrong_order_error":0.0,
    "missing_step_error":43.6066,
    "affordance_error":1.9672,
    "additional_step_error":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.3-8b-base",
    "Model Family":"Granite",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":16.7213,
    "state_goal":19.0647,
    "relation_goal":16.6667,
    "action_goal":21.6216,
    "total_goal":18.9769,
    "execution_success_rate":26.2,
    "parsing_error":45.9016,
    "hallucination_error":5.9016,
    "predicate_argument_number_error":0.9836,
    "wrong_order_error":1.9672,
    "missing_step_error":13.4426,
    "affordance_error":5.5738,
    "additional_step_error":1.3115
  },
  {
    "Model":"meta-llama\/Llama-2-70b-hf",
    "Model Family":"Llama-2",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":69.0,
    "task_success_rate":5.6106,
    "state_goal":10.4693,
    "relation_goal":4.4693,
    "action_goal":2.027,
    "total_goal":6.6225,
    "execution_success_rate":4.6,
    "parsing_error":55.4455,
    "hallucination_error":24.0924,
    "predicate_argument_number_error":2.3102,
    "wrong_order_error":0.33,
    "missing_step_error":10.8911,
    "affordance_error":2.9703,
    "additional_step_error":0.33
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-1.3b-base",
    "Model Family":"DeepSeek-Coder",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.3,
    "task_success_rate":0.33,
    "state_goal":1.444,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.6623,
    "execution_success_rate":0.0,
    "parsing_error":45.2145,
    "hallucination_error":46.5347,
    "predicate_argument_number_error":1.6502,
    "wrong_order_error":0.0,
    "missing_step_error":8.2508,
    "affordance_error":0.6601,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-6B",
    "Model Family":"Yi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.1,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":86.7987,
    "hallucination_error":11.2211,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":1.9802,
    "affordance_error":0.33,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen2.5-0.5B",
    "Model Family":"Qwen2.5",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":0.5,
    "task_success_rate":0.0,
    "state_goal":0.0,
    "relation_goal":0.0,
    "action_goal":0.0,
    "total_goal":0.0,
    "execution_success_rate":0.0,
    "parsing_error":93.7705,
    "hallucination_error":15.082,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":0.0,
    "additional_step_error":0.0
  },
  {
    "Model":"microsoft\/Phi-3-mini-4k-instruct",
    "Model Family":"phi",
    "dataset":"virtualhome",
    "eval_type":"action_sequencing",
    "Model Size (B)":3.8,
    "task_success_rate":13.8614,
    "state_goal":22.0217,
    "relation_goal":12.2905,
    "action_goal":6.7568,
    "total_goal":15.3974,
    "execution_success_rate":17.8,
    "parsing_error":17.4917,
    "hallucination_error":15.5116,
    "predicate_argument_number_error":12.8713,
    "wrong_order_error":0.0,
    "missing_step_error":23.4323,
    "affordance_error":12.8713,
    "additional_step_error":0.33
  }
]