[
  {
    "Model":"01-ai\/Yi-1.5-34B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":34.4,
    "task_success_rate":0.0,
    "state_goal":1.42,
    "relation_goal":16.22,
    "action_goal":0,
    "total_goal":11.0,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-1.5-34B-Chat",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":34.4,
    "task_success_rate":10.14,
    "state_goal":22.7,
    "relation_goal":20.08,
    "action_goal":0,
    "total_goal":21.0,
    "execution_success_rate":13.04,
    "parsing_error":62.32,
    "hallucination_error":1.45,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":2.9,
    "missing_step_error":14.49,
    "affordance_error":null,
    "additional_step_error":1.45
  },
  {
    "Model":"01-ai\/Yi-1.5-6B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.1,
    "task_success_rate":0.0,
    "state_goal":1.42,
    "relation_goal":16.22,
    "action_goal":0,
    "total_goal":11.0,
    "execution_success_rate":0.0,
    "parsing_error":98.55,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":1.45,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-1.5-6B-Chat",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.1,
    "task_success_rate":1.45,
    "state_goal":4.26,
    "relation_goal":18.53,
    "action_goal":0,
    "total_goal":13.5,
    "execution_success_rate":2.9,
    "parsing_error":71.01,
    "hallucination_error":10.14,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.45,
    "missing_step_error":8.7,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-1.5-9B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.8,
    "task_success_rate":8.7,
    "state_goal":11.35,
    "relation_goal":21.62,
    "action_goal":0,
    "total_goal":18.0,
    "execution_success_rate":11.59,
    "parsing_error":71.01,
    "hallucination_error":7.25,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.45,
    "missing_step_error":7.25,
    "affordance_error":null,
    "additional_step_error":1.45
  },
  {
    "Model":"01-ai\/Yi-Coder-1.5B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.5,
    "task_success_rate":0.0,
    "state_goal":2.84,
    "relation_goal":16.99,
    "action_goal":0,
    "total_goal":12.0,
    "execution_success_rate":1.45,
    "parsing_error":98.55,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-Coder-1.5B-Chat",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.5,
    "task_success_rate":0.0,
    "state_goal":1.42,
    "relation_goal":16.22,
    "action_goal":0,
    "total_goal":11.0,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-Coder-9B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.8,
    "task_success_rate":2.9,
    "state_goal":9.93,
    "relation_goal":17.76,
    "action_goal":0,
    "total_goal":15.0,
    "execution_success_rate":4.35,
    "parsing_error":86.96,
    "hallucination_error":2.9,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.45,
    "missing_step_error":2.9,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"01-ai\/Yi-Coder-9B-Chat",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.8,
    "task_success_rate":15.94,
    "state_goal":17.02,
    "relation_goal":23.94,
    "action_goal":0,
    "total_goal":21.5,
    "execution_success_rate":17.39,
    "parsing_error":68.12,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":5.8,
    "missing_step_error":4.35,
    "affordance_error":null,
    "additional_step_error":1.45
  },
  {
    "Model":"Qwen\/Qwen-72B",
    "Model Family":"Qwen",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":72.3,
    "task_success_rate":11.34,
    "state_goal":8.0,
    "relation_goal":21.46,
    "action_goal":0,
    "total_goal":17.58,
    "execution_success_rate":12.37,
    "parsing_error":68.04,
    "hallucination_error":5.15,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":11.34,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen1.5-1.8B",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.8,
    "task_success_rate":0.0,
    "state_goal":4.0,
    "relation_goal":12.15,
    "action_goal":0,
    "total_goal":9.8,
    "execution_success_rate":0.0,
    "parsing_error":86.6,
    "hallucination_error":6.19,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":1.03,
    "affordance_error":null,
    "additional_step_error":1.03
  },
  {
    "Model":"Qwen\/Qwen1.5-14B",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.2,
    "task_success_rate":4.12,
    "state_goal":13.0,
    "relation_goal":14.57,
    "action_goal":0,
    "total_goal":14.12,
    "execution_success_rate":8.25,
    "parsing_error":58.76,
    "hallucination_error":9.28,
    "predicate_argument_number_error":1.03,
    "wrong_order_error":5.15,
    "missing_step_error":13.4,
    "affordance_error":null,
    "additional_step_error":5.15
  },
  {
    "Model":"Qwen\/Qwen1.5-4B",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":4.0,
    "task_success_rate":0.0,
    "state_goal":4.0,
    "relation_goal":10.53,
    "action_goal":0,
    "total_goal":8.65,
    "execution_success_rate":0.0,
    "parsing_error":67.01,
    "hallucination_error":5.15,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.03,
    "missing_step_error":13.4,
    "affordance_error":null,
    "additional_step_error":4.12
  },
  {
    "Model":"Qwen\/Qwen1.5-72B",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":72.3,
    "task_success_rate":18.56,
    "state_goal":25.0,
    "relation_goal":27.94,
    "action_goal":0,
    "total_goal":27.09,
    "execution_success_rate":23.71,
    "parsing_error":37.11,
    "hallucination_error":3.09,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":8.25,
    "missing_step_error":23.71,
    "affordance_error":null,
    "additional_step_error":5.15
  },
  {
    "Model":"Qwen\/Qwen1.5-7B",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.7,
    "task_success_rate":2.06,
    "state_goal":7.0,
    "relation_goal":12.15,
    "action_goal":0,
    "total_goal":10.66,
    "execution_success_rate":1.03,
    "parsing_error":76.29,
    "hallucination_error":10.31,
    "predicate_argument_number_error":1.03,
    "wrong_order_error":2.06,
    "missing_step_error":7.22,
    "affordance_error":null,
    "additional_step_error":1.03
  },
  {
    "Model":"Qwen\/Qwen3-0.6B",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":0.8,
    "task_success_rate":0.0,
    "state_goal":4.0,
    "relation_goal":12.55,
    "action_goal":0,
    "total_goal":10.09,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-1.7B",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.0,
    "task_success_rate":0.0,
    "state_goal":4.0,
    "relation_goal":12.55,
    "action_goal":0,
    "total_goal":10.09,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-14B",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.8,
    "task_success_rate":8.25,
    "state_goal":14.0,
    "relation_goal":14.17,
    "action_goal":0,
    "total_goal":14.12,
    "execution_success_rate":8.25,
    "parsing_error":90.72,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":1.03,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-235B-A22B-Thinking-2507",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":235.1,
    "task_success_rate":0.0,
    "state_goal":4.0,
    "relation_goal":11.65,
    "action_goal":0,
    "total_goal":9.56,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-32B",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.8,
    "task_success_rate":8.25,
    "state_goal":10.0,
    "relation_goal":14.98,
    "action_goal":0,
    "total_goal":13.54,
    "execution_success_rate":8.25,
    "parsing_error":91.75,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-4B",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":4.0,
    "task_success_rate":1.03,
    "state_goal":10.0,
    "relation_goal":12.55,
    "action_goal":0,
    "total_goal":11.82,
    "execution_success_rate":1.03,
    "parsing_error":97.94,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"Qwen\/Qwen3-8B",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":3.09,
    "state_goal":7.0,
    "relation_goal":12.96,
    "action_goal":0,
    "total_goal":11.24,
    "execution_success_rate":3.09,
    "parsing_error":96.91,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"baichuan-inc\/Baichuan2-7B-Base",
    "Model Family":"Baichuan",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.0,
    "task_success_rate":0.0,
    "state_goal":1.31,
    "relation_goal":14.86,
    "action_goal":0,
    "total_goal":10.5,
    "execution_success_rate":0.0,
    "parsing_error":92.21,
    "hallucination_error":2.6,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":3.9,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"baichuan-inc\/Baichuan2-7B-Chat",
    "Model Family":"Baichuan",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.0,
    "task_success_rate":1.3,
    "state_goal":9.15,
    "relation_goal":13.62,
    "action_goal":0,
    "total_goal":12.18,
    "execution_success_rate":1.3,
    "parsing_error":77.92,
    "hallucination_error":15.58,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":2.6,
    "affordance_error":null,
    "additional_step_error":1.3
  },
  {
    "Model":"CohereLabs\/c4ai-command-r-08-2024",
    "Model Family":"Cohere",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.3,
    "task_success_rate":16.0,
    "state_goal":22.0,
    "relation_goal":25.94,
    "action_goal":0,
    "total_goal":24.86,
    "execution_success_rate":19.0,
    "parsing_error":5.0,
    "hallucination_error":13.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":8.0,
    "missing_step_error":43.0,
    "affordance_error":null,
    "additional_step_error":4.0
  },
  {
    "Model":"CohereLabs\/c4ai-command-r-plus-08-2024",
    "Model Family":"Cohere",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":103.8,
    "task_success_rate":28.0,
    "state_goal":29.0,
    "relation_goal":31.95,
    "action_goal":0,
    "total_goal":31.15,
    "execution_success_rate":35.0,
    "parsing_error":0.0,
    "hallucination_error":1.0,
    "predicate_argument_number_error":15.0,
    "wrong_order_error":10.0,
    "missing_step_error":39.0,
    "affordance_error":null,
    "additional_step_error":15.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1",
    "Model Family":"DeepSeek",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":684.5,
    "task_success_rate":1.0,
    "state_goal":6.0,
    "relation_goal":12.03,
    "action_goal":0,
    "total_goal":10.38,
    "execution_success_rate":1.0,
    "parsing_error":98.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":1.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Llama-70B",
    "Model Family":"DeepSeek",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":70.6,
    "task_success_rate":11.22,
    "state_goal":13.0,
    "relation_goal":16.93,
    "action_goal":0,
    "total_goal":15.82,
    "execution_success_rate":11.22,
    "parsing_error":86.73,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":2.04,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Llama-8B",
    "Model Family":"DeepSeek",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.0,
    "task_success_rate":4.08,
    "state_goal":15.0,
    "relation_goal":13.39,
    "action_goal":0,
    "total_goal":13.84,
    "execution_success_rate":5.1,
    "parsing_error":81.63,
    "hallucination_error":3.06,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":2.04,
    "missing_step_error":4.08,
    "affordance_error":null,
    "additional_step_error":3.06
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-1.5B",
    "Model Family":"DeepSeek",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":1.8,
    "task_success_rate":0.0,
    "state_goal":4.0,
    "relation_goal":12.2,
    "action_goal":0,
    "total_goal":9.89,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-14B",
    "Model Family":"DeepSeek",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":14.8,
    "task_success_rate":7.14,
    "state_goal":13.0,
    "relation_goal":14.17,
    "action_goal":0,
    "total_goal":13.84,
    "execution_success_rate":7.14,
    "parsing_error":87.76,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.02,
    "missing_step_error":2.04,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-32B",
    "Model Family":"DeepSeek",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.8,
    "task_success_rate":17.35,
    "state_goal":26.5,
    "relation_goal":16.73,
    "action_goal":0,
    "total_goal":19.49,
    "execution_success_rate":19.39,
    "parsing_error":70.41,
    "hallucination_error":4.08,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":2.04,
    "missing_step_error":3.06,
    "affordance_error":null,
    "additional_step_error":1.02
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-7B",
    "Model Family":"DeepSeek",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.6,
    "task_success_rate":0.0,
    "state_goal":4.0,
    "relation_goal":12.2,
    "action_goal":0,
    "total_goal":9.89,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-V3",
    "Model Family":"DeepSeek",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":684.5,
    "task_success_rate":41.0,
    "state_goal":54.5,
    "relation_goal":42.67,
    "action_goal":0,
    "total_goal":45.9,
    "execution_success_rate":51.0,
    "parsing_error":0.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":6.0,
    "missing_step_error":36.0,
    "affordance_error":null,
    "additional_step_error":1.0
  },
  {
    "Model":"google\/gemma-1.1-2b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":0.0,
    "state_goal":2.2,
    "relation_goal":12.96,
    "action_goal":0,
    "total_goal":9.77,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-1.1-7b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.5,
    "task_success_rate":1.11,
    "state_goal":9.89,
    "relation_goal":14.81,
    "action_goal":0,
    "total_goal":13.36,
    "execution_success_rate":7.78,
    "parsing_error":32.22,
    "hallucination_error":16.67,
    "predicate_argument_number_error":8.89,
    "wrong_order_error":2.22,
    "missing_step_error":27.78,
    "affordance_error":null,
    "additional_step_error":4.44
  },
  {
    "Model":"google\/gemma-2-27b",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":27.2,
    "task_success_rate":14.44,
    "state_goal":13.19,
    "relation_goal":19.44,
    "action_goal":0,
    "total_goal":17.59,
    "execution_success_rate":16.67,
    "parsing_error":72.22,
    "hallucination_error":1.11,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.11,
    "missing_step_error":6.67,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-2-27b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":27.2,
    "task_success_rate":30.0,
    "state_goal":25.27,
    "relation_goal":41.67,
    "action_goal":0,
    "total_goal":36.81,
    "execution_success_rate":41.11,
    "parsing_error":22.22,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":12.22,
    "missing_step_error":24.44,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-2-2b",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.6,
    "task_success_rate":0.0,
    "state_goal":2.2,
    "relation_goal":12.96,
    "action_goal":0,
    "total_goal":9.77,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-2-2b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.6,
    "task_success_rate":0.0,
    "state_goal":6.59,
    "relation_goal":10.65,
    "action_goal":0,
    "total_goal":9.45,
    "execution_success_rate":0.0,
    "parsing_error":24.44,
    "hallucination_error":11.11,
    "predicate_argument_number_error":8.89,
    "wrong_order_error":0.0,
    "missing_step_error":54.44,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-2-9b",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":9.2,
    "task_success_rate":0.0,
    "state_goal":3.3,
    "relation_goal":13.43,
    "action_goal":0,
    "total_goal":10.42,
    "execution_success_rate":0.0,
    "parsing_error":98.89,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.11,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":1.11
  },
  {
    "Model":"google\/gemma-2-9b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":9.2,
    "task_success_rate":20.0,
    "state_goal":15.38,
    "relation_goal":29.17,
    "action_goal":0,
    "total_goal":25.08,
    "execution_success_rate":28.89,
    "parsing_error":32.22,
    "hallucination_error":1.11,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":7.78,
    "missing_step_error":28.89,
    "affordance_error":null,
    "additional_step_error":3.33
  },
  {
    "Model":"google\/gemma-2b",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":0.0,
    "state_goal":2.2,
    "relation_goal":12.96,
    "action_goal":0,
    "total_goal":9.77,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-2b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":0.0,
    "state_goal":2.2,
    "relation_goal":12.96,
    "action_goal":0,
    "total_goal":9.77,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-3-12b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":12.2,
    "task_success_rate":29.21,
    "state_goal":37.64,
    "relation_goal":31.07,
    "action_goal":0,
    "total_goal":33.0,
    "execution_success_rate":34.83,
    "parsing_error":31.46,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":5.62,
    "missing_step_error":26.97,
    "affordance_error":null,
    "additional_step_error":2.25
  },
  {
    "Model":"google\/gemma-3-12b-pt",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":12.2,
    "task_success_rate":1.12,
    "state_goal":4.49,
    "relation_goal":14.49,
    "action_goal":0,
    "total_goal":11.55,
    "execution_success_rate":3.37,
    "parsing_error":96.63,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-3-27b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":27.4,
    "task_success_rate":32.58,
    "state_goal":36.52,
    "relation_goal":39.02,
    "action_goal":0,
    "total_goal":38.28,
    "execution_success_rate":39.33,
    "parsing_error":30.34,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":5.62,
    "missing_step_error":22.47,
    "affordance_error":null,
    "additional_step_error":1.12
  },
  {
    "Model":"google\/gemma-3-4b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":4.3,
    "task_success_rate":2.25,
    "state_goal":7.87,
    "relation_goal":16.82,
    "action_goal":0,
    "total_goal":14.19,
    "execution_success_rate":7.87,
    "parsing_error":40.45,
    "hallucination_error":0.0,
    "predicate_argument_number_error":1.12,
    "wrong_order_error":5.62,
    "missing_step_error":35.96,
    "affordance_error":null,
    "additional_step_error":4.49
  },
  {
    "Model":"google\/gemma-3-4b-pt",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":4.3,
    "task_success_rate":0.0,
    "state_goal":3.37,
    "relation_goal":13.55,
    "action_goal":0,
    "total_goal":10.56,
    "execution_success_rate":1.12,
    "parsing_error":97.75,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":1.12,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-7b",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.5,
    "task_success_rate":0.0,
    "state_goal":7.69,
    "relation_goal":13.43,
    "action_goal":0,
    "total_goal":11.73,
    "execution_success_rate":1.11,
    "parsing_error":92.22,
    "hallucination_error":1.11,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":2.22,
    "missing_step_error":1.11,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"google\/gemma-7b-it",
    "Model Family":"Gemma",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.5,
    "task_success_rate":0.0,
    "state_goal":6.59,
    "relation_goal":13.43,
    "action_goal":0,
    "total_goal":11.4,
    "execution_success_rate":0.0,
    "parsing_error":30.0,
    "hallucination_error":41.11,
    "predicate_argument_number_error":16.67,
    "wrong_order_error":0.0,
    "missing_step_error":12.22,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.1-2b-base",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":1.18,
    "state_goal":3.53,
    "relation_goal":15.31,
    "action_goal":0,
    "total_goal":11.74,
    "execution_success_rate":1.18,
    "parsing_error":90.59,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.18,
    "missing_step_error":2.35,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.1-2b-instruct",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":5.81,
    "state_goal":10.59,
    "relation_goal":18.41,
    "action_goal":0,
    "total_goal":16.08,
    "execution_success_rate":9.3,
    "parsing_error":43.02,
    "hallucination_error":6.98,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":6.98,
    "missing_step_error":29.07,
    "affordance_error":null,
    "additional_step_error":4.65
  },
  {
    "Model":"ibm-granite\/granite-3.1-8b-base",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":4.71,
    "state_goal":9.41,
    "relation_goal":16.84,
    "action_goal":0,
    "total_goal":14.59,
    "execution_success_rate":8.24,
    "parsing_error":69.41,
    "hallucination_error":3.53,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.18,
    "missing_step_error":11.76,
    "affordance_error":null,
    "additional_step_error":1.18
  },
  {
    "Model":"ibm-granite\/granite-3.1-8b-instruct",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":3.49,
    "state_goal":10.59,
    "relation_goal":18.91,
    "action_goal":0,
    "total_goal":16.43,
    "execution_success_rate":4.65,
    "parsing_error":48.84,
    "hallucination_error":8.14,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":9.3,
    "missing_step_error":20.93,
    "affordance_error":null,
    "additional_step_error":2.33
  },
  {
    "Model":"ibm-granite\/granite-3.2-2b-instruct",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":0.0,
    "state_goal":7.06,
    "relation_goal":13.43,
    "action_goal":0,
    "total_goal":11.54,
    "execution_success_rate":3.49,
    "parsing_error":41.86,
    "hallucination_error":3.49,
    "predicate_argument_number_error":1.16,
    "wrong_order_error":11.63,
    "missing_step_error":33.72,
    "affordance_error":null,
    "additional_step_error":3.49
  },
  {
    "Model":"ibm-granite\/granite-3.2-8b-instruct",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":4.65,
    "state_goal":11.76,
    "relation_goal":16.92,
    "action_goal":0,
    "total_goal":15.38,
    "execution_success_rate":8.14,
    "parsing_error":44.19,
    "hallucination_error":5.81,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":15.12,
    "missing_step_error":15.12,
    "affordance_error":null,
    "additional_step_error":5.81
  },
  {
    "Model":"ibm-granite\/granite-3.3-2b-base",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":1.18,
    "state_goal":3.53,
    "relation_goal":13.27,
    "action_goal":0,
    "total_goal":10.32,
    "execution_success_rate":0.0,
    "parsing_error":90.59,
    "hallucination_error":1.18,
    "predicate_argument_number_error":1.18,
    "wrong_order_error":2.35,
    "missing_step_error":4.71,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"ibm-granite\/granite-3.3-2b-instruct",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":2.5,
    "task_success_rate":2.33,
    "state_goal":7.06,
    "relation_goal":15.92,
    "action_goal":0,
    "total_goal":13.29,
    "execution_success_rate":3.49,
    "parsing_error":38.37,
    "hallucination_error":13.95,
    "predicate_argument_number_error":4.65,
    "wrong_order_error":10.47,
    "missing_step_error":26.74,
    "affordance_error":null,
    "additional_step_error":8.14
  },
  {
    "Model":"ibm-granite\/granite-3.3-8b-base",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":4.71,
    "state_goal":5.88,
    "relation_goal":15.82,
    "action_goal":0,
    "total_goal":12.81,
    "execution_success_rate":4.71,
    "parsing_error":64.71,
    "hallucination_error":7.06,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":2.35,
    "missing_step_error":11.76,
    "affordance_error":null,
    "additional_step_error":2.35
  },
  {
    "Model":"ibm-granite\/granite-3.3-8b-instruct",
    "Model Family":"Granite",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.2,
    "task_success_rate":3.49,
    "state_goal":8.24,
    "relation_goal":16.42,
    "action_goal":0,
    "total_goal":13.99,
    "execution_success_rate":8.14,
    "parsing_error":48.84,
    "hallucination_error":10.47,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":4.65,
    "missing_step_error":22.09,
    "affordance_error":null,
    "additional_step_error":3.49
  },
  {
    "Model":"LGAI-EXAONE\/EXAONE-3.5-32B-Instruct",
    "Model Family":"Exaone",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.0,
    "task_success_rate":29.0,
    "state_goal":32.5,
    "relation_goal":34.02,
    "action_goal":0,
    "total_goal":33.61,
    "execution_success_rate":34.0,
    "parsing_error":0.0,
    "hallucination_error":1.0,
    "predicate_argument_number_error":1.0,
    "wrong_order_error":11.0,
    "missing_step_error":49.0,
    "affordance_error":null,
    "additional_step_error":1.0
  },
  {
    "Model":"LGAI-EXAONE\/EXAONE-Deep-32B",
    "Model Family":"Exaone",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":32.0,
    "task_success_rate":8.0,
    "state_goal":15.0,
    "relation_goal":18.8,
    "action_goal":0,
    "total_goal":17.76,
    "execution_success_rate":10.0,
    "parsing_error":80.0,
    "hallucination_error":1.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":4.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-2-13b-hf",
    "Model Family":"Llama-2",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":13.0,
    "task_success_rate":0.0,
    "state_goal":1.42,
    "relation_goal":16.12,
    "action_goal":0,
    "total_goal":11.11,
    "execution_success_rate":0.0,
    "parsing_error":98.61,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-2-70b-hf",
    "Model Family":"Llama-2",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":69.0,
    "task_success_rate":0.0,
    "state_goal":1.42,
    "relation_goal":16.12,
    "action_goal":0,
    "total_goal":11.11,
    "execution_success_rate":0.0,
    "parsing_error":100.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":0.0,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-2-7b-hf",
    "Model Family":"Llama-2",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":6.7,
    "task_success_rate":0.0,
    "state_goal":1.42,
    "relation_goal":16.85,
    "action_goal":0,
    "total_goal":11.59,
    "execution_success_rate":0.0,
    "parsing_error":91.67,
    "hallucination_error":2.78,
    "predicate_argument_number_error":1.39,
    "wrong_order_error":0.0,
    "missing_step_error":2.78,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-3.1-70B",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":70.6,
    "task_success_rate":3.06,
    "state_goal":8.0,
    "relation_goal":13.78,
    "action_goal":0,
    "total_goal":12.15,
    "execution_success_rate":3.06,
    "parsing_error":93.88,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":1.02,
    "missing_step_error":2.04,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Llama-3.1-8B",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.0,
    "task_success_rate":3.06,
    "state_goal":10.0,
    "relation_goal":13.39,
    "action_goal":0,
    "total_goal":12.43,
    "execution_success_rate":5.1,
    "parsing_error":78.57,
    "hallucination_error":3.06,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":2.04,
    "missing_step_error":7.14,
    "affordance_error":null,
    "additional_step_error":2.04
  },
  {
    "Model":"meta-llama\/Llama-3.3-70B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":70.6,
    "task_success_rate":41.0,
    "state_goal":49.5,
    "relation_goal":43.05,
    "action_goal":0,
    "total_goal":44.81,
    "execution_success_rate":49.0,
    "parsing_error":0.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":10.0,
    "missing_step_error":38.0,
    "affordance_error":null,
    "additional_step_error":6.0
  },
  {
    "Model":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct-FP8",
    "Model Family":"Llama",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":401.6,
    "task_success_rate":45.0,
    "state_goal":56.5,
    "relation_goal":53.95,
    "action_goal":0,
    "total_goal":54.64,
    "execution_success_rate":56.0,
    "parsing_error":16.0,
    "hallucination_error":1.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":6.0,
    "missing_step_error":18.0,
    "affordance_error":null,
    "additional_step_error":3.0
  },
  {
    "Model":"meta-llama\/Llama-4-Scout-17B-16E-Instruct",
    "Model Family":"Llama",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":108.6,
    "task_success_rate":37.0,
    "state_goal":35.5,
    "relation_goal":56.58,
    "action_goal":0,
    "total_goal":50.82,
    "execution_success_rate":50.0,
    "parsing_error":0.0,
    "hallucination_error":7.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":14.0,
    "missing_step_error":29.0,
    "affordance_error":null,
    "additional_step_error":4.0
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-70B",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":70.6,
    "task_success_rate":1.02,
    "state_goal":8.0,
    "relation_goal":12.6,
    "action_goal":0,
    "total_goal":11.3,
    "execution_success_rate":2.04,
    "parsing_error":95.92,
    "hallucination_error":1.02,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":1.02,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-8B",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.0,
    "task_success_rate":1.02,
    "state_goal":4.0,
    "relation_goal":12.6,
    "action_goal":0,
    "total_goal":10.17,
    "execution_success_rate":1.02,
    "parsing_error":92.86,
    "hallucination_error":1.02,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":0.0,
    "missing_step_error":3.06,
    "affordance_error":null,
    "additional_step_error":0.0
  },
  {
    "Model":"meta-llama\/Meta-Llama-3-8B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":8.0,
    "task_success_rate":13.27,
    "state_goal":18.0,
    "relation_goal":21.65,
    "action_goal":0,
    "total_goal":20.62,
    "execution_success_rate":18.37,
    "parsing_error":16.33,
    "hallucination_error":11.22,
    "predicate_argument_number_error":3.06,
    "wrong_order_error":5.1,
    "missing_step_error":36.73,
    "affordance_error":null,
    "additional_step_error":5.1
  },
  {
    "Model":"mistralai\/Mistral-7B-Instruct-v0.2",
    "Model Family":"Mistral",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":7.2,
    "task_success_rate":3.0,
    "state_goal":5.0,
    "relation_goal":13.53,
    "action_goal":0,
    "total_goal":11.2,
    "execution_success_rate":5.0,
    "parsing_error":8.0,
    "hallucination_error":38.0,
    "predicate_argument_number_error":5.0,
    "wrong_order_error":5.0,
    "missing_step_error":39.0,
    "affordance_error":null,
    "additional_step_error":3.0
  },
  {
    "Model":"mistralai\/Mixtral-8x7B-Instruct-v0.1",
    "Model Family":"Mistral",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":46.7,
    "task_success_rate":6.0,
    "state_goal":17.0,
    "relation_goal":15.79,
    "action_goal":0,
    "total_goal":16.12,
    "execution_success_rate":7.0,
    "parsing_error":21.0,
    "hallucination_error":53.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":6.0,
    "missing_step_error":11.0,
    "affordance_error":null,
    "additional_step_error":1.0
  },
  {
    "Model":"mistralai\/Mixtral-8x22B-Instruct-v0.1",
    "Model Family":"Mistral",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":140.6,
    "task_success_rate":31.0,
    "state_goal":37.5,
    "relation_goal":38.16,
    "action_goal":0,
    "total_goal":37.98,
    "execution_success_rate":40.0,
    "parsing_error":3.0,
    "hallucination_error":6.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":10.0,
    "missing_step_error":32.0,
    "affordance_error":null,
    "additional_step_error":2.0
  },
  {
    "Model":"moonshotai\/Kimi-K2-Instruct",
    "Model Family":"Kimi",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":1000.0,
    "task_success_rate":53.0,
    "state_goal":58.5,
    "relation_goal":65.6,
    "action_goal":0,
    "total_goal":63.66,
    "execution_success_rate":66.0,
    "parsing_error":0.0,
    "hallucination_error":1.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":6.0,
    "missing_step_error":27.0,
    "affordance_error":null,
    "additional_step_error":3.0
  },
  {
    "Model":"openai\/gpt-oss-120b",
    "Model Family":"GPT-OSS",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":120.4,
    "task_success_rate":53.54,
    "state_goal":52.15,
    "relation_goal":63.45,
    "action_goal":0,
    "total_goal":60.5,
    "execution_success_rate":64.65,
    "parsing_error":1.01,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":2.02,
    "missing_step_error":32.32,
    "affordance_error":null,
    "additional_step_error":2.02
  },
  {
    "Model":"openai\/gpt-oss-20b",
    "Model Family":"GPT-OSS",
    "dataset":"behavior",
    "eval_type":"action_sequencing",
    "Model Size (B)":21.5,
    "task_success_rate":35.0,
    "state_goal":28.0,
    "relation_goal":48.5,
    "action_goal":0,
    "total_goal":42.9,
    "execution_success_rate":46.0,
    "parsing_error":37.0,
    "hallucination_error":0.0,
    "predicate_argument_number_error":0.0,
    "wrong_order_error":2.0,
    "missing_step_error":13.0,
    "affordance_error":null,
    "additional_step_error":1.0
  }
]