[
  {
    "Model":"01-ai\/Yi-1.5-34B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":34.4,
    "overall_precision":35.0597609562,
    "overall_recall":39.2273402675,
    "overall_f1":37.0266479663,
    "num_predicted_conditions":755,
    "num_GT_conditions":673,
    "num_satisfied_conditions":264,
    "num_unsatisfied_conditions":409,
    "num_false_positive_conditions":489,
    "state_goal_precision":28.4431137725,
    "state_goal_recall":62.091503268,
    "state_goal_f1":39.0143737166,
    "state_goal_num_predicted":333,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":95,
    "state_goal_num_unsatisfied":58,
    "state_goal_num_false_positive":239,
    "relation_goal_precision":41.8316831683,
    "relation_goal_recall":32.5,
    "relation_goal_f1":36.5800865801,
    "relation_goal_num_predicted":407,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":169,
    "relation_goal_num_unsatisfied":351,
    "relation_goal_num_false_positive":235,
    "grammatically_valid_num":740,
    "grammatically_valid_rate":98.0132450331,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":15,
    "state_hallucination_rate":1.9867549669,
    "object_hallucination_num":15,
    "object_hallucination_rate":1.9867549669
  },
  {
    "Model":"01-ai\/Yi-1.5-34B-Chat",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":34.4,
    "overall_precision":42.9710867398,
    "overall_recall":64.0416047548,
    "overall_f1":51.4319809069,
    "num_predicted_conditions":1011,
    "num_GT_conditions":673,
    "num_satisfied_conditions":431,
    "num_unsatisfied_conditions":242,
    "num_false_positive_conditions":572,
    "state_goal_precision":25.6578947368,
    "state_goal_recall":76.4705882353,
    "state_goal_f1":38.4236453202,
    "state_goal_num_predicted":455,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":117,
    "state_goal_num_unsatisfied":36,
    "state_goal_num_false_positive":339,
    "relation_goal_precision":59.4696969697,
    "relation_goal_recall":60.3846153846,
    "relation_goal_f1":59.9236641221,
    "relation_goal_num_predicted":537,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":314,
    "relation_goal_num_unsatisfied":206,
    "relation_goal_num_false_positive":214,
    "grammatically_valid_num":992,
    "grammatically_valid_rate":98.1206726014,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":19,
    "state_hallucination_rate":1.8793273986,
    "object_hallucination_num":13,
    "object_hallucination_rate":1.2858555885
  },
  {
    "Model":"01-ai\/Yi-1.5-6B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":6.1,
    "overall_precision":15.8933859822,
    "overall_recall":23.9227340267,
    "overall_f1":19.0984578885,
    "num_predicted_conditions":1016,
    "num_GT_conditions":673,
    "num_satisfied_conditions":161,
    "num_unsatisfied_conditions":512,
    "num_false_positive_conditions":852,
    "state_goal_precision":9.7799511002,
    "state_goal_recall":26.1437908497,
    "state_goal_f1":14.2348754448,
    "state_goal_num_predicted":409,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":40,
    "state_goal_num_unsatisfied":113,
    "state_goal_num_false_positive":369,
    "relation_goal_precision":24.297188755,
    "relation_goal_recall":23.2692307692,
    "relation_goal_f1":23.7721021611,
    "relation_goal_num_predicted":501,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":121,
    "relation_goal_num_unsatisfied":399,
    "relation_goal_num_false_positive":377,
    "grammatically_valid_num":910,
    "grammatically_valid_rate":89.5669291339,
    "format_error_num":16,
    "format_error_rate":1.5748031496,
    "state_hallucination_num":106,
    "state_hallucination_rate":10.4330708661,
    "object_hallucination_num":44,
    "object_hallucination_rate":4.3307086614
  },
  {
    "Model":"01-ai\/Yi-1.5-6B-Chat",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":6.1,
    "overall_precision":15.8933859822,
    "overall_recall":23.9227340267,
    "overall_f1":19.0984578885,
    "num_predicted_conditions":1016,
    "num_GT_conditions":673,
    "num_satisfied_conditions":161,
    "num_unsatisfied_conditions":512,
    "num_false_positive_conditions":852,
    "state_goal_precision":9.7799511002,
    "state_goal_recall":26.1437908497,
    "state_goal_f1":14.2348754448,
    "state_goal_num_predicted":409,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":40,
    "state_goal_num_unsatisfied":113,
    "state_goal_num_false_positive":369,
    "relation_goal_precision":24.297188755,
    "relation_goal_recall":23.2692307692,
    "relation_goal_f1":23.7721021611,
    "relation_goal_num_predicted":501,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":121,
    "relation_goal_num_unsatisfied":399,
    "relation_goal_num_false_positive":377,
    "grammatically_valid_num":910,
    "grammatically_valid_rate":89.5669291339,
    "format_error_num":16,
    "format_error_rate":1.5748031496,
    "state_hallucination_num":106,
    "state_hallucination_rate":10.4330708661,
    "object_hallucination_num":44,
    "object_hallucination_rate":4.3307086614
  },
  {
    "Model":"01-ai\/Yi-1.5-9B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.8,
    "overall_precision":15.0617283951,
    "overall_recall":18.1277860327,
    "overall_f1":16.4531355361,
    "num_predicted_conditions":817,
    "num_GT_conditions":673,
    "num_satisfied_conditions":122,
    "num_unsatisfied_conditions":551,
    "num_false_positive_conditions":688,
    "state_goal_precision":17.1875,
    "state_goal_recall":21.568627451,
    "state_goal_f1":19.1304347826,
    "state_goal_num_predicted":192,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":33,
    "state_goal_num_unsatisfied":120,
    "state_goal_num_false_positive":159,
    "relation_goal_precision":35.1778656126,
    "relation_goal_recall":17.1153846154,
    "relation_goal_f1":23.0271668823,
    "relation_goal_num_predicted":260,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":89,
    "relation_goal_num_unsatisfied":431,
    "relation_goal_num_false_positive":164,
    "grammatically_valid_num":452,
    "grammatically_valid_rate":55.3243574051,
    "format_error_num":359,
    "format_error_rate":43.94124847,
    "state_hallucination_num":365,
    "state_hallucination_rate":44.6756425949,
    "object_hallucination_num":365,
    "object_hallucination_rate":44.6756425949
  },
  {
    "Model":"01-ai\/Yi-Coder-1.5B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.5,
    "overall_precision":1.4420062696,
    "overall_recall":3.4175334324,
    "overall_f1":2.0282186949,
    "num_predicted_conditions":1597,
    "num_GT_conditions":673,
    "num_satisfied_conditions":23,
    "num_unsatisfied_conditions":650,
    "num_false_positive_conditions":1572,
    "state_goal_precision":5.0,
    "state_goal_recall":3.2679738562,
    "state_goal_f1":3.95256917,
    "state_goal_num_predicted":100,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":5,
    "state_goal_num_unsatisfied":148,
    "state_goal_num_false_positive":95,
    "relation_goal_precision":9.6256684492,
    "relation_goal_recall":3.4615384615,
    "relation_goal_f1":5.0919377652,
    "relation_goal_num_predicted":189,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":18,
    "relation_goal_num_unsatisfied":502,
    "relation_goal_num_false_positive":169,
    "grammatically_valid_num":289,
    "grammatically_valid_rate":18.0964308078,
    "format_error_num":1306,
    "format_error_rate":81.778334377,
    "state_hallucination_num":1308,
    "state_hallucination_rate":81.9035691922,
    "object_hallucination_num":1341,
    "object_hallucination_rate":83.9699436443
  },
  {
    "Model":"01-ai\/Yi-Coder-1.5B-Chat",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.5,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":1115,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":1115,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":3,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":3,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":5,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":5,
    "grammatically_valid_num":8,
    "grammatically_valid_rate":0.7174887892,
    "format_error_num":1100,
    "format_error_rate":98.6547085202,
    "state_hallucination_num":1107,
    "state_hallucination_rate":99.2825112108,
    "object_hallucination_num":1103,
    "object_hallucination_rate":98.9237668161
  },
  {
    "Model":"01-ai\/Yi-Coder-9B",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.8,
    "overall_precision":35.3225806452,
    "overall_recall":32.5408618128,
    "overall_f1":33.8747099768,
    "num_predicted_conditions":619,
    "num_GT_conditions":673,
    "num_satisfied_conditions":219,
    "num_unsatisfied_conditions":454,
    "num_false_positive_conditions":401,
    "state_goal_precision":35.1239669421,
    "state_goal_recall":55.5555555556,
    "state_goal_f1":43.0379746835,
    "state_goal_num_predicted":241,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":85,
    "state_goal_num_unsatisfied":68,
    "state_goal_num_false_positive":157,
    "relation_goal_precision":37.6404494382,
    "relation_goal_recall":25.7692307692,
    "relation_goal_f1":30.5936073059,
    "relation_goal_num_predicted":356,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":134,
    "relation_goal_num_unsatisfied":386,
    "relation_goal_num_false_positive":222,
    "grammatically_valid_num":597,
    "grammatically_valid_rate":96.4458804523,
    "format_error_num":7,
    "format_error_rate":1.1308562197,
    "state_hallucination_num":22,
    "state_hallucination_rate":3.5541195477,
    "object_hallucination_num":25,
    "object_hallucination_rate":4.0387722132
  },
  {
    "Model":"01-ai\/Yi-Coder-9B-Chat",
    "Model Family":"Yi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.8,
    "overall_precision":26.8933539413,
    "overall_recall":51.7087667162,
    "overall_f1":35.3838332486,
    "num_predicted_conditions":1293,
    "num_GT_conditions":673,
    "num_satisfied_conditions":348,
    "num_unsatisfied_conditions":325,
    "num_false_positive_conditions":946,
    "state_goal_precision":20.6823027719,
    "state_goal_recall":63.3986928105,
    "state_goal_f1":31.1897106109,
    "state_goal_num_predicted":468,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":97,
    "state_goal_num_unsatisfied":56,
    "state_goal_num_false_positive":372,
    "relation_goal_precision":57.3059360731,
    "relation_goal_recall":48.2692307692,
    "relation_goal_f1":52.4008350731,
    "relation_goal_num_predicted":438,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":251,
    "relation_goal_num_unsatisfied":269,
    "relation_goal_num_false_positive":187,
    "grammatically_valid_num":906,
    "grammatically_valid_rate":70.0696055684,
    "format_error_num":359,
    "format_error_rate":27.7648878577,
    "state_hallucination_num":387,
    "state_hallucination_rate":29.9303944316,
    "object_hallucination_num":368,
    "object_hallucination_rate":28.4609435422
  },
  {
    "Model":"Qwen\/Qwen-72B-Chat",
    "Model Family":"Qwen",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":72.3,
    "overall_precision":40.671641791,
    "overall_recall":48.588410104,
    "overall_f1":44.278943805,
    "num_predicted_conditions":803,
    "num_GT_conditions":673,
    "num_satisfied_conditions":327,
    "num_unsatisfied_conditions":346,
    "num_false_positive_conditions":477,
    "state_goal_precision":35.7142857143,
    "state_goal_recall":78.431372549,
    "state_goal_f1":49.0797546012,
    "state_goal_num_predicted":335,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":120,
    "state_goal_num_unsatisfied":33,
    "state_goal_num_false_positive":216,
    "relation_goal_precision":48.3644859813,
    "relation_goal_recall":39.8076923077,
    "relation_goal_f1":43.6708860759,
    "relation_goal_num_predicted":428,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":207,
    "relation_goal_num_unsatisfied":313,
    "relation_goal_num_false_positive":221,
    "grammatically_valid_num":763,
    "grammatically_valid_rate":95.0186799502,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":40,
    "state_hallucination_rate":4.9813200498,
    "object_hallucination_num":2,
    "object_hallucination_rate":0.2490660025
  },
  {
    "Model":"Qwen\/Qwen-7B-Chat",
    "Model Family":"Qwen",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":7.7,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":738,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":738,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":20,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":20,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":3,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":3,
    "grammatically_valid_num":20,
    "grammatically_valid_rate":2.7100271003,
    "format_error_num":718,
    "format_error_rate":97.2899728997,
    "state_hallucination_num":715,
    "state_hallucination_rate":96.8834688347,
    "object_hallucination_num":722,
    "object_hallucination_rate":97.8319783198
  },
  {
    "Model":"Qwen\/Qwen1.5-1.8B-Chat",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.8,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":2461,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":2461,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":8,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":8,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":1,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":1,
    "grammatically_valid_num":9,
    "grammatically_valid_rate":0.365704998,
    "format_error_num":2449,
    "format_error_rate":99.512393336,
    "state_hallucination_num":2452,
    "state_hallucination_rate":99.634295002,
    "object_hallucination_num":2458,
    "object_hallucination_rate":99.878098334
  },
  {
    "Model":"Qwen\/Qwen1.5-14B-Chat",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":14.2,
    "overall_precision":9.5238095238,
    "overall_recall":0.2971768202,
    "overall_f1":0.5763688761,
    "num_predicted_conditions":21,
    "num_GT_conditions":673,
    "num_satisfied_conditions":2,
    "num_unsatisfied_conditions":671,
    "num_false_positive_conditions":19,
    "state_goal_precision":66.6666666667,
    "state_goal_recall":1.3071895425,
    "state_goal_f1":2.5641025641,
    "state_goal_num_predicted":3,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":2,
    "state_goal_num_unsatisfied":151,
    "state_goal_num_false_positive":1,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":1,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":1,
    "grammatically_valid_num":4,
    "grammatically_valid_rate":19.0476190476,
    "format_error_num":17,
    "format_error_rate":80.9523809524,
    "state_hallucination_num":17,
    "state_hallucination_rate":80.9523809524,
    "object_hallucination_num":16,
    "object_hallucination_rate":76.1904761905
  },
  {
    "Model":"Qwen\/Qwen1.5-32B-Chat",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":32.5,
    "overall_precision":36.9519832985,
    "overall_recall":26.3001485884,
    "overall_f1":30.7291666667,
    "num_predicted_conditions":482,
    "num_GT_conditions":673,
    "num_satisfied_conditions":177,
    "num_unsatisfied_conditions":496,
    "num_false_positive_conditions":302,
    "state_goal_precision":44.6808510638,
    "state_goal_recall":27.4509803922,
    "state_goal_f1":34.008097166,
    "state_goal_num_predicted":94,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":42,
    "state_goal_num_unsatisfied":111,
    "state_goal_num_false_positive":52,
    "relation_goal_precision":84.9056603774,
    "relation_goal_recall":25.9615384615,
    "relation_goal_f1":39.764359352,
    "relation_goal_num_predicted":162,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":135,
    "relation_goal_num_unsatisfied":385,
    "relation_goal_num_false_positive":24,
    "grammatically_valid_num":256,
    "grammatically_valid_rate":53.112033195,
    "format_error_num":156,
    "format_error_rate":32.3651452282,
    "state_hallucination_num":226,
    "state_hallucination_rate":46.887966805,
    "object_hallucination_num":160,
    "object_hallucination_rate":33.1950207469
  },
  {
    "Model":"Qwen\/Qwen1.5-4B-Chat",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":4.0,
    "overall_precision":8.313253012,
    "overall_recall":10.2526002972,
    "overall_f1":9.1816367265,
    "num_predicted_conditions":830,
    "num_GT_conditions":673,
    "num_satisfied_conditions":69,
    "num_unsatisfied_conditions":604,
    "num_false_positive_conditions":761,
    "state_goal_precision":13.0612244898,
    "state_goal_recall":20.9150326797,
    "state_goal_f1":16.0804020101,
    "state_goal_num_predicted":245,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":32,
    "state_goal_num_unsatisfied":121,
    "state_goal_num_false_positive":213,
    "relation_goal_precision":10.8504398827,
    "relation_goal_recall":7.1153846154,
    "relation_goal_f1":8.5946573751,
    "relation_goal_num_predicted":341,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":37,
    "relation_goal_num_unsatisfied":483,
    "relation_goal_num_false_positive":304,
    "grammatically_valid_num":586,
    "grammatically_valid_rate":70.6024096386,
    "format_error_num":150,
    "format_error_rate":18.0722891566,
    "state_hallucination_num":244,
    "state_hallucination_rate":29.3975903614,
    "object_hallucination_num":272,
    "object_hallucination_rate":32.7710843373
  },
  {
    "Model":"Qwen\/Qwen1.5-72B-Chat",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":72.3,
    "overall_precision":41.7391304348,
    "overall_recall":57.0579494799,
    "overall_f1":48.2109227872,
    "num_predicted_conditions":920,
    "num_GT_conditions":673,
    "num_satisfied_conditions":384,
    "num_unsatisfied_conditions":289,
    "num_false_positive_conditions":536,
    "state_goal_precision":29.1457286432,
    "state_goal_recall":75.8169934641,
    "state_goal_f1":42.1052631579,
    "state_goal_num_predicted":398,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":116,
    "state_goal_num_unsatisfied":37,
    "state_goal_num_false_positive":282,
    "relation_goal_precision":56.0669456067,
    "relation_goal_recall":51.5384615385,
    "relation_goal_f1":53.7074148297,
    "relation_goal_num_predicted":478,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":268,
    "relation_goal_num_unsatisfied":252,
    "relation_goal_num_false_positive":210,
    "grammatically_valid_num":876,
    "grammatically_valid_rate":95.2173913043,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":44,
    "state_hallucination_rate":4.7826086957,
    "object_hallucination_num":11,
    "object_hallucination_rate":1.1956521739
  },
  {
    "Model":"Qwen\/Qwen1.5-7B-Chat",
    "Model Family":"Qwen1.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":7.7,
    "overall_precision":2.094972067,
    "overall_recall":4.4576523031,
    "overall_f1":2.8503562945,
    "num_predicted_conditions":1432,
    "num_GT_conditions":673,
    "num_satisfied_conditions":30,
    "num_unsatisfied_conditions":643,
    "num_false_positive_conditions":1402,
    "state_goal_precision":5.8823529412,
    "state_goal_recall":3.2679738562,
    "state_goal_f1":4.2016806723,
    "state_goal_num_predicted":85,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":5,
    "state_goal_num_unsatisfied":148,
    "state_goal_num_false_positive":80,
    "relation_goal_precision":37.8787878788,
    "relation_goal_recall":4.8076923077,
    "relation_goal_f1":8.5324232082,
    "relation_goal_num_predicted":66,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":25,
    "relation_goal_num_unsatisfied":495,
    "relation_goal_num_false_positive":41,
    "grammatically_valid_num":151,
    "grammatically_valid_rate":10.5446927374,
    "format_error_num":1275,
    "format_error_rate":89.0363128492,
    "state_hallucination_num":1281,
    "state_hallucination_rate":89.4553072626,
    "object_hallucination_num":1284,
    "object_hallucination_rate":89.6648044693
  },
  {
    "Model":"Qwen\/Qwen2-1.5B",
    "Model Family":"Qwen",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.5,
    "overall_precision":1.6279069767,
    "overall_recall":2.0802377415,
    "overall_f1":1.8264840183,
    "num_predicted_conditions":864,
    "num_GT_conditions":673,
    "num_satisfied_conditions":14,
    "num_unsatisfied_conditions":659,
    "num_false_positive_conditions":846,
    "state_goal_precision":1.9108280255,
    "state_goal_recall":1.9607843137,
    "state_goal_f1":1.935483871,
    "state_goal_num_predicted":157,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":3,
    "state_goal_num_unsatisfied":150,
    "state_goal_num_false_positive":154,
    "relation_goal_precision":4.0293040293,
    "relation_goal_recall":2.1153846154,
    "relation_goal_f1":2.7742749054,
    "relation_goal_num_predicted":277,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":11,
    "relation_goal_num_unsatisfied":509,
    "relation_goal_num_false_positive":262,
    "grammatically_valid_num":407,
    "grammatically_valid_rate":47.1064814815,
    "format_error_num":457,
    "format_error_rate":52.8935185185,
    "state_hallucination_num":430,
    "state_hallucination_rate":49.7685185185,
    "object_hallucination_num":578,
    "object_hallucination_rate":66.8981481481
  },
  {
    "Model":"Qwen\/Qwen2-1.5B-Instruct",
    "Model Family":"Qwen",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.5,
    "overall_precision":7.2631578947,
    "overall_recall":10.2526002972,
    "overall_f1":8.5027726433,
    "num_predicted_conditions":956,
    "num_GT_conditions":673,
    "num_satisfied_conditions":69,
    "num_unsatisfied_conditions":604,
    "num_false_positive_conditions":881,
    "state_goal_precision":6.1594202899,
    "state_goal_recall":11.1111111111,
    "state_goal_f1":7.9254079254,
    "state_goal_num_predicted":278,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":17,
    "state_goal_num_unsatisfied":136,
    "state_goal_num_false_positive":259,
    "relation_goal_precision":8.0495356037,
    "relation_goal_recall":10.0,
    "relation_goal_f1":8.9193825043,
    "relation_goal_num_predicted":650,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":52,
    "relation_goal_num_unsatisfied":468,
    "relation_goal_num_false_positive":594,
    "grammatically_valid_num":910,
    "grammatically_valid_rate":95.1882845188,
    "format_error_num":28,
    "format_error_rate":2.9288702929,
    "state_hallucination_num":28,
    "state_hallucination_rate":2.9288702929,
    "object_hallucination_num":54,
    "object_hallucination_rate":5.6485355649
  },
  {
    "Model":"Qwen\/Qwen2-72B",
    "Model Family":"Qwen",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":72.7,
    "overall_precision":60.8294930876,
    "overall_recall":78.4546805349,
    "overall_f1":68.5269305646,
    "num_predicted_conditions":867,
    "num_GT_conditions":673,
    "num_satisfied_conditions":528,
    "num_unsatisfied_conditions":145,
    "num_false_positive_conditions":340,
    "state_goal_precision":36.3905325444,
    "state_goal_recall":80.3921568627,
    "state_goal_f1":50.1018329939,
    "state_goal_num_predicted":337,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":123,
    "state_goal_num_unsatisfied":30,
    "state_goal_num_false_positive":215,
    "relation_goal_precision":77.1428571429,
    "relation_goal_recall":77.8846153846,
    "relation_goal_f1":77.5119617225,
    "relation_goal_num_predicted":525,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":405,
    "relation_goal_num_unsatisfied":115,
    "relation_goal_num_false_positive":120,
    "grammatically_valid_num":862,
    "grammatically_valid_rate":99.4232987313,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":5,
    "state_hallucination_rate":0.5767012687,
    "object_hallucination_num":1,
    "object_hallucination_rate":0.1153402537
  },
  {
    "Model":"Qwen\/Qwen2-72B-Instruct",
    "Model Family":"Qwen",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":72.7,
    "overall_precision":75.4076086957,
    "overall_recall":82.4665676077,
    "overall_f1":78.7792760823,
    "num_predicted_conditions":735,
    "num_GT_conditions":673,
    "num_satisfied_conditions":555,
    "num_unsatisfied_conditions":118,
    "num_false_positive_conditions":181,
    "state_goal_precision":79.347826087,
    "state_goal_recall":95.4248366013,
    "state_goal_f1":86.646884273,
    "state_goal_num_predicted":183,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":146,
    "state_goal_num_unsatisfied":7,
    "state_goal_num_false_positive":38,
    "relation_goal_precision":75.8812615955,
    "relation_goal_recall":78.6538461538,
    "relation_goal_f1":77.2426817753,
    "relation_goal_num_predicted":539,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":409,
    "relation_goal_num_unsatisfied":111,
    "relation_goal_num_false_positive":130,
    "grammatically_valid_num":722,
    "grammatically_valid_rate":98.231292517,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":13,
    "state_hallucination_rate":1.768707483,
    "object_hallucination_num":3,
    "object_hallucination_rate":0.4081632653
  },
  {
    "Model":"Qwen\/Qwen2.5-1.5B-instruct",
    "Model Family":"Qwen2.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.5,
    "overall_precision":1.1385199241,
    "overall_recall":0.8915304606,
    "overall_f1":1.0,
    "num_predicted_conditions":527,
    "num_GT_conditions":673,
    "num_satisfied_conditions":6,
    "num_unsatisfied_conditions":667,
    "num_false_positive_conditions":521,
    "state_goal_precision":4.0268456376,
    "state_goal_recall":3.9215686275,
    "state_goal_f1":3.9735099338,
    "state_goal_num_predicted":149,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":6,
    "state_goal_num_unsatisfied":147,
    "state_goal_num_false_positive":143,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":2,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":2,
    "grammatically_valid_num":147,
    "grammatically_valid_rate":27.8937381404,
    "format_error_num":378,
    "format_error_rate":71.7267552182,
    "state_hallucination_num":376,
    "state_hallucination_rate":71.3472485769,
    "object_hallucination_num":384,
    "object_hallucination_rate":72.8652751423
  },
  {
    "Model":"Qwen\/Qwen2.5-14B-instruct",
    "Model Family":"Qwen2.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":14.8,
    "overall_precision":59.7660818713,
    "overall_recall":75.9286775632,
    "overall_f1":66.8848167539,
    "num_predicted_conditions":867,
    "num_GT_conditions":673,
    "num_satisfied_conditions":511,
    "num_unsatisfied_conditions":162,
    "num_false_positive_conditions":344,
    "state_goal_precision":46.1538461538,
    "state_goal_recall":86.2745098039,
    "state_goal_f1":60.1366742597,
    "state_goal_num_predicted":285,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":132,
    "state_goal_num_unsatisfied":21,
    "state_goal_num_false_positive":154,
    "relation_goal_precision":70.5772811918,
    "relation_goal_recall":72.8846153846,
    "relation_goal_f1":71.7123935667,
    "relation_goal_num_predicted":550,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":379,
    "relation_goal_num_unsatisfied":141,
    "relation_goal_num_false_positive":158,
    "grammatically_valid_num":835,
    "grammatically_valid_rate":96.30911188,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":32,
    "state_hallucination_rate":3.69088812,
    "object_hallucination_num":3,
    "object_hallucination_rate":0.3460207612
  },
  {
    "Model":"Qwen\/Qwen2.5-32B-instruct",
    "Model Family":"Qwen2.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":32.8,
    "overall_precision":71.9178082192,
    "overall_recall":78.0089153046,
    "overall_f1":74.8396293656,
    "num_predicted_conditions":729,
    "num_GT_conditions":673,
    "num_satisfied_conditions":525,
    "num_unsatisfied_conditions":148,
    "num_false_positive_conditions":205,
    "state_goal_precision":69.5,
    "state_goal_recall":90.8496732026,
    "state_goal_f1":78.7535410765,
    "state_goal_num_predicted":199,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":139,
    "state_goal_num_unsatisfied":14,
    "state_goal_num_false_positive":61,
    "relation_goal_precision":74.8062015504,
    "relation_goal_recall":74.2307692308,
    "relation_goal_f1":74.5173745174,
    "relation_goal_num_predicted":516,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":386,
    "relation_goal_num_unsatisfied":134,
    "relation_goal_num_false_positive":130,
    "grammatically_valid_num":715,
    "grammatically_valid_rate":98.0795610425,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":14,
    "state_hallucination_rate":1.9204389575,
    "object_hallucination_num":1,
    "object_hallucination_rate":0.1371742112
  },
  {
    "Model":"Qwen\/Qwen2.5-3B-instruct",
    "Model Family":"Qwen2.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":3.1,
    "overall_precision":27.0769230769,
    "overall_recall":39.2273402675,
    "overall_f1":32.0388349515,
    "num_predicted_conditions":977,
    "num_GT_conditions":673,
    "num_satisfied_conditions":264,
    "num_unsatisfied_conditions":409,
    "num_false_positive_conditions":711,
    "state_goal_precision":41.7721518987,
    "state_goal_recall":64.7058823529,
    "state_goal_f1":50.7692307692,
    "state_goal_num_predicted":236,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":99,
    "state_goal_num_unsatisfied":54,
    "state_goal_num_false_positive":138,
    "relation_goal_precision":23.0125523013,
    "relation_goal_recall":31.7307692308,
    "relation_goal_f1":26.6774454325,
    "relation_goal_num_predicted":720,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":165,
    "relation_goal_num_unsatisfied":355,
    "relation_goal_num_false_positive":552,
    "grammatically_valid_num":956,
    "grammatically_valid_rate":97.8505629478,
    "format_error_num":5,
    "format_error_rate":0.5117707267,
    "state_hallucination_num":21,
    "state_hallucination_rate":2.1494370522,
    "object_hallucination_num":60,
    "object_hallucination_rate":6.1412487206
  },
  {
    "Model":"Qwen\/Qwen2.5-72B-instruct",
    "Model Family":"Qwen2.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":72.7,
    "overall_precision":64.4943820225,
    "overall_recall":85.2897473997,
    "overall_f1":73.4484964811,
    "num_predicted_conditions":889,
    "num_GT_conditions":673,
    "num_satisfied_conditions":574,
    "num_unsatisfied_conditions":99,
    "num_false_positive_conditions":316,
    "state_goal_precision":38.8888888889,
    "state_goal_recall":96.0784313725,
    "state_goal_f1":55.3672316384,
    "state_goal_num_predicted":377,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":147,
    "state_goal_num_unsatisfied":6,
    "state_goal_num_false_positive":231,
    "relation_goal_precision":83.8899803536,
    "relation_goal_recall":82.1153846154,
    "relation_goal_f1":82.9931972789,
    "relation_goal_num_predicted":509,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":427,
    "relation_goal_num_unsatisfied":93,
    "relation_goal_num_false_positive":82,
    "grammatically_valid_num":886,
    "grammatically_valid_rate":99.6625421822,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":3,
    "state_hallucination_rate":0.3374578178,
    "object_hallucination_num":0,
    "object_hallucination_rate":0.0
  },
  {
    "Model":"Qwen\/Qwen2.5-7B-instruct",
    "Model Family":"Qwen2.5",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":7.6,
    "overall_precision":38.7168141593,
    "overall_recall":52.0059435364,
    "overall_f1":44.3880786303,
    "num_predicted_conditions":901,
    "num_GT_conditions":673,
    "num_satisfied_conditions":350,
    "num_unsatisfied_conditions":323,
    "num_false_positive_conditions":554,
    "state_goal_precision":15.1960784314,
    "state_goal_recall":40.522875817,
    "state_goal_f1":22.1033868093,
    "state_goal_num_predicted":407,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":62,
    "state_goal_num_unsatisfied":91,
    "state_goal_num_false_positive":346,
    "relation_goal_precision":64.4295302013,
    "relation_goal_recall":55.3846153846,
    "relation_goal_f1":59.5656670114,
    "relation_goal_num_predicted":445,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":288,
    "relation_goal_num_unsatisfied":232,
    "relation_goal_num_false_positive":159,
    "grammatically_valid_num":852,
    "grammatically_valid_rate":94.5615982242,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":49,
    "state_hallucination_rate":5.4384017758,
    "object_hallucination_num":6,
    "object_hallucination_rate":0.6659267481
  },
  {
    "Model":"Qwen\/Qwen3-0.6B-Base",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":0.6,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":162,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":162,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":162,
    "format_error_rate":100.0,
    "state_hallucination_num":162,
    "state_hallucination_rate":100.0,
    "object_hallucination_num":162,
    "object_hallucination_rate":100.0
  },
  {
    "Model":"Qwen\/Qwen3-1.7B-Base",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.7,
    "overall_precision":4.5871559633,
    "overall_recall":2.9717682021,
    "overall_f1":3.6068530207,
    "num_predicted_conditions":436,
    "num_GT_conditions":673,
    "num_satisfied_conditions":20,
    "num_unsatisfied_conditions":653,
    "num_false_positive_conditions":416,
    "state_goal_precision":2.5906735751,
    "state_goal_recall":3.2679738562,
    "state_goal_f1":2.8901734104,
    "state_goal_num_predicted":193,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":5,
    "state_goal_num_unsatisfied":148,
    "state_goal_num_false_positive":188,
    "relation_goal_precision":11.5384615385,
    "relation_goal_recall":2.8846153846,
    "relation_goal_f1":4.6153846154,
    "relation_goal_num_predicted":130,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":15,
    "relation_goal_num_unsatisfied":505,
    "relation_goal_num_false_positive":115,
    "grammatically_valid_num":323,
    "grammatically_valid_rate":74.0825688073,
    "format_error_num":104,
    "format_error_rate":23.8532110092,
    "state_hallucination_num":113,
    "state_hallucination_rate":25.9174311927,
    "object_hallucination_num":108,
    "object_hallucination_rate":24.7706422018
  },
  {
    "Model":"Qwen\/Qwen3-14B-Base",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":14.8,
    "overall_precision":47.4609375,
    "overall_recall":72.2139673105,
    "overall_f1":57.2775486152,
    "num_predicted_conditions":1034,
    "num_GT_conditions":673,
    "num_satisfied_conditions":486,
    "num_unsatisfied_conditions":187,
    "num_false_positive_conditions":538,
    "state_goal_precision":28.3095723014,
    "state_goal_recall":90.8496732026,
    "state_goal_f1":43.1677018634,
    "state_goal_num_predicted":490,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":139,
    "state_goal_num_unsatisfied":14,
    "state_goal_num_false_positive":352,
    "relation_goal_precision":67.6413255361,
    "relation_goal_recall":66.7307692308,
    "relation_goal_f1":67.1829622459,
    "relation_goal_num_predicted":524,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":347,
    "relation_goal_num_unsatisfied":173,
    "relation_goal_num_false_positive":166,
    "grammatically_valid_num":1014,
    "grammatically_valid_rate":98.0657640232,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":20,
    "state_hallucination_rate":1.9342359768,
    "object_hallucination_num":2,
    "object_hallucination_rate":0.1934235977
  },
  {
    "Model":"Qwen\/Qwen3-4B-Base",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":4.0,
    "overall_precision":30.023364486,
    "overall_recall":38.1872213967,
    "overall_f1":33.6167429693,
    "num_predicted_conditions":859,
    "num_GT_conditions":673,
    "num_satisfied_conditions":257,
    "num_unsatisfied_conditions":416,
    "num_false_positive_conditions":599,
    "state_goal_precision":15.8562367865,
    "state_goal_recall":49.0196078431,
    "state_goal_f1":23.9616613419,
    "state_goal_num_predicted":472,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":75,
    "state_goal_num_unsatisfied":78,
    "state_goal_num_false_positive":398,
    "relation_goal_precision":51.4124293785,
    "relation_goal_recall":35.0,
    "relation_goal_f1":41.647597254,
    "relation_goal_num_predicted":358,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":182,
    "relation_goal_num_unsatisfied":338,
    "relation_goal_num_false_positive":172,
    "grammatically_valid_num":830,
    "grammatically_valid_rate":96.6239813737,
    "format_error_num":28,
    "format_error_rate":3.2596041909,
    "state_hallucination_num":29,
    "state_hallucination_rate":3.3760186263,
    "object_hallucination_num":33,
    "object_hallucination_rate":3.8416763679
  },
  {
    "Model":"Qwen\/Qwen3-8B-Base",
    "Model Family":"Qwen3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.2,
    "overall_precision":37.5903614458,
    "overall_recall":46.3595839525,
    "overall_f1":41.5169660679,
    "num_predicted_conditions":831,
    "num_GT_conditions":673,
    "num_satisfied_conditions":312,
    "num_unsatisfied_conditions":361,
    "num_false_positive_conditions":518,
    "state_goal_precision":25.6857855362,
    "state_goal_recall":67.3202614379,
    "state_goal_f1":37.1841155235,
    "state_goal_num_predicted":401,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":103,
    "state_goal_num_unsatisfied":50,
    "state_goal_num_false_positive":298,
    "relation_goal_precision":52.9113924051,
    "relation_goal_recall":40.1923076923,
    "relation_goal_f1":45.6830601093,
    "relation_goal_num_predicted":396,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":209,
    "relation_goal_num_unsatisfied":311,
    "relation_goal_num_false_positive":186,
    "grammatically_valid_num":797,
    "grammatically_valid_rate":95.908543923,
    "format_error_num":6,
    "format_error_rate":0.7220216606,
    "state_hallucination_num":34,
    "state_hallucination_rate":4.091456077,
    "object_hallucination_num":9,
    "object_hallucination_rate":1.083032491
  },
  {
    "Model":"bigcode\/starcoder2-15b",
    "Model Family":"starcoder2",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":16.0,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":75,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":75,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":75,
    "format_error_rate":100.0,
    "state_hallucination_num":75,
    "state_hallucination_rate":100.0,
    "object_hallucination_num":75,
    "object_hallucination_rate":100.0
  },
  {
    "Model":"bigcode\/starcoder2-3b",
    "Model Family":"starcoder2",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":3.0,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":0,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":0,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":0,
    "state_hallucination_rate":0.0,
    "object_hallucination_num":0,
    "object_hallucination_rate":0.0
  },
  {
    "Model":"bigcode\/starcoder2-7b",
    "Model Family":"starcoder2",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":7.2,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":0,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":0,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":0,
    "state_hallucination_rate":0.0,
    "object_hallucination_num":0,
    "object_hallucination_rate":0.0
  },
  {
    "Model":"bigcode\/starcoderbase",
    "Model Family":"starcoder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":15.5,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":0,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":0,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":0,
    "state_hallucination_rate":0.0,
    "object_hallucination_num":0,
    "object_hallucination_rate":0.0
  },
  {
    "Model":"bigcode\/starcoderbase-1b",
    "Model Family":"starcoder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":15.5,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":0,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":0,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":0,
    "state_hallucination_rate":0.0,
    "object_hallucination_num":0,
    "object_hallucination_rate":0.0
  },
  {
    "Model":"bigcode\/starcoderbase-3b",
    "Model Family":"starcoder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":15.5,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":0,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":0,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":0,
    "state_hallucination_rate":0.0,
    "object_hallucination_num":0,
    "object_hallucination_rate":0.0
  },
  {
    "Model":"bigcode\/starcoderbase-7b",
    "Model Family":"starcoder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":15.5,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":195,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":195,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":195,
    "format_error_rate":100.0,
    "state_hallucination_num":195,
    "state_hallucination_rate":100.0,
    "object_hallucination_num":195,
    "object_hallucination_rate":100.0
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Llama-70B",
    "Model Family":"DeepSeek-R1",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":70.6,
    "overall_precision":78.7553648069,
    "overall_recall":54.5319465082,
    "overall_f1":64.4424934153,
    "num_predicted_conditions":465,
    "num_GT_conditions":673,
    "num_satisfied_conditions":367,
    "num_unsatisfied_conditions":306,
    "num_false_positive_conditions":99,
    "state_goal_precision":71.9512195122,
    "state_goal_recall":77.1241830065,
    "state_goal_f1":74.4479495268,
    "state_goal_num_predicted":163,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":118,
    "state_goal_num_unsatisfied":35,
    "state_goal_num_false_positive":46,
    "relation_goal_precision":83.2775919732,
    "relation_goal_recall":47.8846153846,
    "relation_goal_f1":60.8058608059,
    "relation_goal_num_predicted":299,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":249,
    "relation_goal_num_unsatisfied":271,
    "relation_goal_num_false_positive":50,
    "grammatically_valid_num":462,
    "grammatically_valid_rate":99.3548387097,
    "format_error_num":1,
    "format_error_rate":0.2150537634,
    "state_hallucination_num":3,
    "state_hallucination_rate":0.6451612903,
    "object_hallucination_num":6,
    "object_hallucination_rate":1.2903225806
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Llama-8B",
    "Model Family":"DeepSeek-R1",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.0,
    "overall_precision":41.6356877323,
    "overall_recall":16.6419019316,
    "overall_f1":23.7791932059,
    "num_predicted_conditions":269,
    "num_GT_conditions":673,
    "num_satisfied_conditions":112,
    "num_unsatisfied_conditions":561,
    "num_false_positive_conditions":157,
    "state_goal_precision":45.6896551724,
    "state_goal_recall":34.6405228758,
    "state_goal_f1":39.405204461,
    "state_goal_num_predicted":116,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":53,
    "state_goal_num_unsatisfied":100,
    "state_goal_num_false_positive":63,
    "relation_goal_precision":47.9674796748,
    "relation_goal_recall":11.3461538462,
    "relation_goal_f1":18.3514774495,
    "relation_goal_num_predicted":123,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":59,
    "relation_goal_num_unsatisfied":461,
    "relation_goal_num_false_positive":64,
    "grammatically_valid_num":239,
    "grammatically_valid_rate":88.8475836431,
    "format_error_num":9,
    "format_error_rate":3.3457249071,
    "state_hallucination_num":30,
    "state_hallucination_rate":11.1524163569,
    "object_hallucination_num":19,
    "object_hallucination_rate":7.063197026
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-1.5B",
    "Model Family":"DeepSeek-R1",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.8,
    "overall_precision":1.1799410029,
    "overall_recall":0.5943536404,
    "overall_f1":0.790513834,
    "num_predicted_conditions":340,
    "num_GT_conditions":673,
    "num_satisfied_conditions":4,
    "num_unsatisfied_conditions":669,
    "num_false_positive_conditions":335,
    "state_goal_precision":16.6666666667,
    "state_goal_recall":0.6535947712,
    "state_goal_f1":1.2578616352,
    "state_goal_num_predicted":6,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":1,
    "state_goal_num_unsatisfied":152,
    "state_goal_num_false_positive":5,
    "relation_goal_precision":8.1081081081,
    "relation_goal_recall":0.5769230769,
    "relation_goal_f1":1.0771992819,
    "relation_goal_num_predicted":38,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":3,
    "relation_goal_num_unsatisfied":517,
    "relation_goal_num_false_positive":34,
    "grammatically_valid_num":44,
    "grammatically_valid_rate":12.9411764706,
    "format_error_num":295,
    "format_error_rate":86.7647058824,
    "state_hallucination_num":296,
    "state_hallucination_rate":87.0588235294,
    "object_hallucination_num":310,
    "object_hallucination_rate":91.1764705882
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-14B",
    "Model Family":"DeepSeek-R1",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":14.8,
    "overall_precision":52.0066889632,
    "overall_recall":46.2109955423,
    "overall_f1":48.9378442172,
    "num_predicted_conditions":603,
    "num_GT_conditions":673,
    "num_satisfied_conditions":311,
    "num_unsatisfied_conditions":362,
    "num_false_positive_conditions":287,
    "state_goal_precision":42.1052631579,
    "state_goal_recall":67.9738562092,
    "state_goal_f1":52.0,
    "state_goal_num_predicted":246,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":104,
    "state_goal_num_unsatisfied":49,
    "state_goal_num_false_positive":143,
    "relation_goal_precision":64.2857142857,
    "relation_goal_recall":39.8076923077,
    "relation_goal_f1":49.1686460808,
    "relation_goal_num_predicted":328,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":207,
    "relation_goal_num_unsatisfied":313,
    "relation_goal_num_false_positive":115,
    "grammatically_valid_num":574,
    "grammatically_valid_rate":95.1907131012,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":29,
    "state_hallucination_rate":4.8092868988,
    "object_hallucination_num":11,
    "object_hallucination_rate":1.824212272
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-32B",
    "Model Family":"DeepSeek-R1",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":32.8,
    "overall_precision":54.7961630695,
    "overall_recall":67.9049034175,
    "overall_f1":60.6502986065,
    "num_predicted_conditions":835,
    "num_GT_conditions":673,
    "num_satisfied_conditions":457,
    "num_unsatisfied_conditions":216,
    "num_false_positive_conditions":377,
    "state_goal_precision":33.9473684211,
    "state_goal_recall":84.3137254902,
    "state_goal_f1":48.4052532833,
    "state_goal_num_predicted":380,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":129,
    "state_goal_num_unsatisfied":24,
    "state_goal_num_false_positive":251,
    "relation_goal_precision":75.2293577982,
    "relation_goal_recall":63.0769230769,
    "relation_goal_f1":68.6192468619,
    "relation_goal_num_predicted":437,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":328,
    "relation_goal_num_unsatisfied":192,
    "relation_goal_num_false_positive":108,
    "grammatically_valid_num":817,
    "grammatically_valid_rate":97.8443113772,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":18,
    "state_hallucination_rate":2.1556886228,
    "object_hallucination_num":2,
    "object_hallucination_rate":0.2395209581
  },
  {
    "Model":"deepseek-ai\/DeepSeek-R1-Distill-Qwen-7B",
    "Model Family":"DeepSeek-R1",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":7.6,
    "overall_precision":44.9044585987,
    "overall_recall":20.9509658247,
    "overall_f1":28.5714285714,
    "num_predicted_conditions":318,
    "num_GT_conditions":673,
    "num_satisfied_conditions":141,
    "num_unsatisfied_conditions":532,
    "num_false_positive_conditions":173,
    "state_goal_precision":46.9387755102,
    "state_goal_recall":30.0653594771,
    "state_goal_f1":36.6533864542,
    "state_goal_num_predicted":98,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":46,
    "state_goal_num_unsatisfied":107,
    "state_goal_num_false_positive":52,
    "relation_goal_precision":56.8862275449,
    "relation_goal_recall":18.2692307692,
    "relation_goal_f1":27.6564774381,
    "relation_goal_num_predicted":171,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":95,
    "relation_goal_num_unsatisfied":425,
    "relation_goal_num_false_positive":72,
    "grammatically_valid_num":269,
    "grammatically_valid_rate":84.5911949686,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":49,
    "state_hallucination_rate":15.4088050314,
    "object_hallucination_num":13,
    "object_hallucination_rate":4.0880503145
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-1.3b-base",
    "Model Family":"DeepSeek-Coder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.3,
    "overall_precision":0.4299226139,
    "overall_recall":0.7429420505,
    "overall_f1":0.5446623094,
    "num_predicted_conditions":1163,
    "num_GT_conditions":673,
    "num_satisfied_conditions":5,
    "num_unsatisfied_conditions":668,
    "num_false_positive_conditions":1158,
    "state_goal_precision":12.5,
    "state_goal_recall":3.2679738562,
    "state_goal_f1":5.1813471503,
    "state_goal_num_predicted":40,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":5,
    "state_goal_num_unsatisfied":148,
    "state_goal_num_false_positive":35,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":53,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":53,
    "grammatically_valid_num":93,
    "grammatically_valid_rate":7.9965606191,
    "format_error_num":1069,
    "format_error_rate":91.9174548581,
    "state_hallucination_num":1070,
    "state_hallucination_rate":92.0034393809,
    "object_hallucination_num":1127,
    "object_hallucination_rate":96.9045571797
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-1.3b-instruct",
    "Model Family":"DeepSeek-Coder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.3,
    "overall_precision":0.8975317876,
    "overall_recall":1.7830609212,
    "overall_f1":1.1940298507,
    "num_predicted_conditions":1337,
    "num_GT_conditions":673,
    "num_satisfied_conditions":12,
    "num_unsatisfied_conditions":661,
    "num_false_positive_conditions":1325,
    "state_goal_precision":2.427184466,
    "state_goal_recall":3.2679738562,
    "state_goal_f1":2.7855153203,
    "state_goal_num_predicted":206,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":5,
    "state_goal_num_unsatisfied":148,
    "state_goal_num_false_positive":201,
    "relation_goal_precision":1.9073569482,
    "relation_goal_recall":1.3461538462,
    "relation_goal_f1":1.5783540023,
    "relation_goal_num_predicted":367,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":7,
    "relation_goal_num_unsatisfied":513,
    "relation_goal_num_false_positive":360,
    "grammatically_valid_num":563,
    "grammatically_valid_rate":42.1091997008,
    "format_error_num":774,
    "format_error_rate":57.8908002992,
    "state_hallucination_num":764,
    "state_hallucination_rate":57.1428571429,
    "object_hallucination_num":854,
    "object_hallucination_rate":63.8743455497
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-33b-base",
    "Model Family":"DeepSeek-Coder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":33.3,
    "overall_precision":18.2829888712,
    "overall_recall":17.087667162,
    "overall_f1":17.6651305684,
    "num_predicted_conditions":629,
    "num_GT_conditions":673,
    "num_satisfied_conditions":115,
    "num_unsatisfied_conditions":558,
    "num_false_positive_conditions":514,
    "state_goal_precision":3.2467532468,
    "state_goal_recall":6.5359477124,
    "state_goal_f1":4.3383947939,
    "state_goal_num_predicted":308,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":10,
    "state_goal_num_unsatisfied":143,
    "state_goal_num_false_positive":298,
    "relation_goal_precision":37.7697841727,
    "relation_goal_recall":20.1923076923,
    "relation_goal_f1":26.3157894737,
    "relation_goal_num_predicted":278,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":105,
    "relation_goal_num_unsatisfied":415,
    "relation_goal_num_false_positive":173,
    "grammatically_valid_num":586,
    "grammatically_valid_rate":93.1637519873,
    "format_error_num":43,
    "format_error_rate":6.8362480127,
    "state_hallucination_num":43,
    "state_hallucination_rate":6.8362480127,
    "object_hallucination_num":69,
    "object_hallucination_rate":10.9697933227
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-33b-instruct",
    "Model Family":"DeepSeek-Coder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":33.3,
    "overall_precision":35.2941176471,
    "overall_recall":56.1664190193,
    "overall_f1":43.3486238532,
    "num_predicted_conditions":1071,
    "num_GT_conditions":673,
    "num_satisfied_conditions":378,
    "num_unsatisfied_conditions":295,
    "num_false_positive_conditions":693,
    "state_goal_precision":22.4568138196,
    "state_goal_recall":76.4705882353,
    "state_goal_f1":34.7181008902,
    "state_goal_num_predicted":520,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":117,
    "state_goal_num_unsatisfied":36,
    "state_goal_num_false_positive":404,
    "relation_goal_precision":49.5256166983,
    "relation_goal_recall":50.1923076923,
    "relation_goal_f1":49.8567335244,
    "relation_goal_num_predicted":528,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":261,
    "relation_goal_num_unsatisfied":259,
    "relation_goal_num_false_positive":266,
    "grammatically_valid_num":1048,
    "grammatically_valid_rate":97.8524743231,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":23,
    "state_hallucination_rate":2.1475256769,
    "object_hallucination_num":12,
    "object_hallucination_rate":1.1204481793
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-6.7b-base",
    "Model Family":"DeepSeek-Coder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":6.7,
    "overall_precision":12.7659574468,
    "overall_recall":14.26448737,
    "overall_f1":13.4736842105,
    "num_predicted_conditions":752,
    "num_GT_conditions":673,
    "num_satisfied_conditions":96,
    "num_unsatisfied_conditions":577,
    "num_false_positive_conditions":656,
    "state_goal_precision":2.9239766082,
    "state_goal_recall":6.5359477124,
    "state_goal_f1":4.0404040404,
    "state_goal_num_predicted":342,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":10,
    "state_goal_num_unsatisfied":143,
    "state_goal_num_false_positive":332,
    "relation_goal_precision":29.5532646048,
    "relation_goal_recall":16.5384615385,
    "relation_goal_f1":21.2083847102,
    "relation_goal_num_predicted":291,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":86,
    "relation_goal_num_unsatisfied":434,
    "relation_goal_num_false_positive":205,
    "grammatically_valid_num":633,
    "grammatically_valid_rate":84.1755319149,
    "format_error_num":119,
    "format_error_rate":15.8244680851,
    "state_hallucination_num":119,
    "state_hallucination_rate":15.8244680851,
    "object_hallucination_num":169,
    "object_hallucination_rate":22.4734042553
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-6.7b-instruct",
    "Model Family":"DeepSeek-Coder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":6.7,
    "overall_precision":16.3898117386,
    "overall_recall":21.9910846954,
    "overall_f1":18.7817258883,
    "num_predicted_conditions":903,
    "num_GT_conditions":673,
    "num_satisfied_conditions":148,
    "num_unsatisfied_conditions":525,
    "num_false_positive_conditions":755,
    "state_goal_precision":5.9340659341,
    "state_goal_recall":17.6470588235,
    "state_goal_f1":8.8815789474,
    "state_goal_num_predicted":455,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":27,
    "state_goal_num_unsatisfied":126,
    "state_goal_num_false_positive":428,
    "relation_goal_precision":31.3471502591,
    "relation_goal_recall":23.2692307692,
    "relation_goal_f1":26.710816777,
    "relation_goal_num_predicted":386,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":121,
    "relation_goal_num_unsatisfied":399,
    "relation_goal_num_false_positive":265,
    "grammatically_valid_num":841,
    "grammatically_valid_rate":93.1339977852,
    "format_error_num":54,
    "format_error_rate":5.9800664452,
    "state_hallucination_num":62,
    "state_hallucination_rate":6.8660022148,
    "object_hallucination_num":65,
    "object_hallucination_rate":7.1982281285
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-7b-base-v1.5",
    "Model Family":"DeepSeek-Coder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":6.9,
    "overall_precision":11.5996258185,
    "overall_recall":18.4249628529,
    "overall_f1":14.2365097589,
    "num_predicted_conditions":1070,
    "num_GT_conditions":673,
    "num_satisfied_conditions":124,
    "num_unsatisfied_conditions":549,
    "num_false_positive_conditions":945,
    "state_goal_precision":7.9155672823,
    "state_goal_recall":19.6078431373,
    "state_goal_f1":11.2781954887,
    "state_goal_num_predicted":379,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":30,
    "state_goal_num_unsatisfied":123,
    "state_goal_num_false_positive":349,
    "relation_goal_precision":24.3523316062,
    "relation_goal_recall":18.0769230769,
    "relation_goal_f1":20.7505518764,
    "relation_goal_num_predicted":387,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":94,
    "relation_goal_num_unsatisfied":426,
    "relation_goal_num_false_positive":292,
    "grammatically_valid_num":766,
    "grammatically_valid_rate":71.5887850467,
    "format_error_num":303,
    "format_error_rate":28.3177570093,
    "state_hallucination_num":304,
    "state_hallucination_rate":28.4112149533,
    "object_hallucination_num":363,
    "object_hallucination_rate":33.9252336449
  },
  {
    "Model":"deepseek-ai\/deepseek-coder-7b-instruct-v1.5",
    "Model Family":"DeepSeek-Coder",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":6.9,
    "overall_precision":29.3548387097,
    "overall_recall":40.5646359584,
    "overall_f1":34.0611353712,
    "num_predicted_conditions":948,
    "num_GT_conditions":673,
    "num_satisfied_conditions":273,
    "num_unsatisfied_conditions":400,
    "num_false_positive_conditions":657,
    "state_goal_precision":21.0862619808,
    "state_goal_recall":43.137254902,
    "state_goal_f1":28.3261802575,
    "state_goal_num_predicted":313,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":66,
    "state_goal_num_unsatisfied":87,
    "state_goal_num_false_positive":247,
    "relation_goal_precision":34.9072512648,
    "relation_goal_recall":39.8076923077,
    "relation_goal_f1":37.1967654987,
    "relation_goal_num_predicted":611,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":207,
    "relation_goal_num_unsatisfied":313,
    "relation_goal_num_false_positive":386,
    "grammatically_valid_num":924,
    "grammatically_valid_rate":97.4683544304,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":24,
    "state_hallucination_rate":2.5316455696,
    "object_hallucination_num":28,
    "object_hallucination_rate":2.9535864979
  },
  {
    "Model":"meta-llama\/Llama-3.1-70B",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":70.6,
    "overall_precision":65.8808933002,
    "overall_recall":78.9004457652,
    "overall_f1":71.8052738337,
    "num_predicted_conditions":807,
    "num_GT_conditions":673,
    "num_satisfied_conditions":531,
    "num_unsatisfied_conditions":142,
    "num_false_positive_conditions":275,
    "state_goal_precision":53.8759689922,
    "state_goal_recall":90.8496732026,
    "state_goal_f1":67.6399026764,
    "state_goal_num_predicted":259,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":139,
    "state_goal_num_unsatisfied":14,
    "state_goal_num_false_positive":119,
    "relation_goal_precision":75.2399232246,
    "relation_goal_recall":75.3846153846,
    "relation_goal_f1":75.3121998079,
    "relation_goal_num_predicted":521,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":392,
    "relation_goal_num_unsatisfied":128,
    "relation_goal_num_false_positive":129,
    "grammatically_valid_num":780,
    "grammatically_valid_rate":96.6542750929,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":27,
    "state_hallucination_rate":3.3457249071,
    "object_hallucination_num":4,
    "object_hallucination_rate":0.4956629492
  },
  {
    "Model":"meta-llama\/Llama-3.1-70B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":70.6,
    "overall_precision":65.8808933002,
    "overall_recall":78.9004457652,
    "overall_f1":71.8052738337,
    "num_predicted_conditions":807,
    "num_GT_conditions":673,
    "num_satisfied_conditions":531,
    "num_unsatisfied_conditions":142,
    "num_false_positive_conditions":275,
    "state_goal_precision":53.8759689922,
    "state_goal_recall":90.8496732026,
    "state_goal_f1":67.6399026764,
    "state_goal_num_predicted":259,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":139,
    "state_goal_num_unsatisfied":14,
    "state_goal_num_false_positive":119,
    "relation_goal_precision":75.2399232246,
    "relation_goal_recall":75.3846153846,
    "relation_goal_f1":75.3121998079,
    "relation_goal_num_predicted":521,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":392,
    "relation_goal_num_unsatisfied":128,
    "relation_goal_num_false_positive":129,
    "grammatically_valid_num":780,
    "grammatically_valid_rate":96.6542750929,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":27,
    "state_hallucination_rate":3.3457249071,
    "object_hallucination_num":4,
    "object_hallucination_rate":0.4956629492
  },
  {
    "Model":"meta-llama\/Llama-3.1-8B",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.0,
    "overall_precision":2.9411764706,
    "overall_recall":4.1604754829,
    "overall_f1":3.4461538462,
    "num_predicted_conditions":954,
    "num_GT_conditions":673,
    "num_satisfied_conditions":28,
    "num_unsatisfied_conditions":645,
    "num_false_positive_conditions":924,
    "state_goal_precision":11.1111111111,
    "state_goal_recall":4.5751633987,
    "state_goal_f1":6.4814814815,
    "state_goal_num_predicted":63,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":7,
    "state_goal_num_unsatisfied":146,
    "state_goal_num_false_positive":56,
    "relation_goal_precision":29.5774647887,
    "relation_goal_recall":4.0384615385,
    "relation_goal_f1":7.1065989848,
    "relation_goal_num_predicted":73,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":21,
    "relation_goal_num_unsatisfied":499,
    "relation_goal_num_false_positive":50,
    "grammatically_valid_num":136,
    "grammatically_valid_rate":14.2557651992,
    "format_error_num":808,
    "format_error_rate":84.6960167715,
    "state_hallucination_num":818,
    "state_hallucination_rate":85.7442348008,
    "object_hallucination_num":833,
    "object_hallucination_rate":87.3165618449
  },
  {
    "Model":"meta-llama\/Llama-3.1-8B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":8.0,
    "overall_precision":29.5313881521,
    "overall_recall":49.6285289747,
    "overall_f1":37.0288248337,
    "num_predicted_conditions":1159,
    "num_GT_conditions":673,
    "num_satisfied_conditions":334,
    "num_unsatisfied_conditions":339,
    "num_false_positive_conditions":797,
    "state_goal_precision":24.6520874751,
    "state_goal_recall":81.045751634,
    "state_goal_f1":37.8048780488,
    "state_goal_num_predicted":502,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":124,
    "state_goal_num_unsatisfied":29,
    "state_goal_num_false_positive":379,
    "relation_goal_precision":35.175879397,
    "relation_goal_recall":40.3846153846,
    "relation_goal_f1":37.6007162041,
    "relation_goal_num_predicted":626,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":210,
    "relation_goal_num_unsatisfied":310,
    "relation_goal_num_false_positive":387,
    "grammatically_valid_num":1128,
    "grammatically_valid_rate":97.3252804142,
    "format_error_num":0,
    "format_error_rate":0.0,
    "state_hallucination_num":31,
    "state_hallucination_rate":2.6747195858,
    "object_hallucination_num":19,
    "object_hallucination_rate":1.6393442623
  },
  {
    "Model":"meta-llama\/Llama-3.2-1B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.2,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":158,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":158,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":54,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":54,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":74,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":74,
    "grammatically_valid_num":128,
    "grammatically_valid_rate":81.0126582278,
    "format_error_num":6,
    "format_error_rate":3.7974683544,
    "state_hallucination_num":30,
    "state_hallucination_rate":18.9873417722,
    "object_hallucination_num":39,
    "object_hallucination_rate":24.6835443038
  },
  {
    "Model":"meta-llama\/Llama-3.2-3B",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":3.2,
    "overall_precision":12.0815138282,
    "overall_recall":24.6656760773,
    "overall_f1":16.2188568637,
    "num_predicted_conditions":1395,
    "num_GT_conditions":673,
    "num_satisfied_conditions":166,
    "num_unsatisfied_conditions":507,
    "num_false_positive_conditions":1208,
    "state_goal_precision":5.0874403816,
    "state_goal_recall":20.9150326797,
    "state_goal_f1":8.1841432225,
    "state_goal_num_predicted":630,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":32,
    "state_goal_num_unsatisfied":121,
    "state_goal_num_false_positive":597,
    "relation_goal_precision":20.303030303,
    "relation_goal_recall":25.7692307692,
    "relation_goal_f1":22.7118644068,
    "relation_goal_num_predicted":680,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":134,
    "relation_goal_num_unsatisfied":386,
    "relation_goal_num_false_positive":526,
    "grammatically_valid_num":1310,
    "grammatically_valid_rate":93.9068100358,
    "format_error_num":22,
    "format_error_rate":1.5770609319,
    "state_hallucination_num":85,
    "state_hallucination_rate":6.0931899642,
    "object_hallucination_num":85,
    "object_hallucination_rate":6.0931899642
  },
  {
    "Model":"meta-llama\/Llama-3.2-3B-Instruct",
    "Model Family":"Llama-3",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":3.2,
    "overall_precision":12.0815138282,
    "overall_recall":24.6656760773,
    "overall_f1":16.2188568637,
    "num_predicted_conditions":1395,
    "num_GT_conditions":673,
    "num_satisfied_conditions":166,
    "num_unsatisfied_conditions":507,
    "num_false_positive_conditions":1208,
    "state_goal_precision":5.0874403816,
    "state_goal_recall":20.9150326797,
    "state_goal_f1":8.1841432225,
    "state_goal_num_predicted":630,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":32,
    "state_goal_num_unsatisfied":121,
    "state_goal_num_false_positive":597,
    "relation_goal_precision":20.303030303,
    "relation_goal_recall":25.7692307692,
    "relation_goal_f1":22.7118644068,
    "relation_goal_num_predicted":680,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":134,
    "relation_goal_num_unsatisfied":386,
    "relation_goal_num_false_positive":526,
    "grammatically_valid_num":1310,
    "grammatically_valid_rate":93.9068100358,
    "format_error_num":22,
    "format_error_rate":1.5770609319,
    "state_hallucination_num":85,
    "state_hallucination_rate":6.0931899642,
    "object_hallucination_num":85,
    "object_hallucination_rate":6.0931899642
  },
  {
    "Model":"microsoft\/Phi-3-mini-128k-instruct",
    "Model Family":"phi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":3.8,
    "overall_precision":21.3530655391,
    "overall_recall":30.014858841,
    "overall_f1":24.9536751081,
    "num_predicted_conditions":956,
    "num_GT_conditions":673,
    "num_satisfied_conditions":202,
    "num_unsatisfied_conditions":471,
    "num_false_positive_conditions":744,
    "state_goal_precision":15.2582159624,
    "state_goal_recall":42.4836601307,
    "state_goal_f1":22.4525043178,
    "state_goal_num_predicted":426,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":65,
    "state_goal_num_unsatisfied":88,
    "state_goal_num_false_positive":361,
    "relation_goal_precision":30.5122494432,
    "relation_goal_recall":26.3461538462,
    "relation_goal_f1":28.2765737874,
    "relation_goal_num_predicted":459,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":137,
    "relation_goal_num_unsatisfied":383,
    "relation_goal_num_false_positive":312,
    "grammatically_valid_num":885,
    "grammatically_valid_rate":92.5732217573,
    "format_error_num":32,
    "format_error_rate":3.3472803347,
    "state_hallucination_num":71,
    "state_hallucination_rate":7.4267782427,
    "object_hallucination_num":98,
    "object_hallucination_rate":10.2510460251
  },
  {
    "Model":"microsoft\/Phi-3-mini-4k-instruct",
    "Model Family":"phi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":3.8,
    "overall_precision":14.4694533762,
    "overall_recall":20.059435364,
    "overall_f1":16.8119551681,
    "num_predicted_conditions":942,
    "num_GT_conditions":673,
    "num_satisfied_conditions":135,
    "num_unsatisfied_conditions":538,
    "num_false_positive_conditions":798,
    "state_goal_precision":15.0510204082,
    "state_goal_recall":38.5620915033,
    "state_goal_f1":21.6513761468,
    "state_goal_num_predicted":394,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":59,
    "state_goal_num_unsatisfied":94,
    "state_goal_num_false_positive":333,
    "relation_goal_precision":25.4180602007,
    "relation_goal_recall":14.6153846154,
    "relation_goal_f1":18.5592185592,
    "relation_goal_num_predicted":306,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":76,
    "relation_goal_num_unsatisfied":444,
    "relation_goal_num_false_positive":223,
    "grammatically_valid_num":700,
    "grammatically_valid_rate":74.3099787686,
    "format_error_num":230,
    "format_error_rate":24.4161358811,
    "state_hallucination_num":242,
    "state_hallucination_rate":25.6900212314,
    "object_hallucination_num":265,
    "object_hallucination_rate":28.1316348195
  },
  {
    "Model":"microsoft\/Phi-3.5-mini-instruct",
    "Model Family":"phi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":3.8,
    "overall_precision":17.7802944507,
    "overall_recall":23.3283803863,
    "overall_f1":20.1799485861,
    "num_predicted_conditions":891,
    "num_GT_conditions":673,
    "num_satisfied_conditions":157,
    "num_unsatisfied_conditions":516,
    "num_false_positive_conditions":726,
    "state_goal_precision":5.8295964126,
    "state_goal_recall":8.4967320261,
    "state_goal_f1":6.914893617,
    "state_goal_num_predicted":223,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":13,
    "state_goal_num_unsatisfied":140,
    "state_goal_num_false_positive":210,
    "relation_goal_precision":39.0243902439,
    "relation_goal_recall":27.6923076923,
    "relation_goal_f1":32.3959505062,
    "relation_goal_num_predicted":377,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":144,
    "relation_goal_num_unsatisfied":376,
    "relation_goal_num_false_positive":225,
    "grammatically_valid_num":600,
    "grammatically_valid_rate":67.3400673401,
    "format_error_num":236,
    "format_error_rate":26.4870931538,
    "state_hallucination_num":291,
    "state_hallucination_rate":32.6599326599,
    "object_hallucination_num":264,
    "object_hallucination_rate":29.6296296296
  },
  {
    "Model":"microsoft\/phi-1",
    "Model Family":"phi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.4,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":26,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":26,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":26,
    "format_error_rate":100.0,
    "state_hallucination_num":26,
    "state_hallucination_rate":100.0,
    "object_hallucination_num":26,
    "object_hallucination_rate":100.0
  },
  {
    "Model":"microsoft\/phi-1_5",
    "Model Family":"phi",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":1.4,
    "overall_precision":0.0,
    "overall_recall":0.0,
    "overall_f1":0.0,
    "num_predicted_conditions":277,
    "num_GT_conditions":673,
    "num_satisfied_conditions":0,
    "num_unsatisfied_conditions":673,
    "num_false_positive_conditions":277,
    "state_goal_precision":0.0,
    "state_goal_recall":0.0,
    "state_goal_f1":0.0,
    "state_goal_num_predicted":0,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":0,
    "state_goal_num_unsatisfied":153,
    "state_goal_num_false_positive":0,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":0,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":0,
    "grammatically_valid_num":0,
    "grammatically_valid_rate":0.0,
    "format_error_num":277,
    "format_error_rate":100.0,
    "state_hallucination_num":277,
    "state_hallucination_rate":100.0,
    "object_hallucination_num":277,
    "object_hallucination_rate":100.0
  },
  {
    "Model":"tiiuae\/Falcon3-10B-Base",
    "Model Family":"falcon",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":10.3,
    "overall_precision":29.107373868,
    "overall_recall":33.4323922734,
    "overall_f1":31.1203319502,
    "num_predicted_conditions":781,
    "num_GT_conditions":673,
    "num_satisfied_conditions":225,
    "num_unsatisfied_conditions":448,
    "num_false_positive_conditions":548,
    "state_goal_precision":30.9352517986,
    "state_goal_recall":56.2091503268,
    "state_goal_f1":39.9071925754,
    "state_goal_num_predicted":281,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":86,
    "state_goal_num_unsatisfied":67,
    "state_goal_num_false_positive":192,
    "relation_goal_precision":47.1186440678,
    "relation_goal_recall":26.7307692308,
    "relation_goal_f1":34.1104294479,
    "relation_goal_num_predicted":300,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":139,
    "relation_goal_num_unsatisfied":381,
    "relation_goal_num_false_positive":156,
    "grammatically_valid_num":581,
    "grammatically_valid_rate":74.3918053777,
    "format_error_num":198,
    "format_error_rate":25.3521126761,
    "state_hallucination_num":200,
    "state_hallucination_rate":25.6081946223,
    "object_hallucination_num":202,
    "object_hallucination_rate":25.8642765685
  },
  {
    "Model":"tiiuae\/Falcon3-7B-Base",
    "Model Family":"falcon",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":7.5,
    "overall_precision":2.1822849807,
    "overall_recall":7.5780089153,
    "overall_f1":3.3887043189,
    "num_predicted_conditions":2338,
    "num_GT_conditions":673,
    "num_satisfied_conditions":51,
    "num_unsatisfied_conditions":622,
    "num_false_positive_conditions":2286,
    "state_goal_precision":9.219858156,
    "state_goal_recall":8.4967320261,
    "state_goal_f1":8.843537415,
    "state_goal_num_predicted":141,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":13,
    "state_goal_num_unsatisfied":140,
    "state_goal_num_false_positive":128,
    "relation_goal_precision":30.8943089431,
    "relation_goal_recall":7.3076923077,
    "relation_goal_f1":11.8195956454,
    "relation_goal_num_predicted":124,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":38,
    "relation_goal_num_unsatisfied":482,
    "relation_goal_num_false_positive":85,
    "grammatically_valid_num":265,
    "grammatically_valid_rate":11.3344739093,
    "format_error_num":2068,
    "format_error_rate":88.4516680924,
    "state_hallucination_num":2073,
    "state_hallucination_rate":88.6655260907,
    "object_hallucination_num":2083,
    "object_hallucination_rate":89.0932420873
  },
  {
    "Model":"tiiuae\/falcon-11B",
    "Model Family":"falcon",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":11.1,
    "overall_precision":2.4390243902,
    "overall_recall":4.309063893,
    "overall_f1":3.1149301826,
    "num_predicted_conditions":1189,
    "num_GT_conditions":673,
    "num_satisfied_conditions":29,
    "num_unsatisfied_conditions":644,
    "num_false_positive_conditions":1160,
    "state_goal_precision":6.5934065934,
    "state_goal_recall":3.9215686275,
    "state_goal_f1":4.9180327869,
    "state_goal_num_predicted":91,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":6,
    "state_goal_num_unsatisfied":147,
    "state_goal_num_false_positive":85,
    "relation_goal_precision":21.1009174312,
    "relation_goal_recall":4.4230769231,
    "relation_goal_f1":7.3131955485,
    "relation_goal_num_predicted":109,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":23,
    "relation_goal_num_unsatisfied":497,
    "relation_goal_num_false_positive":86,
    "grammatically_valid_num":200,
    "grammatically_valid_rate":16.8208578638,
    "format_error_num":978,
    "format_error_rate":82.2539949537,
    "state_hallucination_num":989,
    "state_hallucination_rate":83.1791421362,
    "object_hallucination_num":999,
    "object_hallucination_rate":84.0201850294
  },
  {
    "Model":"tiiuae\/falcon-40b",
    "Model Family":"falcon",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":41.8,
    "overall_precision":0.7984031936,
    "overall_recall":1.1887072808,
    "overall_f1":0.9552238806,
    "num_predicted_conditions":1006,
    "num_GT_conditions":673,
    "num_satisfied_conditions":8,
    "num_unsatisfied_conditions":665,
    "num_false_positive_conditions":994,
    "state_goal_precision":3.1746031746,
    "state_goal_recall":2.614379085,
    "state_goal_f1":2.8673835125,
    "state_goal_num_predicted":126,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":4,
    "state_goal_num_unsatisfied":149,
    "state_goal_num_false_positive":122,
    "relation_goal_precision":2.9850746269,
    "relation_goal_recall":0.7692307692,
    "relation_goal_f1":1.2232415902,
    "relation_goal_num_predicted":138,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":4,
    "relation_goal_num_unsatisfied":516,
    "relation_goal_num_false_positive":130,
    "grammatically_valid_num":264,
    "grammatically_valid_rate":26.2425447316,
    "format_error_num":731,
    "format_error_rate":72.6640159046,
    "state_hallucination_num":742,
    "state_hallucination_rate":73.7574552684,
    "object_hallucination_num":816,
    "object_hallucination_rate":81.1133200795
  },
  {
    "Model":"tiiuae\/falcon-7b",
    "Model Family":"falcon",
    "dataset":"behavior",
    "eval_type":"goal_interpretation",
    "Model Size (B)":7.2,
    "overall_precision":0.1261829653,
    "overall_recall":0.2971768202,
    "overall_f1":0.1771479185,
    "num_predicted_conditions":1586,
    "num_GT_conditions":673,
    "num_satisfied_conditions":2,
    "num_unsatisfied_conditions":671,
    "num_false_positive_conditions":1583,
    "state_goal_precision":11.1111111111,
    "state_goal_recall":1.3071895425,
    "state_goal_f1":2.3391812865,
    "state_goal_num_predicted":19,
    "state_goal_num_GT":153,
    "state_goal_num_satisfied":2,
    "state_goal_num_unsatisfied":151,
    "state_goal_num_false_positive":16,
    "relation_goal_precision":0.0,
    "relation_goal_recall":0.0,
    "relation_goal_f1":0.0,
    "relation_goal_num_predicted":9,
    "relation_goal_num_GT":520,
    "relation_goal_num_satisfied":0,
    "relation_goal_num_unsatisfied":520,
    "relation_goal_num_false_positive":9,
    "grammatically_valid_num":28,
    "grammatically_valid_rate":1.7654476671,
    "format_error_num":1558,
    "format_error_rate":98.2345523329,
    "state_hallucination_num":1558,
    "state_hallucination_rate":98.2345523329,
    "object_hallucination_num":1574,
    "object_hallucination_rate":99.2433795712
  }
]