Model,Model Family,dataset,eval_type,Model Size (B),overall_precision,overall_recall,overall_f1,num_predicted_conditions,num_GT_conditions,num_satisfied_conditions,num_unsatisfied_conditions,num_false_positive_conditions,state_goal_precision,state_goal_recall,state_goal_f1,state_goal_num_predicted,state_goal_num_GT,state_goal_num_satisfied,state_goal_num_unsatisfied,state_goal_num_false_positive,relation_goal_precision,relation_goal_recall,relation_goal_f1,relation_goal_num_predicted,relation_goal_num_GT,relation_goal_num_satisfied,relation_goal_num_unsatisfied,relation_goal_num_false_positive,grammatically_valid_num,grammatically_valid_rate,format_error_num,format_error_rate,state_hallucination_num,state_hallucination_rate,object_hallucination_num,object_hallucination_rate
01-ai/Yi-1.5-34B,Yi,behavior,goal_interpretation,34.4,35.0597609561753,39.22734026745914,37.02664796633942,755,673,264,409,489,28.443113772455092,62.091503267973856,39.01437371663244,333,153,95,58,239,41.83168316831683,32.5,36.580086580086586,407,520,169,351,235,740,98.01324503311258,0,0.0,15,1.9867549668874174,15,1.9867549668874174
01-ai/Yi-1.5-34B-Chat,Yi,behavior,goal_interpretation,34.4,42.97108673978066,64.04160475482912,51.43198090692124,1011,673,431,242,572,25.657894736842106,76.47058823529412,38.423645320197046,455,153,117,36,339,59.46969696969697,60.38461538461538,59.923664122137396,537,520,314,206,214,992,98.12067260138477,0,0.0,19,1.8793273986152326,13,1.2858555885262115
01-ai/Yi-1.5-6B,Yi,behavior,goal_interpretation,6.1,15.893385982230997,23.922734026745914,19.098457888493474,1016,673,161,512,852,9.7799511002445,26.143790849673206,14.23487544483986,409,153,40,113,369,24.29718875502008,23.26923076923077,23.772102161100197,501,520,121,399,377,910,89.56692913385827,16,1.574803149606299,106,10.433070866141732,44,4.330708661417323
01-ai/Yi-1.5-6B-Chat,Yi,behavior,goal_interpretation,6.1,15.893385982230997,23.922734026745914,19.098457888493474,1016,673,161,512,852,9.7799511002445,26.143790849673206,14.23487544483986,409,153,40,113,369,24.29718875502008,23.26923076923077,23.772102161100197,501,520,121,399,377,910,89.56692913385827,16,1.574803149606299,106,10.433070866141732,44,4.330708661417323
01-ai/Yi-1.5-9B,Yi,behavior,goal_interpretation,8.8,15.06172839506173,18.12778603268945,16.453135536075525,817,673,122,551,688,17.1875,21.568627450980394,19.130434782608695,192,153,33,120,159,35.177865612648226,17.115384615384617,23.027166882276845,260,520,89,431,164,452,55.32435740514076,359,43.94124847001224,365,44.67564259485924,365,44.67564259485924
01-ai/Yi-Coder-1.5B,Yi,behavior,goal_interpretation,1.5,1.4420062695924765,3.4175334323922733,2.0282186948853616,1597,673,23,650,1572,5.0,3.2679738562091507,3.9525691699604746,100,153,5,148,95,9.62566844919786,3.4615384615384617,5.091937765205093,189,520,18,502,169,289,18.09643080776456,1306,81.77833437695679,1308,81.90356919223544,1341,83.96994364433313
01-ai/Yi-Coder-1.5B-Chat,Yi,behavior,goal_interpretation,1.5,0.0,0.0,0.0,1115,673,0,673,1115,0.0,0.0,0.0,3,153,0,153,3,0.0,0.0,0.0,5,520,0,520,5,8,0.7174887892376681,1100,98.65470852017937,1107,99.28251121076234,1103,98.9237668161435
01-ai/Yi-Coder-9B,Yi,behavior,goal_interpretation,8.8,35.32258064516129,32.5408618127786,33.874709976798144,619,673,219,454,401,35.12396694214876,55.55555555555556,43.0379746835443,241,153,85,68,157,37.640449438202246,25.769230769230766,30.59360730593607,356,520,134,386,222,597,96.4458804523425,7,1.1308562197092082,22,3.5541195476575123,25,4.038772213247173
01-ai/Yi-Coder-9B-Chat,Yi,behavior,goal_interpretation,8.8,26.89335394126739,51.708766716196145,35.383833248601945,1293,673,348,325,946,20.68230277185501,63.39869281045751,31.189710610932476,468,153,97,56,372,57.30593607305936,48.26923076923077,52.40083507306888,438,520,251,269,187,906,70.06960556844548,359,27.764887857695282,387,29.930394431554525,368,28.46094354215004
Qwen/Qwen-72B-Chat,Qwen,behavior,goal_interpretation,72.3,40.67164179104478,48.588410104011885,44.27894380501016,803,673,327,346,477,35.714285714285715,78.43137254901961,49.079754601227,335,153,120,33,216,48.36448598130841,39.80769230769231,43.67088607594936,428,520,207,313,221,763,95.01867995018681,0,0.0,40,4.9813200498132,2,0.24906600249066002
Qwen/Qwen-7B-Chat,Qwen,behavior,goal_interpretation,7.7,0.0,0.0,0.0,738,673,0,673,738,0.0,0.0,0.0,20,153,0,153,20,0.0,0.0,0.0,3,520,0,520,3,20,2.710027100271003,718,97.289972899729,715,96.88346883468834,722,97.8319783197832
Qwen/Qwen1.5-1.8B-Chat,Qwen1.5,behavior,goal_interpretation,1.8,0.0,0.0,0.0,2461,673,0,673,2461,0.0,0.0,0.0,8,153,0,153,8,0.0,0.0,0.0,1,520,0,520,1,9,0.3657049979683055,2449,99.51239333604227,2452,99.63429500203169,2458,99.87809833401057
Qwen/Qwen1.5-14B-Chat,Qwen1.5,behavior,goal_interpretation,14.2,9.523809523809524,0.2971768202080238,0.5763688760806917,21,673,2,671,19,66.66666666666666,1.3071895424836601,2.564102564102564,3,153,2,151,1,0.0,0.0,0.0,1,520,0,520,1,4,19.047619047619047,17,80.95238095238095,17,80.95238095238095,16,76.19047619047619
Qwen/Qwen1.5-32B-Chat,Qwen1.5,behavior,goal_interpretation,32.5,36.95198329853862,26.300148588410106,30.729166666666668,482,673,177,496,302,44.680851063829785,27.450980392156865,34.008097165991906,94,153,42,111,52,84.90566037735849,25.961538461538463,39.764359351988226,162,520,135,385,24,256,53.11203319502075,156,32.365145228215766,226,46.88796680497925,160,33.19502074688796
Qwen/Qwen1.5-4B-Chat,Qwen1.5,behavior,goal_interpretation,4.0,8.313253012048193,10.25260029717682,9.181636726546905,830,673,69,604,761,13.061224489795919,20.915032679738562,16.08040201005025,245,153,32,121,213,10.850439882697946,7.115384615384615,8.59465737514518,341,520,37,483,304,586,70.60240963855422,150,18.072289156626507,244,29.397590361445786,272,32.7710843373494
Qwen/Qwen1.5-72B-Chat,Qwen1.5,behavior,goal_interpretation,72.3,41.73913043478261,57.05794947994056,48.21092278719397,920,673,384,289,536,29.145728643216078,75.81699346405229,42.10526315789474,398,153,116,37,282,56.06694560669456,51.53846153846153,53.707414829659314,478,520,268,252,210,876,95.21739130434783,0,0.0,44,4.782608695652174,11,1.1956521739130435
Qwen/Qwen1.5-7B-Chat,Qwen1.5,behavior,goal_interpretation,7.7,2.094972067039106,4.457652303120357,2.850356294536817,1432,673,30,643,1402,5.88235294117647,3.2679738562091507,4.201680672268907,85,153,5,148,80,37.878787878787875,4.807692307692308,8.532423208191126,66,520,25,495,41,151,10.544692737430168,1275,89.03631284916202,1281,89.45530726256983,1284,89.66480446927375
Qwen/Qwen2-1.5B,Qwen,behavior,goal_interpretation,1.5,1.627906976744186,2.080237741456166,1.8264840182648396,864,673,14,659,846,1.910828025477707,1.9607843137254901,1.935483870967742,157,153,3,150,154,4.029304029304029,2.1153846153846154,2.7742749054224465,277,520,11,509,262,407,47.10648148148148,457,52.893518518518526,430,49.76851851851852,578,66.89814814814815
Qwen/Qwen2-1.5B-Instruct,Qwen,behavior,goal_interpretation,1.5,7.2631578947368425,10.25260029717682,8.502772643253236,956,673,69,604,881,6.159420289855073,11.11111111111111,7.925407925407926,278,153,17,136,259,8.04953560371517,10.0,8.919382504288164,650,520,52,468,594,910,95.18828451882845,28,2.928870292887029,28,2.928870292887029,54,5.648535564853557
Qwen/Qwen2-72B,Qwen,behavior,goal_interpretation,72.7,60.82949308755761,78.45468053491828,68.52693056456846,867,673,528,145,340,36.3905325443787,80.3921568627451,50.10183299389003,337,153,123,30,215,77.14285714285715,77.88461538461539,77.51196172248804,525,520,405,115,120,862,99.4232987312572,0,0.0,5,0.5767012687427913,1,0.11534025374855825
Qwen/Qwen2-72B-Instruct,Qwen,behavior,goal_interpretation,72.7,75.40760869565217,82.46656760772659,78.77927608232788,735,673,555,118,181,79.34782608695652,95.42483660130719,86.64688427299704,183,153,146,7,38,75.88126159554731,78.65384615384615,77.24268177525968,539,520,409,111,130,722,98.2312925170068,0,0.0,13,1.7687074829931975,3,0.40816326530612246
Qwen/Qwen2.5-1.5B-instruct,Qwen2.5,behavior,goal_interpretation,1.5,1.1385199240986716,0.8915304606240713,0.9999999999999999,527,673,6,667,521,4.026845637583892,3.9215686274509802,3.9735099337748347,149,153,6,147,143,0.0,0.0,0.0,2,520,0,520,2,147,27.893738140417458,378,71.72675521821633,376,71.34724857685009,384,72.86527514231499
Qwen/Qwen2.5-14B-instruct,Qwen2.5,behavior,goal_interpretation,14.8,59.76608187134504,75.92867756315007,66.88481675392671,867,673,511,162,344,46.15384615384615,86.27450980392157,60.13667425968109,285,153,132,21,154,70.57728119180634,72.88461538461539,71.71239356669821,550,520,379,141,158,835,96.30911188004615,0,0.0,32,3.690888119953864,3,0.34602076124567477
Qwen/Qwen2.5-32B-instruct,Qwen2.5,behavior,goal_interpretation,32.8,71.91780821917808,78.00891530460625,74.83962936564505,729,673,525,148,205,69.5,90.84967320261438,78.75354107648724,199,153,139,14,61,74.8062015503876,74.23076923076923,74.51737451737452,516,520,386,134,130,715,98.079561042524,0,0.0,14,1.9204389574759946,1,0.1371742112482853
Qwen/Qwen2.5-3B-instruct,Qwen2.5,behavior,goal_interpretation,3.1,27.076923076923077,39.22734026745914,32.03883495145631,977,673,264,409,711,41.77215189873418,64.70588235294117,50.76923076923078,236,153,99,54,138,23.01255230125523,31.73076923076923,26.677445432497976,720,520,165,355,552,956,97.85056294779939,5,0.511770726714432,21,2.1494370522006143,60,6.1412487205731825
Qwen/Qwen2.5-72B-instruct,Qwen2.5,behavior,goal_interpretation,72.7,64.49438202247191,85.28974739970282,73.44849648112603,889,673,574,99,316,38.88888888888889,96.07843137254902,55.367231638418076,377,153,147,6,231,83.88998035363457,82.11538461538461,82.99319727891155,509,520,427,93,82,886,99.66254218222723,0,0.0,3,0.3374578177727784,0,0.0
Qwen/Qwen2.5-7B-instruct,Qwen2.5,behavior,goal_interpretation,7.6,38.716814159292035,52.00594353640417,44.388078630310716,901,673,350,323,554,15.196078431372548,40.52287581699346,22.10338680926916,407,153,62,91,346,64.42953020134227,55.38461538461539,59.56566701137539,445,520,288,232,159,852,94.56159822419534,0,0.0,49,5.438401775804662,6,0.6659267480577136
Qwen/Qwen3-0.6B-Base,Qwen3,behavior,goal_interpretation,0.6,0.0,0.0,0.0,162,673,0,673,162,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,162,100.0,162,100.0,162,100.0
Qwen/Qwen3-1.7B-Base,Qwen3,behavior,goal_interpretation,1.7,4.587155963302752,2.9717682020802374,3.606853020739405,436,673,20,653,416,2.5906735751295336,3.2679738562091507,2.8901734104046244,193,153,5,148,188,11.538461538461538,2.8846153846153846,4.615384615384616,130,520,15,505,115,323,74.08256880733946,104,23.853211009174313,113,25.91743119266055,108,24.770642201834864
Qwen/Qwen3-14B-Base,Qwen3,behavior,goal_interpretation,14.8,47.4609375,72.21396731054978,57.2775486152033,1034,673,486,187,538,28.30957230142566,90.84967320261438,43.16770186335404,490,153,139,14,352,67.64132553606238,66.73076923076923,67.18296224588578,524,520,347,173,166,1014,98.06576402321083,0,0.0,20,1.9342359767891684,2,0.19342359767891684
Qwen/Qwen3-4B-Base,Qwen3,behavior,goal_interpretation,4.0,30.023364485981308,38.187221396731054,33.616742969260955,859,673,257,416,599,15.856236786469344,49.01960784313725,23.961661341853034,472,153,75,78,398,51.41242937853108,35.0,41.64759725400458,358,520,182,338,172,830,96.62398137369034,28,3.259604190919674,29,3.3760186263096625,33,3.841676367869616
Qwen/Qwen3-8B-Base,Qwen3,behavior,goal_interpretation,8.2,37.59036144578313,46.35958395245171,41.516966067864274,831,673,312,361,518,25.6857855361596,67.3202614379085,37.1841155234657,401,153,103,50,298,52.911392405063296,40.19230769230769,45.68306010928962,396,520,209,311,186,797,95.90854392298436,6,0.7220216606498195,34,4.0914560770156445,9,1.083032490974729
bigcode/starcoder2-15b,starcoder2,behavior,goal_interpretation,16.0,0.0,0.0,0.0,75,673,0,673,75,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,75,100.0,75,100.0,75,100.0
bigcode/starcoder2-3b,starcoder2,behavior,goal_interpretation,3.0,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0
bigcode/starcoder2-7b,starcoder2,behavior,goal_interpretation,7.2,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0
bigcode/starcoderbase,starcoder,behavior,goal_interpretation,15.5,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0
bigcode/starcoderbase-1b,starcoder,behavior,goal_interpretation,15.5,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0
bigcode/starcoderbase-3b,starcoder,behavior,goal_interpretation,15.5,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0
bigcode/starcoderbase-7b,starcoder,behavior,goal_interpretation,15.5,0.0,0.0,0.0,195,673,0,673,195,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,195,100.0,195,100.0,195,100.0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek-R1,behavior,goal_interpretation,70.6,78.75536480686695,54.53194650817236,64.44249341527654,465,673,367,306,99,71.95121951219512,77.12418300653596,74.44794952681387,163,153,118,35,46,83.27759197324414,47.88461538461539,60.80586080586081,299,520,249,271,50,462,99.35483870967742,1,0.21505376344086022,3,0.6451612903225806,6,1.2903225806451613
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,DeepSeek-R1,behavior,goal_interpretation,8.0,41.63568773234201,16.64190193164933,23.7791932059448,269,673,112,561,157,45.689655172413794,34.64052287581699,39.40520446096654,116,153,53,100,63,47.96747967479675,11.346153846153847,18.35147744945568,123,520,59,461,64,239,88.84758364312268,9,3.3457249070631967,30,11.152416356877323,19,7.063197026022305
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek-R1,behavior,goal_interpretation,1.8,1.1799410029498525,0.5943536404160475,0.7905138339920948,340,673,4,669,335,16.666666666666664,0.6535947712418301,1.2578616352201257,6,153,1,152,5,8.108108108108109,0.576923076923077,1.0771992818671452,38,520,3,517,34,44,12.941176470588237,295,86.76470588235294,296,87.05882352941177,310,91.17647058823529
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek-R1,behavior,goal_interpretation,14.8,52.0066889632107,46.210995542347696,48.93784421715185,603,673,311,362,287,42.10526315789473,67.97385620915033,52.0,246,153,104,49,143,64.28571428571429,39.80769230769231,49.168646080760084,328,520,207,313,115,574,95.19071310116087,0,0.0,29,4.809286898839138,11,1.8242122719734661
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek-R1,behavior,goal_interpretation,32.8,54.79616306954437,67.90490341753343,60.65029860650299,835,673,457,216,377,33.94736842105263,84.31372549019608,48.405253283302066,380,153,129,24,251,75.22935779816514,63.07692307692307,68.6192468619247,437,520,328,192,108,817,97.8443113772455,0,0.0,18,2.155688622754491,2,0.23952095808383234
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek-R1,behavior,goal_interpretation,7.6,44.904458598726116,20.950965824665676,28.57142857142857,318,673,141,532,173,46.93877551020408,30.065359477124183,36.65338645418326,98,153,46,107,52,56.886227544910184,18.269230769230766,27.65647743813682,171,520,95,425,72,269,84.59119496855347,0,0.0,49,15.40880503144654,13,4.088050314465408
deepseek-ai/deepseek-coder-1.3b-base,DeepSeek-Coder,behavior,goal_interpretation,1.3,0.4299226139294927,0.7429420505200593,0.5446623093681917,1163,673,5,668,1158,12.5,3.2679738562091507,5.181347150259068,40,153,5,148,35,0.0,0.0,0.0,53,520,0,520,53,93,7.996560619088563,1069,91.91745485812554,1070,92.00343938091143,1127,96.90455717970765
deepseek-ai/deepseek-coder-1.3b-instruct,DeepSeek-Coder,behavior,goal_interpretation,1.3,0.8975317875841435,1.7830609212481425,1.1940298507462686,1337,673,12,661,1325,2.4271844660194173,3.2679738562091507,2.785515320334262,206,153,5,148,201,1.9073569482288828,1.3461538461538463,1.5783540022547917,367,520,7,513,360,563,42.109199700822735,774,57.89080029917726,764,57.14285714285714,854,63.87434554973822
deepseek-ai/deepseek-coder-33b-base,DeepSeek-Coder,behavior,goal_interpretation,33.3,18.282988871224166,17.087667161961367,17.665130568356375,629,673,115,558,514,3.2467532467532463,6.535947712418301,4.338394793926247,308,153,10,143,298,37.76978417266187,20.192307692307693,26.31578947368421,278,520,105,415,173,586,93.1637519872814,43,6.836248012718602,43,6.836248012718602,69,10.969793322734498
deepseek-ai/deepseek-coder-33b-instruct,DeepSeek-Coder,behavior,goal_interpretation,33.3,35.294117647058826,56.16641901931649,43.34862385321101,1071,673,378,295,693,22.456813819577732,76.47058823529412,34.71810089020771,520,153,117,36,404,49.52561669829222,50.19230769230769,49.8567335243553,528,520,261,259,266,1048,97.85247432306255,0,0.0,23,2.1475256769374416,12,1.1204481792717087
deepseek-ai/deepseek-coder-6.7b-base,DeepSeek-Coder,behavior,goal_interpretation,6.7,12.76595744680851,14.26448736998514,13.473684210526313,752,673,96,577,656,2.923976608187134,6.535947712418301,4.04040404040404,342,153,10,143,332,29.553264604810998,16.538461538461537,21.208384710234277,291,520,86,434,205,633,84.17553191489363,119,15.824468085106384,119,15.824468085106384,169,22.47340425531915
deepseek-ai/deepseek-coder-6.7b-instruct,DeepSeek-Coder,behavior,goal_interpretation,6.7,16.389811738648948,21.99108469539376,18.781725888324875,903,673,148,525,755,5.934065934065933,17.647058823529413,8.881578947368423,455,153,27,126,428,31.34715025906736,23.26923076923077,26.71081677704194,386,520,121,399,265,841,93.13399778516057,54,5.980066445182724,62,6.866002214839424,65,7.198228128460686
deepseek-ai/deepseek-coder-7b-base-v1.5,DeepSeek-Coder,behavior,goal_interpretation,6.9,11.599625818521982,18.424962852897476,14.236509758897817,1070,673,124,549,945,7.9155672823219,19.607843137254903,11.278195488721806,379,153,30,123,349,24.352331606217618,18.076923076923077,20.75055187637969,387,520,94,426,292,766,71.58878504672897,303,28.317757009345794,304,28.41121495327103,363,33.925233644859816
deepseek-ai/deepseek-coder-7b-instruct-v1.5,DeepSeek-Coder,behavior,goal_interpretation,6.9,29.354838709677416,40.56463595839524,34.061135371179034,948,673,273,400,657,21.08626198083067,43.13725490196079,28.32618025751073,313,153,66,87,247,34.90725126475548,39.80769230769231,37.19676549865229,611,520,207,313,386,924,97.46835443037975,0,0.0,24,2.5316455696202533,28,2.9535864978902953
meta-llama/Llama-3.1-70B,Llama-3,behavior,goal_interpretation,70.6,65.88089330024815,78.90044576523032,71.80527383367141,807,673,531,142,275,53.875968992248055,90.84967320261438,67.63990267639902,259,153,139,14,119,75.23992322456814,75.38461538461539,75.31219980787704,521,520,392,128,129,780,96.6542750929368,0,0.0,27,3.3457249070631967,4,0.49566294919454773
meta-llama/Llama-3.1-70B-Instruct,Llama-3,behavior,goal_interpretation,70.6,65.88089330024815,78.90044576523032,71.80527383367141,807,673,531,142,275,53.875968992248055,90.84967320261438,67.63990267639902,259,153,139,14,119,75.23992322456814,75.38461538461539,75.31219980787704,521,520,392,128,129,780,96.6542750929368,0,0.0,27,3.3457249070631967,4,0.49566294919454773
meta-llama/Llama-3.1-8B,Llama-3,behavior,goal_interpretation,8.0,2.941176470588235,4.160475482912332,3.446153846153846,954,673,28,645,924,11.11111111111111,4.57516339869281,6.481481481481481,63,153,7,146,56,29.577464788732392,4.038461538461538,7.106598984771573,73,520,21,499,50,136,14.255765199161424,808,84.69601677148847,818,85.74423480083857,833,87.31656184486373
meta-llama/Llama-3.1-8B-Instruct,Llama-3,behavior,goal_interpretation,8.0,29.53138815207781,49.62852897473997,37.028824833702885,1159,673,334,339,797,24.652087475149106,81.04575163398692,37.80487804878049,502,153,124,29,379,35.175879396984925,40.38461538461539,37.60071620411817,626,520,210,310,387,1128,97.32528041415013,0,0.0,31,2.6747195858498705,19,1.639344262295082
meta-llama/Llama-3.2-1B-Instruct,Llama-3,behavior,goal_interpretation,1.2,0.0,0.0,0.0,158,673,0,673,158,0.0,0.0,0.0,54,153,0,153,54,0.0,0.0,0.0,74,520,0,520,74,128,81.0126582278481,6,3.79746835443038,30,18.9873417721519,39,24.68354430379747
meta-llama/Llama-3.2-3B,Llama-3,behavior,goal_interpretation,3.2,12.081513828238718,24.665676077265974,16.218856863702978,1395,673,166,507,1208,5.087440381558029,20.915032679738562,8.184143222506393,630,153,32,121,597,20.303030303030305,25.769230769230766,22.71186440677966,680,520,134,386,526,1310,93.9068100358423,22,1.5770609318996418,85,6.093189964157706,85,6.093189964157706
meta-llama/Llama-3.2-3B-Instruct,Llama-3,behavior,goal_interpretation,3.2,12.081513828238718,24.665676077265974,16.218856863702978,1395,673,166,507,1208,5.087440381558029,20.915032679738562,8.184143222506393,630,153,32,121,597,20.303030303030305,25.769230769230766,22.71186440677966,680,520,134,386,526,1310,93.9068100358423,22,1.5770609318996418,85,6.093189964157706,85,6.093189964157706
microsoft/Phi-3-mini-128k-instruct,phi,behavior,goal_interpretation,3.8,21.353065539112052,30.0148588410104,24.953675108091414,956,673,202,471,744,15.258215962441316,42.48366013071895,22.45250431778929,426,153,65,88,361,30.51224944320713,26.346153846153847,28.2765737874097,459,520,137,383,312,885,92.57322175732217,32,3.3472803347280333,71,7.426778242677824,98,10.251046025104603
microsoft/Phi-3-mini-4k-instruct,phi,behavior,goal_interpretation,3.8,14.469453376205788,20.059435364041605,16.811955168119553,942,673,135,538,798,15.051020408163266,38.56209150326798,21.65137614678899,394,153,59,94,333,25.418060200668897,14.615384615384617,18.55921855921856,306,520,76,444,223,700,74.3099787685775,230,24.416135881104033,242,25.690021231422506,265,28.13163481953291
microsoft/Phi-3.5-mini-instruct,phi,behavior,goal_interpretation,3.8,17.780294450736125,23.328380386329865,20.17994858611825,891,673,157,516,726,5.829596412556054,8.49673202614379,6.914893617021277,223,153,13,140,210,39.02439024390244,27.692307692307693,32.39595050618673,377,520,144,376,225,600,67.34006734006735,236,26.487093153759822,291,32.659932659932664,264,29.629629629629626
microsoft/phi-1,phi,behavior,goal_interpretation,1.4,0.0,0.0,0.0,26,673,0,673,26,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,26,100.0,26,100.0,26,100.0
microsoft/phi-1_5,phi,behavior,goal_interpretation,1.4,0.0,0.0,0.0,277,673,0,673,277,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,277,100.0,277,100.0,277,100.0
tiiuae/Falcon3-10B-Base,falcon,behavior,goal_interpretation,10.3,29.107373868046572,33.43239227340267,31.120331950207465,781,673,225,448,548,30.935251798561154,56.209150326797385,39.90719257540603,281,153,86,67,192,47.11864406779661,26.73076923076923,34.11042944785276,300,520,139,381,156,581,74.39180537772087,198,25.352112676056336,200,25.60819462227913,202,25.86427656850192
tiiuae/Falcon3-7B-Base,falcon,behavior,goal_interpretation,7.5,2.1822849807445444,7.578008915304606,3.388704318936877,2338,673,51,622,2286,9.219858156028367,8.49673202614379,8.843537414965986,141,153,13,140,128,30.89430894308943,7.307692307692308,11.81959564541213,124,520,38,482,85,265,11.334473909324208,2068,88.45166809238665,2073,88.6655260906758,2083,89.09324208725407
tiiuae/falcon-11B,falcon,behavior,goal_interpretation,11.1,2.4390243902439024,4.3090638930163445,3.1149301825993554,1189,673,29,644,1160,6.593406593406594,3.9215686274509802,4.918032786885246,91,153,6,147,85,21.100917431192663,4.423076923076923,7.3131955484896665,109,520,23,497,86,200,16.82085786375105,978,82.25399495374263,989,83.17914213624896,999,84.0201850294365
tiiuae/falcon-40b,falcon,behavior,goal_interpretation,41.8,0.7984031936127743,1.188707280832095,0.9552238805970149,1006,673,8,665,994,3.1746031746031744,2.6143790849673203,2.8673835125448024,126,153,4,149,122,2.9850746268656714,0.7692307692307693,1.2232415902140672,138,520,4,516,130,264,26.24254473161034,731,72.66401590457257,742,73.75745526838966,816,81.11332007952286
tiiuae/falcon-7b,falcon,behavior,goal_interpretation,7.2,0.12618296529968456,0.2971768202080238,0.1771479185119575,1586,673,2,671,1583,11.11111111111111,1.3071895424836601,2.339181286549708,19,153,2,151,16,0.0,0.0,0.0,9,520,0,520,9,28,1.7654476670870116,1558,98.23455233291298,1558,98.23455233291298,1574,99.24337957124843
