Model,Model Family,dataset,eval_type,Model Size (B),overall_precision,overall_recall,overall_f1,num_predicted_conditions,num_GT_conditions,num_satisfied_conditions,num_unsatisfied_conditions,num_false_positive_conditions,state_goal_precision,state_goal_recall,state_goal_f1,state_goal_num_predicted,state_goal_num_GT,state_goal_num_satisfied,state_goal_num_unsatisfied,state_goal_num_false_positive,relation_goal_precision,relation_goal_recall,relation_goal_f1,relation_goal_num_predicted,relation_goal_num_GT,relation_goal_num_satisfied,relation_goal_num_unsatisfied,relation_goal_num_false_positive,grammatically_valid_num,grammatically_valid_rate,format_error_num,format_error_rate,state_hallucination_num,state_hallucination_rate,object_hallucination_num,object_hallucination_rate,Pretraining Data Size (T),FLOPs (1E21),Average,BBH,MATH Lvl 5,GPQA,MUSR,MMLU-PRO,IFEval
01-ai/Yi-1.5-34B,Yi,behavior,goal_interpretation,34.4,35.0597609561753,39.22734026745914,37.02664796633942,755,673,264,409,489,28.443113772455092,62.091503267973856,39.01437371663244,333,153,95,58,239,41.83168316831683,32.5,36.580086580086586,407,520,169,351,235,740,98.01324503311258,0,0.0,15,1.9867549668874176,15,1.9867549668874176,3.6,743.04,25.64649419429311,42.74936268839652,15.332326283987916,15.436241610738257,11.217187500000003,40.732121749408975,28.411725333226947
01-ai/Yi-1.5-34B-Chat,Yi,behavior,goal_interpretation,34.4,42.97108673978066,64.04160475482912,51.43198090692124,1011,673,431,242,572,25.657894736842103,76.47058823529412,38.423645320197046,455,153,117,36,339,59.46969696969697,60.38461538461538,59.92366412213739,537,520,314,206,214,992,98.12067260138475,0,0.0,19,1.8793273986152328,13,1.2858555885262115,3.6,743.04,33.35799367075618,44.262825981005655,27.719033232628398,15.324384787472036,13.058072916666665,39.11606087470449,60.66758423205982
01-ai/Yi-1.5-6B,Yi,behavior,goal_interpretation,6.1,15.893385982230996,23.92273402674591,19.09845788849347,1016,673,161,512,852,9.7799511002445,26.14379084967321,14.23487544483986,409,153,40,113,369,24.29718875502008,23.26923076923077,23.772102161100197,501,520,121,399,377,910,89.56692913385827,16,1.574803149606299,106,10.433070866141732,44,4.330708661417323,3.6,131.76,16.745698054972127,22.027904536694773,6.646525679758309,8.501118568232664,13.309114583333335,23.823507683215126,26.166017278598567
01-ai/Yi-1.5-6B-Chat,Yi,behavior,goal_interpretation,6.1,15.893385982230996,23.92273402674591,19.09845788849347,1016,673,161,512,852,9.7799511002445,26.14379084967321,14.23487544483986,409,153,40,113,369,24.29718875502008,23.26923076923077,23.772102161100197,501,520,121,399,377,910,89.56692913385827,16,1.574803149606299,106,10.433070866141732,44,4.330708661417323,3.6,131.76,22.784006289829847,23.67872313235784,16.238670694864048,6.935123042505594,14.030468750000002,24.368351063829788,51.452701055421834
01-ai/Yi-1.5-9B,Yi,behavior,goal_interpretation,8.8,15.06172839506173,18.12778603268945,16.453135536075525,817,673,122,551,688,17.1875,21.568627450980394,19.130434782608695,192,153,33,120,159,35.177865612648226,17.115384615384617,23.027166882276845,260,520,89,431,164,452,55.32435740514076,359,43.94124847001224,365,44.67564259485924,365,44.67564259485924,3.6,190.08000000000004,22.153901514184795,30.50071699492122,11.404833836858005,17.225950782997764,12.030989583333332,32.402482269503544,29.358435617494916
01-ai/Yi-Coder-1.5B,Yi,behavior,goal_interpretation,1.5,1.4420062695924765,3.4175334323922733,2.0282186948853616,1597,673,23,650,1572,5.0,3.2679738562091507,3.9525691699604746,100,153,5,148,95,9.62566844919786,3.4615384615384617,5.091937765205093,189,520,18,502,169,289,18.09643080776456,1306,81.77833437695679,1308,81.90356919223544,1341,83.96994364433313,2.4,21.6,,,,,,,
01-ai/Yi-Coder-1.5B-Chat,Yi,behavior,goal_interpretation,1.5,0.0,0.0,0.0,1115,673,0,673,1115,0.0,0.0,0.0,3,153,0,153,3,0.0,0.0,0.0,5,520,0,520,5,8,0.7174887892376681,1100,98.65470852017935,1107,99.28251121076234,1103,98.9237668161435,2.4,21.6,,,,,,,
01-ai/Yi-Coder-9B,Yi,behavior,goal_interpretation,8.8,35.32258064516129,32.5408618127786,33.874709976798144,619,673,219,454,401,35.12396694214876,55.55555555555556,43.0379746835443,241,153,85,68,157,37.640449438202246,25.769230769230766,30.59360730593607,356,520,134,386,222,597,96.4458804523425,7,1.1308562197092082,22,3.5541195476575123,25,4.038772213247173,2.4,126.72,,,,,,,
01-ai/Yi-Coder-9B-Chat,Yi,behavior,goal_interpretation,8.8,26.89335394126739,51.708766716196145,35.383833248601945,1293,673,348,325,946,20.68230277185501,63.39869281045751,31.189710610932476,468,153,97,56,372,57.30593607305936,48.26923076923077,52.40083507306888,438,520,251,269,187,906,70.06960556844548,359,27.76488785769528,387,29.930394431554525,368,28.46094354215004,2.4,126.72,16.985989314863886,25.94315294491389,4.003021148036254,0.0,7.963802083333333,15.83554964539007,48.17041006750976
Qwen/Qwen-72B-Chat,Qwen,behavior,goal_interpretation,72.3,40.67164179104478,48.588410104011885,44.27894380501016,803,673,327,346,477,35.714285714285715,78.43137254901961,49.079754601227,335,153,120,33,216,48.36448598130841,39.80769230769231,43.67088607594936,428,520,207,313,221,763,95.0186799501868,0,0.0,40,4.9813200498132,2,0.24906600249066,,,,,,,,,
Qwen/Qwen-7B-Chat,Qwen,behavior,goal_interpretation,7.7,0.0,0.0,0.0,738,673,0,673,738,0.0,0.0,0.0,20,153,0,153,20,0.0,0.0,0.0,3,520,0,520,3,20,2.710027100271003,718,97.289972899729,715,96.88346883468834,722,97.8319783197832,,,,,,,,,
Qwen/Qwen1.5-1.8B-Chat,Qwen1.5,behavior,goal_interpretation,1.8,0.0,0.0,0.0,2461,673,0,673,2461,0.0,0.0,0.0,8,153,0,153,8,0.0,0.0,0.0,1,520,0,520,1,9,0.3657049979683055,2449,99.51239333604228,2452,99.63429500203168,2458,99.87809833401056,,,9.257783499275524,5.908662877770453,1.9637462235649545,6.375838926174497,12.179427083333335,8.928043735224584,20.190982149585324
Qwen/Qwen1.5-14B-Chat,Qwen1.5,behavior,goal_interpretation,14.2,9.523809523809524,0.2971768202080238,0.5763688760806917,21,673,2,671,19,66.66666666666666,1.30718954248366,2.564102564102564,3,153,2,151,1,0.0,0.0,0.0,1,520,0,520,1,4,19.047619047619047,17,80.95238095238095,17,80.95238095238095,16,76.19047619047619,,,23.566106475051374,32.75647930053065,15.256797583081571,2.684563758389265,13.930729166666667,29.08724881796691,47.68082022367319
Qwen/Qwen1.5-32B-Chat,Qwen1.5,behavior,goal_interpretation,32.5,36.95198329853862,26.300148588410103,30.729166666666668,482,673,177,496,302,44.680851063829785,27.450980392156865,34.008097165991906,94,153,42,111,52,84.90566037735849,25.961538461538463,39.764359351988226,162,520,135,385,24,256,53.11203319502075,156,32.365145228215766,226,46.88796680497925,160,33.19502074688796,,,29.25746822860332,44.55485402391639,19.561933534743204,7.494407158836691,10.197395833333335,38.41422872340425,55.32199009738605
Qwen/Qwen1.5-4B-Chat,Qwen1.5,behavior,goal_interpretation,4.0,8.313253012048193,10.25260029717682,9.181636726546904,830,673,69,604,761,13.06122448979592,20.91503267973856,16.08040201005025,245,153,32,121,213,10.850439882697946,7.115384615384615,8.59465737514518,341,520,37,483,304,586,70.60240963855422,150,18.072289156626507,244,29.397590361445783,272,32.7710843373494,,,12.627280110791753,16.29707852890831,2.794561933534743,2.2371364653243813,7.355989583333333,15.512337470449172,31.566576683200577
Qwen/Qwen1.5-72B-Chat,Qwen1.5,behavior,goal_interpretation,72.3,41.73913043478261,57.05794947994056,48.21092278719397,920,673,384,289,536,29.14572864321608,75.81699346405229,42.10526315789474,398,153,116,37,282,56.06694560669456,51.53846153846153,53.707414829659314,478,520,268,252,210,876,95.21739130434784,0,0.0,44,4.782608695652174,11,1.1956521739130437,,,,,,,,,
Qwen/Qwen1.5-7B-Chat,Qwen1.5,behavior,goal_interpretation,7.7,2.094972067039106,4.457652303120357,2.850356294536817,1432,673,30,643,1402,5.88235294117647,3.2679738562091507,4.201680672268907,85,153,5,148,80,37.878787878787875,4.807692307692308,8.532423208191126,66,520,25,495,41,151,10.544692737430168,1275,89.03631284916202,1281,89.45530726256983,1284,89.66480446927375,,,17.62098662745355,22.379129599952787,6.268882175226587,7.046979865771815,4.6382812499999995,21.681072695035464,43.711574178734644
Qwen/Qwen2-1.5B,Qwen,behavior,goal_interpretation,1.5,1.627906976744186,2.080237741456166,1.8264840182648396,864,673,14,659,846,1.910828025477707,1.96078431372549,1.935483870967742,157,153,3,150,154,4.029304029304029,2.1153846153846154,2.7742749054224465,277,520,11,509,262,407,47.10648148148148,457,52.893518518518526,430,49.76851851851852,578,66.89814814814815,,,10.445452935561454,11.781833653483531,7.02416918429003,1.9015659955257262,3.5932291666666667,17.239213947990542,21.132705665412217
Qwen/Qwen2-1.5B-Instruct,Qwen,behavior,goal_interpretation,1.5,7.263157894736843,10.25260029717682,8.502772643253236,956,673,69,604,881,6.159420289855073,11.11111111111111,7.925407925407926,278,153,17,136,259,8.04953560371517,10.0,8.919382504288164,650,520,52,468,594,910,95.18828451882844,28,2.928870292887029,28,2.928870292887029,54,5.648535564853557,,,14.141936815181689,13.695346827502663,7.175226586102719,1.5659955257270708,12.026822916666667,16.675901300236408,33.712327734854625
Qwen/Qwen2-72B,Qwen,behavior,goal_interpretation,72.7,60.82949308755761,78.45468053491828,68.52693056456846,867,673,528,145,340,36.3905325443787,80.3921568627451,50.10183299389003,337,153,123,30,215,77.14285714285715,77.88461538461539,77.51196172248804,525,520,405,115,120,862,99.4232987312572,0,0.0,5,0.5767012687427913,1,0.1153402537485582,,,35.45667093247413,51.85613118695519,31.1178247734139,19.239373601789712,19.728906250000005,52.56168735224587,38.23610243044012
Qwen/Qwen2-72B-Instruct,Qwen,behavior,goal_interpretation,72.7,75.40760869565217,82.46656760772659,78.77927608232788,735,673,555,118,181,79.34782608695652,95.4248366013072,86.64688427299704,183,153,146,7,38,75.88126159554731,78.65384615384615,77.24268177525968,539,520,409,111,130,722,98.2312925170068,0,0.0,13,1.7687074829931977,3,0.4081632653061224,,,43.59406246367795,57.48300911876294,41.76737160120846,16.33109619686801,17.167968749999996,48.92324172576833,79.89168738945996
Qwen/Qwen2.5-1.5B-instruct,Qwen2.5,behavior,goal_interpretation,1.5,1.1385199240986716,0.8915304606240713,1.0,527,673,6,667,521,4.026845637583892,3.92156862745098,3.9735099337748343,149,153,6,147,143,0.0,0.0,0.0,2,520,0,520,2,147,27.893738140417454,378,71.72675521821633,376,71.34724857685009,384,72.86527514231499,,,,,,,,,
Qwen/Qwen2.5-14B-instruct,Qwen2.5,behavior,goal_interpretation,14.8,59.76608187134504,75.92867756315007,66.88481675392671,867,673,511,162,344,46.15384615384615,86.27450980392157,60.13667425968109,285,153,132,21,154,70.57728119180634,72.88461538461539,71.71239356669821,550,520,379,141,158,835,96.30911188004616,0,0.0,32,3.690888119953864,3,0.3460207612456747,,,,,,,,,
Qwen/Qwen2.5-32B-instruct,Qwen2.5,behavior,goal_interpretation,32.8,71.91780821917808,78.00891530460625,74.83962936564505,729,673,525,148,205,69.5,90.84967320261438,78.75354107648724,199,153,139,14,61,74.8062015503876,74.23076923076923,74.51737451737452,516,520,386,134,130,715,98.079561042524,0,0.0,14,1.9204389574759944,1,0.1371742112482853,,,,,,,,,
Qwen/Qwen2.5-3B-instruct,Qwen2.5,behavior,goal_interpretation,3.1,27.076923076923077,39.22734026745914,32.03883495145631,977,673,264,409,711,41.77215189873418,64.70588235294117,50.76923076923078,236,153,99,54,138,23.01255230125523,31.73076923076923,26.677445432497976,720,520,165,355,552,956,97.8505629477994,5,0.511770726714432,21,2.1494370522006143,60,6.1412487205731825,,,,,,,,,
Qwen/Qwen2.5-72B-instruct,Qwen2.5,behavior,goal_interpretation,72.7,64.49438202247191,85.28974739970282,73.44849648112603,889,673,574,99,316,38.88888888888889,96.07843137254902,55.36723163841808,377,153,147,6,231,83.88998035363457,82.11538461538461,82.99319727891155,509,520,427,93,82,886,99.66254218222724,0,0.0,3,0.3374578177727784,0,0.0,,,,,,,,,
Qwen/Qwen2.5-7B-instruct,Qwen2.5,behavior,goal_interpretation,7.6,38.716814159292035,52.00594353640417,44.38807863031072,901,673,350,323,554,15.196078431372548,40.52287581699346,22.10338680926916,407,153,62,91,346,64.42953020134227,55.38461538461539,59.56566701137539,445,520,288,232,159,852,94.56159822419534,0,0.0,49,5.438401775804662,6,0.6659267480577136,,,,,,,,,
Qwen/Qwen3-0.6B-Base,Qwen3,behavior,goal_interpretation,0.6,0.0,0.0,0.0,162,673,0,673,162,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,162,100.0,162,100.0,162,100.0,,,,,,,,,
Qwen/Qwen3-1.7B-Base,Qwen3,behavior,goal_interpretation,1.7,4.587155963302752,2.9717682020802374,3.606853020739405,436,673,20,653,416,2.5906735751295336,3.2679738562091507,2.8901734104046244,193,153,5,148,188,11.538461538461538,2.8846153846153846,4.615384615384616,130,520,15,505,115,323,74.08256880733946,104,23.853211009174313,113,25.91743119266055,108,24.770642201834864,,,,,,,,,
Qwen/Qwen3-14B-Base,Qwen3,behavior,goal_interpretation,14.8,47.4609375,72.21396731054978,57.2775486152033,1034,673,486,187,538,28.30957230142566,90.84967320261438,43.16770186335404,490,153,139,14,352,67.64132553606238,66.73076923076923,67.18296224588578,524,520,347,173,166,1014,98.06576402321085,0,0.0,20,1.9342359767891684,2,0.1934235976789168,,,,,,,,,
Qwen/Qwen3-4B-Base,Qwen3,behavior,goal_interpretation,4.0,30.023364485981308,38.187221396731054,33.616742969260955,859,673,257,416,599,15.856236786469344,49.01960784313725,23.96166134185303,472,153,75,78,398,51.41242937853108,35.0,41.64759725400458,358,520,182,338,172,830,96.62398137369034,28,3.259604190919674,29,3.3760186263096625,33,3.841676367869616,,,,,,,,,
Qwen/Qwen3-8B-Base,Qwen3,behavior,goal_interpretation,8.2,37.59036144578313,46.35958395245171,41.516966067864274,831,673,312,361,518,25.6857855361596,67.3202614379085,37.1841155234657,401,153,103,50,298,52.911392405063296,40.19230769230769,45.68306010928962,396,520,209,311,186,797,95.90854392298436,6,0.7220216606498195,34,4.0914560770156445,9,1.083032490974729,,,,,,,,,
bigcode/starcoder2-15b,starcoder2,behavior,goal_interpretation,16.0,0.0,0.0,0.0,75,673,0,673,75,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,75,100.0,75,100.0,75,100.0,4.3,387.0,12.539175421645837,20.373540752678547,5.966767371601208,3.1319910514541416,2.9283854166666674,15.032136524822693,27.802231412651764
bigcode/starcoder2-3b,starcoder2,behavior,goal_interpretation,3.0,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0,3.3,59.4,6.549147626379535,8.909299421083569,1.5105740181268883,0.0,1.432291666666666,7.0718823877068555,20.370838264693234
bigcode/starcoder2-7b,starcoder2,behavior,goal_interpretation,7.2,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0,3.7,155.4,8.2934383764798,11.395110106503443,3.096676737160121,0.22371364653244186,5.8166666666666655,7.1365248226950335,22.09193827932109
bigcode/starcoderbase,starcoder,behavior,goal_interpretation,15.5,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0,1.0,93.0,,,,,,,
bigcode/starcoderbase-1b,starcoder,behavior,goal_interpretation,15.5,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0,1.0,6.0,,,,,,,
bigcode/starcoderbase-3b,starcoder,behavior,goal_interpretation,15.5,0.0,0.0,0.0,0,673,0,673,0,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,0,0.0,0,0.0,0,0.0,1.0,18.0,,,,,,,
bigcode/starcoderbase-7b,starcoder,behavior,goal_interpretation,15.5,0.0,0.0,0.0,195,673,0,673,195,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,195,100.0,195,100.0,195,100.0,1.0,42.0,,,,,,,
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek-R1,behavior,goal_interpretation,70.6,78.75536480686695,54.53194650817236,64.44249341527654,465,673,367,306,99,71.95121951219512,77.12418300653596,74.44794952681387,163,153,118,35,46,83.27759197324414,47.88461538461539,60.80586080586081,299,520,249,271,50,462,99.35483870967742,1,0.2150537634408602,3,0.6451612903225806,6,1.2903225806451613,15.0,6353.999999999999,27.809426360756188,35.81986234433108,30.74018126888218,2.0134228187919474,13.277343749999998,41.64635047281324,43.35939750971866
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,DeepSeek-R1,behavior,goal_interpretation,8.0,41.63568773234201,16.64190193164933,23.7791932059448,269,673,112,561,157,45.689655172413794,34.64052287581699,39.40520446096654,116,153,53,100,63,47.96747967479675,11.346153846153848,18.35147744945568,123,520,59,461,64,239,88.84758364312268,9,3.3457249070631967,30,11.152416356877325,19,7.063197026022305,15.0,720.0,13.059950104920146,5.325247153240706,21.978851963746223,0.6711409395973182,0.45572916666666624,12.10475768321513,37.82397372305483
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek-R1,behavior,goal_interpretation,1.8,1.1799410029498525,0.5943536404160475,0.7905138339920948,340,673,4,669,335,16.666666666666664,0.6535947712418301,1.2578616352201255,6,153,1,152,5,8.108108108108109,0.576923076923077,1.0771992818671452,38,520,3,517,34,44,12.941176470588236,295,86.76470588235294,296,87.05882352941177,310,91.17647058823528,18.0,194.4,10.351036796154286,4.729119207646243,16.91842900302115,0.7829977628635317,2.9656249999999993,2.0759456264775418,34.63410417691725
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek-R1,behavior,goal_interpretation,14.8,52.0066889632107,46.210995542347696,48.93784421715185,603,673,311,362,287,42.10526315789473,67.97385620915033,52.0,246,153,104,49,143,64.28571428571429,39.80769230769231,49.16864608076008,328,520,207,313,115,574,95.19071310116088,0,0.0,29,4.809286898839138,11,1.824212271973466,18.0,1598.4,38.22146462032291,40.69076685552542,57.02416918429003,18.34451901565996,28.711458333333326,40.74135638297872,43.81651795015004
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek-R1,behavior,goal_interpretation,32.8,54.79616306954437,67.90490341753343,60.65029860650299,835,673,457,216,377,33.94736842105263,84.31372549019608,48.405253283302066,380,153,129,24,251,75.22935779816514,63.07692307692307,68.6192468619247,437,520,328,192,108,817,97.8443113772455,0,0.0,18,2.155688622754491,2,0.2395209580838323,18.0,3542.3999999999996,22.96226839270608,17.149673765590364,17.069486404833835,4.5861297539149914,16.1421875,40.962987588652474,41.86314534324481
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek-R1,behavior,goal_interpretation,7.6,44.90445859872611,20.950965824665676,28.57142857142857,318,673,141,532,173,46.93877551020408,30.065359477124183,36.65338645418326,98,153,46,107,52,56.886227544910184,18.269230769230766,27.65647743813682,171,520,95,425,72,269,84.59119496855347,0,0.0,49,15.40880503144654,13,4.088050314465408,18.0,820.8,14.99492256865316,7.882702983365756,19.561933534743204,3.9149888143176734,3.5518229166666675,14.681220449172578,40.3768667136531
deepseek-ai/deepseek-coder-1.3b-base,DeepSeek-Coder,behavior,goal_interpretation,1.3,0.4299226139294927,0.7429420505200593,0.5446623093681917,1163,673,5,668,1158,12.5,3.2679738562091507,5.181347150259068,40,153,5,148,35,0.0,0.0,0.0,53,520,0,520,53,93,7.996560619088563,1069,91.91745485812554,1070,92.00343938091144,1127,96.90455717970764,2.0,15.6,,,,,,,
deepseek-ai/deepseek-coder-1.3b-instruct,DeepSeek-Coder,behavior,goal_interpretation,1.3,0.8975317875841435,1.7830609212481423,1.1940298507462686,1337,673,12,661,1325,2.4271844660194173,3.2679738562091507,2.785515320334262,206,153,5,148,201,1.9073569482288828,1.3461538461538465,1.5783540022547915,367,520,7,513,360,563,42.109199700822735,774,57.89080029917726,764,57.14285714285714,854,63.87434554973822,2.0,15.6,,,,,,,
deepseek-ai/deepseek-coder-33b-base,DeepSeek-Coder,behavior,goal_interpretation,33.3,18.28298887122417,17.087667161961367,17.665130568356375,629,673,115,558,514,3.2467532467532463,6.535947712418301,4.338394793926247,308,153,10,143,298,37.76978417266187,20.192307692307693,26.31578947368421,278,520,105,415,173,586,93.1637519872814,43,6.836248012718602,43,6.836248012718602,69,10.969793322734498,2.0,396.0,,,,,,,
deepseek-ai/deepseek-coder-33b-instruct,DeepSeek-Coder,behavior,goal_interpretation,33.3,35.294117647058826,56.16641901931649,43.34862385321101,1071,673,378,295,693,22.456813819577732,76.47058823529412,34.71810089020771,520,153,117,36,404,49.52561669829222,50.19230769230769,49.8567335243553,528,520,261,259,266,1048,97.85247432306257,0,0.0,23,2.1475256769374416,12,1.1204481792717087,2.0,399.6,,,,,,,
deepseek-ai/deepseek-coder-6.7b-base,DeepSeek-Coder,behavior,goal_interpretation,6.7,12.76595744680851,14.26448736998514,13.473684210526311,752,673,96,577,656,2.923976608187134,6.535947712418301,4.04040404040404,342,153,10,143,332,29.553264604811,16.538461538461537,21.208384710234277,291,520,86,434,205,633,84.17553191489363,119,15.824468085106384,119,15.824468085106384,169,22.47340425531915,2.0,80.4,,,,,,,
deepseek-ai/deepseek-coder-6.7b-instruct,DeepSeek-Coder,behavior,goal_interpretation,6.7,16.389811738648948,21.99108469539376,18.781725888324875,903,673,148,525,755,5.934065934065933,17.647058823529413,8.881578947368423,455,153,27,126,428,31.34715025906736,23.26923076923077,26.71081677704194,386,520,121,399,265,841,93.13399778516056,54,5.980066445182724,62,6.866002214839424,65,7.198228128460686,2.0,80.4,,,,,,,
deepseek-ai/deepseek-coder-7b-base-v1.5,DeepSeek-Coder,behavior,goal_interpretation,6.9,11.599625818521982,18.424962852897476,14.236509758897816,1070,673,124,549,945,7.9155672823219,19.607843137254903,11.278195488721806,379,153,30,123,349,24.352331606217614,18.076923076923077,20.75055187637969,387,520,94,426,292,766,71.58878504672897,303,28.317757009345794,304,28.41121495327103,363,33.925233644859816,2.0,82.80000000000001,,,,,,,
deepseek-ai/deepseek-coder-7b-instruct-v1.5,DeepSeek-Coder,behavior,goal_interpretation,6.9,29.354838709677416,40.56463595839524,34.061135371179034,948,673,273,400,657,21.08626198083067,43.13725490196079,28.32618025751073,313,153,66,87,247,34.90725126475548,39.80769230769231,37.19676549865229,611,520,207,313,386,924,97.46835443037976,0,0.0,24,2.5316455696202533,28,2.9535864978902953,2.0,82.80000000000001,,,,,,,
meta-llama/Llama-3.1-70B,Llama-3,behavior,goal_interpretation,70.6,65.88089330024815,78.90044576523032,71.80527383367141,807,673,531,142,275,53.875968992248055,90.84967320261438,67.63990267639902,259,153,139,14,119,75.23992322456814,75.38461538461539,75.31219980787704,521,520,392,128,129,780,96.6542750929368,0,0.0,27,3.3457249070631967,4,0.4956629491945477,15.0,6353.999999999999,26.200215843375947,46.39941295581887,18.429003021148038,18.34451901565996,16.581770833333337,40.602836879432616,16.843752354862875
meta-llama/Llama-3.1-70B-Instruct,Llama-3,behavior,goal_interpretation,70.6,65.88089330024815,78.90044576523032,71.80527383367141,807,673,531,142,275,53.875968992248055,90.84967320261438,67.63990267639902,259,153,139,14,119,75.23992322456814,75.38461538461539,75.31219980787704,521,520,392,128,129,780,96.6542750929368,0,0.0,27,3.3457249070631967,4,0.4956629491945477,15.0,6353.999999999999,43.409948245645786,55.92799173898473,38.066465256797585,14.205816554809845,17.691145833333334,47.87972813238771,86.6885419575615
meta-llama/Llama-3.1-8B,Llama-3,behavior,goal_interpretation,8.0,2.941176470588235,4.160475482912332,3.446153846153846,954,673,28,645,924,11.11111111111111,4.57516339869281,6.481481481481481,63,153,7,146,56,29.577464788732392,4.038461538461538,7.106598984771573,73,520,21,499,50,136,14.255765199161424,808,84.69601677148847,818,85.74423480083857,833,87.31656184486373,15.0,720.0,14.42086519266696,25.30447063475493,6.570996978851963,8.05369127516779,8.715104166666668,25.42109929078014,12.459828809780273
meta-llama/Llama-3.1-8B-Instruct,Llama-3,behavior,goal_interpretation,8.0,29.53138815207781,49.62852897473997,37.028824833702885,1159,673,334,339,797,24.652087475149106,81.04575163398692,37.80487804878049,502,153,124,29,379,35.175879396984925,40.38461538461539,37.60071620411817,626,520,210,310,387,1128,97.32528041415011,0,0.0,31,2.6747195858498705,19,1.639344262295082,,,23.763729445470883,29.379192497334035,15.55891238670695,8.7248322147651,8.611197916666667,31.091164302600465,49.217077354752064
meta-llama/Llama-3.2-1B-Instruct,Llama-3,behavior,goal_interpretation,1.2,0.0,0.0,0.0,158,673,0,673,158,0.0,0.0,0.0,54,153,0,153,54,0.0,0.0,0.0,74,520,0,520,74,128,81.0126582278481,6,3.79746835443038,30,18.9873417721519,39,24.68354430379747,9.0,64.8,14.443126333711135,8.742521312303046,7.02416918429003,3.355704697986576,2.973437500000001,7.579787234042552,56.9831380736446
meta-llama/Llama-3.2-3B,Llama-3,behavior,goal_interpretation,3.2,12.081513828238718,24.665676077265974,16.218856863702978,1395,673,166,507,1208,5.087440381558029,20.91503267973856,8.184143222506393,630,153,32,121,597,20.303030303030305,25.769230769230766,22.71186440677966,680,520,134,386,526,1310,93.9068100358423,22,1.5770609318996418,85,6.093189964157706,85,6.093189964157706,9.0,172.8,8.697822716562822,14.232664884364107,1.8882175226586102,2.348993288590602,3.8148437499999996,16.528147163120565,13.374069690643047
meta-llama/Llama-3.2-3B-Instruct,Llama-3,behavior,goal_interpretation,3.2,12.081513828238718,24.665676077265974,16.218856863702978,1395,673,166,507,1208,5.087440381558029,20.91503267973856,8.184143222506393,630,153,32,121,597,20.303030303030305,25.769230769230766,22.71186440677966,680,520,134,386,526,1310,93.9068100358423,22,1.5770609318996418,85,6.093189964157706,85,6.093189964157706,9.0,172.8,24.204650807793456,24.059186446885473,17.673716012084594,3.8031319910514525,1.3734374999999996,24.386820330969268,73.93161256576994
microsoft/Phi-3-mini-128k-instruct,phi,behavior,goal_interpretation,3.8,21.353065539112052,30.0148588410104,24.953675108091417,956,673,202,471,744,15.258215962441316,42.48366013071895,22.45250431778929,426,153,65,88,361,30.51224944320713,26.346153846153847,28.2765737874097,459,520,137,383,312,885,92.57322175732216,32,3.3472803347280333,71,7.426778242677824,98,10.251046025104603,4.9,111.72,26.343809931865636,37.09976663224031,14.04833836858006,9.060402684563762,7.710937500000003,30.38009751773049,59.76331688807919
microsoft/Phi-3-mini-4k-instruct,phi,behavior,goal_interpretation,3.8,14.469453376205788,20.059435364041605,16.811955168119553,942,673,135,538,798,15.051020408163266,38.56209150326798,21.65137614678899,394,153,59,94,333,25.418060200668897,14.615384615384617,18.55921855921856,306,520,76,444,223,700,74.3099787685775,230,24.416135881104033,242,25.690021231422502,265,28.13163481953291,4.9,111.72,25.967732638041607,39.2693352377728,11.63141993957704,9.284116331096197,7.644270833333336,31.848404255319146,56.12884923115112
microsoft/Phi-3.5-mini-instruct,phi,behavior,goal_interpretation,3.8,17.780294450736125,23.328380386329865,20.17994858611825,891,673,157,516,726,5.829596412556054,8.49673202614379,6.914893617021277,223,153,13,140,210,39.02439024390244,27.692307692307693,32.39595050618673,377,520,144,376,225,600,67.34006734006735,236,26.487093153759822,291,32.659932659932664,264,29.629629629629623,,,28.184391192864627,36.74585390851661,19.637462235649547,11.968680089485462,10.098958333333334,32.91038711583924,57.74500547436358
microsoft/phi-1,phi,behavior,goal_interpretation,1.4,0.0,0.0,0.0,26,673,0,673,26,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,26,100.0,26,100.0,26,100.0,,,5.574318195377169,4.273999212214679,0.9818731117824773,2.0134228187919474,3.697135416666667,1.798906619385342,20.6805719934219
microsoft/phi-1_5,phi,behavior,goal_interpretation,1.4,0.0,0.0,0.0,277,673,0,673,277,0.0,0.0,0.0,0,153,0,153,0,0.0,0.0,0.0,0,520,0,520,0,0,0.0,277,100.0,277,100.0,277,100.0,0.15,1.1700000000000002,7.170966845799231,7.468938770070243,1.812688821752266,2.348993288590602,3.385416666666666,7.6813682033096935,20.32839532440591
tiiuae/Falcon3-10B-Base,falcon,behavior,goal_interpretation,10.3,29.107373868046572,33.43239227340267,31.120331950207465,781,673,225,448,548,30.93525179856115,56.209150326797385,39.90719257540603,281,153,86,67,192,47.11864406779661,26.73076923076923,34.11042944785276,300,520,139,381,156,581,74.39180537772087,198,25.352112676056336,200,25.60819462227913,202,25.86427656850192,14.0,865.2,27.617850879493677,41.37546218651794,24.924471299093657,12.751677852348994,14.173958333333331,36.003989361702125,36.47754624396601
tiiuae/Falcon3-7B-Base,falcon,behavior,goal_interpretation,7.5,2.1822849807445444,7.578008915304606,3.388704318936877,2338,673,51,622,2286,9.219858156028367,8.49673202614379,8.843537414965986,141,153,13,140,128,30.89430894308943,7.307692307692308,11.81959564541213,124,520,38,482,85,265,11.334473909324208,2068,88.45166809238665,2073,88.6655260906758,2083,89.09324208725407,14.0,630.0,24.745725360383613,31.55991854750336,19.410876132930515,12.863534675615215,18.142708333333335,32.337839834515364,34.15947463840388
tiiuae/falcon-11B,falcon,behavior,goal_interpretation,11.1,2.4390243902439024,4.3090638930163445,3.1149301825993554,1189,673,29,644,1160,6.593406593406594,3.92156862745098,4.918032786885246,91,153,6,147,85,21.100917431192663,4.423076923076923,7.313195548489667,109,520,23,497,86,200,16.82085786375105,978,82.25399495374263,989,83.17914213624896,999,84.0201850294365,5.0,333.0,13.851902586180215,21.937999462890275,2.794561933534743,2.796420581655479,7.530729166666667,15.438460401891252,32.613243970442866
tiiuae/falcon-40b,falcon,behavior,goal_interpretation,41.8,0.7984031936127743,1.188707280832095,0.9552238805970148,1006,673,8,665,994,3.1746031746031744,2.6143790849673203,2.8673835125448024,126,153,4,149,122,2.9850746268656714,0.7692307692307693,1.2232415902140672,138,520,4,516,130,264,26.24254473161034,731,72.66401590457257,742,73.75745526838966,816,81.11332007952286,1.0,240.0,11.40130446230009,16.583304730312175,1.812688821752266,3.1319910514541416,5.193229166666668,16.722074468085104,24.964538535530174
tiiuae/falcon-7b,falcon,behavior,goal_interpretation,7.2,0.1261829652996845,0.2971768202080238,0.1771479185119575,1586,673,2,671,1583,11.11111111111111,1.30718954248366,2.339181286549708,19,153,2,151,16,0.0,0.0,0.0,9,520,0,520,9,28,1.7654476670870116,1558,98.23455233291298,1558,98.23455233291298,1574,99.24337957124844,1.5,63.0,5.1734447203194796,5.963936911876051,0.9818731117824773,0.0,4.497135416666667,1.392582742316784,18.205140139274903
