Model,Model Family,dataset,eval_type,Model Size (B),task_success_rate,state_goal,relation_goal,action_goal,total_goal,execution_success_rate,parsing_error,hallucination_error,predicate_argument_number_error,wrong_order_error,missing_step_error,affordance_error,additional_step_error,Pretraining Data Size (T),FLOPs (1E21),Average,BBH,MATH Lvl 5,GPQA,MUSR,MMLU-PRO,IFEval
01-ai/Yi-1.5-34B,Yi,behavior,action_sequencing,34.4,0.0,1.4200000000000002,16.220000000000002,0,11.0,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,3.6,743.04,25.64649419429311,42.74936268839652,15.332326283987916,15.436241610738257,11.217187500000003,40.732121749408975,28.411725333226947
01-ai/Yi-1.5-34B-Chat,Yi,behavior,action_sequencing,34.4,10.14,22.7,20.08,0,21.0,13.04,62.32,1.4500000000000002,0.0,2.9000000000000004,14.49,,1.4500000000000002,3.6,743.04,33.35799367075618,44.262825981005655,27.719033232628398,15.324384787472036,13.058072916666665,39.11606087470449,60.66758423205982
01-ai/Yi-1.5-6B,Yi,behavior,action_sequencing,6.1,0.0,1.4200000000000002,16.220000000000002,0,11.0,0.0,98.55,0.0,0.0,0.0,1.4500000000000002,,0.0,3.6,131.76,16.745698054972127,22.027904536694773,6.646525679758309,8.501118568232664,13.309114583333335,23.823507683215126,26.166017278598567
01-ai/Yi-1.5-6B-Chat,Yi,behavior,action_sequencing,6.1,1.4500000000000002,4.26,18.53,0,13.5,2.9000000000000004,71.00999999999999,10.14,0.0,1.4500000000000002,8.7,,0.0,3.6,131.76,22.784006289829847,23.67872313235784,16.238670694864048,6.935123042505594,14.030468750000002,24.368351063829788,51.452701055421834
01-ai/Yi-1.5-9B,Yi,behavior,action_sequencing,8.8,8.7,11.35,21.62,0,18.0,11.59,71.00999999999999,7.249999999999999,0.0,1.4500000000000002,7.249999999999999,,1.4500000000000002,3.6,190.08000000000004,22.153901514184795,30.50071699492122,11.404833836858005,17.225950782997764,12.030989583333332,32.402482269503544,29.358435617494916
01-ai/Yi-Coder-1.5B,Yi,behavior,action_sequencing,1.5,0.0,2.8400000000000003,16.99,0,12.0,1.4500000000000002,98.55,0.0,0.0,0.0,0.0,,0.0,2.4,21.6,,,,,,,
01-ai/Yi-Coder-1.5B-Chat,Yi,behavior,action_sequencing,1.5,0.0,1.4200000000000002,16.220000000000002,0,11.0,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,2.4,21.6,,,,,,,
01-ai/Yi-Coder-9B,Yi,behavior,action_sequencing,8.8,2.9000000000000004,9.93,17.76,0,15.0,4.35,86.96000000000001,2.9000000000000004,0.0,1.4500000000000002,2.9000000000000004,,0.0,2.4,126.72,,,,,,,
01-ai/Yi-Coder-9B-Chat,Yi,behavior,action_sequencing,8.8,15.939999999999998,17.02,23.94,0,21.5,17.39,68.12,0.0,0.0,5.800000000000001,4.35,,1.4500000000000002,2.4,126.72,16.985989314863886,25.94315294491389,4.003021148036254,0.0,7.963802083333333,15.83554964539007,48.17041006750976
Qwen/Qwen-72B,Qwen,behavior,action_sequencing,72.3,11.34,8.0,21.46,0,17.580000000000002,12.37,68.04,5.15,0.0,0.0,11.34,,0.0,3.0,1296.0,,,,,,,
Qwen/Qwen1.5-1.8B,Qwen1.5,behavior,action_sequencing,1.8,0.0,4.0,12.15,0,9.8,0.0,86.6,6.19,0.0,0.0,1.03,,1.03,2.4,25.92,9.269492522098927,9.759901587727937,3.1722054380664653,7.38255033557047,3.963802083333334,9.79609929078014,21.542396397115212
Qwen/Qwen1.5-14B,Qwen1.5,behavior,action_sequencing,14.2,4.12,13.0,14.57,0,14.12,8.25,58.76,9.28,1.03,5.15,13.4,,5.15,4.0,336.0,20.854080062460586,30.063103282917453,20.241691842900302,5.92841163310962,10.464062500000002,29.373522458628837,29.05368865720732
Qwen/Qwen1.5-4B,Qwen1.5,behavior,action_sequencing,4.0,0.0,4.0,10.53,0,8.649999999999999,0.0,67.01,5.15,0.0,1.03,13.4,,4.12,2.4,57.6,11.76818275851784,16.249142581095292,5.287009063444108,3.5794183445190177,4.8226562500000005,16.22340425531915,24.447466056729475
Qwen/Qwen1.5-72B,Qwen1.5,behavior,action_sequencing,72.3,18.56,25.0,27.94,0,27.09,23.71,37.11,3.09,0.0,8.25,23.71,,5.15,3.0,1296.0,,,,,,,
Qwen/Qwen1.5-7B,Qwen1.5,behavior,action_sequencing,7.7,2.06,7.000000000000001,12.15,0,10.66,1.03,76.29,10.31,1.03,2.06,7.22,,1.03,4.0,168.0,16.024674155407357,23.075768754340448,9.290030211480364,6.487695749440718,9.158333333333333,21.293218085106382,26.842998798742894
Qwen/Qwen3-0.6B,Qwen3,behavior,action_sequencing,0.8,0.0,4.0,12.55,0,10.09,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,36.0,172.8,,,,,,,
Qwen/Qwen3-1.7B,Qwen3,behavior,action_sequencing,2.0,0.0,4.0,12.55,0,10.09,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,36.0,432.0,,,,,,,
Qwen/Qwen3-14B,Qwen3,behavior,action_sequencing,14.8,8.25,14.000000000000002,14.17,0,14.12,8.25,90.72,0.0,0.0,0.0,1.03,,0.0,36.0,3196.8,,,,,,,
Qwen/Qwen3-235B-A22B-Thinking-2507,Qwen3,behavior,action_sequencing,235.1,0.0,4.0,11.65,0,9.56,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,36.0,50781.6,,,,,,,
Qwen/Qwen3-32B,Qwen3,behavior,action_sequencing,32.8,8.25,10.0,14.98,0,13.54,8.25,91.75,0.0,0.0,0.0,0.0,,0.0,36.0,7084.799999999999,,,,,,,
Qwen/Qwen3-4B,Qwen3,behavior,action_sequencing,4.0,1.03,10.0,12.55,0,11.82,1.03,97.94,0.0,0.0,0.0,0.0,,0.0,36.0,864.0,,,,,,,
Qwen/Qwen3-8B,Qwen3,behavior,action_sequencing,8.2,3.09,7.000000000000001,12.96,0,11.24,3.09,96.91,0.0,0.0,0.0,0.0,,0.0,36.0,1771.1999999999998,,,,,,,
baichuan-inc/Baichuan2-7B-Base,Baichuan,behavior,action_sequencing,7.0,0.0,1.31,14.86,0,10.5,0.0,92.21,2.6,0.0,0.0,3.9,,0.0,2.6,109.20000000000002,,,,,,,
baichuan-inc/Baichuan2-7B-Chat,Baichuan,behavior,action_sequencing,7.0,1.3,9.15,13.62,0,12.18,1.3,77.92,15.58,0.0,0.0,2.6,,1.3,2.6,109.20000000000002,,,,,,,
CohereLabs/c4ai-command-r-08-2024,Cohere,behavior,action_sequencing,32.3,16.0,22.0,25.94,0,24.86,19.0,5.0,13.0,0.0,8.0,43.0,,4.0,,,,,,,,,
CohereLabs/c4ai-command-r-plus-08-2024,Cohere,behavior,action_sequencing,103.8,28.000000000000004,29.0,31.95,0,31.15,35.0,0.0,1.0,15.0,10.0,39.0,,15.0,,,,,,,,,
deepseek-ai/DeepSeek-R1,DeepSeek,behavior,action_sequencing,684.5,1.0,6.0,12.03,0,10.38,1.0,98.0,0.0,0.0,0.0,1.0,,0.0,14.8,60783.600000000006,,,,,,,
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,DeepSeek,behavior,action_sequencing,70.6,11.22,13.0,16.93,0,15.82,11.22,86.72999999999999,0.0,0.0,0.0,2.04,,0.0,15.0,6353.999999999999,27.809426360756188,35.81986234433108,30.74018126888218,2.0134228187919474,13.277343749999998,41.64635047281324,43.35939750971866
deepseek-ai/DeepSeek-R1-Distill-Llama-8B,DeepSeek,behavior,action_sequencing,8.0,4.08,15.0,13.39,0,13.84,5.1,81.63,3.06,0.0,2.04,4.08,,3.06,15.0,720.0,13.059950104920146,5.325247153240706,21.978851963746223,0.6711409395973182,0.45572916666666624,12.10475768321513,37.82397372305483
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek,behavior,action_sequencing,1.8,0.0,4.0,12.2,0,9.89,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,18.0,194.4,10.351036796154286,4.729119207646243,16.91842900302115,0.7829977628635317,2.9656249999999993,2.0759456264775418,34.63410417691725
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,DeepSeek,behavior,action_sequencing,14.8,7.140000000000001,13.0,14.17,0,13.84,7.140000000000001,87.76,0.0,0.0,1.02,2.04,,0.0,18.0,1598.4,38.22146462032291,40.69076685552542,57.02416918429003,18.34451901565996,28.711458333333326,40.74135638297872,43.81651795015004
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,DeepSeek,behavior,action_sequencing,32.8,17.349999999999998,26.5,16.73,0,19.49,19.39,70.41,4.08,0.0,2.04,3.06,,1.02,18.0,3542.3999999999996,22.96226839270608,17.149673765590364,17.069486404833835,4.5861297539149914,16.1421875,40.962987588652474,41.86314534324481
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,DeepSeek,behavior,action_sequencing,7.6,0.0,4.0,12.2,0,9.89,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,18.0,820.8,14.99492256865316,7.882702983365756,19.561933534743204,3.9149888143176734,3.5518229166666675,14.681220449172578,40.3768667136531
deepseek-ai/DeepSeek-V3,DeepSeek,behavior,action_sequencing,684.5,41.0,54.50000000000001,42.67,0,45.9,51.0,0.0,0.0,0.0,6.0,36.0,,1.0,14.8,60783.600000000006,,,,,,,
google/gemma-1.1-2b-it,Gemma,behavior,action_sequencing,2.5,0.0,2.2,12.96,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,3.0,45.0,8.053373854341979,5.862826722774347,1.812688821752266,2.572706935123044,2.024479166666666,5.372709810874704,30.674831668860847
google/gemma-1.1-7b-it,Gemma,behavior,action_sequencing,8.5,1.11,9.89,14.81,0,13.36,7.779999999999999,32.22,16.669999999999998,8.89,2.22,27.78,,4.44,6.0,306.0,17.693584228972615,15.93420938501317,4.909365558912387,5.8165548098433995,11.510937500000002,17.5993646572104,50.391073462856326
google/gemma-2-27b,Gemma,behavior,action_sequencing,27.2,14.44,13.19,19.44,0,17.59,16.669999999999998,72.22,1.11,0.0,1.11,6.67,,0.0,13.0,2121.6,23.926167340782822,37.390737454186464,16.61631419939577,13.422818791946312,13.921093749999997,37.4538268321513,24.75221301701707
google/gemma-2-27b-it,Gemma,behavior,action_sequencing,27.2,30.0,25.27,41.67,0,36.81,41.11,22.22,0.0,0.0,12.22,24.44,,0.0,13.0,2121.6,36.17428251510342,49.27284215130387,23.867069486404834,16.666666666666664,9.112760416666667,38.34958628841608,79.77677008116243
google/gemma-2-2b,Gemma,behavior,action_sequencing,2.6,0.0,2.2,12.96,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,2.0,31.200000000000003,10.129463155055184,11.755807532236112,2.8700906344410875,1.6778523489932917,11.430468750000001,13.111332742316787,19.931226922343825
google/gemma-2-2b-it,Gemma,behavior,action_sequencing,2.6,0.0,6.59,10.65,0,9.45,0.0,24.44,11.11,8.89,0.0,54.44,,0.0,2.0,31.200000000000003,17.046939294966545,17.980792881523424,0.0755287009063444,3.243847874720355,7.077343750000001,17.22074468085106,56.68337788179808
google/gemma-2-9b,Gemma,behavior,action_sequencing,9.2,0.0,3.3000000000000003,13.43,0,10.42,0.0,98.89,0.0,0.0,1.11,0.0,,1.11,8.0,441.6,21.205286776100692,34.09681853589784,13.444108761329304,10.514541387024611,14.297656250000001,34.48027482269504,20.398320899657357
google/gemma-2-9b-it,Gemma,behavior,action_sequencing,9.2,20.0,15.38,29.17,0,25.08,28.89,32.22,1.11,0.0,7.779999999999999,28.89,,3.3300000000000005,8.0,441.6,32.07276025267082,42.136619683664655,19.486404833836858,14.76510067114094,9.742187500000002,31.949985224586293,74.35626360279613
google/gemma-2b,Gemma,behavior,action_sequencing,2.5,0.0,2.2,12.96,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,6.0,72.0,7.321959810488082,8.246263426638125,3.0211480362537766,0.6711409395973182,7.555989583333336,4.061391843971631,20.375825033134305
google/gemma-2b-it,Gemma,behavior,action_sequencing,2.5,0.0,2.2,12.96,0,9.77,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,6.0,90.0,7.485804130315127,5.214303022163619,2.0392749244712993,3.8031319910514525,3.0322916666666675,3.9228723404255303,26.902950837112197
google/gemma-3-12b-it,Gemma,behavior,action_sequencing,12.2,29.21,37.64,31.07,0,33.0,34.83,31.46,0.0,0.0,5.62,26.97,,2.25,12.0,878.4,,,,,,,
google/gemma-3-12b-pt,Gemma,behavior,action_sequencing,12.2,1.12,4.49,14.49,0,11.55,3.37,96.63,0.0,0.0,0.0,0.0,,0.0,12.0,878.4,,,,,,,
google/gemma-3-27b-it,Gemma,behavior,action_sequencing,27.4,32.58,36.52,39.02,0,38.28,39.33,30.34,0.0,0.0,5.62,22.47,,1.12,14.0,2301.6,,,,,,,
google/gemma-3-4b-it,Gemma,behavior,action_sequencing,4.3,2.25,7.870000000000001,16.82,0,14.19,7.870000000000001,40.45,0.0,1.12,5.62,35.96,,4.49,4.0,103.2,,,,,,,
google/gemma-3-4b-pt,Gemma,behavior,action_sequencing,4.3,0.0,3.37,13.55,0,10.56,1.12,97.75,0.0,0.0,0.0,1.12,,0.0,4.0,103.2,,,,,,,
google/gemma-7b,Gemma,behavior,action_sequencing,8.5,0.0,7.689999999999999,13.43,0,11.73,1.11,92.22,1.11,0.0,2.22,1.11,,0.0,6.0,252.0,15.442818570272307,21.11609932329174,7.401812688821751,4.921700223713646,10.979947916666669,21.644134160756497,26.593217108383534
google/gemma-7b-it,Gemma,behavior,action_sequencing,8.5,0.0,6.59,13.43,0,11.4,0.0,30.0,41.11,16.669999999999998,0.0,12.22,,0.0,2.0,102.0,13.067087110466217,11.940832085290182,2.9456193353474323,4.5861297539149914,12.528385416666667,7.7183067375886525,38.68324933398937
ibm-granite/granite-3.1-2b-base,Granite,behavior,action_sequencing,2.5,1.18,3.53,15.310000000000002,0,11.74,1.18,90.59,0.0,0.0,1.18,2.35,,0.0,12.0,180.0,13.202826259598206,16.843689846888516,5.664652567975831,3.6912751677852316,3.9049479166666674,13.89627659574468,35.216115462528315
ibm-granite/granite-3.1-2b-instruct,Granite,behavior,action_sequencing,2.5,5.81,10.59,18.41,0,16.08,9.3,43.02,6.98,0.0,6.98,29.07,,4.65,12.0,180.0,21.712212822028288,21.822956140794506,15.256797583081571,5.257270693512303,4.867708333333335,20.212765957446805,62.8557782240012
ibm-granite/granite-3.1-8b-base,Granite,behavior,action_sequencing,8.2,4.71,9.41,16.84,0,14.59,8.24,69.41000000000001,3.53,0.0,1.18,11.76,,1.18,12.0,590.4,20.05719991900457,26.01958867101177,9.441087613293051,9.507829977628639,8.36197916666667,24.802378841607563,42.21033524381973
ibm-granite/granite-3.1-8b-instruct,Granite,behavior,action_sequencing,8.2,3.49,10.59,18.91,0,16.43,4.65,48.84,8.14,0.0,9.3,20.93,,2.33,12.0,590.4,30.6030430081627,34.089655299414055,21.978851963746223,8.277404921700223,19.00520833333333,28.191489361702125,72.07564816908027
ibm-granite/granite-3.2-2b-instruct,Granite,behavior,action_sequencing,2.5,0.0,7.06,13.43,0,11.54,3.49,41.86,3.49,1.16,11.63,33.72,,3.49,12.0,180.0,21.25014812377563,21.668268416036614,14.425981873111782,5.369127516778524,4.704947916666668,19.815676713947987,61.51688630611223
ibm-granite/granite-3.2-8b-instruct,Granite,behavior,action_sequencing,8.2,4.65,11.76,16.919999999999998,0,15.38,8.14,44.190000000000005,5.81,0.0,15.12,15.12,,5.81,12.0,590.4,30.7704488980163,34.65536965519957,23.791540785498487,8.7248322147651,16.791406250000005,27.914450354609933,72.74509412802476
ibm-granite/granite-3.3-2b-base,Granite,behavior,action_sequencing,2.5,1.18,3.53,13.27,0,10.32,0.0,90.59,1.18,1.18,2.35,4.71,,0.0,12.0,180.0,,,,,,,
ibm-granite/granite-3.3-2b-instruct,Granite,behavior,action_sequencing,2.5,2.33,7.06,15.920000000000002,0,13.29,3.49,38.37,13.95,4.65,10.47,26.74,,8.14,12.0,180.0,,,,,,,
ibm-granite/granite-3.3-8b-base,Granite,behavior,action_sequencing,8.2,4.71,5.88,15.82,0,12.81,4.71,64.71000000000001,7.06,0.0,2.35,11.76,,2.35,12.0,590.4,,,,,,,
ibm-granite/granite-3.3-8b-instruct,Granite,behavior,action_sequencing,8.2,3.49,8.24,16.42,0,13.99,8.14,48.84,10.47,0.0,4.65,22.09,,3.49,12.0,590.4,,,,,,,
LGAI-EXAONE/EXAONE-3.5-32B-Instruct,Exaone,behavior,action_sequencing,32.0,29.0,32.5,34.02,0,33.61,34.0,0.0,1.0,1.0,11.0,49.0,,1.0,6.5,1248.0,37.603165755662836,39.82420331711213,51.283987915407856,5.033557046979867,5.150000000000001,40.40890957446809,83.91833668000905
LGAI-EXAONE/EXAONE-Deep-32B,Exaone,behavior,action_sequencing,32.0,8.0,15.0,18.8,0,17.76,10.0,80.0,1.0,0.0,0.0,4.0,,0.0,6.5,1248.0,,,,,,,
meta-llama/Llama-2-13b-hf,Llama-2,behavior,action_sequencing,13.0,0.0,1.4200000000000002,16.12,0,11.11,0.0,98.61,0.0,0.0,0.0,0.0,,0.0,2.0,156.0,11.065185981273997,17.222559825058127,1.5105740181268883,4.138702460850116,3.385416666666666,15.309175531914892,24.824687385027282
meta-llama/Llama-2-70b-hf,Llama-2,behavior,action_sequencing,69.0,0.0,1.4200000000000002,16.12,0,11.11,0.0,100.0,0.0,0.0,0.0,0.0,,0.0,2.0,840.0,18.372598605703004,35.900061863721675,3.2477341389728096,7.046979865771815,9.777604166666668,30.1954048463357,24.06780675274937
meta-llama/Llama-2-7b-hf,Llama-2,behavior,action_sequencing,6.7,0.0,1.4200000000000002,16.85,0,11.59,0.0,91.67,2.78,1.39,0.0,2.78,,0.0,2.0,84.0,8.806357596540016,10.35141665784897,1.7371601208459215,2.2371364653243813,3.7578125,9.56523345153664,25.18938638368418
meta-llama/Llama-3.1-70B,Llama-3,behavior,action_sequencing,70.6,3.06,8.0,13.78,0,12.15,3.06,93.88,0.0,0.0,1.02,2.04,,0.0,15.0,6353.999999999999,26.200215843375947,46.39941295581887,18.429003021148038,18.34451901565996,16.581770833333337,40.602836879432616,16.843752354862875
meta-llama/Llama-3.1-8B,Llama-3,behavior,action_sequencing,8.0,3.06,10.0,13.39,0,12.43,5.1,78.57,3.06,0.0,2.04,7.140000000000001,,2.04,,,14.42086519266696,25.30447063475493,6.570996978851963,8.05369127516779,8.715104166666668,25.42109929078014,12.459828809780273
meta-llama/Llama-3.3-70B-Instruct,Llama-3,behavior,action_sequencing,70.6,41.0,49.5,43.05,0,44.81,49.0,0.0,0.0,0.0,10.0,38.0,,6.0,15.0,6353.999999999999,44.84747145129876,56.561410788022194,48.338368580060425,10.514541387024611,15.565624999999999,48.12906323877069,89.97581971391463
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8,Llama,behavior,action_sequencing,401.6,45.0,56.49999999999999,53.95,0,54.64,56.00000000000001,16.0,1.0,0.0,6.0,18.0,,3.0,22.0,53011.2,,,,,,,
meta-llama/Llama-4-Scout-17B-16E-Instruct,Llama,behavior,action_sequencing,108.6,37.0,35.5,56.58,0,50.82,50.0,0.0,7.000000000000001,0.0,14.000000000000002,29.0,,4.0,40.0,26064.0,,,,,,,
meta-llama/Meta-Llama-3-70B,Llama-3,behavior,action_sequencing,70.6,1.02,8.0,12.6,0,11.3,2.04,95.92,1.02,0.0,0.0,1.02,,0.0,15.0,6300.0,26.705350171613343,48.709812647505885,18.580060422960727,19.686800894854585,16.011197916666664,41.21232269503546,16.031906452656727
meta-llama/Meta-Llama-3-8B,Llama-3,behavior,action_sequencing,8.0,1.02,4.0,12.6,0,10.17,1.02,92.86,1.02,0.0,0.0,3.06,,0.0,15.0,720.0,13.626857071686075,24.50076379676797,4.531722054380665,7.38255033557047,6.242447916666666,24.553043735224584,14.550614591506093
meta-llama/Meta-Llama-3-8B-Instruct,Llama-3,behavior,action_sequencing,8.0,13.27,18.0,21.65,0,20.62,18.37,16.33,11.22,3.06,5.1,36.73,,5.1,15.0,720.0,23.908735693936837,28.244949576343615,8.685800604229607,1.230425055928408,1.602864583333335,29.604388297872337,74.08398604591373
mistralai/Mistral-7B-Instruct-v0.2,Mistral,behavior,action_sequencing,7.2,3.0,5.0,13.53,0,11.2,5.0,8.0,38.0,5.0,5.0,39.0,,3.0,,,18.50789159273764,22.910601936713604,3.0211480362537766,3.467561521252797,7.608854166666667,19.076906028368796,54.96227786717022
mistralai/Mixtral-8x7B-Instruct-v0.1,Mistral,behavior,action_sequencing,46.7,6.0,17.0,15.79,0,16.12,7.000000000000001,21.0,53.0,0.0,6.0,11.0,,1.0,,,23.8171027058463,29.742398380967334,9.138972809667674,7.046979865771815,11.073697916666667,29.909131205673756,55.991436056330535
mistralai/Mixtral-8x22B-Instruct-v0.1,Mistral,behavior,action_sequencing,140.6,31.0,37.5,38.16,0,37.98,40.0,3.0,6.0,0.0,10.0,32.0,,2.0,,,33.88568028808198,44.11434558724835,18.731117824773413,16.442953020134222,13.489322916666664,38.70050236406619,71.83584001560305
moonshotai/Kimi-K2-Instruct,Kimi,behavior,action_sequencing,1000.0,53.0,58.5,65.60000000000001,0,63.66,66.0,0.0,1.0,0.0,6.0,27.0,,3.0,15.5,93000.0,,,,,,,
openai/gpt-oss-120b,GPT-OSS,behavior,action_sequencing,120.4,53.54,52.15,63.45,0,60.5,64.64999999999999,1.01,0.0,0.0,2.02,32.32,,2.02,,,,,,,,,
openai/gpt-oss-20b,GPT-OSS,behavior,action_sequencing,21.5,35.0,28.000000000000004,48.5,0,42.9,46.0,37.0,0.0,0.0,2.0,13.0,,1.0,,,,,,,,,
