ID,dataset,action,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,?A4=A1|3,?A1=A1*,?A1=A1**,?A1*=A1**,J(A1-A2),J(A1-A34),J(A3-A4),J(A4-A1|3),J(A1-A1*),J(A1-A1**),J(A1*-A1**),?SC(A1=A2),?SC(A1>A3),?SC(A1>A4),?SC(A3∅A4),?SC(A4=A1|3),idk_A1,idk_A2,idk_A3,idk_A4,?A1=A1(ave),J_A1_ave,idk,?A1=A2(+),?A1=A2(-),J(1-2)+,J(1-2)-,?A1>A3(+),?A1>A3(-),?A1>A4(+),?A1>A4(-),?A3∅A4(+),?A3∅A4(-),J(3-4)+,J(3-4)-,?A1=A3+A4(+),?A1=A3+A4(-),J(1-34)+,J(1-34)-,p(A1=A2),p(A1=A3+A4),p(A1>A3),p(A1>A4),p(A3∅A4),p(A4=A1|3)
A,overall,zero-shot,llama3.1:8b,0.0167,0.0017,0.03,0.0117,0.8433,0.0,0.029249999999999998,,,0.1667,0.1173,0.0329,0.0644,0.21525,,,0.9383,0.8633,0.8733,0.285,0.9717,0.0083,0.0033,0.0233,0.0317,0.0135,0.1361,0.016649999999999998,0.11428571428571428,0.010619469026548672,0.2730942857142857,0.16006230088495574,0.029411764705882353,0.03007518796992481,0.0,0.013182674199623353,0.8347826086956521,0.845360824742268,0.03642608695652174,0.03208536082474227,0.0,0.0017152658662092624,0.16815882352941175,0.1157934819897084,1.0,1.0,1.0,1.0,1.0,1.0
B,overall,zero-shot,gpt-oss:20b,0.2167,0.1533,0.38,0.325,0.83,0.1,0.22705,,,0.4413,0.398,0.0775,0.2927,0.44435,,,0.2617,0.4033,0.3217,0.8033,0.16,0.12,0.1267,0.1067,0.1483,0.1824,0.3995,0.125425,0.22202486678507993,0.13513513513513514,0.44832984014209587,0.33379729729729724,0.38267148014440433,0.34782608695652173,0.3166089965397924,0.5454545454545454,0.8467153284671532,0.6538461538461539,0.0644669708029197,0.21456346153846154,0.15602836879432624,0.1111111111111111,0.4029003546099291,0.32116666666666666,1.0,1.0,1.0,1.0,1.0,1.0
C,overall,zero-shot,gpt-4.1-nano-2025-04-14,0.2783,0.1183,0.4567,0.3717,0.62,0.04,0.5852499999999999,,,0.5129,0.3949,0.1915,0.2037,0.68235,,,0.4567,0.5,0.5317,0.5,0.96,0.0683,0.085,0.1833,0.1667,0.2105,0.4039,0.125825,0.30843373493975906,0.21081081081081082,0.5487522891566265,0.4323335135135135,0.4449152542372881,0.4642857142857143,0.38306451612903225,0.36363636363636365,0.5962566844919787,0.6592920353982301,0.20640935828877008,0.1666973451327434,,0.11833333333333333,,0.39486116666666665,1.0,1.0,1.0,1.0,1.0,1.0
D,overall,zero-shot,mistral-small:24b,0.4407,0.2407,0.5034,0.4525,0.5085,0.0153,0.4301,,,0.6399,0.5192,0.3447,0.2057,0.6410499999999999,,,0.4898,0.4949,0.5458,0.5068,0.8729,0.2203,0.2271,0.3322,0.3881,0.5135,0.6899,0.291925,0.46210720887245843,0.20408163265306123,0.6567571164510166,0.4542408163265305,0.0,0.5042444821731749,0.42857142857142855,0.4528301886792453,0.5076660988074957,0.6666666666666666,0.344718398637138,0.3333333333333333,0.2857142857142857,0.23461538461538461,0.5582942857142856,0.5139790384615385,1.0,1.0,1.0,1.0,1.0,1.0
E,overall,zero-shot,llama3.1:70b,0.2067,0.07,0.2883,0.2383,0.71,0.0283,0.2033,,,0.4712,0.3543,0.1006,0.2067,0.4882,,,0.23,0.6817,0.6217,0.7117,0.0433,0.02,0.025,0.0283,0.06,,,0.033325,0.2106164383561644,0.0625,0.47690958904109587,0.26463749999999997,0.30434782608695654,0.2870036101083033,0.24074074074074073,0.2374429223744292,0.7126050420168067,0.4,0.1,0.17,0.0676818950930626,0.2222222222222222,0.3524453468697124,0.47390000000000004,1.0,1.0,1.0,1.0,1.0,1.0
F,overall,zero-shot,gemini-2.0-flash,0.3267,0.1217,0.415,0.3533,0.6267,0.05,0.48034999999999994,,,0.6008,0.4605,0.13,0.2914,0.7008000000000001,,,0.3733,0.445,0.6233,0.625,0.2333,0.0017,0.005,0.0033,0.0067,0.5407,0.7426,0.004175,0.332089552238806,0.28125,0.6044757462686567,0.569709375,0.41025641025641024,0.4318181818181818,0.4453125,0.3283898305084746,0.6345811051693404,0.5128205128205128,0.13011836007130123,0.12855128205128205,0.1160337552742616,0.14285714285714285,0.4620457805907173,0.4545468253968254,1.0,1.0,1.0,1.0,1.0,1.0
G,overall,zero-shot,gpt-4.1-mini-2025-04-14,0.3367,0.14,0.4817,0.3833,0.5633,0.055,0.28015,,,0.6296,0.5016,0.1574,0.2812,0.53845,,,0.3867,0.46,0.5917,0.5633,0.0867,0.025,0.025,0.0567,0.0867,0.2153,0.4823,0.048350000000000004,0.34798534798534797,0.2222222222222222,0.6418919413919414,0.5058537037037036,0.41545893719806765,0.5165394402035624,0.44,0.3684210526315789,0.563973063973064,0.5,0.15517878787878786,0.3772,0.13391304347826086,0.28,0.5026756521739131,0.47752799999999995,1.0,1.0,1.0,1.0,1.0,1.0
H,overall,zero-shot,gpt-4o,0.4517,0.255,0.5333,0.47,0.6283,0.0617,0.48985,,,0.6534,0.5385,0.2669,0.2612,0.52095,,,0.4533,0.49,0.535,0.6317,0.3767,0.2583,0.2533,0.2817,0.3983,0.3797,0.6086,0.29790000000000005,0.4497354497354497,0.48484848484848486,0.6583820105820105,0.5677454545454544,0.5368421052631579,0.5317073170731708,0.5205479452054794,0.4629981024667932,0.6308724832214765,0.25,0.26364412751677846,0.75,0.23785166240409208,0.28708133971291866,0.5503314578005115,0.5163626794258374,1.0,1.0,1.0,1.0,1.0,1.0
I,overall,zero-shot,gpt-4.1-2025-04-14,0.3967,0.165,0.4983,0.3867,0.6333,0.1017,0.2873,,,0.6771,0.5435,0.1118,0.3581,0.5949,,,0.4283,0.4983,0.5033,0.6383,0.1083,0.0167,0.0217,0.05,0.0633,0.2763,0.5689,0.037925,0.4074074074074074,0.21212121212121213,0.6917485008818343,0.4259424242424243,0.4983108108108108,0.5,0.42105263157894735,0.3076923076923077,0.6365159128978225,0.0,0.10899212730318257,0.6805666666666667,0.1644295302013423,0.25,0.5446303691275167,0.37377499999999997,1.0,1.0,1.0,1.0,1.0,1.0
J,overall,zero-shot,grok-3-mini,0.3433,0.2617,0.5233,0.4383,0.8767,0.23,0.3767,,,0.6346,0.5715,0.0561,0.4478,0.6309,,,0.3933,0.5217,0.44,0.88,0.2617,0.0567,0.07,0.075,0.1233,,,0.08125,0.3563636363636364,0.2,0.6489525454545455,0.477314,0.523725834797891,0.5161290322580645,0.4369602763385147,0.47619047619047616,0.8873720136518771,0.42857142857142855,0.050362457337883956,0.2975428571428571,0.2685025817555938,0.05263157894736842,0.5792645438898452,0.3328368421052632,1.0,1.0,1.0,1.0,1.0,1.0
K,overall,zero-shot,deepseek-chat,0.325,0.165,0.455,0.4017,0.56,0.0733,0.3881,,,0.5703,0.4662,0.1661,0.2573,0.6082000000000001,,,0.3483,0.5033,0.5433,0.56,0.1483,0.0617,0.0733,0.14,0.18,0.2729,0.5227,0.11375,0.3298611111111111,0.20833333333333334,0.5765803819444444,0.4197916666666666,0.4413145539906103,0.4625322997416021,0.368,0.4105263157894737,0.5604026845637584,0.5,0.16629882550335573,0.140625,0.15837937384898712,0.22807017543859648,0.46725064456721915,0.45645789473684206,1.0,1.0,1.0,1.0,1.0,1.0
L,overall,zero-shot,gemini-2.5-flash,0.335,0.2467,0.495,0.415,0.845,0.2183,0.37685,,,0.6101,0.5398,0.036,0.4382,0.64855,,,0.4183,0.5083,0.4183,0.84,0.2567,0.045,0.04,0.06,0.0683,0.4237,0.6615,0.053325,0.35714285714285715,0.1951219512195122,0.6282980694980694,0.49529878048780485,0.5017421602787456,0.34615384615384615,0.4134948096885813,0.45454545454545453,0.8537005163511188,0.5789473684210527,0.030123924268502585,0.2145263157894737,0.2530541012216405,0.1111111111111111,0.5482260034904014,0.36104444444444445,1.0,1.0,1.0,1.0,1.0,1.0
M,overall,zero-shot,gpt-5-nano,0.59,0.4367,0.64,0.6317,0.6767,0.1733,0.33485,,,0.7228,0.633,0.2988,0.3852,0.4209,,,0.5783,0.6133,0.6267,0.6583,0.245,0.4133,0.4267,0.3617,0.47,0.5797,0.7458,0.41792500000000005,0.5916515426497277,0.5714285714285714,0.7257918330308529,0.689430612244898,0.6490196078431373,0.5888888888888889,0.6411657559198543,0.5294117647058824,0.6892655367231638,0.5797101449275363,0.28559152542372884,0.4001608695652174,0.43327239488117003,0.4716981132075472,0.6283680073126143,0.6806415094339624,1.0,1.0,1.0,1.0,1.0,1.0
N,overall,zero-shot,deepseek-reasoner,0.1867,0.135,0.4017,0.3017,0.82,0.1083,0.2995,,,0.461,0.4102,0.0396,0.3115,0.5185500000000001,,,0.2817,0.4183,0.315,0.7967,0.165,0.0333,0.0467,0.035,0.0533,0.1723,0.4072,0.042075,0.20075046904315197,0.07462686567164178,0.48127504690431516,0.29982089552238805,0.4072164948453608,0.2222222222222222,0.3017241379310345,0.3,0.8200692041522492,0.8181818181818182,0.03940397923875433,0.04500909090909091,0.1431095406360424,0.0,0.41895671378091875,0.26439411764705884,1.0,1.0,1.0,1.0,1.0,1.0
O,overall,zero-shot,gemini-2.5-pro,0.31,0.2233,0.4367,0.3883,0.815,0.195,0.36885,,,0.6314,0.5665,0.0325,0.4698,0.66435,,,0.3867,0.4367,0.3967,0.8167,0.2333,0.0267,0.0233,0.04,0.0433,0.3644,0.6772,0.033325,0.33394160583941607,0.057692307692307696,0.6490443430656934,0.4458115384615384,0.4351535836177474,0.5,0.3862433862433862,0.42424242424242425,0.8206429780033841,0.4444444444444444,0.02945245346869712,0.23574444444444445,0.2322357019064125,0.0,0.5735788561525129,0.38996956521739135,1.0,1.0,1.0,1.0,1.0,1.0
P,overall,zero-shot,gpt-5-mini,0.6533,0.4783,0.6383,0.6817,0.615,0.1433,0.6314500000000001,,,0.7718,0.6709,0.3593,0.3725,0.7622,,,0.6333,0.63,0.6833,0.635,0.1917,0.485,0.465,0.41,0.5267,0.6729,0.7934,0.47167499999999996,0.6586715867158671,0.603448275862069,0.7772850553505535,0.721003448275862,0.6380789022298456,0.6470588235294118,0.686541737649063,0.46153846153846156,0.631578947368421,0.3,0.34176122807017545,0.6916666666666667,0.47285464098073554,0.5862068965517241,0.6680252189141856,0.7275965517241381,1.0,1.0,1.0,1.0,1.0,1.0
Q,overall,zero-shot,o3,0.3441,0.2288,0.5068,0.3864,0.8661,0.1915,0.31420000000000003,,,0.6246,0.5426,0.0553,0.4383,0.5892,,,0.3932,0.4949,0.3966,0.8492,0.2542,0.0898,0.0983,0.0864,0.0847,0.3034,0.575,0.08979999999999999,0.3583180987202925,0.16279069767441862,0.6396588665447899,0.4324162790697674,0.5008726003490401,0.7058823529411765,0.3888888888888889,0.2857142857142857,0.8754448398576512,0.6785714285714286,0.04982722419928825,0.16460357142857143,0.23869801084990958,0.08108108108108109,0.5525264014466547,0.3948270270270271,1.0,1.0,1.0,1.0,1.0,1.0
R,overall,zero-shot,gpt-5,0.61,0.4317,0.6417,0.65,0.735,0.2167,0.58595,,,0.7816,0.6814,0.2211,0.4816,0.76895,,,0.6083,0.635,0.6467,0.7567,0.2683,0.3567,0.3433,0.2533,0.3267,0.6136,0.7845,0.32,0.6206261510128913,0.5087719298245614,0.7890513812154696,0.7101964912280702,0.6416382252559727,0.6428571428571429,0.6508474576271186,0.6,0.7538726333907056,0.15789473684210525,0.20501153184165233,0.7130684210526317,0.42680776014109345,0.5151515151515151,0.6833717813051146,0.6470454545454545,1.0,1.0,1.0,1.0,1.0,1.0
