llm,action,?A1=A1*,J(A1-A1*),?A1=A2,?A1>A3,?A1>A4,?A3∅A4,?A4=A1|3,ave_?A,J(A1-A2),J(A3-A4),J(A4-A1|3),idk
llama3.1:8b,zero-shot,0.0292499999999999,0.21525,0.0167,0.03,0.0117,0.8433,0.0,0.18034,0.1667,0.0329,0.0644,0.0166499999999999
llama3.1:8b,classification,0.045,0.2944,0.1633,1.0,0.3383,0.6817,0.05,0.44665999999999995,0.5603,0.0974,0.3899,0.250825
llama3.1:8b,fixing,0.0217,0.1659,0.1267,0.155,0.1233,0.6183,0.0183,0.20831999999999998,0.4549,0.1126,0.2414,0.0021
gpt-oss:20b,zero-shot,0.22705,0.44435,0.2167,0.38,0.325,0.83,0.1,0.37034,0.4413,0.0775,0.2927,0.125425
gpt-oss:20b,classification,0.2717,0.4892,0.6717,1.0,0.7317,0.8467,0.38,0.7260199999999999,0.7939,0.1073,0.6099,0.370425
gpt-oss:20b,fixing,0.2433,0.4946,0.5833,0.8067,0.9233,0.9533,0.62,0.77732,0.765,0.0397,0.7864,0.13335
gpt-4.1-nano-2025-04-14,zero-shot,0.5852499999999999,0.68235,0.2783,0.4567,0.3717,0.62,0.04,0.35334000000000004,0.5129,0.1915,0.2037,0.125825
gpt-4.1-nano-2025-04-14,classification,0.96,0.9608,0.9733,0.7167,0.5667,0.3817,0.0,0.5276799999999999,0.9757,0.6149,0.0017,0.808725
gpt-4.1-nano-2025-04-14,fixing,0.3483,0.4852,0.5467,0.5917,0.6017,0.6183,0.215,0.51468,0.6237,0.2257,0.3509,0.1937499999999999
mistral-small:24b,zero-shot,0.4301,0.6410499999999999,0.4407,0.5034,0.4525,0.5085,0.0153,0.38408,0.6399,0.3447,0.2057,0.291925
mistral-small:24b,classification,0.3467,0.5922,0.845,1.0,0.8067,0.7983,0.4233,0.7746600000000001,0.9257,0.1774,0.6371,0.335825
mistral-small:24b,fixing,0.5233,0.7518,0.72,0.7467,0.9,0.9583,0.5017,0.7653399999999999,0.8296,0.0148,0.6782,0.1554249999999999
llama3.1:70b,zero-shot,0.2033,0.4882,0.2067,0.2883,0.2383,0.71,0.0283,0.29432,0.4712,0.1006,0.2067,0.033325
llama3.1:70b,classification,0.2033,0.4882,0.8033,1.0,0.8633,0.935,0.5617,0.83266,0.9071,0.0295,0.812,0.26915
llama3.1:70b,fixing,0.21,0.4702,0.61,0.7267,0.8217,0.935,0.5067,0.72002,0.7529,0.0337,0.7635,0.0504
gemini-2.0-flash,zero-shot,0.4803499999999999,0.7008000000000001,0.3267,0.415,0.3533,0.6267,0.05,0.35434,0.6008,0.13,0.2914,0.004175
gemini-2.0-flash,classification,0.42,0.659,0.8817,1.0,0.945,0.6433,0.5383,0.80166,0.9384,0.2782,0.6268,0.405825
gemini-2.0-flash,fixing,0.775,0.8706,0.79,0.8417,0.92,0.9667,0.7383,0.8513399999999999,0.8233,0.0137,0.8415,0.029575
gpt-4.1-mini-2025-04-14,zero-shot,0.28015,0.53845,0.3367,0.4817,0.3833,0.5633,0.055,0.364,0.6296,0.1574,0.2812,0.04835
gpt-4.1-mini-2025-04-14,classification,0.345,0.5946,0.8983,0.64,0.9483,0.7717,0.64,0.77966,0.9573,0.2183,0.7241,0.2325249999999999
gpt-4.1-mini-2025-04-14,fixing,0.445,0.7114,0.775,0.8767,0.9633,0.8967,0.6883,0.8400000000000001,0.8881,0.061,0.7953,0.0875249999999999
gpt-4o,zero-shot,0.48985,0.52095,0.4517,0.5333,0.47,0.6283,0.0617,0.429,0.6534,0.2669,0.2612,0.2979
gpt-4o,classification,0.6,0.4333,0.9733,0.9933,0.9833,0.3633,0.3067,0.7239800000000001,0.5867,0.6279,0.3508,0.66665
gpt-4o,fixing,0.54,0.5486,0.8533,0.9183,0.9483,0.865,0.6783,0.8526400000000001,0.7343,0.1209,0.5879,0.33875
gpt-4.1-2025-04-14,zero-shot,0.2873,0.5949,0.3967,0.4983,0.3867,0.6333,0.1017,0.40334000000000003,0.6771,0.1118,0.3581,0.037925
gpt-4.1-2025-04-14,classification,0.2983,0.6209,0.9183,0.7033,0.975,0.87,0.7483,0.8429800000000001,0.9754,0.1174,0.8484,0.1374999999999999
gpt-4.1-2025-04-14,fixing,0.5217,0.7766,0.825,0.9183,0.945,0.9533,0.7483,0.87798,0.9207,0.0309,0.8364,0.06625
grok-3-mini,zero-shot,0.3767,0.6309,0.3433,0.5233,0.4383,0.8767,0.23,0.48231999999999997,0.6346,0.0561,0.4478,0.08125
grok-3-mini,classification,0.3767,0.6309,0.9083,1.0,0.9,0.8667,0.6683,0.86866,0.9639,0.1277,0.7869,0.350825
grok-3-mini,fixing,0.4083,0.6373,0.8817,0.9267,0.985,0.955,0.81,0.9116800000000002,0.9363,0.04,0.7916,0.134575
deepseek-chat,zero-shot,0.3881,0.6082000000000001,0.325,0.455,0.4017,0.56,0.0733,0.363,0.5703,0.1661,0.2573,0.11375
deepseek-chat,classification,0.5033,0.6937,0.945,0.8467,0.9117,0.6733,0.5417,0.7836800000000002,0.9722,0.3101,0.6274,0.3067
deepseek-chat,fixing,0.4883,0.714,0.8167,0.8883,0.95,0.9467,0.71,0.86234,0.9117,0.0396,0.798,0.1
gemini-2.5-flash,zero-shot,0.37685,0.64855,0.335,0.495,0.415,0.845,0.2183,0.46166,0.6101,0.036,0.4382,0.053325
gemini-2.5-flash,classification,0.33,0.6356,0.8983,1.0,0.9683,0.9067,0.8483,0.92432,0.9383,0.0882,0.8392,0.3196
gemini-2.5-flash,fixing,0.6817,0.7806,0.8917,0.9017,0.9533,0.9567,0.855,0.9116800000000002,0.9256,0.0408,0.8349,0.080025
gpt-5-nano,zero-shot,0.33485,0.4209,0.59,0.64,0.6317,0.6767,0.1733,0.5423399999999999,0.7228,0.2988,0.3852,0.417925
gpt-5-nano,classification,0.09,0.096,0.7783,1.0,0.7683,0.195,0.005,0.5493199999999999,0.7816,0.8039,0.0782,0.310025
gpt-5-nano,fixing,0.625,0.748,0.8417,0.8317,0.9717,0.7783,0.5333,0.79134,0.8965,0.214,0.5845,0.4125
deepseek-reasoner,zero-shot,0.2995,0.5185500000000001,0.1867,0.4017,0.3017,0.82,0.1083,0.36368000000000006,0.461,0.0396,0.3115,0.042075
deepseek-reasoner,classification,0.4267,0.6299,0.785,0.7533,0.8533,0.6967,0.515,0.72066,0.824,0.2966,0.5637,0.2854
deepseek-reasoner,fixing,0.2033,0.4582,0.6833,0.7567,0.8633,0.93,0.5833,0.76332,0.8221,0.0537,0.7955,0.0625
gemini-2.5-pro,zero-shot,0.36885,0.66435,0.31,0.4367,0.3883,0.815,0.195,0.42899999999999994,0.6314,0.0325,0.4698,0.033325
gemini-2.5-pro,classification,0.3733,0.6515,0.8533,1.0,0.88,0.93,0.7133,0.87532,0.8978,0.0622,0.7902,0.3299999999999999
gemini-2.5-pro,fixing,0.4717,0.6247,0.7733,0.8017,0.9217,0.9783,0.7483,0.84466,0.8197,0.0206,0.8089,0.105
gpt-5-mini,zero-shot,0.6314500000000001,0.7622,0.6533,0.6383,0.6817,0.615,0.1433,0.5463199999999999,0.7718,0.3593,0.3725,0.4716749999999999
gpt-5-mini,classification,0.59,0.731,0.88,1.0,0.75,0.6733,0.34,0.72866,0.9258,0.3267,0.4464,0.550825
gpt-5-mini,fixing,0.68,0.7839,0.8683,0.9167,0.95,0.7367,0.545,0.80334,0.9104,0.2617,0.5503,0.5004000000000001
o3,zero-shot,0.3142,0.5892,0.3441,0.5068,0.3864,0.8661,0.1915,0.45898000000000005,0.6246,0.0553,0.4383,0.0897999999999999
o3,classification,0.325,0.6034,0.82,0.8833,0.9133,0.925,0.7467,0.8576599999999999,0.9324,0.0674,0.8397,0.265825
o3,fixing,0.3783,0.5603,0.7367,0.92,0.945,0.9733,0.7717,0.86934,0.8218,0.0244,0.8082,0.13875
gpt-5,zero-shot,0.58595,0.76895,0.61,0.6417,0.65,0.735,0.2167,0.57068,0.7816,0.2211,0.4816,0.32
gpt-5,classification,0.5583,0.7534,0.9233,1.0,0.965,0.7067,0.5983,0.8386600000000002,0.9663,0.292,0.6683,0.475
gpt-5,fixing,0.645,0.8059,0.9067,0.94,0.9817,0.79,0.6533,0.85434,0.9611,0.21,0.6917,0.34585
