Metric,Model,Correlation,p_value
?A1=A2,deepseek-chat,0.6198,0.574423
?A1=A2,deepseek-reasoner,0.6056,0.585858
?A1=A2,gemini-2.0-flash,0.6486,0.550687
?A1=A2,gemini-2.5-flash,0.5281,0.645842
?A1=A2,gemini-2.5-pro,0.7757,0.434804
?A1=A2,gpt-4.1-2025-04-14,0.8183,0.389793
?A1=A2,gpt-4.1-mini-2025-04-14,0.803,0.406432
?A1=A2,gpt-4.1-nano-2025-04-14,0.9546,0.192527
?A1=A2,gpt-4o,0.7513,0.45883
?A1=A2,gpt-5,0.6626,0.538888
?A1=A2,gpt-5-mini,0.8097,0.399226
?A1=A2,gpt-5-nano,-0.327,0.787891
?A1=A2,gpt-oss:20b,0.639,0.558659
?A1=A2,grok-3-mini,0.6821,0.52211
?A1=A2,llama3.1:70b,0.7842,0.426108
?A1=A2,llama3.1:8b,0.6464,0.55257
?A1=A2,mistral-small:24b,0.0078,0.995051
?A1=A2,o3,0.8233,0.38427
?A1=A3+A4,deepseek-chat,0.5422,0.635148
?A1=A3+A4,deepseek-reasoner,0.7669,0.443597
?A1=A3+A4,gemini-2.0-flash,0.6228,0.571962
?A1=A3+A4,gemini-2.5-flash,0.5636,0.618808
?A1=A3+A4,gemini-2.5-pro,0.6905,0.514761
?A1=A3+A4,gpt-4.1-2025-04-14,0.788,0.422235
?A1=A3+A4,gpt-4.1-mini-2025-04-14,0.7799,0.430564
?A1=A3+A4,gpt-4.1-nano-2025-04-14,0.9381,0.225131
?A1=A3+A4,gpt-4o,0.7446,0.4653
?A1=A3+A4,gpt-5,0.6571,0.543601
?A1=A3+A4,gpt-5-mini,0.3946,0.74174
?A1=A3+A4,gpt-5-nano,-0.3966,0.740414
?A1=A3+A4,gpt-oss:20b,0.1714,0.890316
?A1=A3+A4,grok-3-mini,0.5754,0.609687
?A1=A3+A4,llama3.1:70b,0.6133,0.579665
?A1=A3+A4,llama3.1:8b,0.9335,0.233392
?A1=A3+A4,mistral-small:24b,-0.1671,0.893129
?A1=A3+A4,o3,0.7349,0.47444
?A1>A3,deepseek-chat,0.5226,0.649918
?A1>A3,deepseek-reasoner,0.801,0.408636
?A1>A3,gemini-2.0-flash,0.7333,0.475961
?A1>A3,gemini-2.5-flash,0.6714,0.531375
?A1>A3,gemini-2.5-pro,0.8949,0.294517
?A1>A3,gpt-4.1-2025-04-14,0.2407,0.845252
?A1>A3,gpt-4.1-mini-2025-04-14,0.0807,0.948545
?A1>A3,gpt-4.1-nano-2025-04-14,0.9134,0.266836
?A1>A3,gpt-4o,0.7027,0.503924
?A1>A3,gpt-5,0.74,0.469628
?A1>A3,gpt-5-mini,0.8994,0.287956
?A1>A3,gpt-5-nano,-0.8784,0.317284
?A1>A3,gpt-oss:20b,0.7329,0.476343
?A1>A3,grok-3-mini,0.7531,0.457121
?A1>A3,llama3.1:70b,0.826,0.381249
?A1>A3,llama3.1:8b,0.9848,0.111022
?A1>A3,mistral-small:24b,0.2358,0.848476
?A1>A3,o3,0.6611,0.540211
?A1>A4,deepseek-chat,0.4043,0.735011
?A1>A4,deepseek-reasoner,0.5755,0.609625
?A1>A4,gemini-2.0-flash,0.554,0.626187
?A1>A4,gemini-2.5-flash,0.5402,0.636732
?A1>A4,gemini-2.5-pro,0.6291,0.566822
?A1>A4,gpt-4.1-2025-04-14,0.7417,0.468072
?A1>A4,gpt-4.1-mini-2025-04-14,0.643,0.555338
?A1>A4,gpt-4.1-nano-2025-04-14,0.4495,0.703185
?A1>A4,gpt-4o,0.6389,0.558807
?A1>A4,gpt-5,0.5916,0.597007
?A1>A4,gpt-5-mini,0.0979,0.937557
?A1>A4,gpt-5-nano,0.0584,0.962795
?A1>A4,gpt-oss:20b,0.1803,0.884583
?A1>A4,grok-3-mini,0.5333,0.641881
?A1>A4,llama3.1:70b,0.5967,0.592923
?A1>A4,llama3.1:8b,0.9277,0.243606
?A1>A4,mistral-small:24b,-0.4784,0.682418
?A1>A4,o3,0.6834,0.520974
?A3∅A4,deepseek-chat,-0.217,0.860742
?A3∅A4,deepseek-reasoner,-0.8521,0.350694
?A3∅A4,gemini-2.0-flash,-0.462,0.694293
?A3∅A4,gemini-2.5-flash,-0.0079,0.994969
?A3∅A4,gemini-2.5-pro,0.4435,0.707501
?A3∅A4,gpt-4.1-2025-04-14,0.5046,0.663312
?A3∅A4,gpt-4.1-mini-2025-04-14,0.3346,0.78283
?A3∅A4,gpt-4.1-nano-2025-04-14,-0.9994,0.022684
?A3∅A4,gpt-4o,-0.8277,0.379342
?A3∅A4,gpt-5,-0.685,0.51957
?A3∅A4,gpt-5-mini,0.343,0.777147
?A3∅A4,gpt-5-nano,0.9756,0.141004
?A3∅A4,gpt-oss:20b,-0.4341,0.71414
?A3∅A4,grok-3-mini,-0.4434,0.707516
?A3∅A4,llama3.1:70b,0.5475,0.631154
?A3∅A4,llama3.1:8b,-0.1828,0.882948
?A3∅A4,mistral-small:24b,-0.6139,0.579197
?A3∅A4,o3,0.335,0.78256
?A4=A1|3,deepseek-chat,0.241,0.845038
?A4=A1|3,deepseek-reasoner,0.4789,0.682048
?A4=A1|3,gemini-2.0-flash,0.2593,0.832996
?A4=A1|3,gemini-2.5-flash,0.5113,0.658317
?A4=A1|3,gemini-2.5-pro,0.6404,0.557577
?A4=A1|3,gpt-4.1-2025-04-14,0.7104,0.49707
?A4=A1|3,gpt-4.1-mini-2025-04-14,0.6077,0.584221
?A4=A1|3,gpt-4.1-nano-2025-04-14,-0.5668,0.616395
?A4=A1|3,gpt-4o,0.0172,0.98902
?A4=A1|3,gpt-5,0.5331,0.642038
?A4=A1|3,gpt-5-mini,0.3521,0.770935
?A4=A1|3,gpt-5-nano,0.7054,0.501556
?A4=A1|3,gpt-oss:20b,0.0273,0.982591
?A4=A1|3,grok-3-mini,0.4544,0.699729
?A4=A1|3,llama3.1:70b,0.624,0.570989
?A4=A1|3,llama3.1:8b,0.9077,0.275697
?A4=A1|3,mistral-small:24b,-0.4365,0.712422
?A4=A1|3,o3,0.6923,0.513227
J(A1-A2),deepseek-chat,0.5703,0.613624
J(A1-A2),deepseek-reasoner,0.4462,0.705545
J(A1-A2),gemini-2.0-flash,0.7756,0.434904
J(A1-A2),gemini-2.5-flash,0.5471,0.63145
J(A1-A2),gemini-2.5-pro,0.8613,0.33926
J(A1-A2),gpt-4.1-2025-04-14,0.8211,0.38676
J(A1-A2),gpt-4.1-mini-2025-04-14,0.7966,0.4133
J(A1-A2),gpt-4.1-nano-2025-04-14,0.9887,0.095786
J(A1-A2),gpt-4o,-0.7953,0.414679
J(A1-A2),gpt-5,0.6463,0.552663
J(A1-A2),gpt-5-mini,0.8348,0.371161
J(A1-A2),gpt-5-nano,0.1408,0.910037
J(A1-A2),gpt-oss:20b,0.5486,0.630346
J(A1-A2),grok-3-mini,0.7062,0.50078
J(A1-A2),llama3.1:70b,0.8055,0.403801
J(A1-A2),llama3.1:8b,0.6616,0.539738
J(A1-A2),mistral-small:24b,0.0413,0.973706
J(A1-A2),o3,0.9201,0.256212
J(A3-A4),deepseek-chat,0.8843,0.309293
J(A3-A4),deepseek-reasoner,0.9996,0.017585
J(A3-A4),gemini-2.0-flash,0.9012,0.285419
J(A3-A4),gemini-2.5-flash,0.9856,0.108057
J(A3-A4),gemini-2.5-pro,0.8914,0.299486
J(A3-A4),gpt-4.1-2025-04-14,0.3324,0.784317
J(A3-A4),gpt-4.1-mini-2025-04-14,0.6593,0.541677
J(A3-A4),gpt-4.1-nano-2025-04-14,1.0,0.005679
J(A3-A4),gpt-4o,0.9247,0.248598
J(A3-A4),gpt-5,0.9632,0.173191
J(A3-A4),gpt-5-mini,-0.1838,0.8823
J(A3-A4),gpt-5-nano,-0.9818,0.12164
J(A3-A4),gpt-oss:20b,0.8591,0.342079
J(A3-A4),grok-3-mini,0.9408,0.220219
J(A3-A4),llama3.1:70b,-0.5942,0.594955
J(A3-A4),llama3.1:8b,0.2787,0.820184
J(A3-A4),mistral-small:24b,0.7295,0.479524
J(A3-A4),o3,0.5106,0.658847
J(A4-A1|3),deepseek-chat,0.2121,0.863934
J(A4-A1|3),deepseek-reasoner,0.1824,0.883218
J(A4-A1|3),gemini-2.0-flash,0.1437,0.9082
J(A4-A1|3),gemini-2.5-flash,0.5272,0.646457
J(A4-A1|3),gemini-2.5-pro,0.6463,0.552635
J(A4-A1|3),gpt-4.1-2025-04-14,0.7252,0.483489
J(A4-A1|3),gpt-4.1-mini-2025-04-14,0.5592,0.622211
J(A4-A1|3),gpt-4.1-nano-2025-04-14,-0.8648,0.334848
J(A4-A1|3),gpt-4o,-0.1308,0.916508
J(A4-A1|3),gpt-5,0.5451,0.632966
J(A4-A1|3),gpt-5-mini,0.2678,0.82743
J(A4-A1|3),gpt-5-nano,0.8964,0.292291
J(A4-A1|3),gpt-oss:20b,0.1536,0.901841
J(A4-A1|3),grok-3-mini,0.6419,0.556292
J(A4-A1|3),llama3.1:70b,0.6068,0.584905
J(A4-A1|3),llama3.1:8b,0.8034,0.406094
J(A4-A1|3),mistral-small:24b,-0.3703,0.75851
J(A4-A1|3),o3,0.7667,0.44375
