model,bin,n_train_correct,mean_holdout_exact,mean_holdout_fp,mean_holdout_fn
grok4,<=+1,111,0.0847,0.3037,0.0677
grok4,+2..+25,40,0.0250,0.2807,0.0900
grok4,>+25,5,0.0000,0.3218,0.1152
gpt-5.2,<=+1,92,0.0348,0.3155,0.0746
gpt-5.2,+2..+25,54,0.0120,0.2549,0.1390
gpt-5.2,>+25,19,0.0000,0.2344,0.1462
grok4.1fast,<=+1,100,0.0520,0.3160,0.0654
grok4.1fast,+2..+25,21,0.0190,0.2014,0.1295
grok4.1fast,>+25,0,,,
gemini-3-pro-preview,<=+1,92,0.0652,0.2687,0.0905
gemini-3-pro-preview,+2..+25,18,0.0111,0.2825,0.0712
gemini-3-pro-preview,>+25,0,,,
deepseek-reasoner,<=+1,67,0.0597,0.3238,0.0611
deepseek-reasoner,+2..+25,16,0.0000,0.1956,0.1223
deepseek-reasoner,>+25,0,,,
claude-opus-4-5-20251101,<=+1,58,0.0414,0.3080,0.0798
claude-opus-4-5-20251101,+2..+25,10,0.0200,0.3301,0.0459
claude-opus-4-5-20251101,>+25,0,,,
hermes4,<=+1,5,0.0000,0.4000,0.0836
hermes4,+2..+25,0,,,
hermes4,>+25,0,,,
gpt-4o,<=+1,1,0.0000,0.2739,0.1094
gpt-4o,+2..+25,0,,,
gpt-4o,>+25,0,,,
