model,bin,n_train_correct,mean_holdout_exact,mean_holdout_fp,mean_holdout_fn
grok4,<=+1,144,0.9813,0.0021,0.0021
grok4,+2..+25,22,0.5000,0.0976,0.0450
grok4,>+25,15,0.2667,0.1269,0.0727
gpt-5.2,<=+1,39,0.9026,0.0293,0.0006
gpt-5.2,+2..+25,28,0.3429,0.1489,0.0539
gpt-5.2,>+25,89,0.0899,0.1936,0.1729
grok4.1fast,<=+1,54,0.9407,0.0133,0.0041
grok4.1fast,+2..+25,6,0.2000,0.3250,0.0470
grok4.1fast,>+25,0,,,
gemini-3-pro-preview,<=+1,30,0.7800,0.0615,0.0098
gemini-3-pro-preview,+2..+25,20,0.2700,0.1757,0.0661
gemini-3-pro-preview,>+25,3,0.2000,0.1413,0.1074
deepseek-reasoner,<=+1,30,0.9533,0.0099,
deepseek-reasoner,+2..+25,9,0.3778,0.1457,0.0397
deepseek-reasoner,>+25,0,,,
claude-opus-4-5-20251101,<=+1,25,0.7840,0.0586,0.0043
claude-opus-4-5-20251101,+2..+25,7,0.3143,0.1423,0.0429
claude-opus-4-5-20251101,>+25,0,,,
hermes4,<=+1,10,1.0000,,
hermes4,+2..+25,0,,,
hermes4,>+25,0,,,
