,model_id,eval dataset,Performance
0,olmo1b_hf_ckpt424000_tulu_5epoch_2e-6,arc_challenge,0.38461538461538464
1,olmo1b_hf_ckpt18000_tulu_5epoch_2e-6,arc_challenge,0.3010033444816054
2,checkpoint-1000,hellaswag,0.2530372435769767
3,checkpoint-424000,hellaswag,0.6078470424218283
4,checkpoint-505000,qqp,0.4675735839722978
5,olmo1b_hf_ckpt505000_tulu_5epoch_2e-6,boolq,0.6987767584097859
6,olmo1b_hf_ckpt592000_tulu_5epoch_2e-6,boolq,0.6987767584097859
7,olmo1b_hf_ckpt342000_tulu_5epoch_2e-6,sciq,0.937
8,checkpoint-738000,qqp,0.5692555033391046
9,olmo1b_original_hf,arc_easy,0.5649122807017544
10,olmo1b_hf_ckpt592000_tulu_5epoch_2e-6,sciq,0.94
11,checkpoint-738000,hellaswag,0.6282613025293766
12,checkpoint-505000,sciq,0.873
13,checkpoint-424000,arc_easy,0.5789473684210527
14,olmo1b_hf_ckpt738000_tulu_5epoch_2e-6,arc_challenge,0.36454849498327757
15,checkpoint-505000,arc_easy,0.5736842105263158
16,olmo1b_hf_ckpt505000_tulu_5epoch_2e-6,sciq,0.941
17,olmo1b_hf_ckpt424000_tulu_5epoch_2e-6,sciq,0.944
18,olmo1b_hf_ckpt1000_tulu_5epoch_2e-6,arc_challenge,0.23745819397993312
19,checkpoint-738000,arc_easy,0.5754385964912281
20,olmo1b_hf_ckpt424000_tulu_5epoch_2e-6,arc_easy,0.6017543859649123
21,olmo1b_hf_ckpt738000_tulu_5epoch_2e-6,hellaswag,0.6288587930691097
22,checkpoint-738000,openbookqa,0.436
23,olmo1b_hf_ckpt18000_tulu_5epoch_2e-6,arc_easy,0.5140350877192983
24,olmo1b_hf_main_tulu_5epoch_2e-6,arc_challenge,0.35785953177257523
25,olmo1b_hf_ckpt592000_tulu_5epoch_2e-6,hellaswag,0.6272654849631547
26,checkpoint-738000,boolq,0.6165137614678899
27,checkpoint-592000,arc_easy,0.5701754385964912
28,olmo1b_hf_main_tulu_5epoch_2e-6,sciq,0.91
29,checkpoint-342000,hellaswag,0.5985859390559649
30,olmo1b_hf_ckpt592000_tulu_5epoch_2e-6,openbookqa,0.424
31,checkpoint-342000,rte,0.5703971119133574
32,checkpoint-424000,arc_challenge,0.3879598662207358
33,checkpoint-342000,qqp,0.5051694286420975
34,checkpoint-592000,hellaswag,0.6217884883489345
35,olmo1b_hf_ckpt18000_tulu_5epoch_2e-6,sciq,0.873
36,checkpoint-505000,arc_challenge,0.3712374581939799
37,olmo1b_hf_ckpt592000_tulu_5epoch_2e-6,arc_challenge,0.35785953177257523
38,olmo1b_hf_ckpt424000_tulu_5epoch_2e-6,openbookqa,0.436
39,checkpoint-342000,arc_challenge,0.3779264214046823
40,checkpoint-592000,openbookqa,0.426
41,checkpoint-424000,rte,0.5703971119133574
42,olmo1b_hf_main_tulu_5epoch_2e-6,boolq,0.6871559633027523
43,checkpoint-1000,rte,0.5270758122743683
44,olmo1b_hf_ckpt505000_tulu_5epoch_2e-6,arc_challenge,0.3612040133779264
45,olmo1b_original_hf,sciq,0.93
46,olmo1b_hf_ckpt342000_tulu_5epoch_2e-6,openbookqa,0.418
47,checkpoint-592000,qqp,0.5054909720504576
48,olmo1b_original_hf,arc_challenge,0.3377926421404682
49,checkpoint-342000,boolq,0.5122324159021406
50,olmo1b_hf_ckpt1000_tulu_5epoch_2e-6,boolq,0.5214067278287462
51,olmo1b_hf_ckpt18000_tulu_5epoch_2e-6,openbookqa,0.376
52,checkpoint-18000,rte,0.5776173285198556
53,checkpoint-18000,boolq,0.6204892966360857
54,checkpoint-505000,hellaswag,0.6104361680940051
55,olmo1b_hf_ckpt342000_tulu_5epoch_2e-6,hellaswag,0.6237801234813782
56,checkpoint-505000,boolq,0.5788990825688073
57,olmo1b_hf_ckpt738000_tulu_5epoch_2e-6,arc_easy,0.5789473684210527
58,checkpoint-18000,hellaswag,0.41983668591913964
59,checkpoint-424000,openbookqa,0.442
60,checkpoint-342000,openbookqa,0.422
61,checkpoint-505000,openbookqa,0.464
62,checkpoint-505000,rte,0.5270758122743683
63,checkpoint-738000,arc_challenge,0.35785953177257523
64,olmo1b_hf_ckpt1000_tulu_5epoch_2e-6,hellaswag,0.2544313881696873
65,olmo1b_hf_ckpt342000_tulu_5epoch_2e-6,boolq,0.6767584097859327
66,olmo1b_hf_main_tulu_5epoch_2e-6,arc_easy,0.5701754385964912
67,checkpoint-592000,sciq,0.867
68,checkpoint-342000,arc_easy,0.6
69,checkpoint-1000,openbookqa,0.266
70,checkpoint-592000,boolq,0.5856269113149847
71,checkpoint-18000,sciq,0.789
72,checkpoint-424000,sciq,0.93
73,olmo1b_hf_ckpt1000_tulu_5epoch_2e-6,openbookqa,0.316
74,olmo1b_hf_main_tulu_5epoch_2e-6,openbookqa,0.418
75,olmo1b_hf_ckpt1000_tulu_5epoch_2e-6,arc_easy,0.29473684210526313
76,checkpoint-342000,sciq,0.921
77,checkpoint-1000,arc_challenge,0.23076923076923078
78,olmo1b_hf_ckpt342000_tulu_5epoch_2e-6,arc_challenge,0.35785953177257523
79,olmo1b_hf_ckpt18000_tulu_5epoch_2e-6,hellaswag,0.43835889265086636
80,olmo1b_hf_ckpt738000_tulu_5epoch_2e-6,sciq,0.937
81,olmo1b_original_hf,openbookqa,0.434
82,olmo1b_hf_ckpt424000_tulu_5epoch_2e-6,boolq,0.6785932721712539
83,olmo1b_hf_main_tulu_5epoch_2e-6,hellaswag,0.6294562836088429
84,checkpoint-424000,boolq,0.482262996941896
85,olmo1b_hf_ckpt424000_tulu_5epoch_2e-6,hellaswag,0.6237801234813782
86,olmo1b_hf_ckpt505000_tulu_5epoch_2e-6,openbookqa,0.462
87,checkpoint-592000,rte,0.5270758122743683
88,olmo1b_hf_ckpt738000_tulu_5epoch_2e-6,openbookqa,0.434
89,checkpoint-1000,boolq,0.5370030581039755
90,olmo1b_original_hf,boolq,0.6174311926605505
91,checkpoint-738000,sciq,0.878
92,olmo1b_hf_ckpt18000_tulu_5epoch_2e-6,boolq,0.5764525993883792
93,olmo1b_hf_ckpt592000_tulu_5epoch_2e-6,arc_easy,0.5807017543859649
94,olmo1b_hf_ckpt342000_tulu_5epoch_2e-6,arc_easy,0.5912280701754385
95,checkpoint-1000,sciq,0.382
96,olmo1b_hf_ckpt505000_tulu_5epoch_2e-6,arc_easy,0.6070175438596491
97,checkpoint-592000,arc_challenge,0.35785953177257523
98,checkpoint-1000,arc_easy,0.2719298245614035
99,checkpoint-18000,arc_challenge,0.26421404682274247
100,olmo1b_hf_ckpt738000_tulu_5epoch_2e-6,boolq,0.6978593272171254
101,checkpoint-18000,arc_easy,0.4421052631578947
102,olmo1b_hf_ckpt1000_tulu_5epoch_2e-6,sciq,0.485
103,checkpoint-18000,openbookqa,0.384
104,checkpoint-738000,rte,0.5487364620938628
105,olmo1b_original_hf,hellaswag,0.6280621390161323
106,olmo1b_hf_ckpt505000_tulu_5epoch_2e-6,hellaswag,0.6313483369846644
