,model_id,eval dataset,Performance
0,checkpoint-738000,alpaca,0.0
1,checkpoint-342000,alpaca,0.0
2,olmo1b_ckpt505000_tulu_5epoch_2e-6,alpaca,1.8012422360248446
3,olmo1b_ckpt342000_tulu_5epoch_2e-6,alpaca,1.4303482587064678
4,checkpoint-424000,alpaca,0.0
5,olmo1b_ckpt424000_tulu_5epoch_2e-6,alpaca,1.5527950310559007
6,checkpoint-592000,alpaca,0.0
7,checkpoint-505000,alpaca,0.0
8,checkpoint-342000_cot_chat,gsm,0.03
9,checkpoint-738000,gsm,0.025
10,checkpoint-342000,gsm,0.035
11,checkpoint-738000_cot_chat,gsm,0.025
12,olmo1b_tulu_5epoch_2e-6,gsm,0.03
13,olmo1b_ckpt505000_tulu_5epoch_2e-6,gsm,0.03
14,olmo1b_ckpt342000_tulu_5epoch_2e-6,gsm,0.03
15,checkpoint-424000,gsm,0.01
16,olmo1b_ckpt342000_tulu_5epoch_2e-6_cot_chat,gsm,0.06
17,olmo1b_ckpt424000_tulu_5epoch_2e-6,gsm,0.03
18,checkpoint-18000,gsm,0.0
19,checkpoint-592000_cot_chat,gsm,0.0
20,checkpoint-505000_cot_chat,gsm,0.035
21,checkpoint-424000_cot_chat,gsm,0.02
22,checkpoint-592000,gsm,0.02
23,olmo1b_original,gsm,0.02
24,checkpoint-1000,gsm,0.015
25,checkpoint-18000_cot_chat,gsm,0.02
26,checkpoint-1000_cot_chat,gsm,0.005
27,olmo1b_ckpt424000_tulu_5epoch_2e-6_cot_chat,gsm,0.06
28,checkpoint-505000,gsm,0.025
29,olmo1b_ckpt738000_tulu_5epoch_2e-6,gsm,0.035
30,olmo1b_ckpt592000_tulu_5epoch_2e-6_cot_chat,gsm,0.055
31,olmo1b_ckpt505000_tulu_5epoch_2e-6_cot_chat,gsm,0.06
32,olmo1b_original_cot_chat,gsm,0.015
33,olmo1b_ckpt592000_tulu_5epoch_2e-6,gsm,0.03
34,olmo1b_tulu_5epoch_2e-6_cot_chat,gsm,0.06
35,olmo1b_ckpt738000_tulu_5epoch_2e-6_cot_chat,gsm,0.045
36,checkpoint-738000,mmlu,0.24312775957840763
37,checkpoint-342000,mmlu,0.23429710867397807
38,olmo1b_tulu_5epoch_2e-6,mmlu,0.2673408346389403
39,olmo1b_ckpt505000_tulu_5epoch_2e-6,mmlu,0.2673408346389403
40,olmo1b_ckpt342000_tulu_5epoch_2e-6,mmlu,0.2673408346389403
41,checkpoint-424000,mmlu,0.24868252385700043
42,olmo1b_ckpt424000_tulu_5epoch_2e-6,mmlu,0.2673408346389403
43,checkpoint-18000,mmlu,0.2332288847742487
44,olmo1b_ckpt1000_tulu_5epoch_2e-6,mmlu,0.24248682523857001
45,checkpoint-592000,mmlu,0.24825523429710866
46,olmo1b_original,mmlu,0.24854009400370317
47,checkpoint-1000,mmlu,0.23201823102122204
48,checkpoint-505000,mmlu,0.2503916820965674
49,olmo1b_ckpt738000_tulu_5epoch_2e-6,mmlu,0.2699045719982908
50,olmo1b_ckpt592000_tulu_5epoch_2e-6,mmlu,0.26520438683948155
51,checkpoint-738000,truthfulqa,0.576499388004896
52,checkpoint-342000,truthfulqa,0.6462668298653611
53,olmo1b_tulu_5epoch_2e-6,truthfulqa,0.401468788249694
54,olmo1b_ckpt505000_tulu_5epoch_2e-6,truthfulqa,0.401468788249694
55,olmo1b_ckpt342000_tulu_5epoch_2e-6,truthfulqa,0.401468788249694
56,checkpoint-424000,truthfulqa,0.598531211750306
57,olmo1b_ckpt424000_tulu_5epoch_2e-6,truthfulqa,0.401468788249694
58,checkpoint-18000,truthfulqa,0.1346389228886169
59,checkpoint-592000,truthfulqa,0.565483476132191
60,olmo1b_original,truthfulqa,0.47613219094247244
61,checkpoint-1000,truthfulqa,0.033047735618115054
62,checkpoint-505000,truthfulqa,0.408812729498164
63,olmo1b_ckpt738000_tulu_5epoch_2e-6,truthfulqa,0.3072215422276622
64,olmo1b_ckpt592000_tulu_5epoch_2e-6,truthfulqa,0.31701346389228885
65,checkpoint-738000,toxigen,0.6905714285714286
66,checkpoint-342000,toxigen,0.7352857142857143
67,olmo1b_tulu_5epoch_2e-6,toxigen,0.6282857142857143
68,olmo1b_ckpt505000_tulu_5epoch_2e-6,toxigen,0.6282857142857143
69,olmo1b_ckpt342000_tulu_5epoch_2e-6,toxigen,0.6282857142857143
70,checkpoint-424000,toxigen,0.7507142857142857
71,olmo1b_ckpt424000_tulu_5epoch_2e-6,toxigen,0.6282857142857143
72,checkpoint-18000,toxigen,0.6125714285714285
73,checkpoint-592000,toxigen,0.6744285714285714
74,olmo1b_original,toxigen,0.7194285714285714
75,checkpoint-1000,toxigen,0.10714285714285714
76,checkpoint-505000,toxigen,0.6992857142857143
77,olmo1b_ckpt738000_tulu_5epoch_2e-6,toxigen,0.6104285714285714
78,olmo1b_ckpt592000_tulu_5epoch_2e-6,toxigen,0.6181428571428571
