training:
  batch_size: 512 # global_train_batch_size
save_interval: 1000
save_interval_unsharded: 100000
save_num_checkpoints_to_keep: 1
save_num_unsharded_checkpoints_to_keep: 1
sweep:
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 184
  max_duration: 1847
  model:
    context_length: 512
    d_model: 256
    n_heads: 4
    n_layers: 4
  params: 68817152
  ratio: 7.0357455071665855
  scheduler:
    t_warmup: 461
  tokens: 484179968
  total_flops: 200000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 144
  max_duration: 1441
  model:
    context_length: 512
    d_model: 320
    n_heads: 5
    n_layers: 5
  params: 88234560
  ratio: 4.281196664889586
  scheduler:
    t_warmup: 360
  tokens: 377749504
  total_flops: 200000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 116
  max_duration: 1165
  model:
    context_length: 512
    d_model: 384
    n_heads: 6
    n_layers: 6
  params: 109127040
  ratio: 2.798552586050167
  scheduler:
    t_warmup: 291
  tokens: 305397760
  total_flops: 200000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 404
  max_duration: 4040
  model:
    context_length: 512
    d_model: 256
    n_heads: 4
    n_layers: 4
  params: 68817152
  ratio: 15.3895028960222
  scheduler:
    t_warmup: 1010
  tokens: 1059061760
  total_flops: 437344829577312000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 315
  max_duration: 3151
  model:
    context_length: 512
    d_model: 320
    n_heads: 5
    n_layers: 5
  params: 88234560
  ratio: 9.361589653759253
  scheduler:
    t_warmup: 787
  tokens: 826015744
  total_flops: 437344829577312000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 254
  max_duration: 2548
  model:
    context_length: 512
    d_model: 384
    n_heads: 6
    n_layers: 6
  params: 109127040
  ratio: 6.120782823395558
  scheduler:
    t_warmup: 637
  tokens: 667942912
  total_flops: 437344829577312000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 210
  max_duration: 2109
  model:
    context_length: 512
    d_model: 448
    n_heads: 7
    n_layers: 7
  params: 131789504
  ratio: 4.19503586567865
  scheduler:
    t_warmup: 527
  tokens: 552861696
  total_flops: 437344829577312000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 177
  max_duration: 1776
  model:
    context_length: 512
    d_model: 512
    n_heads: 8
    n_layers: 8
  params: 156516864
  ratio: 2.974553234084731
  scheduler:
    t_warmup: 444
  tokens: 465567744
  total_flops: 437344829577312000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 151
  max_duration: 1514
  model:
    context_length: 512
    d_model: 576
    n_heads: 9
    n_layers: 9
  params: 183604032
  ratio: 2.1616410689717314
  scheduler:
    t_warmup: 378
  tokens: 396886016
  total_flops: 437344829577312000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 883
  max_duration: 8835
  model:
    context_length: 512
    d_model: 256
    n_heads: 4
    n_layers: 4
  params: 68817152
  ratio: 33.65501437781093
  scheduler:
    t_warmup: 2208
  tokens: 2316042240
  total_flops: 956352499790040192
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 689
  max_duration: 6891
  model:
    context_length: 512
    d_model: 320
    n_heads: 5
    n_layers: 5
  params: 88234560
  ratio: 20.47309244812917
  scheduler:
    t_warmup: 1722
  tokens: 1806434304
  total_flops: 956352499790040192
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 557
  max_duration: 5571
  model:
    context_length: 512
    d_model: 384
    n_heads: 6
    n_layers: 6
  params: 109127040
  ratio: 13.382606400760068
  scheduler:
    t_warmup: 1392
  tokens: 1460404224
  total_flops: 956352499790040192
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 461
  max_duration: 4613
  model:
    context_length: 512
    d_model: 448
    n_heads: 7
    n_layers: 7
  params: 131789504
  ratio: 9.175770719950505
  scheduler:
    t_warmup: 1153
  tokens: 1209270272
  total_flops: 956352499790040192
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 388
  max_duration: 3884
  model:
    context_length: 512
    d_model: 512
    n_heads: 8
    n_layers: 8
  params: 156516864
  ratio: 6.505160338505121
  scheduler:
    t_warmup: 971
  tokens: 1018167296
  total_flops: 956352499790040192
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 331
  max_duration: 3311
  model:
    context_length: 512
    d_model: 576
    n_heads: 9
    n_layers: 9
  params: 183604032
  ratio: 4.727340541192472
  scheduler:
    t_warmup: 827
  tokens: 867958784
  total_flops: 956352499790040192
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 284
  max_duration: 2849
  model:
    context_length: 512
    d_model: 640
    n_heads: 10
    n_layers: 10
  params: 213345920
  ratio: 3.500644661964944
  scheduler:
    t_warmup: 712
  tokens: 746848256
  total_flops: 956352499790040192
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 247
  max_duration: 2471
  model:
    context_length: 512
    d_model: 704
    n_heads: 11
    n_layers: 11
  params: 246037440
  ratio: 2.63276119276806
  scheduler:
    t_warmup: 617
  tokens: 647757824
  total_flops: 956352499790040192
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 215
  max_duration: 2156
  model:
    context_length: 512
    d_model: 768
    n_heads: 12
    n_layers: 12
  params: 281973504
  ratio: 2.0043814613163087
  scheduler:
    t_warmup: 539
  tokens: 565182464
  total_flops: 956352499790040192
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1932
  max_duration: 19320
  model:
    context_length: 512
    d_model: 256
    n_heads: 4
    n_layers: 4
  params: 68817152
  ratio: 73.5953455324626
  scheduler:
    t_warmup: 4830
  tokens: 5064622080
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1506
  max_duration: 15068
  model:
    context_length: 512
    d_model: 320
    n_heads: 5
    n_layers: 5
  params: 88234560
  ratio: 44.76687810309249
  scheduler:
    t_warmup: 3767
  tokens: 3949985792
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1218
  max_duration: 12183
  model:
    context_length: 512
    d_model: 384
    n_heads: 6
    n_layers: 6
  params: 109127040
  ratio: 29.265893696007883
  scheduler:
    t_warmup: 3045
  tokens: 3193700352
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1008
  max_duration: 10088
  model:
    context_length: 512
    d_model: 448
    n_heads: 7
    n_layers: 7
  params: 131789504
  ratio: 20.06615543526137
  scheduler:
    t_warmup: 2522
  tokens: 2644508672
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 849
  max_duration: 8494
  model:
    context_length: 512
    d_model: 512
    n_heads: 8
    n_layers: 8
  params: 156516864
  ratio: 14.226269803105689
  scheduler:
    t_warmup: 2123
  tokens: 2226651136
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 724
  max_duration: 7241
  model:
    context_length: 512
    d_model: 576
    n_heads: 9
    n_layers: 9
  params: 183604032
  ratio: 10.338469603979068
  scheduler:
    t_warmup: 1810
  tokens: 1898184704
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 623
  max_duration: 6232
  model:
    context_length: 512
    d_model: 640
    n_heads: 10
    n_layers: 10
  params: 213345920
  ratio: 7.657429811641113
  scheduler:
    t_warmup: 1558
  tokens: 1633681408
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 540
  max_duration: 5404
  model:
    context_length: 512
    d_model: 704
    n_heads: 11
    n_layers: 11
  params: 246037440
  ratio: 5.757766687866692
  scheduler:
    t_warmup: 1351
  tokens: 1416626176
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 471
  max_duration: 4715
  model:
    context_length: 512
    d_model: 768
    n_heads: 12
    n_layers: 12
  params: 281973504
  ratio: 4.383422351626343
  scheduler:
    t_warmup: 1178
  tokens: 1236008960
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 413
  max_duration: 4136
  model:
    context_length: 512
    d_model: 832
    n_heads: 13
    n_layers: 13
  params: 321449024
  ratio: 3.3729378627698057
  scheduler:
    t_warmup: 1034
  tokens: 1084227584
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 364
  max_duration: 3645
  model:
    context_length: 512
    d_model: 896
    n_heads: 14
    n_layers: 14
  params: 364758912
  ratio: 2.619579257874308
  scheduler:
    t_warmup: 911
  tokens: 955514880
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 322
  max_duration: 3225
  model:
    context_length: 512
    d_model: 960
    n_heads: 15
    n_layers: 15
  params: 412198080
  ratio: 2.0509906305240433
  scheduler:
    t_warmup: 806
  tokens: 845414400
  total_flops: 2091279105182539776
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 4224
  max_duration: 42249
  model:
    context_length: 512
    d_model: 256
    n_heads: 4
    n_layers: 4
  params: 68817152
  ratio: 160.9383930331787
  scheduler:
    t_warmup: 10562
  tokens: 11075321856
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 3295
  max_duration: 32951
  model:
    context_length: 512
    d_model: 320
    n_heads: 5
    n_layers: 5
  params: 88234560
  ratio: 97.89709320248211
  scheduler:
    t_warmup: 8237
  tokens: 8637906944
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 2664
  max_duration: 26642
  model:
    context_length: 512
    d_model: 384
    n_heads: 6
    n_layers: 6
  params: 109127040
  ratio: 63.99917424682278
  scheduler:
    t_warmup: 6660
  tokens: 6984040448
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 2206
  max_duration: 22061
  model:
    context_length: 512
    d_model: 448
    n_heads: 7
    n_layers: 7
  params: 131789504
  ratio: 43.88178579077132
  scheduler:
    t_warmup: 5515
  tokens: 5783158784
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1857
  max_duration: 18576
  model:
    context_length: 512
    d_model: 512
    n_heads: 8
    n_layers: 8
  params: 156516864
  ratio: 31.112218961913268
  scheduler:
    t_warmup: 4644
  tokens: 4869586944
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1583
  max_duration: 15835
  model:
    context_length: 512
    d_model: 576
    n_heads: 9
    n_layers: 9
  params: 183604032
  ratio: 22.608709595222834
  scheduler:
    t_warmup: 3958
  tokens: 4151050240
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1362
  max_duration: 13627
  model:
    context_length: 512
    d_model: 640
    n_heads: 10
    n_layers: 10
  params: 213345920
  ratio: 16.74386971168701
  scheduler:
    t_warmup: 3406
  tokens: 3572236288
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1181
  max_duration: 11817
  model:
    context_length: 512
    d_model: 704
    n_heads: 11
    n_layers: 11
  params: 246037440
  ratio: 12.590586408312491
  scheduler:
    t_warmup: 2954
  tokens: 3097755648
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1031
  max_duration: 10311
  model:
    context_length: 512
    d_model: 768
    n_heads: 12
    n_layers: 12
  params: 281973504
  ratio: 9.585889261425073
  scheduler:
    t_warmup: 2577
  tokens: 2702966784
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 904
  max_duration: 9044
  model:
    context_length: 512
    d_model: 832
    n_heads: 13
    n_layers: 13
  params: 321449024
  ratio: 7.375447299538231
  scheduler:
    t_warmup: 2261
  tokens: 2370830336
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 797
  max_duration: 7970
  model:
    context_length: 512
    d_model: 896
    n_heads: 14
    n_layers: 14
  params: 364758912
  ratio: 5.727859172910352
  scheduler:
    t_warmup: 1992
  tokens: 2089287680
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 705
  max_duration: 7053
  model:
    context_length: 512
    d_model: 960
    n_heads: 15
    n_layers: 15
  params: 412198080
  ratio: 4.485468811499558
  scheduler:
    t_warmup: 1763
  tokens: 1848901632
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 626
  max_duration: 6265
  model:
    context_length: 512
    d_model: 1024
    n_heads: 16
    n_layers: 16
  params: 464061440
  ratio: 3.539040347760848
  scheduler:
    t_warmup: 1566
  tokens: 1642332160
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 558
  max_duration: 5584
  model:
    context_length: 512
    d_model: 1088
    n_heads: 17
    n_layers: 17
  params: 520643904
  ratio: 2.811541794216417
  scheduler:
    t_warmup: 1396
  tokens: 1463812096
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 499
  max_duration: 4993
  model:
    context_length: 512
    d_model: 1152
    n_heads: 18
    n_layers: 18
  params: 582240384
  ratio: 2.2480147855906885
  scheduler:
    t_warmup: 1248
  tokens: 1308884992
  total_flops: 4573050519273255936
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 5826
  max_duration: 58260
  model:
    context_length: 512
    d_model: 384
    n_heads: 6
    n_layers: 6
  params: 109127040
  ratio: 139.95165121311823
  scheduler:
    t_warmup: 14565
  tokens: 15272509440
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 4824
  max_duration: 48242
  model:
    context_length: 512
    d_model: 448
    n_heads: 7
    n_layers: 7
  params: 131789504
  ratio: 95.95871039927428
  scheduler:
    t_warmup: 12060
  tokens: 12646350848
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 4062
  max_duration: 40620
  model:
    context_length: 512
    d_model: 512
    n_heads: 8
    n_layers: 8
  params: 156516864
  ratio: 68.03285606335686
  scheduler:
    t_warmup: 10155
  tokens: 10648289280
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 3462
  max_duration: 34627
  model:
    context_length: 512
    d_model: 576
    n_heads: 9
    n_layers: 9
  params: 183604032
  ratio: 49.439329785524535
  scheduler:
    t_warmup: 8656
  tokens: 9077260288
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 2980
  max_duration: 29800
  model:
    context_length: 512
    d_model: 640
    n_heads: 10
    n_layers: 10
  params: 213345920
  ratio: 36.61607965130057
  scheduler:
    t_warmup: 7450
  tokens: 7811891200
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 2584
  max_duration: 25840
  model:
    context_length: 512
    d_model: 704
    n_heads: 11
    n_layers: 11
  params: 246037440
  ratio: 27.53158608706057
  scheduler:
    t_warmup: 6460
  tokens: 6773800960
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 2254
  max_duration: 22547
  model:
    context_length: 512
    d_model: 768
    n_heads: 12
    n_layers: 12
  params: 281973504
  ratio: 20.96140482759685
  scheduler:
    t_warmup: 5636
  tokens: 5910560768
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1977
  max_duration: 19778
  model:
    context_length: 512
    d_model: 832
    n_heads: 13
    n_layers: 13
  params: 321449024
  ratio: 16.129101801223698
  scheduler:
    t_warmup: 4944
  tokens: 5184684032
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1743
  max_duration: 17430
  model:
    context_length: 512
    d_model: 896
    n_heads: 14
    n_layers: 14
  params: 364758912
  ratio: 12.52654772695451
  scheduler:
    t_warmup: 4357
  tokens: 4569169920
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1542
  max_duration: 15424
  model:
    context_length: 512
    d_model: 960
    n_heads: 15
    n_layers: 15
  params: 412198080
  ratio: 9.809140925644293
  scheduler:
    t_warmup: 3856
  tokens: 4043309056
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1370
  max_duration: 13700
  model:
    context_length: 512
    d_model: 1024
    n_heads: 16
    n_layers: 16
  params: 464061440
  ratio: 7.739002835486612
  scheduler:
    t_warmup: 3425
  tokens: 3591372800
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1221
  max_duration: 12211
  model:
    context_length: 512
    d_model: 1088
    n_heads: 17
    n_layers: 17
  params: 520643904
  ratio: 6.148233676428487
  scheduler:
    t_warmup: 3052
  tokens: 3201040384
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 1091
  max_duration: 10919
  model:
    context_length: 512
    d_model: 1152
    n_heads: 18
    n_layers: 18
  params: 582240384
  ratio: 4.916097224887788
  scheduler:
    t_warmup: 2729
  tokens: 2862350336
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 979
  max_duration: 9794
  model:
    context_length: 512
    d_model: 1216
    n_heads: 19
    n_layers: 19
  params: 649145792
  ratio: 3.955102794535253
  scheduler:
    t_warmup: 2448
  tokens: 2567438336
  total_flops: 10000000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 881
  max_duration: 8810
  model:
    context_length: 512
    d_model: 1280
    n_heads: 20
    n_layers: 20
  params: 721655040
  ratio: 3.2002667645749416
  scheduler:
    t_warmup: 2202
  tokens: 2309488640
  total_flops: 10000000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 794
  max_duration: 7946
  model:
    context_length: 512
    d_model: 1344
    n_heads: 21
    n_layers: 21
  params: 800063040
  ratio: 2.603540121038462
  scheduler:
    t_warmup: 1986
  tokens: 2082996224
  total_flops: 10000000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 718
  max_duration: 7186
  model:
    context_length: 512
    d_model: 1408
    n_heads: 22
    n_layers: 22
  params: 884664704
  ratio: 2.129356778316771
  scheduler:
    t_warmup: 1796
  tokens: 1883766784
  total_flops: 10000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 8936
  max_duration: 89365
  model:
    context_length: 512
    d_model: 512
    n_heads: 8
    n_layers: 8
  params: 156516864
  ratio: 149.6739582004403
  scheduler:
    t_warmup: 22341
  tokens: 23426498560
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 7618
  max_duration: 76181
  model:
    context_length: 512
    d_model: 576
    n_heads: 9
    n_layers: 9
  params: 183604032
  ratio: 108.76880995728895
  scheduler:
    t_warmup: 19045
  tokens: 19970392064
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 6556
  max_duration: 65561
  model:
    context_length: 512
    d_model: 640
    n_heads: 10
    n_layers: 10
  params: 213345920
  ratio: 80.55660396036633
  scheduler:
    t_warmup: 16390
  tokens: 17186422784
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 5684
  max_duration: 56849
  model:
    context_length: 512
    d_model: 704
    n_heads: 11
    n_layers: 11
  params: 246037440
  ratio: 60.570554855391116
  scheduler:
    t_warmup: 14212
  tokens: 14902624256
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 4960
  max_duration: 49604
  model:
    context_length: 512
    d_model: 768
    n_heads: 12
    n_layers: 12
  params: 281973504
  ratio: 46.11564842631455
  scheduler:
    t_warmup: 12401
  tokens: 13003390976
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 4351
  max_duration: 43513
  model:
    context_length: 512
    d_model: 832
    n_heads: 13
    n_layers: 13
  params: 321449024
  ratio: 35.48516567280043
  scheduler:
    t_warmup: 10878
  tokens: 11406671872
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 3834
  max_duration: 38346
  model:
    context_length: 512
    d_model: 896
    n_heads: 14
    n_layers: 14
  params: 364758912
  ratio: 27.55840499929992
  scheduler:
    t_warmup: 9586
  tokens: 10052173824
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 3393
  max_duration: 33933
  model:
    context_length: 512
    d_model: 960
    n_heads: 15
    n_layers: 15
  params: 412198080
  ratio: 21.58023722963484
  scheduler:
    t_warmup: 8483
  tokens: 8895332352
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 3014
  max_duration: 30140
  model:
    context_length: 512
    d_model: 1024
    n_heads: 16
    n_layers: 16
  params: 464061440
  ratio: 17.025806238070544
  scheduler:
    t_warmup: 7535
  tokens: 7901020160
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 2686
  max_duration: 26865
  model:
    context_length: 512
    d_model: 1088
    n_heads: 17
    n_layers: 17
  params: 520643904
  ratio: 13.526516887826656
  scheduler:
    t_warmup: 6716
  tokens: 7042498560
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 2402
  max_duration: 24023
  model:
    context_length: 512
    d_model: 1152
    n_heads: 18
    n_layers: 18
  params: 582240384
  ratio: 10.815954174693593
  scheduler:
    t_warmup: 6005
  tokens: 6297485312
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 2154
  max_duration: 21547
  model:
    context_length: 512
    d_model: 1216
    n_heads: 19
    n_layers: 19
  params: 649145792
  ratio: 8.701306913809587
  scheduler:
    t_warmup: 5386
  tokens: 5648416768
  total_flops: 22000000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 1938
  max_duration: 19382
  model:
    context_length: 512
    d_model: 1280
    n_heads: 20
    n_layers: 20
  params: 721655040
  ratio: 7.040586882064871
  scheduler:
    t_warmup: 4845
  tokens: 5080875008
  total_flops: 22000000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 1748
  max_duration: 17482
  model:
    context_length: 512
    d_model: 1344
    n_heads: 21
    n_layers: 21
  params: 800063040
  ratio: 5.728050389629297
  scheduler:
    t_warmup: 4370
  tokens: 4582801408
  total_flops: 22000000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 1581
  max_duration: 15810
  model:
    context_length: 512
    d_model: 1408
    n_heads: 22
    n_layers: 22
  params: 884664704
  ratio: 4.684821968436982
  scheduler:
    t_warmup: 3952
  tokens: 4144496640
  total_flops: 22000000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 1302
  max_duration: 13027
  model:
    context_length: 512
    d_model: 1536
    n_heads: 24
    n_layers: 24
  params: 1073628672
  ratio: 3.180755113067621
  scheduler:
    t_warmup: 3256
  tokens: 3414949888
  total_flops: 22000000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 1083
  max_duration: 10835
  model:
    context_length: 512
    d_model: 1664
    n_heads: 26
    n_layers: 26
  params: 1290906240
  ratio: 2.2002606788855554
  scheduler:
    t_warmup: 2708
  tokens: 2840330240
  total_flops: 22000000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 14423
  max_duration: 144234
  model:
    context_length: 512
    d_model: 640
    n_heads: 10
    n_layers: 10
  params: 213345920
  ratio: 177.22428296730493
  scheduler:
    t_warmup: 36058
  tokens: 37810077696
  total_flops: 48400000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 12506
  max_duration: 125069
  model:
    context_length: 512
    d_model: 704
    n_heads: 11
    n_layers: 11
  params: 246037440
  ratio: 133.25649923848988
  scheduler:
    t_warmup: 31267
  tokens: 32786087936
  total_flops: 48400000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 10913
  max_duration: 109130
  model:
    context_length: 512
    d_model: 768
    n_heads: 12
    n_layers: 12
  params: 281973504
  ratio: 101.45554214909497
  scheduler:
    t_warmup: 27282
  tokens: 28607774720
  total_flops: 48400000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 9572
  max_duration: 95728
  model:
    context_length: 512
    d_model: 832
    n_heads: 13
    n_layers: 13
  params: 321449024
  ratio: 78.06687517582881
  scheduler:
    t_warmup: 23932
  tokens: 25094520832
  total_flops: 48400000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 8436
  max_duration: 84362
  model:
    context_length: 512
    d_model: 896
    n_heads: 14
    n_layers: 14
  params: 364758912
  ratio: 60.629065940409426
  scheduler:
    t_warmup: 21090
  tokens: 22114992128
  total_flops: 48400000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 7465
  max_duration: 74653
  model:
    context_length: 512
    d_model: 960
    n_heads: 15
    n_layers: 15
  params: 412198080
  ratio: 47.47677629163144
  scheduler:
    t_warmup: 18663
  tokens: 19569836032
  total_flops: 48400000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 6630
  max_duration: 66309
  model:
    context_length: 512
    d_model: 1024
    n_heads: 16
    n_layers: 16
  params: 464061440
  ratio: 37.457338614473116
  scheduler:
    t_warmup: 16577
  tokens: 17382506496
  total_flops: 48400000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 5910
  max_duration: 59103
  model:
    context_length: 512
    d_model: 1088
    n_heads: 17
    n_layers: 17
  params: 520643904
  ratio: 29.75833715321864
  scheduler:
    t_warmup: 14775
  tokens: 15493496832
  total_flops: 48400000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 5285
  max_duration: 52850
  model:
    context_length: 512
    d_model: 1152
    n_heads: 18
    n_layers: 18
  params: 582240384
  ratio: 23.794829044355673
  scheduler:
    t_warmup: 13212
  tokens: 13854310400
  total_flops: 48400000000000000000
- device_eval_batch_size: 64
  device_train_microbatch_size: 64
  eval_interval: 4740
  max_duration: 47403
  model:
    context_length: 512
    d_model: 1216
    n_heads: 19
    n_layers: 19
  params: 649145792
  ratio: 19.14271367871703
  scheduler:
    t_warmup: 11850
  tokens: 12426412032
  total_flops: 48400000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 4264
  max_duration: 42640
  model:
    context_length: 512
    d_model: 1280
    n_heads: 20
    n_layers: 20
  params: 721655040
  ratio: 15.489145838987005
  scheduler:
    t_warmup: 10660
  tokens: 11177820160
  total_flops: 48400000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 3846
  max_duration: 38461
  model:
    context_length: 512
    d_model: 1344
    n_heads: 21
    n_layers: 21
  params: 800063040
  ratio: 12.601907449692964
  scheduler:
    t_warmup: 9615
  tokens: 10082320384
  total_flops: 48400000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 3478
  max_duration: 34783
  model:
    context_length: 512
    d_model: 1408
    n_heads: 22
    n_layers: 22
  params: 884664704
  ratio: 10.306904650736467
  scheduler:
    t_warmup: 8695
  tokens: 9118154752
  total_flops: 48400000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 2866
  max_duration: 28661
  model:
    context_length: 512
    d_model: 1536
    n_heads: 24
    n_layers: 24
  params: 1073628672
  ratio: 6.998051914917563
  scheduler:
    t_warmup: 7165
  tokens: 7513309184
  total_flops: 48400000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 2383
  max_duration: 23837
  model:
    context_length: 512
    d_model: 1664
    n_heads: 26
    n_layers: 26
  params: 1290906240
  ratio: 4.840573493548223
  scheduler:
    t_warmup: 5959
  tokens: 6248726528
  total_flops: 48400000000000000000
- device_eval_batch_size: 32
  device_train_microbatch_size: 32
  eval_interval: 1999
  max_duration: 19996
  model:
    context_length: 512
    d_model: 1792
    n_heads: 28
    n_layers: 28
  params: 1538856704
  ratio: 3.406315487579018
  scheduler:
    t_warmup: 4999
  tokens: 5241831424
  total_flops: 48400000000000000000
- device_eval_batch_size: 16
  device_train_microbatch_size: 16
  eval_interval: 1690
  max_duration: 16909
  model:
    context_length: 512
    d_model: 1920
    n_heads: 30
    n_layers: 30
  params: 1819839360
  ratio: 2.4357055866733206
  scheduler:
    t_warmup: 4227
  tokens: 4432592896
  total_flops: 48400000000000000000
- device_eval_batch_size: 8
  device_train_microbatch_size: 8
  eval_interval: 16718
  max_duration: 167187
  model:
    context_length: 512
    d_model: 2560
    n_heads: 40
    n_layers: 40
  params: 3802810880
  ratio: 11.524914151923326
  scheduler:
    t_warmup: 41796
  tokens: 43827068928
  total_flops: 1000000000000000000000
wandb:
  group: new-scale-big-1
