name: gpt2s_adam_bs512
method: grid
parameters:
  opt.peak_lr:
    values: [0.00020, 0.00035, 0.00061, 0.00107, 0.00187, 0.00327, 0.00572, 0.01000] # -> 8x
program: main.py
command:
  - ${env}
  - ${interpreter}
  - ${program}
  - +model=gpt2s
  - +dataset=fw_gpt2
  - opt.optimizer='adamw'
  - opt.batch_size=512
  - opt.max_microbatch_size=16
  - opt.b1=0.9
  - opt.b2=0.95
  - opt.weight_decay=0.1
  - ${args_no_hyphens}
