pipeline_space:
  weight_decay:
    choices: [0, 1e-2, 1e-4, 1e-6]
  adam_beta_1:
    choices: [0.9, 0.95]
  adam_beta_2:
    choices: [0.95, 0.99]
  warmup_fraction:
    choices: [0.05, 0.1, 0.25]
  cooldown_fraction:
    choices: [0.1, 0.25, 0.5]
  # constants
  max_lr: 0.003
  micro_batch_size: 260
  cooldown_lr_decay_factor: 0.1
  tokens_per_param: 20
  layers_to_train: null
max_evaluations_total: 144
optimizer: "grid_search"
ignore_errors: true