pipeline_space:
  weight_decay:
    choices: [0, 1e-2, 1e-4, 1e-6]
  adam_beta_1:
    choices: [0.9, 0.95]
  adam_beta_2:
    choices: [0.95, 0.99]
  warmup_fraction:
    choices: [0.05, 0.1, 0.25]
  cooldown_fraction:
    choices: [0.1, 0.25, 0.5]
  tokens_per_param:
    lower: 2
    upper: 20
    is_fidelity: true
  # constants
  max_lr: 0.003
  micro_batch_size: 72
  cooldown_lr_decay_factor: 0.1
  layers_to_train: 10
  share_embeddings: true
max_evaluations_total: 100
optimizer: 
  name: "successive_halving"
  eta: 2
ignore_errors: true