batch_size: 8
n_epochs: 200
num_workers: 4
pin_memory: true
learning_rate: 1e-4
final_learning_rate: 1e-6
weight_decay: 1e-5
predict_delta: false
scheduler: "cosine"
clip_grad: true
clip_to: 1.0
gradnorm_balancer: "none"  # "none", "full", "pseudo"
pushforward:
  epochs: [100, 130, 160]
  unrolls: [0, 0, 0]
  probs: [10, 10, 5]
pretraining: false
pretraining_kwargs:
  freeze_after: false
  clip_grad: false
  scheduler: null
  weight_decay: 1e-5
  lr: 1e-3
  n_epochs: 35
  target_modules: [patch_embed, unpatch]
  add_noise: false

