batch_size: 16
n_epochs: 100
num_workers: 1
pin_memory: false
learning_rate: 1e-3
final_learning_rate: 1e-6
weight_decay: 1e-5
predict_delta: false
scheduler: "cosine"
clip_grad: false
gradnorm_balancer: "pseudo"  # "none", "full", "pseudo"
pushforward:
  epochs: [-1, 100, 120, 140, 160, 180]
  unrolls: [1, 2, 3, 5, 5, 10]
  probs: [10, 10, 10, 5, 5, 5]

pretraining: false
pretraining_kwargs:
  freeze_after: true
  clip_grad: false
  scheduler: null
  lr: 5e-4
  n_epochs: 20
  target_modules: [patch_embed, unpatch]
  weight_decay: 1e-5

