defaults:
  - dataset: cifar10
  - _self_

# Training
device: cuda  # auto, cpu, cuda
seed: 0
epochs: 40000
batch_size: 128
num_workers: 2

# Optimizer
optimizer:
  name: sgd  # sgd | adamw
  lr: 0.01
  weight_decay: 1e-6
  momentum: 0.9    # used by sgd
  betas: [0.9, 0.999]  # used by adamw
  eps: 1e-8           # used by adamw

# LR scheduler
scheduler:
  name: cosine
  t_max: 20000
  eta_min: 1e-7

# Sparsifier
sparsifier:
  rho: 0.001
  lambda:
    mode: "constant"
    beta: 0.01
    gamma: 0.8
    t0: 100
    cap: 1.0
  alpha: 1.      # default alpha for selected params
  # kappa: 4       # internal aggregation k
  sparsity: 0.9
  eps: 1e-6

# Model
model:
  name: smallconv
  num_classes: 10
