defaults:
  - model: rttransformer
  - data: mnist
  - _self_

lr: 3e-4
n_epochs: 150
batch_size: 64

n_samples: null
num_workers: 8
eval_every: 1

compile: false
compile_kwargs:
  # mode: reduce-overhead
  mode: null
  options:
    matmul-padding: True

optim:
  cls: torch.optim.AdamW
  kwargs:
    lr: ${lr}
    weight_decay: 5e-4
    amsgrad: True
    fused: False

use_scheduler: False
scheduler:
  cls: lr_scheduler.WarmupLRScheduler
  kwargs:
    warmup_steps: 10000
    # decay_steps: 80000
    
load_ckpt: null

use_amp: False
gradscaler:
  enabled: ${use_amp}
autocast:
  device_type: cuda
  enabled: ${use_amp}
  dtype: float16

clip_grad: True
clip_grad_max_norm: 1.0

mixup: False
alpha: 1.0

seed: 42
gpu: 0
save_path: ./output
wandb:
  project: mnist_inr_classify
  entity: null
  name: null

matmul_precision: high
cudnn_benchmark: False

debug: False
