_target_: src.models.SFMModule

optimizer:
  _target_: torch.optim.AdamW
  _partial_: true
  lr: 0.001
  weight_decay: 0.0

scheduler:
  _target_: torch.optim.lr_scheduler.CosineAnnealingLR
  _partial_: true
  T_max: 1000
  eta_min: 0.00008

net:
  _target_: src.models.net.GPT
  block_size: 1024
  vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
  n_layer: 12
  n_head: 12
  n_embd: 768
  dropout: 0.0
  bias: True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
  qk_layernorm: False
  do_x1_sc: False
  mask_token_id: 0
  proper_timestep_emb: False
  d3pm_loss_weighting: False
  d3pm_loss_weighting_maxT: 1000

compile: false
