- ar = autoregressive
- bi = bidirectional
- lr = learning rate
- bs = batch size
- rep = representation
- rec = reconstruction
- dl = dataloader
- ds = dataset
- bmt = base_model_trainer.py
- gt = ground truth
- ld = langevin dynamics
- calc = calculation
- pred = predicted
- attn = attention
- ebm = energy-based model
- pl = pytorch lightning
- ebt = energy-based transformer
- bf = bug free
- oob = out of bounds
- avg = average
- pct = percent
- std = standard deviation
- fn = function
- fl = final layer
- probs = probabilities
- cur = current
- pos = position
- prev = previous
- logprobs = log probabilities
- toks = tokens
- infer = inference
- sc = sanity check
- ndv = not during validation
- t2i = text to image
- mn = multinode
- wd = weight decay