python train.py noise.type=loglinear graph.type=absorb training.batch_size=512 \
    model=small distill.is_distill=True ngpus=8 \
    sampling.predictor=analytic sampling.steps=32 training.accum=8 \
    eval.batch_size=512 optim.lr=3e-7 training.snapshot_freq=100 \
    training.snapshot_freq_for_preemption=100 eval.perplexity_batch_size=4 \
    training.log_freq=1 training.eval_freq=10 distill.regularization.entropy_coef=0.0 \
    distill.regularization.forward_kl_coef=0.0 distill.regularization.reinforce_coef=0.0 \
    hydra.run.dir=name-of-exp