python -u -m main \
  loader.batch_size=8 \
  loader.eval_batch_size=8 \
  loader.num_workers=1 \
  data=openwebtext-split \
  model=small \
  algo=duo \
  model.length=1024 \
  algo.gumbel_tau_log10_start=-3.0 \
  algo.gumbel_tau_log10_end=-3.0 \
  algo.gamma_min=-3.55 \
  algo.gamma_max=-1.85 \
  algo.curriculum_start=0 \
  algo.curriculum_end=500000 \
  strategy.find_unused_parameters=True \
  adversarial_distill.is_distill=True \
  algo.backbone='hf_dit' \
  eval.checkpoint_path='path-to-checkpoint' \
  eval.compute_generative_perplexity=True \
  trainer.accumulate_grad_batches=1 \
  trainer.gradient_clip_val=null \
  trainer.precision=32 \
  training.loss_precision=float32 \
  logger.name=name-of-exp \
  sampling.steps=32 \
  optim.lr=1e-6 \
  trainer.val_check_interval=1500 \
  trainer.limit_val_batches=0.01 