accumulate_grad_batches: 1
# accelerator: null # set to 'ddp' for distributed
# amp_backend: native # 'native' | 'apex'
gpus: 8
max_epochs: 50
gradient_clip_val: 0.0 # Gradient clipping
log_every_n_steps: 10
precision: 16
progress_bar_refresh_rate: 1
weights_summary: top # Set to 'full' to see every layer
track_grad_norm: -1 # Set to 2 to track norms of gradients
limit_train_batches: 1.0
limit_val_batches: 1.0
# We use the dataloader from Transformer-XL to ensure adjacent minibatches
# are from text that are next to each other.
# So that dataloader has to deal with DDP, and we don't want PL to handle
# that.
replace_sampler_ddp: False
