# @package _group_

common:
  fp16: true
  log_format: simple
  log_interval: 8000
  seed: 1
  empty_cache_freq: 30

checkpoint:
  save_dir: checkpoints
  keep_best_checkpoints: 5
  keep_last_epochs: 5
  
task:
  _name: language_modeling
  data: ???
  dict: ???
  sample_break_mode: eos
  tokens_per_sample: 1024

dataset:
  num_workers: 2
  max_tokens: 160000
  data_buffer_size: 100
  train_subset: train
  valid_subset: valid
  batch_size_valid: 1536
  curriculum: 1

distributed_training:
  distributed_world_size: 1
  ddp_backend: pytorch_ddp

criterion:
  _name: cross_entropy

optimization:
  max_epoch: 25
  clip_norm: 5.0
  update_freq: [12]
  lr: [0.001]

optimizer:
  _name: adam
  adam_betas: (0.9,0.999)
  adam_eps: 1e-08
  weight_decay: 0.0

lr_scheduler:
  _name: tri_stage
  warmup_steps: 25000
  hold_steps: 0
  decay_steps: 20000
  final_lr_scale: 0.05

model:
  _name: transformer_lm
  dropout: 0.0
  decoder_embed_dim: 512
  decoder_hidden_size: 512
  decoder_layers: 16
  decoder_out_embed_dim: 512
  decoder_dropout_in: 0.0
  decoder_dropout_out: 0.0
  share_embed: true
  is_wordlm: false
  share_decoder_input_output_embed: true
  decoder_normalize_before: true