# Base
wt103: &wt103
   dataset: wt103
   data: ../data/wikitext-103/

train: &train
   <<: *wt103
   cuda: true
   n_layer: 18
   d_model: 1024
   n_head: 16
   d_head: 64
   d_inner: 4096
   dropout: 0.2
   dropatt: 0.2
   optim: jitlamb
   lr: 0.01
   eta_min: 0.0001
   roll: true
   warmup_step: 10000
   max_step: 100000
   tgt_len: 384
   mem_len: 384
   init_std: 0.005
   eval_tgt_len: 128
   batch_size: 128
   multi_gpu: ddp
   log_interval: 100
   eval_interval: 5000
   vocab: word
   adaptive: true
   div_val: 4

train_multinode: &train_multinode
   <<: *wt103
   <<: *train
   lr: 0.02
   max_step: 25000
   batch_size: 512
   eval_batch_size: 128
   eval_interval: 1000

eval: &eval
   <<: *wt103
   cuda: true
   tgt_len: 128
   mem_len: 1600
   clamp_len: 1000
   same_length: true
   split: test

default:
   train:
      <<: *train
   eval:
      <<: *eval

manual_eval:
   train:
      <<: *train
   eval:
      <<: *eval
      manual_config: '{"n_token": 267735, "n_layer": 18, "n_head": 16, "d_model": 1024, "d_head": 64, "d_inner": 4096, "dropout": 0.2, "dropatt": 0.2, "dtype": null, "tie_weight": true, "d_embed": 1024, "div_val": 4, "tie_projs": [false, true, true, true], "pre_lnorm": false, "tgt_len": 384, "ext_len": 0, "mem_len": 384, "cutoffs": [19997, 39997, 199997], "same_length": false, "attn_type": 0, "clamp_len": -1, "sample_softmax": -1}'
