# Base
wt103: &wt103
   dataset: wt103
   data: ../data/wikitext-103/

train: &train
   <<: *wt103
   cuda: true
   n_layer: 16
   d_model: 512
   n_head: 8
   d_head: 64
   d_inner: 2048
   dropout: 0.1
   dropatt: 0.0
   optim: jitlamb
   lr: 0.01
   eta_min: 0.001
   roll: true
   warmup_step: 1000
   max_step: 40000
   tgt_len: 192
   mem_len: 192
   eval_tgt_len: 192
   batch_size: 256
   multi_gpu: ddp
   log_interval: 10
   eval_interval: 5000
   vocab: word
   adaptive: true

eval: &eval
   <<: *wt103
   cuda: true
   tgt_len: 64
   mem_len: 640
   clamp_len: 400
   same_length: true
   split: test

default:
   train:
      <<: *train
   eval:
      <<: *eval

manual_eval:
   train:
      <<: *train
   eval:
      <<: *eval
      manual_config: '{"n_token": 267735, "n_layer": 16, "n_head": 8, "d_model": 512, "d_head": 64, "d_inner": 2048, "dropout": 0.1, "dropatt": 0.0, "dtype": null, "tie_weight": true, "d_embed": 512, "div_val": 1, "tie_projs": [false, true, true, true], "pre_lnorm": false, "tgt_len": 192, "ext_len": 0, "mem_len": 192, "cutoffs": [19997, 39997, 199997], "same_length": false, "attn_type": 0, "clamp_len": -1, "sample_softmax": -1}'
