train-subtransformer: True

# model
arch: transformersuper_iwslt_de_en
max-tokens: 4096
data: data/binary/iwslt14_de_en
source-lang: de
target-lang: en

# training settings
optimizer: adam
adam-betas: (0.9, 0.98)
weight-decay: 0.0001
dropout: 0.375
criterion: label_smoothed_cross_entropy
label-smoothing: 0.1

fp16: True

# warmup from warmup-init-lr to lr (warmup-updates steps); then inverse sqrt anneal (max-update - warmup-updates steps)
max-update: 40000
warmup-updates: 4000
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
lr: 0.0005

# logging
keep-last-epochs: 20
save-interval: 10
validate-interval: 10

# SuperTransformer configs

# We train the SubTransformer inside the SuperTransformer, so need to specify a SuperTransformer
# From algorithm side, we can train a totally standalone SubTransformer and it is unnecessary to specify a SuperTransformer
# However, from implementation side, it is easier to do a Subtransformer training by always sampling the same desired SubTransformer from a specified SuperTransformer

encoder-embed-dim: 640
decoder-embed-dim: 640

encoder-ffn-embed-dim: 2048
decoder-ffn-embed-dim: 2048

encoder-layers: 6
decoder-layers: 6

encoder-attention-heads: 4
decoder-attention-heads: 4

qkv-dim: 512
