# model
arch: transformersuper_iwslt_de_en
max-tokens: 4096
data: data/binary/iwslt14_de_en
source-lang: de
target-lang: en

# training settings
distributed-world-size: 1
optimizer: adam
adam-betas: (0.9, 0.98)
weight-decay: 0.0001
dropout: 0.375
criterion: label_smoothed_cross_entropy
label-smoothing: 0.1

fp16: True

# warmup from warmup-init-lr to lr (warmup-updates steps); then inverse sqrt anneal (max-update - warmup-updates steps)
max-update: 40000
warmup-updates: 10000
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
lr: 0.0005

# logging
keep-last-epochs: 20
save-interval: 10
validate-interval: 10

# SuperTransformer configs
encoder-embed-dim: 640
decoder-embed-dim: 640

encoder-ffn-embed-dim: 2048
decoder-ffn-embed-dim: 2048

encoder-layers: 6
decoder-layers: 6

encoder-attention-heads: 4
decoder-attention-heads: 4

qkv-dim: 512

# SubTransformers search space
encoder-embed-choice: [640, 512]
decoder-embed-choice: [640, 512]

encoder-ffn-embed-dim-choice: [2048, 1024, 512]
decoder-ffn-embed-dim-choice: [2048, 1024, 512]

encoder-layer-num-choice: [6]
decoder-layer-num-choice: [6, 5, 4, 3, 2, 1]

encoder-self-attention-heads-choice: [4, 2]
decoder-self-attention-heads-choice: [4, 2]
decoder-ende-attention-heads-choice: [4, 2]

# for arbitrary encoder decoder attention. -1 means attending to last one encoder layer
# 1 means last two encoder layers, 2 means last three encoder layers
decoder-arbitrary-ende-attn-choice: [-1, 1, 2]
