# @package _global_
defaults:
  - _self_
  - trainer: lm
  - optimizer: lamb
  - scheduler: cosine_warmup #  LR scheduler to use, choices=['cosine_warmup', 'inv_sqrt', 'dev_perf', 'constant']
  - loader: lm
  - dataset: wt103 # Dataset _name_, choices=['wt103', 'wt2', 'lm1b', 'enwik8', 'text8', 'ptb']
  - model: transformer
  - logger: null # set to wandb for wandb logging
  # - attention: full
  # - dataset_attention: ${defaults.4.dataset}_${defaults.3.attention}
  # optional: true
  # https://hydra.cc/docs/patterns/specializing_config

# DATASET
# dataset:
#   num_workers: 4
# dataset:
  # vocab: word # Type of vocabulary, choices=['word', 'bpe']
  # train_loader_kwargs:
  #   pad_to_multiple: False
  # eval_loader_kwargs:
  #   pad_to_multiple: False

# model:
#   _target_: model.model.Model
#   cell: legs
#   cell_args:
#     d_model: 256
#   dropout: 0.0
#   depth: 1

train:
  seed: 1111
  # epochs: 19
  # tgt_len: 128 # Number of tokens to predict # moved to dataset
  # interval: step
  # monitor: val/loss
  # ema: 0.9995
  # ddp: False # Now handled automatically
  # wandb: False # substitute for wandb.mode
  checkpoint: False
  load_from_path: null
  test: False # Whether to evaluate on test set after training

# TRAINING
# max_step: 40000 # Max number of training steps
# seed: 1111 # Random seed

# smoke_test: False
# seed: [_sample_uniform, 0, 65536]
# hydra:
#   run:
#     dir: ${env:RESULT_DIR,.}/outputs/${dataset._name_}_${now:%Y-%m-%d}/${now:%H-%M-%S}
#   sweep:
#     dir: ${env:RESULT_DIR,.}/multirun/${dataset._name_}_${now:%Y-%m-%d}/${now:%H-%M-%S}
