# Reproducibility
random_seed: 13

# Optimization hyperparams
batch_size: 512
device_batch_size: 64
sequence_length: 512
num_iterations: 2000
learning_rate: 0.0005
# Default scheduler uses cosine annealing to LR/10
# If no scheduler, we keep LR constant
use_scheduler: true

# Evaluation and logging hyperparams
# Whether to save model at checkpoints
save_model: true
# Whether to save the optimizer and scheduler at checkpoints
save_optimizer: true
save_model_every: 500
val_loss_every: 50
val_tokens: 1048576 # 8*64*1024

# wandb
expname: null
