# Reproducibility
random_seed: 13

# Optimization hyperparams
batch_size: 512
device_batch_size: 64
sequence_length: 1024
num_iterations: 15000
learning_rate: 0.0005
# Default scheduler uses cosine annealing to LR/10
# If no scheduler, we keep LR constant
use_scheduler: true

# Evaluation and logging hyperparams
# Whether to save model at checkpoints
save_model: true
# Whether to save the optimizer and scheduler at checkpoints
save_optimizer: true
save_model_every: 500
val_loss_every: 100
val_tokens: 1048576 # 8*64*1024

# wandb
expname: null
