# Reproducibility
random_seed: 13

# Optimization hyperparams
batch_size: 512
device_batch_size: 16
sequence_length: 1024
# 18000 * 1024 * 512 = 9437184000 ~ approx 10B tokens
# (using 19000 iters would be more than one epoch)
num_iterations: 18000
learning_rate: 0.00003
# Default scheduler uses cosine annealing to LR/10
# If no scheduler, we keep LR constant
use_scheduler: false

# Evaluation and logging hyperparams
# Whether to save model at checkpoints
save_model: true
# Whether to save the optimizer and scheduler at checkpoints
save_optimizer: true
save_model_every: 2000
val_loss_every: 200
val_tokens: 1048576 # 8*64*1024

# wandb
expname: null
