# Reproducibility
random_seed: 13

# Optimization hyperparams
batch_size: 256
device_batch_size: 1
sequence_length: 8192
num_iterations: 2000
learning_rate: 0.00003
# Default scheduler uses cosine annealing to LR/10
# If no scheduler, we keep LR constant
use_scheduler: false

# Evaluation and logging hyperparams
# Whether to save model at checkpoints
save_model: true
# Whether to save the optimizer and scheduler at checkpoints
save_optimizer: true
save_model_every: 200
val_loss_every: 200
# 512 * 8196 (i.e. 512 examples)
val_tokens: 4194304

# wandb
expname: null
