# Reproducibility
random_seed: 13

# Optimization hyperparams
batch_size: 256
device_batch_size: 1
sequence_length: 8192
num_iterations: 1000
learning_rate: 0.00003
# Default scheduler uses cosine annealing to LR/10
# If no scheduler, we keep LR constant
use_scheduler: false

# Evaluation and logging hyperparams
# Whether to save model at checkpoints
save_model: true
# Whether to save the optimizer and scheduler at checkpoints
save_optimizer: true
save_model_every: 100

# wandb
expname: null
