# =====================
project: "rescion-experiment"
run: "scion-40k"
# =====================
# Data
# =====================
input_bin: data/fineweb10B/fineweb_train_*.bin
input_val_bin: data/fineweb10B/fineweb_val_*.bin

# =====================
# Optimization / training
# =====================
batch_size: 256
device_batch_size: 16
sequence_length: 1024
num_iterations: 42500
warmup_iters: 0
warmdown_iters: 11900

# =====================
# Evaluation / logging
# =====================
val_loss_every: 125
val_tokens: 10485760
save_every: 0

# =====================
# Model architecture
# =====================
n_layer: 12
n_head: 6
n_embd: 768

# =====================
# Scion optimizer
# =====================
unconstrained: false
momentum: 0.1
weight_decay: 0.0

lr_embed: 3.6e-4
lr_matrix: 3.6e-4

scale_embed: 50
scale_matrix: 3000

track: False