# =====================
project: "rescion-experiment"
run: "scionlight-1B-10k"
# =====================
# Data
# =====================
input_bin: data/fineweb10B/fineweb_train_*.bin
input_val_bin: data/fineweb10B/fineweb_val_*.bin

# =====================
# Optimization / training
# =====================
batch_size: 512
device_batch_size: 16
sequence_length: 2048
num_iterations: 10625
warmup_iters: 0
warmdown_iters: 2975

# =====================
# Evaluation / logging
# =====================
val_loss_every: 125
val_tokens: 10485760
save_every: 0

# =====================
# Model architecture
# =====================
n_layer: 18
n_head: 16
n_embd: 2048

# =====================
# Scion optimizer
# =====================
unconstrained: false
momentum: 0.1
weight_decay: 0.0

lr_embed: 1.8e-4
lr_matrix: 1.8e-4

scale_embed: 50
scale_matrix: 3000

track: False