project: "rescion-experiment"
run: "rescion"

# =====================
# Data
# =====================
input_bin: data/fineweb10B/fineweb_train_*.bin
input_val_bin: data/fineweb10B/fineweb_val_*.bin

# =====================
# Optimization / training
# =====================
device_batch_size: 64

# =====================
# Evaluation / logging
# =====================
val_loss_every: 125
val_tokens: 10485760
save_every: 0

# =====================
# Model architecture
# =====================
n_layer: 12
n_head: 6
n_embd: 768

# =====================
# Scion optimizer
# =====================
unconstrained: false
momentum: 0.1
weight_decay: 0.0

scale_embed: 50
scale_matrix: 3000

# =====================
# Curriculum (step-based iteration switching)
# =====================
curriculum:
  # Phase 1: original TTP
  - step_until: 5100
    batch_size: 256
    sequence_length: 1024
    lr_embed: 3.6e-4
    lr_matrix: 3.6e-4
    warmup_iters: 0
    warmdown_iters: 0

  # Phase 2: scaled batch+seq, half LR, cooldown
  - step_until: 9350 
    batch_size: 512
    sequence_length: 2048
    lr_embed: 1.8e-4
    lr_matrix: 1.8e-4
    warmup_iters: 0
    warmdown_iters: 2975
