run_name: OLMo-20M
seed: 6198
dry_run: false
model_type: olmo

swanlab:
  name: ${run_name}
  project: olmo-tiny

max_duration: 1ep
stop_at: 20_000
global_train_batch_size: 512
device_train_microbatch_size: 64

model:
  d_model: 256
  n_heads: 8
  n_layers: 8
  mlp_ratio: 8
  weight_tying: false
  alibi: false
  rope: true
  flash_attention: true
  attention_dropout: 0.0
  attention_layer_norm: false
  clip_qkv: null
  include_bias: false
  block_type: sequential
  layer_norm_type: rms
  layer_norm_with_affine: true
  layer_norm_eps: 1e-6
  bias_for_layer_norm: false
  attention_layer_norm_with_affine: false
  activation_type: swiglu
  residual_dropout: 0.0
  embedding_dropout: 0.0
  max_sequence_length: 1024
  vocab_size: 50280
  embedding_size: 50304
  eos_token_id: 0
  pad_token_id: 1
  init_device: cuda
  init_fn: normal
  init_std: 0.02
  init_cutoff_factor: 3

ddp:
  grad_sync_mode: batch
  find_unused_params: false

compile: null

optimizer:
  name: adamw
  embed_name: embed
  head_name: head
  learning_rate: 6.0e-4
  muon_lr: 6.0e-3
  embed_lr: -1
  weight_decay: 0.1
  eps: 1e-8
  decay_norm_and_bias: true
  decay_embeddings: true
  betas:
  - 0.9
  - 0.95
  metrics_log_interval: 10

LLR:
  use_modulewise_lr: True
  alpha_positively_with_lr: True
  grad_alpha_metric: grad
  num_grad_steps: 0
  grad_unbalancedlr_every: 1
  assign_func: tb_linear_map
  lr_min_ratio: 0.666
  lr_max_ratio: 3
  unbalancedlr_every: 10
  pl_fitting: median
  remove_last_layer: False
  remove_first_layer: False
  batchnorm: True
  filter_zeros: False
  esd_metric_for_tb: alpha
  xmin_pos: 2
  batchnorm_type: name
  log_LR_every: 100

scheduler:
  name: cosine_with_warmup
  t_warmup: 2000
  alpha_f: 0.1
  warmup_min_lr: 0

tokenizer:
  identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
  truncate_direction: right

save_folder: workspace/${run_name}  # doesn't matter since we'll upload to S3
remote_save_folder:
save_overwrite: True

# Unsharded checkpoints (for ddp)
save_interval_unsharded: 5000
save_num_unsharded_checkpoints_to_keep: -1

load_path: null

precision: amp_bf16
distributed_strategy: ddp

gen1_gc_interval: 1

max_grad_norm: 1.0
max_grad_norm_ratio: null

speed_monitor:
  window_size: 20

eval_interval: 5000
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
  - label: all-small-ppl-validation
    data:
      num_workers: 0
      drop_last: true
      datasets:
        fineweb-validation:
          - /mnt/parallel_ssd/c4/eval-fineweb-part-00039.npy

data:
  pad_direction: right
  num_workers: 16
  drop_last: true
  pin_memory: true
  prefetch_factor: 8
  persistent_workers: true
  timeout: 0
  instance_filter:
    repetition_max_period: 13
    repetition_min_period: 1
    repetition_max_count: 32
  paths:
    # ~> C4 (138.4 GT)
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00000.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00001.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00002.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00003.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00004.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00005.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00006.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00007.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00008.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00009.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00010.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00011.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00012.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00013.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00014.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00015.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00016.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00017.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00018.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00019.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00020.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00021.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00022.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00023.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00024.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00025.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00026.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00027.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00028.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00029.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00030.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00031.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00032.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00033.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00034.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00035.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00036.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00037.npy
    - /mnt/parallel_ssd/c4/TokenizedFineWeb/part-00038.npy