run_name: llama-130M
seed: 6198
dry_run: false
model_type: llama
dataset_path: /.../Fineweb/100BT/*.parquet
eval_dataset_path: /.../Fineweb/eval_013_00008.parquet

swanlab:
  name: ${run_name}
  project: olmo-llama-130M-1024

max_duration: 1ep
stop_at: 5_200
max_epochs: 100
LLR_ratio: 0.2
global_train_batch_size: 512
device_train_microbatch_size: 32

scheduler:
  name: cosine_with_warmup
  t_warmup: 520
  alpha_f: 0.1
  warmup_min_lr: 0

llama_model:
  bos_token_id: 0
  eos_token_id: 1
  hidden_act: silu
  hidden_size: 768
  intermediate_size: 2048
  initializer_range: 0.02
  max_sequence_length: 1024
  model_type: llama
  num_attention_heads: 12
  num_hidden_layers: 12
  pad_token_id: -1
  rms_norm_eps: 1e-06
  transformers_version: 4.28.1
  use_cache: true
  vocab_size: 32128
  precision: amp_bf16

ddp:
  grad_sync_mode: batch
  find_unused_params: false

compile: null

optimizer:
  name: adamw
  embed_name: embed
  head_name: head
  learning_rate: 6.0e-4
  muon_lr: -1
  embed_lr: -1
  weight_decay: 0.1
  eps: 1e-8
  decay_norm_and_bias: true
  decay_embeddings: true
  betas:
  - 0.9
  - 0.95
  metrics_log_interval: 10

LLR:
  use_modulewise_lr: False
  alpha_positively_with_lr: True
  grad_alpha_metric: grad
  num_grad_steps: 0
  grad_unbalancedlr_every: 1
  assign_func: tb_linear_map
  lr_min_ratio: 1
  lr_max_ratio: 3
  unbalancedlr_every: 40
  linear_steps: 20
  pl_fitting: median
  remove_last_layer: False
  remove_first_layer: False
  batchnorm: True
  filter_zeros: False
  esd_metric_for_tb: alpha
  xmin_pos: 2
  batchnorm_type: name
  log_LR_every: 100

tokenizer:
  identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
  truncate_direction: right

save_folder: workspace/${run_name}  # doesn't matter since we'll upload to S3
remote_save_folder:
save_overwrite: True

# Unsharded checkpoints (for ddp)
save_interval_unsharded: 1000
save_num_unsharded_checkpoints_to_keep: 1

load_path: null

precision: amp_bf16
distributed_strategy: ddp

gen1_gc_interval: 1

max_grad_norm: 1.0
max_grad_norm_ratio: null

speed_monitor:
  window_size: 20

eval_interval: 400
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
  - label: all-small-ppl-validation
    data:
      num_workers: 0
      drop_last: true
      datasets:
        fineweb-validation:
          - /.../eval-fineweb-part-00039.npy

data:
  pad_direction: right
  num_workers: 8
  drop_last: true
  pin_memory: true
  prefetch_factor: 8
  persistent_workers: true
  timeout: 0
  instance_filter:
    repetition_max_period: 13
    repetition_min_period: 1
    repetition_max_count: 32
  paths:
    # ~> C4 (138.4 GT)
    - /.../TokenizedFineWeb/part-00000.npy
    - /.../TokenizedFineWeb/part-00001.npy
    - /.../TokenizedFineWeb/part-00002.npy
    - /.../TokenizedFineWeb/part-00003.npy
    - /.../TokenizedFineWeb/part-00004.npy
    - /.../TokenizedFineWeb/part-00005.npy
    - /.../TokenizedFineWeb/part-00006.npy
    - /.../TokenizedFineWeb/part-00007.npy
    - /.../TokenizedFineWeb/part-00008.npy
    - /.../TokenizedFineWeb/part-00009.npy
    - /.../TokenizedFineWeb/part-00010.npy
    - /.../TokenizedFineWeb/part-00011.npy
    - /.../TokenizedFineWeb/part-00012.npy
    - /.../TokenizedFineWeb/part-00013.npy
    - /.../TokenizedFineWeb/part-00014.npy
    - /.../TokenizedFineWeb/part-00015.npy
    - /.../TokenizedFineWeb/part-00016.npy
    - /.../TokenizedFineWeb/part-00017.npy
    - /.../TokenizedFineWeb/part-00018.npy
    - /.../TokenizedFineWeb/part-00019.npy
    - /.../TokenizedFineWeb/part-00020.npy
    - /.../TokenizedFineWeb/part-00021.npy
    - /.../TokenizedFineWeb/part-00022.npy
    - /.../TokenizedFineWeb/part-00023.npy
    - /.../TokenizedFineWeb/part-00024.npy
    - /.../TokenizedFineWeb/part-00025.npy
    - /.../TokenizedFineWeb/part-00026.npy
    - /.../TokenizedFineWeb/part-00027.npy
    - /.../TokenizedFineWeb/part-00028.npy
    - /.../TokenizedFineWeb/part-00029.npy
    - /.../TokenizedFineWeb/part-00030.npy
    - /.../TokenizedFineWeb/part-00031.npy
    - /.../TokenizedFineWeb/part-00032.npy
    - /.../TokenizedFineWeb/part-00033.npy
    - /.../TokenizedFineWeb/part-00034.npy
    - /.../TokenizedFineWeb/part-00035.npy
    - /.../TokenizedFineWeb/part-00036.npy
    - /.../TokenizedFineWeb/part-00037.npy
    - /.../TokenizedFineWeb/part-00038.npy