# @package _global_
defaults:
  - /data: fineweb_edu/llama
  - /model/backbone: sequence
  - /model/backbone/layer@model.backbone.seq_cell: ratplus16localprefixfgatesimple
  - /model/backbone/layer@model.backbone.hidden_cell: ffn
  - /model/head: lm
  - /model/embedding: lm
  - /model/embedding@model.embedding.pe: rope
  - /task: lm
  - /optim/lr_scheduler: cosine
  - /optim/optimizer: adamw
  - _self_

data:
  global_batch_size: 512

optim:
  optimizer:
    lr: 8.0e-4
    weight_decay: 0.1
    betas: [0.9,0.95]
  lr_scheduler:
    warmup_iter: 0.05
    eta_min: 1.0e-6

wandb:
  job_type: fineweb_pretrain

model:
  backbone:
    num_layers: 24
    d_model: 2048
    dropout: 0.0
    bias: false
    ln: rmsnorm
    init:
      _name_: fixed
      initializer_range: 0.02
    seq_cell:
      d_head: 128
      chunk_size: 1
      chunk_size1: 64
      prefix_size: 0
      local_size: 0
      apply_re: true
      mix_train: true
task:
  ignore_index: -100

trainer:
  global_batch_size: ${data.global_batch_size}
  gradient_clipping: 1.0
  max_epoch: 1
  load_checkpoint: true
  save_checkpoint: true
  log_interval: 100
  eval_when_log: false
  dtype: bfloat16
