training:
  batch_size: 256
  lr: 0.001
  seed: 42
  val_every_step: 200
  lr_warmup_steps: 10000
  lr_decay_until_steps: ${.num_steps}
  lr_decay_factor: 1e-3
  min_lr: 0.000001
  weight_decay: 0.1
  compile: false
  grad_clip_norm: None
  num_steps: 100000
  device: cuda
  amp_precision: bfloat16
  weight_precision: float32
  enable_mixed_precision: true

model:
  num_blocks: 2
  embedding_dim: 128
  mlstm_block:
    mlstm:
      num_heads: 4
  slstm_block:
    slstm:
      num_heads: 4
      conv1d_kernel_size: 0
  slstm_at: [0, 1]
  name: xlstm01
  context_length: ${dataset.kwargs.context_length}
  vocab_size: ${dataset.kwargs.vocab_size}

dataset:
  name: form_language
  kwargs:
    synth_lang_type: modular_arithmetic
    vocab_size: 10
    seed: 42
    enable_mask: true
    context_length: 256
    min_sequence_length: 3
    max_sequence_length: 40
    count:
      train: 102400000
      validation: 8192
      test: 8192
    subpar:
      validation:
        min_sequence_length: 40
        max_sequence_length: 256
      test:
        min_sequence_length: 40
        max_sequence_length: 256
