training:
  batch_size: 256
  lr: 0.001
  seed: 42
  val_every_step: 200
  lr_warmup_steps: 2000
  lr_decay_until_steps: ${.num_steps}
  lr_decay_factor: 0.001
  weight_decay: 0.0
  compile: true
  grad_clip_norm: None
  num_steps: 20000
  device: cuda
  amp_precision: bfloat16
  weight_precision: float32
  enable_mixed_precision: true

model:
  num_blocks: 2
  name: simple_recurrent
  add_embedding_dropout: false
  dropout: 0.0
  weight_decay_on_embedding: false
  embedding_dim: 64
  layer_type: diagonal
  activation_func: sigmoid
  step_size: 1.0
  tie_weights: false
  context_length: ${dataset.kwargs.context_length}
  vocab_size: ${dataset.kwargs.vocab_size}

dataset:
  name: form_language
  kwargs:
    synth_lang_type: modular_arithmetic
    vocab_size: 10
    seed: 1
    enable_mask: true
    context_length: 256
    min_sequence_length: 3
    max_sequence_length: 40
    count:
      train: 5120000
      validation: 8192
      test: 8192
    subpar:
      validation:
        min_sequence_length: 40
        max_sequence_length: 256
      test:
        min_sequence_length: 40
        max_sequence_length: 256
