seed_everything: 111

model:
  vocab_size: 65
  block_size: 256
  n_layer: 6
  n_head: 6
  n_embd: 384
  dropout: 0.1
  bias: true

  gradient_clip_val: 1.0

  optimizers:
    nsymo:
      lr: 0.01
      grads_beta: 0.95
      factors_beta: 0
      nesterov: true
      grads_bias_corr: false
      factors_bias_corr: false
      update_correction: false
      update_avg: false
      block_diag: true
      groups_spec:
        qk_group: O
        v_group: O
        heads_group: S
        embd_group: I
        inout_group: I
      split: muon
      #lr_scheduler:
      #  name: exp
      #  gamma: 0.98
      #  interval: step
      #  frequency: 1
      #  monitor: val_loss
    adam:
      lr: 1e-3
      betas: [0.9, 0.99]

data:
  seed: 111
  data_dir: ./.datasets
  block_size: 256
  batch_size: 64
  num_workers: 4
  pin_memory: true
  preload_to_gpu: true

trainer:
  max_epochs: 100
  accelerator: auto
  devices: 1
  # precision: 16-mixed
  deterministic: true
  log_every_n_steps: 5
  val_check_interval: 100

  logger:
    class_path: pytorch_lightning.loggers.WandbLogger
    init_args:
      project: baby-gpt-shakespare
      entity: bla-symo
      save_dir: logs/wandb/
      log_model: false
      group: experiments
      tags: [gpt, baby, baseline, shakespare]

  callbacks:
    - class_path: pytorch_lightning.callbacks.LearningRateMonitor
      init_args:
        logging_interval: step
        log_momentum: false
    - class_path: pytorch_lightning.callbacks.EarlyStopping
      init_args:
        monitor: val_loss
        patience: 10
        mode: min
        min_delta: 0.001
        verbose: true
