experiment:
  experiments_base_dir: experiments/
  project_name: anGPT
  session_name: default
  experiment_name: run_anGPT

resume:
  resume_before_annealing: False
  expt_dir: None

train:
  seed: 1
  clip_value: 1.0
  val_interval: 2000
  log_interval: 100
  log_param_interval: 100
  checkpoint_interval: 5000
  checkpoint_final: true
  save_model_interval: false
  save_final_model: true
  max_steps: 10000
  max_epochs: null
  grad_accumulation: 1


optim:
  optimizer: AdamW
  lr: 0.001
  lr_grad: 1e-5
  weight_decay: 0.0
  betas:
    - 0.9
    - 0.95
  eps: 1.0e-09
  regularize_embedding: True
  regularize_head: False
  scheduler:
    num_warmup_steps: 1 #${eval:0.01 * ${trainer.max_steps}}
    num_training_steps: ${train.max_steps}
    decay_factor: 0.01
    schedule: cosine


accelerate:
  mixed_precision: bf16

model:
  vocab_size: 1
  block_size: ${data_base.sequence_length}
  #  mode: 'GPT2'
  #  mode: 'nGPT'
  mode: 'aGPT'
  dropout: 0
  n_layer: 24
  n_head: 16
  n_embd: 1024
  bias: false
  use_compile: true
  GPT2_norm: 'rms'
  GPT2_DyT_alpha_att: 1.0
  GPT2_DyT_alpha_other: 1.0
  GPT2_ln_scaling: False
  scaled_projection: true
  base_scale: 0.031
  alpha_correction: true
  explicit_norm: true
  explicit_norm_bounded: true
  out_norm_dim_0: false
  learn_alpha: True
  alpha_init_value: 0.05
  post_norm: true
  qk_norm: true
  aGPT_alpha_scale: 0.01
  aGPT_init_normalize: true
  aGPT_logits_scale: true
  aGPT_pre_head_scale: false

data_base:
  batch_size: 8
  sequence_length: 1024
  min_sequence_length: 10
  seed: 1234
  ignore_index: -100

tokenizer:
  name: "neox"
  tokenizer_dir: "data/tokenizer"
  add_eos: True

train_mmap_dl:
  path_prefix: "data/openwebtext"
  ignore_samples: 10000
  shuffle: True
  infinite: True
  seed: 1234

valid_mmap_dl:
  path_prefix: "data/openwebtext"
  limit_samples: 10000
  shuffle: False
  infinite: False
  seed: 1234