
resume_training: False

experiment:
  experiments_base_dir: /tmp/
  project_name: GPTworkbench
  session_name: cpr_training
  experiment_name: test_model



train:
  seed: 1
  clip_value: 1.0
  val_interval: 2500
  log_interval: 100
  log_param_interval: 100
  save_interval: 2500
  max_steps: 200000
  max_epochs: null


optim:
  optimizer: AdamCPR
  lr: 0.002
  weight_decay: 0.1
  betas:
    - 0.9
    - 0.98
  eps: 1.0e-09
  normalization_regularization: False
  bias_regularization: False
  scheduler:
    num_warmup_steps: 5000
    num_training_steps: ${train.max_steps}
    decay_factor: 0.1
    schedule: "cosine" # "cosine" or "linear"
  cpr_config:
    kappa_init_param: 5000
    kappa_init_method: 'warm_start'
    reg_function: 'l2'
    kappa_update: 1.0


accelerate:
  mixed_precision: bf16


model:
  vocab_size: 1 # Default vocabulary size for GPT-2
  model_dim: 768 # Model dimension, typical for GPT-2 small
  max_len: ${data.max_sample_len} # Typical maximum sequence length for GPT-2 models
  pos_embedding: False # GPT-2 uses learned positional embeddings
  rel_pos_enc: true # GPT-2 does not use relative positional encoding by default
  initializer_range: 0.02 # Typical initializer range for weights in GPT-2
  seq_vocab_size: 50257 # Usually the same as trg_vocab_size in GPT-2
  n_layers: 12 # Number of transformer layers in GPT-2 small
  rms_norm: false # GPT-2 uses LayerNorm, not RMSNorm
  ln_eps: 1e-5 # Epsilon for Layer Normalization in GPT-2
  add_unit_offset: false # Not applicable for LayerNorm used in GPT-2
  use_bias: true # Biases are typically used in LayerNorm and other layers in GPT-2
  learn_ln: true # Layer Normalization parameters are learnable in GPT-2
  last_zero: false # The last layer weights are initialized to zero in some transformer models, though not specified for GPT-2
  ff_factor: 4 # Feedforward/inner dimension is typically 4 times the model dimension in GPT-2
  resi_dropout: 0.1 # Residual dropout rate in GPT-2
  num_head: 12 # Number of attention heads in GPT-2 small
  softmax_scale: true # Not typically specified, defaults to natural scaling
  attn_dropout: 0.1 # Attention dropout rate in GPT-2
  activation: silu # GELU activation is used in GPT-2
  glu: false # GLU is not used in standard GPT-2 configurations
  tie_embed: true





data:
  dataset_name: "openwebtext" # openwebtext wikitext pile
  num_cpu_worker: 0
  num_gpu_worker: 1
  max_sample_len: 1024
  seed: ${train.seed}
  batch_size: 32
  val_ratio: 0.0005
  val_split_seed: 2357
  data_dir: "/tmp/lm_data"
  cache_dir: "/tmp/cache"
  evaluation_sets: False
