# config.yaml

name: "tiny_shakespeare_small"

optimizer_params:
  iamsadam_learning_rate: 1.0
  adam_learning_rate: 1.e-3
  sgd_learning_rate: 0.05  # best: 0.01 or 0.05 diverges: 0.1, grid-search 0.0001, 0.001, 0.01, 0.05, 0.1, 0.2
  iams_learning_rate: 1.0

training_params:
  batch_size: 8
  num_epochs: 1
  max_length: 512
  warm_up_percent: 0.2
  warm_up_peak_mult: 3.0 #  grid-search 1.2, 1.5, 2, 3, 5
  adam_warm_up_peak_mult: 1.5

gpt_model:
  teacher_model: "gpt2-medium"  #  'gpt2-medium', etc.
  n_embd: 768    # Hidden size used in distilgpt2
  n_layer: 2    # Number of layers in distilgpt2
  n_head: 4    # Number of attention heads in distilgpt2
  vocab_size: 50304

dataset:
  name: "tiny_shakespeare"  