defaults:
  - method: &method linear
  - optimization: &optimization basic
  - data: &data refinedweb
  - model: &model gpt2
  - _self_

# rewrite optimization cifar here
optimization:
  max_tokens: 2200000000
  global_batch_size: 512
  gradient_checkpointing: false
  gradient_clipping: 1.0
  log_interval: 20
  load_save_mode: step
  load_checkpoint: true
  save_checkpoint: true
  optimizer:
    kwargs:
      lr: &lr 6.0e-4
      weight_decay: 0.1
      betas: [0.9, 0.999]
  lr_scheduler:
    kwargs:
      warmup_iter: 0.1
      eta_min: ${eval:0.1 * ${optimization.optimizer.kwargs.lr}}

data:
  tokenizer:
    name: ${model.kwargs.model_type}
    model_max_length: ${model.kwargs.max_position_embeddings}
  block_size: ${model.kwargs.max_position_embeddings}

base_dir: &base_dir /home/anonymous/transformers/clean_version/exp/

wandb:
  entity: anonymous
  project: gpt2reproduce
  mode: online
  anonymous: allow
  dir: *base_dir

wandb_use: true
hydra:
  run:
    dir: /home/anonymous/transformers/clean_version/exp/hydra/${now:%Y-%m-%d}/${now:%H-%M-%S}
sweep_lr: false