# config.yaml
model:
  vocab_size: 30522
  hidden_size: 256           
  intermediate_size: 1024    
  num_hidden_layers: 24     
  num_attention_heads: 4     
  max_position_embeddings: 512
  dropout_prob: 0.1
  residual_type: "diffuse" 
  tau: 1.0

training:
  max_seq_length: 128      
  per_device_batch_size: 64
  gradient_accumulation_steps: 2
  num_epochs: 40
  learning_rate: 0.00005
  max_steps: 100000
  save_dir: "./bert_checkpoints/tau=1.0_layers=24"
  log_save_dir: "./logs/tau=1.0_layers=24"

data:
  file_path: "./nas/oversmooth_bert/datasets/pretraining/data_100.txt nas/oversmooth_bert/datasets/pretraining/preprocessed_dataset_100.pkldatasets/pretraining/data_100.txt"
  dupe_factor: 3
  short_seq_prob: 0.1
  masked_lm_prob: 0.15
  max_predictions_per_seq: 20
  nsp_neg_prob: 0.5
  max_documents: 100000
  persistent_cache_path: "./datasets/pretraining/preprocessed_dataset_100.pkl"


random_seed: 42

small_experiment: false
