# config.yaml
model:
  vocab_size: 30522
  hidden_size: 256           
  intermediate_size: 1024   
  num_hidden_layers: 24      
  num_attention_heads: 4    
  max_position_embeddings: 512
  dropout_prob: 0.1
  residual_type: "wave" 
  tau: 0.5

training:
  max_seq_length: 128        
  per_device_batch_size: 64
  gradient_accumulation_steps: 2
  num_epochs: 40
  learning_rate: 0.00005
  max_steps: 100000
  save_dir: "./bert_checkpoints/tau=0.5_layers=24"
  log_save_dir: "./logs/tau=0.5_layers=24"

data:
  file_path: "datasets/pretraining/data_100.txt" # フルデータ用ファイル（常に書き出す）
  dupe_factor: 3
  short_seq_prob: 0.1
  masked_lm_prob: 0.15
  max_predictions_per_seq: 20
  nsp_neg_prob: 0.5
  max_documents: 100000
  persistent_cache_path: "/workspace/nas/oversmooth_bert/datasets/pretraining/preprocessed_dataset_100.pkl"

random_seed: 42

small_experiment: false
