# WikiText-103 Language Modeling Configuration

# Model parameters
sequence_length: 512
hidden_dim: 768
num_heads: 12
num_layers: 12
alpha: 1.0
beta: 2.0
gamma: 1.5
dropout: 0.1
use_enhanced: true

# Training parameters
batch_size: 32
learning_rate: 1e-4
weight_decay: 0.01
num_epochs: 10
warmup_steps: 1000
max_grad_norm: 1.0

# Data parameters
dataset_path: data/wikitext103
tokenizer_name: gpt2
max_length: 512

# Reproducibility
seed: 42
deterministic: true

