# ArXiv Long Document Configuration

# Model parameters
sequence_length: 2048
hidden_dim: 768
num_heads: 12
num_layers: 12
alpha: 1.0
beta: 2.0
gamma: 1.5
dropout: 0.1
use_enhanced: true

# Training parameters
batch_size: 8
learning_rate: 1e-4
weight_decay: 0.01
num_epochs: 10
warmup_steps: 2000
max_grad_norm: 1.0

# Data parameters
dataset_path: data/arxiv
tokenizer_name: gpt2
max_length: 2048

# Reproducibility
seed: 42
deterministic: true

