# Configuration for Llama 3.2 1B with standard attention (for comparison)

model:
  name: "llama-3.2-1b-standard"
  vocab_size: 128256
  hidden_size: 2048
  intermediate_size: 8192
  num_hidden_layers: 16
  num_attention_heads: 32
  num_key_value_heads: 8  # GQA
  max_position_embeddings: 131072
  rope_theta: 500000.0
  rms_norm_eps: 1.0e-5
  attention_dropout: 0.0
  hidden_dropout: 0.0
  hidden_act: "silu"
  tie_word_embeddings: true
  initializer_range: 0.02

# Standard attention (no FMA)
fma:
  use_fma_attention: false

# Data configuration
data:
  dataset: "pg19"
  max_length: 2048
  batch_size: 1
  num_workers: 4

# Evaluation configuration
eval:
  num_samples: 100
  metrics:
    - perplexity
    - tokens_per_second
    - memory_usage

# Inference configuration
inference:
  max_new_tokens: 50
  temperature: 1.0
  top_k: 50
  top_p: 0.9
