# Configuration for Llama 3.2 1B with FMA attention

model:
  name: "llama-3.2-1b-fma"
  vocab_size: 128256
  hidden_size: 2048
  intermediate_size: 8192
  num_hidden_layers: 16
  num_attention_heads: 32
  num_key_value_heads: 8  # GQA
  max_position_embeddings: 131072
  rope_theta: 500000.0
  rms_norm_eps: 1.0e-5
  attention_dropout: 0.0
  hidden_dropout: 0.0
  hidden_act: "silu"
  tie_word_embeddings: true
  initializer_range: 0.02

# FMA attention configuration
fma:
  use_fma_attention: true
  block_size: 128
  num_clusters: 32
  # Options: "pallas_retrieval", "single_level_attention", "pallas_monopole"
  attention_type: "pallas_retrieval"

# Data configuration
data:
  dataset: "pg19"
  max_length: 2048
  batch_size: 1  # Single GPU
  num_workers: 4

# Evaluation configuration
eval:
  num_samples: 100
  compare_standard: true
  metrics:
    - perplexity
    - tokens_per_second
    - memory_usage

# Inference configuration
inference:
  max_new_tokens: 50
  temperature: 1.0
  top_k: 50
  top_p: 0.9
