# Llama-3.2-3B-Instruct Q4_K_M via llama-cpp-python — ROCm/HIP GPU acceleration
# Install: see meta_rg/backends/llamacpp_backend.py header for ROCm/HIP build instructions
# Model downloaded automatically to ~/.cache/huggingface/hub/ on first run.
# VRAM: ~2 GB (Q4_K_M)
name: Llama-3.2-3B-llamacpp

task:
  nbr_latents: 3
  nbr_distractors: 0
  vocab_size: 6
  max_sentence_length: 3
  min_nbr_values_per_latent: 2
  max_nbr_values_per_latent: 5
  nbr_communication_rounds: 1
  descriptive: true
  provide_listener_feedback: true

backend:
  backend_type: llamacpp
  hf_repo_id: bartowski/Llama-3.2-3B-Instruct-GGUF
  hf_filename: Llama-3.2-3B-Instruct-Q4_K_M.gguf
  n_ctx: 131072
  n_gpu_layers: -1      # -1 = offload all layers to GPU
  temperature: 0.7
  max_tokens: 64
  flash_attn: true
  verbose: false
  # TurboQuant KV cache (requires TurboQuant fork — see backend docstring):
  # type_k: 14
  # type_v: 14
