# Llama-3.1-8B-Instruct Q4_K_M via llama-cpp-python — ROCm/HIP GPU acceleration
# Install: see meta_rg/backends/llamacpp_backend.py header for ROCm/HIP build instructions
# Model downloaded automatically to ~/.cache/huggingface/hub/ on first run.
# VRAM: ~5 GB weights (Q4_K_M) + KV cache
# KV cache: Q8_0 quantisation (type_k/type_v=8) halves KV memory vs FP16 default.
# For full TurboQuant (TURBO3_0) KV cache, install the fork (see backend docstring)
# and set type_k/type_v to the TURBO3_0 enum value from:
#   python -c "import llama_cpp; print([(e.name,e.value) for e in llama_cpp.GGML_TYPE])"
name: Llama-3.1-8B-llamacpp

task:
  nbr_latents: 3
  nbr_distractors: 0
  vocab_size: 6
  max_sentence_length: 3
  min_nbr_values_per_latent: 2
  max_nbr_values_per_latent: 5
  nbr_communication_rounds: 1
  descriptive: true
  provide_listener_feedback: true

backend:
  backend_type: llamacpp
  hf_repo_id: bartowski/Meta-Llama-3.1-8B-Instruct-GGUF
  hf_filename: Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
  n_ctx: 131072
  n_gpu_layers: -1      # -1 = offload all layers to GPU
  temperature: 0.7
  max_tokens: 64
  flash_attn: true
  verbose: false
  type_k: 8             # Q8_0 KV cache — ~50% KV memory vs FP16 default
  type_v: 8             # upgrade to TurboQuant fork for TURBO3_0 (~75% reduction)
