# GRPO + LoRA fine-tuning: SmolLM2-1.7B-Instruct as listener (RLVR future work)
# Requires: pip install transformers peft accelerate

task:
  nbr_latents: 3
  nbr_distractors: 0
  vocab_size: 6
  max_sentence_length: 3
  min_nbr_values_per_latent: 2
  max_nbr_values_per_latent: 5
  nbr_communication_rounds: 1
  descriptive: true
  nbr_object_centric_samples: 1
  provide_listener_feedback: true
  shots: 1    # → sampling_strategy: component-focused-1shot

model:
  model_id: HuggingFaceTB/SmolLM2-1.7B-Instruct
  lora_r: 16
  lora_alpha: 32
  lora_dropout: 0.05
  lora_target_modules: [q_proj, v_proj, k_proj, o_proj]

training:
  role: listener
  counterpart: rule-based
  B: 4              # groups per step (different stimuli)
  G: 4              # trajectories per group
  num_steps: 2000
  lr: 1.0e-4
  weight_decay: 0.0
  max_grad_norm: 1.0
  beta_kl: 0.0      # no KL penalty (pure policy gradient)
  temperature: 0.7
  max_new_tokens: 256
  log_interval: 10
  eval_interval: 100
  save_interval: 500

seed: 0
output_dir: outputs/grpo/smollm_listener
wandb_project: meta-rg-s2b-grpo
