# @package _global_
defaults:
  - /method: grpo  # specify below
  - /task: gsm8k
  - default
  - _self_


_idx: 111  # Medium models (2-4B parameters)



hydra:
  mode: MULTIRUN
  sweeper:
    # Big grid over different medium models for trpl x vanilla and dapo x gsm8k
    # Note that we do not report all combinations in the paper, since most non-qwen models achieve 
    # close to 0% performance on dapo.
    grid_params:
      actor_rollout_ref.actor.policy_loss.loss_mode: trpl, vanilla
      task: gsm8k, dapo
    list_params:
      model_name:
        - qwen3_4b
        - qwen2.5_3b
        - llama_3b
        - llama_3b_no_instr
        - smol_lm3_3b
        - fine_math_llama_3b
      llm:
        - Qwen_Qwen3-4B
        - Qwen_Qwen2.5-3B-Instruct
        - meta-llama_Llama-3.2-3B-Instruct
        - meta-llama_Llama-3.2-3B-ModTok
        - HuggingFaceTB_SmolLM3-3B
        - HuggingFaceTB_FineMath-Llama-3B

