

# @package _global_
defaults:
  - /method: grpo  # override below
  - /task: dapo
  - default  # import default settings for paper experiments
  - _self_

_idx: 100  # Qwen3 8B experiments


n_gpus: 4

llm: Qwen_Qwen3-8B
model_name: qwen3


actor_rollout_ref:
  actor:
    policy_loss:
      loss_mode: trpl

hydra:
  mode: MULTIRUN
  sweeper:
    ablative_params:
      ##########################################################
      # Default is Qwen3 + DAPO (eval, aligned) + GRPO + TRPL. #
      # The ablative parameters change one thing at a time.    #
      ##########################################################

      ### Vanilla comparison.
      - actor_rollout_ref.actor.policy_loss.loss_mode: vanilla

      ### Algorithm: {drgrpo, ppo, gspo} x {trpl, vanilla}
      # TRPLs
      - method: drgrpo
      - method: ppo
      - method: gspo
        actor_rollout_ref.actor.policy_loss.loss_mode: trpl_seq


      # Vanillas
      - method: drgrpo
        actor_rollout_ref.actor.policy_loss.loss_mode: vanilla
      - method: ppo
        actor_rollout_ref.actor.policy_loss.loss_mode: vanilla
      - method: gspo
        actor_rollout_ref.actor.policy_loss.loss_mode: gspo

      ### Datasets
      # TRPLs
      - task: gsm8k
      - task: eurus
      # Vanillas
      - task: gsm8k
        actor_rollout_ref.actor.policy_loss.loss_mode: vanilla
      - task: eurus
        actor_rollout_ref.actor.policy_loss.loss_mode: vanilla

