dag_id: "ppo_training_pipeline"
description: "This is PPO DAG workflow configured via YAML."

actor1_config: &actor1_config
  model.path: Qwen/Qwen2.5-0.5B
  rollout.log_prob_micro_batch_size_per_gpu: 16
  rollout.tensor_model_parallel_size: 2
  rollout.gpu_memory_utilization: 0.7
  rollout.n: 16

nodes:
  - node_id: "rollout_actor"
    node_type: "MODEL_INFERENCE"
    node_role: "ROLLOUT"
    dependencies: []

  - node_id: "function_reward"
    node_type: "COMPUTE"
    node_role: "REWARD"
    dependencies:
      - "rollout_actor"

  - node_id: "actor_old_log_prob"
    node_type: "MODEL_TRAIN"
    node_role: "ACTOR"
    only_forward_compute: true
    dependencies:
      - "function_reward"

  - node_id: "reference_log_prob"
    node_type: "MODEL_TRAIN"
    node_role: "REFERENCE"
    dependencies:
      - "actor_old_log_prob"

  - node_id: "critic_value"
    node_type: "MODEL_TRAIN"
    node_role: "CRITIC"
    only_forward_compute: true
    dependencies:
      - "reference_log_prob"

  - node_id: "calculate_advantages"
    node_type: "COMPUTE"
    node_role: "ADVANTAGE"
    dependencies:
      - "critic_value"

  - node_id: "critic_train"
    node_type: "MODEL_TRAIN"
    node_role: "CRITIC"
    dependencies:
      - "calculate_advantages"

  - node_id: "actor_train"
    node_type: "MODEL_TRAIN"
    node_role: "ACTOR"
    dependencies:
      - "critic_train"


