[
  {
    "label": "policymodel"
  },
  {
    "label": "grpo"
  },
  {
    "label": "policymodel"
  },
  {
    "label": "dpo"
  },
  {
    "label": "trainingloop"
  },
  {
    "label": "rollout"
  },
  {
    "label": "paramupdate"
  },
  {
    "label": "verifier"
  },
  {
    "label": "chosen"
  },
  {
    "label": "rejected"
  },
  {
    "label": "dposample"
  },
  {
    "label": "paramupdate"
  },
  {
    "label": "o"
  },
  {
    "label": "o"
  },
  {
    "label": "o"
  },
  {
    "label": "rewardfunction"
  },
  {
    "label": "r"
  },
  {
    "label": "r"
  },
  {
    "label": "r"
  },
  {
    "label": "groupcomputation"
  },
  {
    "label": "a"
  },
  {
    "label": "a"
  },
  {
    "label": "a"
  },
  {
    "label": "advantage"
  },
  {
    "label": "paramupdate"
  },
  {
    "label": "osamplingoutput"
  },
  {
    "label": "rrewardscore"
  },
  {
    "label": "aadvantagescore"
  },
  {
    "label": "rthinkcontent"
  },
  {
    "label": "aanswercontent"
  },
  {
    "label": "ggroundtruth"
  },
  {
    "label": "othinkrthinkansweraanswer"
  },
  {
    "label": "tokenizer"
  },
  {
    "label": "lengthcheck"
  },
  {
    "label": "llm"
  },
  {
    "label": "consistencycheck"
  },
  {
    "label": "rule"
  },
  {
    "label": "repetitioncheck"
  },
  {
    "label": "compare"
  },
  {
    "label": "accuracycheck"
  },
  {
    "label": "ochosenrejected"
  },
  {
    "label": "oochosenrejected"
  },
  {
    "label": "chosen"
  },
  {
    "label": "rejected"
  },
  {
    "label": "chosen"
  },
  {
    "label": "rejected"
  },
  {
    "label": "chosen"
  },
  {
    "label": "rejected"
  },
  {
    "label": "chosen"
  },
  {
    "label": "rejected"
  }
]