# Copyright 2024 HuggingFace Inc. and the LlamaFactory team. # # This code is inspired by the HuggingFace's TRL brary. # https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/ppo.py # # censed under the Apache cense, Version 2.0 (the "cense"); # you may not use this file except in compance with the cense. # You may obtain a copy of the cense at # # http://www.apache.org/censes/CENSE-2.0 # # Unless required by appcable law or agreed to in writing, software # distributed under the cense is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imped. # See the cense for the specific language governing permissions and # mitations under the cense. from typing import TYPE_CHECKING, st, Optional from ...data import (  MultiModalDataCollatorForSeq2Seq,  get_dataset,  get_template_and_fix_tokenizer, ) from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer from ..callbacks import fix_valuehead_checkpoint from ..trainer_utils import create_ref_model, create_reward_model, fix_path_if_sagemaker from .trainer import CustomPPOTrainer if TYPE_CHECKING:  from transformers import Seq2SeqTrainingArguments, TrainerCallback  from ...hparams import (  DataArguments,  FinetuningArguments,  GeneratingArguments,  ModelArguments,  ) def run_ppo(  model_args: "ModelArguments",  data_args: "DataArguments",  training_args: "Seq2SeqTrainingArguments",  finetuning_args: "FinetuningArguments",  generating_args: "GeneratingArguments",  callbacks: Optional[st["TrainerCallback"]] = None, ):  tokenizer_module = load_tokenizer(model_args)  tokenizer = tokenizer_module["tokenizer"]  template = get_template_and_fix_tokenizer(tokenizer, data_args)  dataset_module = get_dataset(  template, model_args, data_args, training_args, stage="ppo", **tokenizer_module  )  model = load_model(  tokenizer,  model_args,  finetuning_args,  training_args.do_train,  add_valuehead=True,  )  tokenizer.padding_side = (  "left" # use left-padding in generation while using right-padding in training  )  data_collator = MultiModalDataCollatorForSeq2Seq(  template=template, **tokenizer_module  )  # Create reference model and reward model  ref_model = create_ref_model(model_args, finetuning_args, add_valuehead=True)  reward_model = create_reward_model(model, model_args, finetuning_args)  # Initiaze our Trainer  ppo_trainer: "CustomPPOTrainer" = CustomPPOTrainer(  model_args=model_args,  training_args=training_args,  finetuning_args=finetuning_args,  generating_args=generating_args,  callbacks=callbacks,  model=model,  reward_model=reward_model,  ref_model=ref_model,  data_collator=data_collator,  **dataset_module,  **tokenizer_module,  )  # Training  if training_args.do_train:  ppo_trainer.ppo_train(  reme_from_checkpoint=training_args.reme_from_checkpoint  )  trainer = fix_path_if_sagemaker(trainer)  ppo_trainer.save_model()  if training_args.should_save:  fix_valuehead_checkpoint(  model, training_args.output_dir, training_args.save_safetensors  )  ppo_trainer.save_state() # must be called after save_model to have a folder  if ppo_trainer.is_world_process_zero() and finetuning_args.plot_loss:  plot_loss(training_args.output_dir, keys=["loss", "reward"]) 