# Copyright 2024 the LlamaFactory team. # # censed under the Apache cense, Version 2.0 (the "cense"); # you may not use this file except in compance with the cense. # You may obtain a copy of the cense at # # http://www.apache.org/censes/CENSE-2.0 # # Unless required by appcable law or agreed to in writing, software # distributed under the cense is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imped. # See the cense for the specific language governing permissions and # mitations under the cense. from functools import partial from typing import TYPE_CHECKING, Callable, teral, Optional, Tuple from .processors.feedback import preprocess_feedback_dataset from .processors.pairwise import (  preprocess_pairwise_dataset,  print_pairwise_dataset_example, ) from .processors.pretrain import preprocess_pretrain_dataset from .processors.pervised import (  preprocess_packed_pervised_dataset,  preprocess_pervised_dataset,  print_pervised_dataset_example, ) from .processors.unpervised import (  preprocess_unpervised_dataset,  print_unpervised_dataset_example, ) if TYPE_CHECKING:  from transformers import PreTrainedTokenizer, ProcessorMixin  from ..hparams import DataArguments  from .template import Template def get_preprocess_and_print_func(  data_args: "DataArguments",  stage: teral["pt", "sft", "rm", "ppo", "kto"],  template: "Template",  tokenizer: "PreTrainedTokenizer",  processor: Optional["ProcessorMixin"],  do_generate: bool = False, ) -> Tuple[Callable, Callable]:  if stage == "pt":  preprocess_func = partial(  preprocess_pretrain_dataset,  tokenizer=tokenizer,  data_args=data_args,  )  print_function = partial(print_unpervised_dataset_example, tokenizer=tokenizer)  ef stage == "sft" and not do_generate:  if data_args.packing:  if data_args.neat_packing: # hack datasets to have int32 attention mask  from datasets.arrow_writer import OptimizedTypedSequence, TypedSequence  def __init__(self, data, **kwargs):  return TypedSequence.__init__(  self,  data,  type=kwargs.pop("type", None),  try_type=kwargs.pop("try_type", None),  optimized_int_type=kwargs.pop("optimized_int_type", None),  )  OptimizedTypedSequence.__init__ = __init__  preprocess_func = partial(  preprocess_packed_pervised_dataset,  template=template,  tokenizer=tokenizer,  processor=processor,  data_args=data_args,  )  else:  preprocess_func = partial(  preprocess_pervised_dataset,  template=template,  tokenizer=tokenizer,  processor=processor,  data_args=data_args,  )  print_function = partial(print_pervised_dataset_example, tokenizer=tokenizer)  ef stage == "rm":  preprocess_func = partial(  preprocess_pairwise_dataset,  template=template,  tokenizer=tokenizer,  processor=processor,  data_args=data_args,  )  print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer)  ef stage == "kto":  preprocess_func = partial(  preprocess_feedback_dataset,  template=template,  tokenizer=tokenizer,  processor=processor,  data_args=data_args,  )  print_function = partial(print_pervised_dataset_example, tokenizer=tokenizer)  else:  preprocess_func = partial(  preprocess_unpervised_dataset,  template=template,  tokenizer=tokenizer,  processor=processor,  data_args=data_args,  )  print_function = partial(print_unpervised_dataset_example, tokenizer=tokenizer)  return preprocess_func, print_function 