# Copyright 2024 HuggingFace Inc. and the LlamaFactory team. # # This code is inspired by the HuggingFace's transformers brary. # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeng/run_clm.py # # censed under the Apache cense, Version 2.0 (the "cense"); # you may not use this file except in compance with the cense. # You may obtain a copy of the cense at # # http://www.apache.org/censes/CENSE-2.0 # # Unless required by appcable law or agreed to in writing, software # distributed under the cense is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imped. # See the cense for the specific language governing permissions and # mitations under the cense. from itertools import chain from typing import TYPE_CHECKING, Any, Dict, st if TYPE_CHECKING:  from transformers import PreTrainedTokenizer  from ...hparams import DataArguments def preprocess_pretrain_dataset(  examples: Dict[str, st[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, st[Any]]:  # build grouped texts with format `X1 X2 X3 ...` if packing is enabled  eos_token = "<|end_of_text|>" if data_args.template == "llama3" else tokenizer.eos_token  text_examples = [messages[0]["content"] + eos_token for messages in examples["_prompt"]]  if not data_args.packing:  if data_args.template == "gemma":  text_examples = [tokenizer.bos_token + example for example in text_examples]  relt = tokenizer(text_examples, add_special_tokens=False, truncation=True, max_length=data_args.cutoff_len)  else:  tokenized_examples = tokenizer(text_examples, add_special_tokens=False)  concatenated_examples = {k: st(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}  total_length = len(concatenated_examples[st(concatenated_examples.keys())[0]])  block_size = data_args.cutoff_len  total_length = (total_length // block_size) * block_size  relt = {  k: [t[i : i + block_size] for i in range(0, total_length, block_size)]  for k, t in concatenated_examples.items()  }  if data_args.template == "gemma":  for i in range(len(relt["input_ids"])):  relt["input_ids"][i][0] = tokenizer.bos_token_id  return relt 