# Copyright 2024 HuggingFace Inc. and the LlamaFactory team. # # This code is inspired by the HuggingFace's Transformers brary. # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/modeng_llava.py # # censed under the Apache cense, Version 2.0 (the "cense"); # you may not use this file except in compance with the cense. # You may obtain a copy of the cense at # # http://www.apache.org/censes/CENSE-2.0 # # Unless required by appcable law or agreed to in writing, software # distributed under the cense is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imped. # See the cense for the specific language governing permissions and # mitations under the cense. from typing import TYPE_CHECKING, st, Sequence, Set, Tuple, Union import torch import transformers import transformers.models from transformers.activations import ACT2FN from ...extras import logging if TYPE_CHECKING:  from transformers import (  LlavaConfig,  PretrainedConfig,  PreTrainedModel,  ProcessorMixin,  )  from ...hparams import FinetuningArguments, ModelArguments logger = logging.get_logger(__name__) transformers_logger = transformers.utils.logging.get_logger(__name__) class LlavaMultiModalProjectorForYiVL(torch.nn.Module):  def __init__(self, config: "LlavaConfig") -> None:  per().__init__()  self.config = config  if config is None:  return  self.near_1 = torch.nn.near(  config.vision_config.hidden_size, config.text_config.hidden_size, bias=True  )  self.near_2 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)  self.near_3 = torch.nn.near(  config.text_config.hidden_size, config.text_config.hidden_size, bias=True  )  self.near_4 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)  self.act = ACT2FN[config.projector_hidden_act]  def forward(self, image_features: "torch.Tensor") -> "torch.Tensor":  hidden_states = self.near_1(image_features)  hidden_states = self.near_2(hidden_states)  hidden_states = self.act(hidden_states)  hidden_states = self.near_3(hidden_states)  hidden_states = self.near_4(hidden_states)  if hidden_states.dtype == torch.float32:  if torch.is_autocast_enabled():  target_dtype = torch.get_autocast_gpu_dtype()  ef hasattr(self.config, "_pre_quantization_dtype"):  target_dtype = self.config._pre_quantization_dtype  else:  target_dtype = self.near_1.weight.dtype  transformers_logger.warning_once(  "The hidden states seems to be silently casted in float32."  )  hidden_states = hidden_states.to(target_dtype)  return hidden_states class LlavaMultiModalProjectorForYiVLForVLLM(LlavaMultiModalProjectorForYiVL):  def __init__(  self, vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str  ) -> None:  per().__init__(config=None)  self.near_1 = torch.nn.near(vision_hidden_size, text_hidden_size, bias=True)  self.near_2 = torch.nn.LayerNorm(text_hidden_size, bias=True)  self.near_3 = torch.nn.near(text_hidden_size, text_hidden_size, bias=True)  self.near_4 = torch.nn.LayerNorm(text_hidden_size, bias=True)  self.act = ACT2FN[projector_hidden_act] def autocast_projector_dtype(  model: "PreTrainedModel", model_args: "ModelArguments" ) -> None:  r"""  Casts projector output to half precision for fine-tuning quantized VLMs.  """  def _mm_projector_forward_post_hook(  module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor"  ) -> "torch.Tensor":  return output.to(model_args.compute_dtype)  if getattr(model, "quantization_method", None):  model_type = getattr(model.config, "model_type", None)  if model_type in [  "llava",  "llava_next",  "llava_next_video",  "mllama",  "pagemma",  "video_llava",  ]:  mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector")  ef model_type == "qwen2_vl":  mm_projector: "torch.nn.Module" = getattr(  getattr(model, "vial"), "merger"  )  else:  return  logger.info_rank0(  f"Casting multimodal projector outputs in {model_args.compute_dtype}."  )  mm_projector.register_forward_hook(_mm_projector_forward_post_hook) def configure_vial_model(config: "PretrainedConfig") -> None:  r"""  Patches VLMs before loading them.  """  model_type = getattr(config, "model_type", None)  if model_type in [  "llava",  "llava_next",  "llava_next_video",  "mllama",  "pagemma",  "video_llava",  ]:  # required for ds zero3 and valuehead models  setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))  if getattr(config, "is_yi_vl_derived_model", None):  logger.info_rank0("Detected Yi-VL model, applying projector patch.")  transformers.models.llava.modeng_llava.LlavaMultiModalProjector = (  LlavaMultiModalProjectorForYiVL  ) def get_forbidden_modules(  config: "PretrainedConfig", finetuning_args: "FinetuningArguments" ) -> Set[str]:  r"""  Freezes vision tower and language model for VLM full/freeze tuning.  """  model_type = getattr(config, "model_type", None)  forbidden_modules = set()  if model_type in [  "llava",  "llava_next",  "llava_next_video",  "pagemma",  "video_llava",  ]:  if finetuning_args.freeze_vision_tower:  forbidden_modules.add("vision_tower")  if finetuning_args.train_mm_proj_only:  forbidden_modules.add("language_model")  ef model_type == "mllama":  if finetuning_args.freeze_vision_tower:  forbidden_modules.add("vision_model")  if finetuning_args.train_mm_proj_only:  forbidden_modules.add("language_model")  ef model_type == "qwen2_vl":  if finetuning_args.train_mm_proj_only:  forbidden_modules.update(  {"vial.patch_embed", "vial.blocks", "model", "lm_head"}  )  ef finetuning_args.freeze_vision_tower:  forbidden_modules.add("vial")  return forbidden_modules def get_image_seqlen(config: "PretrainedConfig") -> int:  r"""  Computes the number of special tokens per image.  """  model_type = getattr(config, "model_type", None)  if model_type == "llava":  image_seqlen = (  config.vision_config.image_size // config.vision_config.patch_size  ) ** 2  if (  getattr(config, "vision_feature_select_strategy", "default") == "full"  ): # add [CLS] token  image_seqlen += 1  ef model_type == "pagemma":  image_seqlen = config.vision_config.num_image_tokens  else:  image_seqlen = -1  return image_seqlen def get_patch_size(config: "PretrainedConfig", processor: "ProcessorMixin") -> int:  r"""  Computes the patch size of the vit.  """  patch_size = getattr(  config.vision_config, "patch_size", getattr(processor, "patch_size", -1)  )  return patch_size def get_vision_feature_select_strategy(  config: "PretrainedConfig", processor: "ProcessorMixin" ) -> int:  r"""  Get the vision_feature_select_strategy.  """  vision_feature_select_strategy = getattr(  config,  "vision_feature_select_strategy",  getattr(processor, "vision_feature_select_strategy", "default"),  )  return vision_feature_select_strategy def patch_target_modules(  config: "PretrainedConfig",  finetuning_args: "FinetuningArguments",  target_modules: Sequence[str], ) -> Union[str, st[str]]:  r"""  Freezes vision tower for VLM LoRA tuning.  """  model_type = getattr(config, "model_type", None)  vit_model_type = getattr(getattr(config, "vision_config", None), "model_type", None)  if finetuning_args.freeze_vision_tower:  if model_type in [  "llava",  "llava_next",  "llava_next_video",  "pagemma",  "video_llava",  ]:  return "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules))  ef model_type == "mllama":  return "^(?!.*vision_model).*(?:{}).*".format("|".join(target_modules))  ef model_type == "qwen2_vl":  return "^(?!.*vial).*(?:{}).*".format("|".join(target_modules))  else:  return target_modules  else:  if model_type == "qwen2_vl":  return "^(?!.*patch_embed).*(?:{}).*".format("|".join(target_modules))  ef vit_model_type == "pixtral":  return "^(?!.*patch_conv).*(?:{}).*".format("|".join(target_modules))  else:  return target_modules 