# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch T5 model. """

import copy
import math
import os
import warnings
from regex import E

import torch
import torch.nn.functional as F
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.utils.checkpoint import checkpoint
import numpy as np

from transformers.activations import ACT2FN
from transformers.file_utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from transformers.modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)
from transformers.modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
from transformers.utils import logging
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
from .configuration_t5 import T5Config

from seq2seq.adapters import AdapterController
from seq2seq.hypercomplex.layers import PHMLinear
from seq2seq.hypercomplex.inits import glorot_uniform, glorot_normal
from typing import Dict, Any

logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "T5Config"
_TOKENIZER_FOR_DOC = "T5Tokenizer"

####################################################
# This dict contains ids and associated url
# for the pretrained weights provided with the models
####################################################
T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "t5-small",
    "t5-base",
    "t5-large",
    "t5-3b",
    "t5-11b",
    # See all T5 models at https://huggingface.co/models?filter=t5
]


####################################################
# This is a conversion method from TF 1.0 to PyTorch
# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
####################################################
def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
        import re

        import numpy as np
        import tensorflow as tf
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    tf_weights = {}
    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        tf_weights[name] = array

    for txt_name in names:
        name = txt_name.split("/")
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(
                n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
                for n in name
        ):
            logger.info(f"Skipping {'/'.join(name)}")
            tf_weights.pop(txt_name, None)
            continue
        if "_slot_" in name[-1]:
            logger.info(f"Skipping {'/'.join(name)}")
            tf_weights.pop(txt_name, None)
            continue
        pointer = model
        array = tf_weights[txt_name]

        for m_name in name:
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]
            if scope_names[0] in ["kernel", "scale", "embedding"]:
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "self_attention":
                pointer = getattr(pointer, "layer")
                pointer = pointer[0]
            elif scope_names[0] == "enc_dec_attention":
                pointer = getattr(pointer, "layer")
                pointer = pointer[1]
            elif scope_names[0] == "dense_relu_dense":
                pointer = getattr(pointer, "layer")
                pointer = pointer[2]
            elif scope_names[0] == "rms_norm":
                if hasattr(pointer, "layer_norm"):
                    pointer = getattr(pointer, "layer_norm")
                elif hasattr(pointer, "final_layer_norm"):
                    pointer = getattr(pointer, "final_layer_norm")
            elif scope_names[0] == "scale":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "squad":
                pointer = getattr(pointer, "classifier")
            elif scope_names[0] == "decoder" and name[1] == "logits":
                continue
            elif scope_names[0] == "logits":
                pointer = getattr(pointer, "lm_head")
            elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit():
                pointer = getattr(pointer, f"wi_{scope_names[1]}")
                continue
            else:
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info(f"Skipping {'/'.join(name)}")
                    continue
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        if scope_names[0] not in ["kernel", "scale", "embedding"]:
            pointer = getattr(pointer, "weight")
        if scope_names[0] != "embedding":
            logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
            array = np.transpose(array)
        try:
            assert (
                    pointer.shape == array.shape
            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        logger.info(f"Initialize PyTorch weight {name}")
        pointer.data = torch.from_numpy(array.astype(np.float32))
        tf_weights.pop(txt_name, None)

    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
    return model


####################################################
# PyTorch Models are constructed by sub-classing
# - torch.nn.Module for the layers and
# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
####################################################
PARALLELIZE_DOCSTRING = r"""
    This is an experimental feature and is a subject to change at a moment's notice.

    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
    it will evenly distribute blocks across all devices.

    Args:
        device_map (:obj:`Dict[int, list]`, optional, defaults to None):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
            following number of attention modules:

                - t5-small: 6
                - t5-base: 12
                - t5-large: 24
                - t5-3b: 24
                - t5-11b: 24

    Example::

            # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules:
            model = T5ForConditionalGeneration.from_pretrained('t5-3b')
            device_map = {0: [0, 1, 2],

                         1: [3, 4, 5, 6, 7, 8, 9],
                         2: [10, 11, 12, 13, 14, 15, 16],
                         3: [17, 18, 19, 20, 21, 22, 23]}
            model.parallelize(device_map)
"""
DEPARALLELIZE_DOCSTRING = r"""
    Moves the model to cpu from a model parallel state.

    Example::

        # On a 4 GPU machine with t5-3b:
        model = T5ForConditionalGeneration.from_pretrained('t5-3b')
        device_map = {0: [0, 1, 2],

                     1: [3, 4, 5, 6, 7, 8, 9],
                     2: [10, 11, 12, 13, 14, 15, 16],
                     3: [17, 18, 19, 20, 21, 22, 23]}
        model.parallelize(device_map) # Splits the model across several devices
        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
"""


class T5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6, adapter_config=None):
        """
        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.bitfit = adapter_config.bitfit if adapter_config is not None else False
        if self.bitfit:
            self.bias = nn.Parameter(torch.zeros(hidden_size))

    def forward(self, hidden_states):
        # layer norm should always be calculated in float32
        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

        # convert into float16 if necessary
        if self.weight.dtype == torch.float16:
            hidden_states = hidden_states.to(torch.float16)
        result = self.weight * hidden_states
        if self.bitfit:
            result = result + self.bias
        return result


class T5DenseReluDense(nn.Module):
    def __init__(self, config, adapter_config=None):
        super().__init__()
        self.bitfit = adapter_config.bitfit if adapter_config is not None else False
        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False if not self.bitfit else True)
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False if not self.bitfit else True)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states):
        hidden_states = self.wi(hidden_states)
        hidden_states = F.relu(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.wo(hidden_states)
        return hidden_states


class T5DenseGatedGeluDense(nn.Module):
    def __init__(self, config, adapter_config=None):
        super().__init__()
        self.bitfit = adapter_config.bitfit if adapter_config is not None else False
        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False if not self.bitfit else True)
        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False if not self.bitfit else True)
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False if not self.bitfit else True)
        self.dropout = nn.Dropout(config.dropout_rate)
        self.gelu_act = ACT2FN["gelu_new"]

    def forward(self, hidden_states):
        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
        hidden_linear = self.wi_1(hidden_states)
        hidden_states = hidden_gelu * hidden_linear
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.wo(hidden_states)
        return hidden_states


class T5LayerFF(nn.Module):
    def __init__(self, config, adapter_config=None):
        super().__init__()
        if config.feed_forward_proj == "relu":
            self.DenseReluDense = T5DenseReluDense(config, adapter_config=adapter_config)
        elif config.feed_forward_proj == "gated-gelu":
            self.DenseReluDense = T5DenseGatedGeluDense(config, adapter_config=adapter_config)
        else:
            raise ValueError(
                f"{self.config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`"
            )
        self.train_task_adapters = config.train_task_adapters and adapter_config.add_adapter_in_feed_forward
        if self.train_task_adapters:
            adapter_config.reduction_factor = adapter_config.task_reduction_factor
            self.adapter_controller = AdapterController(adapter_config)
        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon, adapter_config=adapter_config)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states, task_block_adapters=None, task=None):
        forwarded_states = self.layer_norm(hidden_states)
        forwarded_states = self.DenseReluDense(forwarded_states)
        if self.train_task_adapters:
            forwarded_states = self.adapter_controller(forwarded_states, task)
        hidden_states = hidden_states + self.dropout(forwarded_states)
        return hidden_states


class T5Attention(nn.Module):
    def __init__(self, config: T5Config, has_relative_attention_bias=False, adapter_config=None):
        super().__init__()
        self.bitfit = adapter_config.bitfit if adapter_config is not None else False
        self.is_decoder = config.is_decoder
        self.has_relative_attention_bias = has_relative_attention_bias

        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        self.d_model = config.d_model
        self.key_value_proj_dim = config.d_kv
        self.n_heads = config.num_heads
        self.dropout = config.dropout_rate
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        # Mesh TensorFlow initialization to avoid scaling before softmax
        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False if not self.bitfit else True)
        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False if not self.bitfit else True)
        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False if not self.bitfit else True)
        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False if not self.bitfit else True)

        if self.has_relative_attention_bias:
            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
        self.pruned_heads = set()
        self.gradient_checkpointing = getattr(config, "gradient_checkpointing", False)

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(
            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
        )
        # Prune linear layers
        self.q = prune_linear_layer(self.q, index)
        self.k = prune_linear_layer(self.k, index)
        self.v = prune_linear_layer(self.v, index)
        self.o = prune_linear_layer(self.o, index, dim=1)
        # Update hyper params
        self.n_heads = self.n_heads - len(heads)
        self.inner_dim = self.key_value_proj_dim * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    @staticmethod
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        """
        relative_buckets = 0
        if bidirectional:
            num_buckets //= 2
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
            relative_position = torch.abs(relative_position)
        else:
            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
        # now relative_position is in the range [0, inf)

        # half of the buckets are for exact increments in positions
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
        relative_postion_if_large = max_exact + (
                torch.log(relative_position.float() / max_exact)
                / math.log(max_distance / max_exact)
                * (num_buckets - max_exact)
        ).to(torch.long)
        relative_postion_if_large = torch.min(
            relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
        )

        relative_buckets += torch.where(is_small, relative_position, relative_postion_if_large)
        return relative_buckets

    def compute_bias(self, query_length, key_length):
        """Compute binned relative position bias"""
        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
        relative_position = memory_position - context_position  # shape (query_length, key_length)
        relative_position_bucket = self._relative_position_bucket(
            relative_position,  # shape (query_length, key_length)
            bidirectional=(not self.is_decoder),
            num_buckets=self.relative_attention_num_buckets,
        )
        relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
        return values

    def forward(
            self,
            hidden_states,
            mask=None,
            key_value_states=None,
            position_bias=None,
            past_key_value=None,
            layer_head_mask=None,
            query_length=None,
            use_cache=False,
            output_attentions=False,
    ):
        """
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        """
        # Input is (batch_size, seq_length, dim)
        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
        batch_size, seq_length = hidden_states.shape[:2]
        int_seq_length = int(seq_length)

        real_seq_length = seq_length

        if past_key_value is not None:
            assert (
                    len(past_key_value) == 2
            ), f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length

        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]

        def shape(states):
            """projection"""
            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)

        def unshape(states):
            """reshape"""
            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)

        def project(hidden_states, proj_layer, key_value_states, past_key_value):
            # print('intial hidden_states', hidden_states.size())
            # print()
            """projects hidden states correctly to key/query states"""
            if key_value_states is None:
                # self-attn
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = shape(proj_layer(hidden_states))
            elif past_key_value is None:
                # cross-attn
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = shape(proj_layer(key_value_states))

            if past_key_value is not None:
                if key_value_states is None:
                    # self-attn
                    # (batch_size, n_heads, key_length, dim_per_head)
                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
                else:
                    # cross-attn
                    hidden_states = past_key_value
            return hidden_states

        # get query states
        # print('hidden_states', hidden_states.size())
        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)

        # get key/value states
        key_states = project(
            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
        )
        # print('key_states', key_states.size())
        value_states = project(
            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
        )

        # compute scores
        # print('query_states', query_states.size())
        # print('key_states.transpose(3, 2)', key_states.transpose(3, 2).size())
        scores = torch.matmul(
            query_states, key_states.transpose(3, 2)
        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9

        if position_bias is None:
            # print('position_bias if->', position_bias)
            if not self.has_relative_attention_bias:
                position_bias = torch.zeros(
                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
                )
                if self.training and self.gradient_checkpointing:
                    position_bias.requires_grad = True
            else:
                position_bias = self.compute_bias(real_seq_length, key_length)

            # if key and values are already calculated
            # we want only the last query position bias
            if past_key_value is not None:
                position_bias = position_bias[:, :, -int_seq_length:, :]

            if mask is not None:
                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
        # print('position_bias', position_bias.size())
        # print('scores', scores.size())
        scores += position_bias
        attn_weights = F.softmax(scores.float(), dim=-1).type_as(
            scores
        )  # (batch_size, n_heads, seq_length, key_length)
        attn_weights = F.dropout(
            attn_weights, p=self.dropout, training=self.training
        )  # (batch_size, n_heads, seq_length, key_length)

        # Mask heads if we want to
        if layer_head_mask is not None:
            attn_weights = attn_weights * layer_head_mask

        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
        attn_output = self.o(attn_output)

        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)

        if output_attentions:
            outputs = outputs + (attn_weights,)
        return outputs


class T5LayerSelfAttention(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False, adapter_config=None):
        super().__init__()
        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias,
                                         adapter_config=adapter_config)
        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon, adapter_config=adapter_config)
        self.train_task_adapters = config.train_task_adapters and adapter_config.add_adapter_in_self_attention
        if self.train_task_adapters:
            adapter_config.reduction_factor = adapter_config.task_reduction_factor
            self.adapter_controller = AdapterController(adapter_config)

        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
            self,
            hidden_states,
            attention_mask=None,
            position_bias=None,
            layer_head_mask=None,
            past_key_value=None,
            use_cache=False,
            output_attentions=False,
            task_block_adapters=None,
            task=None
    ):
        normed_hidden_states = self.layer_norm(hidden_states)
        attention_output = self.SelfAttention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        y = attention_output[0]
        if self.train_task_adapters:
            y = self.adapter_controller(y, task)
        hidden_states = hidden_states + self.dropout(y)
        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
        return outputs


class T5LayerCrossAttention(nn.Module):
    def __init__(self, config, adapter_config=None):
        super().__init__()
        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False, adapter_config=adapter_config)
        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon, adapter_config=adapter_config)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
            self,
            hidden_states,
            key_value_states,
            attention_mask=None,
            position_bias=None,
            layer_head_mask=None,
            past_key_value=None,
            use_cache=False,
            query_length=None,
            output_attentions=False,
    ):
        normed_hidden_states = self.layer_norm(hidden_states)
        attention_output = self.EncDecAttention(
            normed_hidden_states,
            mask=attention_mask,
            key_value_states=key_value_states,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            query_length=query_length,
            output_attentions=output_attentions,
        )
        layer_output = hidden_states + self.dropout(attention_output[0])
        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
        return outputs


class T5Block(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False, adapter_config=None):
        super().__init__()
        self.is_decoder = config.is_decoder
        self.layer = nn.ModuleList()
        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias,
                                               adapter_config=adapter_config))
        if self.is_decoder:
            self.layer.append(T5LayerCrossAttention(config, adapter_config=adapter_config))

        self.layer.append(T5LayerFF(config, adapter_config=adapter_config))

    def forward(
            self,
            hidden_states,
            attention_mask=None,
            position_bias=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            encoder_decoder_position_bias=None,
            layer_head_mask=None,
            cross_attn_layer_head_mask=None,
            past_key_value=None,
            use_cache=False,
            output_attentions=False,
            return_dict=True,
            task_block_adapters=None,
            task=None
    ):

        if past_key_value is not None:
            assert self.is_decoder, "Only decoder can use `past_key_values`"
            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4

            if len(past_key_value) != expected_num_past_key_values:
                raise ValueError(
                    f"There should be {expected_num_past_key_values} past states. "
                    f"{'2 (past / key) for cross attention' if expected_num_past_key_values == 4 else ''}."
                    f"Got {len(past_key_value)} past key / value states"
                )

            self_attn_past_key_value = past_key_value[:2]
            cross_attn_past_key_value = past_key_value[2:]
        else:
            self_attn_past_key_value, cross_attn_past_key_value = None, None

        self_attention_outputs = self.layer[0](
            hidden_states,
            attention_mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=self_attn_past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
            task_block_adapters=task_block_adapters,
            task=task
        )
        hidden_states, present_key_value_state = self_attention_outputs[:2]
        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights

        # clamp inf values to enable fp16 training
        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
        if do_cross_attention:
            # the actual query length is unknown for cross attention
            # if using past key value states. Need to inject it here
            if present_key_value_state is not None:
                query_length = present_key_value_state[0].shape[2]
            else:
                query_length = None

            cross_attention_outputs = self.layer[1](
                hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                position_bias=encoder_decoder_position_bias,
                layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=cross_attn_past_key_value,
                query_length=query_length,
                use_cache=use_cache,
                output_attentions=output_attentions,
            )
            hidden_states = cross_attention_outputs[0]

            # clamp inf values to enable fp16 training
            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

            # Combine self attn and cross attn key value states
            if present_key_value_state is not None:
                present_key_value_state = present_key_value_state + cross_attention_outputs[1]

            # Keep cross-attention outputs and relative position weights
            attention_outputs = attention_outputs + cross_attention_outputs[2:]

        # Apply Feed Forward layer
        hidden_states = self.layer[-1](hidden_states,
                                       task_block_adapters=task_block_adapters,
                                       task=task)

        # clamp inf values to enable fp16 training
        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        outputs = (hidden_states,)

        if use_cache:
            outputs = outputs + (present_key_value_state,) + attention_outputs
        else:
            outputs = outputs + attention_outputs

        return outputs  # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)


class T5PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = T5Config
    load_tf_weights = load_tf_weights_in_t5
    base_model_prefix = "transformer"
    is_parallelizable = True

    @property
    def dummy_inputs(self):
        input_ids = torch.tensor(DUMMY_INPUTS)
        input_mask = torch.tensor(DUMMY_MASK)
        dummy_inputs = {
            "decoder_input_ids": input_ids,
            "input_ids": input_ids,
            "decoder_attention_mask": input_mask,
        }
        return dummy_inputs

    def _init_weights(self, module):
        """Initialize the weights"""
        factor = self.config.initializer_factor  # Used for testing weights initialization
        if isinstance(module, T5LayerNorm):
            module.weight.data.fill_(factor * 1.0)
        elif isinstance(module, (T5Model, T5ForConditionalGeneration_vote, T5EncoderModel)):
            # Mesh TensorFlow embeddings initialization
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
            if hasattr(module, 'prefix_shared'):
                if module.prefix_shared is not None:
                    self.init_prefix_weights()
        elif isinstance(module, T5DenseReluDense):
            # Mesh TensorFlow FF initialization
            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
            if hasattr(module.wi, "bias") and module.wi.bias is not None:
                module.wi.bias.data.zero_()
            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
            if hasattr(module.wo, "bias") and module.wo.bias is not None:
                module.wo.bias.data.zero_()
        elif isinstance(module, T5DenseGatedGeluDense):
            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
                module.wi_0.bias.data.zero_()
            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
                module.wi_1.bias.data.zero_()
            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
            if hasattr(module.wo, "bias") and module.wo.bias is not None:
                module.wo.bias.data.zero_()
        elif isinstance(module, T5Attention):
            # Mesh TensorFlow attention initialization to avoid scaling before softmax
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
            d_model = self.config.d_model
            key_value_proj_dim = self.config.d_kv
            n_heads = self.config.num_heads
            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
            if module.has_relative_attention_bias:
                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
        ##################################################
        elif isinstance(module, nn.Linear):
            # This is for adapter layers.
            module.weight.data.normal_(mean=0.0, std=0.01)
            if module.bias is not None:
                module.bias.data.zero_()
        ##################################################

    def _shift_right(self, input_ids):
        decoder_start_token_id = self.config.decoder_start_token_id
        pad_token_id = self.config.pad_token_id

        assert (
                decoder_start_token_id is not None
        ), "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information"

        # shift inputs to the right
        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
        shifted_input_ids[..., 0] = decoder_start_token_id

        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
        # replace possible -100 values in labels by `pad_token_id`
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

        assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"

        return shifted_input_ids


class T5Stack(T5PreTrainedModel):
    '''
    Change the encode here~~~
    '''
    def __init__(self, config, embed_tokens=None, adapter_config=None, prefix_emb=None, attn_prefix_tuning=False,
                 mul_prefix_emb=None, model_dim=768, attn_method="linear", shared_attn=False, ignore_target=False,
                 temperature=2000):
        super().__init__(config)

        self.embed_tokens = embed_tokens

        self.is_decoder = config.is_decoder
        #######################################
        self.ignore_target = ignore_target
        self.prefix_emb = prefix_emb if self.ignore_target is False else None
        self.prefix_tuning = config.prefix_tuning
        self.attn_prefix_tuning = attn_prefix_tuning
        self.mul_prefix_emb = mul_prefix_emb
        self.attn_method = attn_method
        self.model_dim = model_dim
        self.shared_attn = shared_attn
        self.temperature = temperature
        self.append_prefix = self.prefix_tuning and not self.is_decoder and not self.attn_prefix_tuning
        self.append_attn_prefix = self.prefix_tuning and not self.is_decoder and self.attn_prefix_tuning
        # print('self.append_attn_prefix, self.append_attn_prefix', self.append_attn_prefix, self.attn_method, shared_attn.size())
        if self.prefix_tuning:
            self.prefix_dim = adapter_config.prefix_dim
        if self.append_attn_prefix:
            if self.attn_method == "linear":
                self.attn_Wa = nn.Linear(self.model_dim, self.model_dim, bias=False)
                self.layer_norm = nn.LayerNorm(self.model_dim)
            if self.attn_method == "weight":
                self.attn_W = nn.Linear(mul_prefix_emb.size()[0], 1, bias=False) if self.ignore_target else nn.Linear(
                    mul_prefix_emb.size()[0] + 1, 1, bias=False)
                self.layer_norm = nn.LayerNorm(mul_prefix_emb.size()[0]) if self.ignore_target else nn.LayerNorm(
                    mul_prefix_emb.size()[0] + 1)
                # self.layer_norm = nn.LayerNorm(self.model_dim)
            if self.attn_method == "weight_more":
                self.attn_W = nn.Linear(self.prefix_dim, 7, bias=False)
                self.layer_norm = nn.LayerNorm(self.prefix_dim)
                # self.layer_norm = nn.LayerNorm(self.model_dim)
            if self.attn_method == "sub":
                self.attn_W_down = nn.Linear(self.model_dim, 100, bias=False)
                self.attn_W_up = nn.Linear(100, self.model_dim, bias=False)
                self.attn_non_linear = nn.SiLU()
                self.layer_norm = nn.LayerNorm(self.model_dim)
            if self.attn_method == "vote":
                pass
        #######################################
        self.adapter_config = adapter_config
        self.block = nn.ModuleList(
            [T5Block(self.per_layer_config(config, i, self.adapter_config, self.is_decoder),
                     has_relative_attention_bias=bool(i == 0),
                     adapter_config=adapter_config) for i in range(config.num_layers)]
        )
        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

        self.init_weights()
        # Model parallel
        self.model_parallel = False
        self.device_map = None

    ######################################################
    def per_layer_config(self, config, layer_id, adapter_config, is_decoder):
        """Sets the train_task_adapter in the config, based on the information given."""

        def is_valid_layer(layer_id, adapter_config, is_decoder):
            valid_layer_ids = adapter_config.task_adapter_layers_encoder \
                if not is_decoder else adapter_config.task_adapter_layers_decoder
            if valid_layer_ids is None:
                return True
            return True if layer_id in valid_layer_ids else False

        if adapter_config is None:
            return config
        config = copy.deepcopy(config)
        valid_task_adapter_layer_id = is_valid_layer(layer_id, adapter_config, is_decoder)
        add_task_adapter = True if not is_decoder else adapter_config.task_adapter_in_decoder
        config.train_task_adapters = config.train_task_adapters and \
                                     valid_task_adapter_layer_id and \
                                     add_task_adapter
        return config
        ####################################################

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        # Check validity of device_map
        self.device_map = (
            get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
        )
        assert_device_map(self.device_map, len(self.block))
        self.model_parallel = True
        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
        self.last_device = "cuda:" + str(max(self.device_map.keys()))
        # Load onto devices
        for k, v in self.device_map.items():
            for layer in v:
                cuda_device = "cuda:" + str(k)
                self.block[layer] = self.block[layer].to(cuda_device)

        # Set embed_tokens to first layer
        self.embed_tokens = self.embed_tokens.to(self.first_device)
        # Set final layer norm to last device
        self.final_layer_norm = self.final_layer_norm.to(self.last_device)

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def deparallelize(self):
        self.model_parallel = False
        self.device_map = None
        self.first_device = "cpu"
        self.last_device = "cpu"
        for i in range(len(self.block)):
            self.block[i] = self.block[i].to("cpu")
        self.embed_tokens = self.embed_tokens.to("cpu")
        self.final_layer_norm = self.final_layer_norm.to("cpu")
        torch.cuda.empty_cache()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, new_embeddings):
        self.embed_tokens = new_embeddings

    def embed_input(self,
                    input_ids=None,
                    attention_mask=None,
                    decoder_input_ids=None,
                    decoder_attention_mask=None,
                    head_mask=None,
                    decoder_head_mask=None,
                    cross_attn_head_mask=None,
                    encoder_outputs=None,
                    past_key_values=None,
                    inputs_embeds=None,
                    decoder_inputs_embeds=None,
                    labels=None,
                    use_cache=None,
                    output_attentions=None,
                    output_hidden_states=None,
                    return_dict=None,
                    task=None,
                    task_ids=None,
    ):
        # Model parallel
        if self.model_parallel:
            torch.cuda.set_device(self.first_device)
            self.embed_tokens = self.embed_tokens.to(self.first_device)
        # use_cache = use_cache if use_cache is not None else self.config.use_cache
        # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # output_hidden_states = (
        #     output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        # )
        # return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(
                f"You cannot specify both {err_msg_prefix}inputs and {err_msg_prefix}inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")

        # print('inputs_embeds--->', inputs_embeds)
        if inputs_embeds is None:
            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
            inputs_embeds = self.embed_tokens(input_ids)
            ######################################
            # if self.append_prefix and self.append_attn_prefix is False:
            #     print('mul_prefix_emb',vote_idx, self.mul_prefix_emb.size(),  self.mul_prefix_emb[vote_idx].unsqueeze(0).size(), inputs_embeds.size())
            #     inputs_embeds = torch.cat(
            #         [self.mul_prefix_emb[vote_idx].unsqueeze(0).repeat(inputs_embeds.size(0), 1, 1), inputs_embeds],
            #         dim=1)  # bsz, seqlen, dim
            #     input_shape = inputs_embeds.size()[:-1]

            # print('append_attn_prefix', self.append_attn_prefix, self.attn_method)
            if 1:
                avg_inputs_embeds, _ = torch.max(inputs_embeds, 1)

                if self.ignore_target is False:
                    if task_ids is not None:
                        target_prompts = torch.index_select(self.prefix_emb, 0, task_ids)
                    else:
                        target_prompts = self.prefix_emb[0].repeat(inputs_embeds.shape[0], 1, 1)
                    '''
                    self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1, 1) torch.Size([32, 6, 100, 768])
                    target_prompts.unsqueeze(1) torch.Size([32, 1, 100, 768])
                    mul_prefix_emb_added ---> torch.Size([32, 7, 100, 768])
                    # '''
                    # print('self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1, 1)', self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1, 1).size())
                    # print('target_prompts.unsqueeze(1)', target_prompts.unsqueeze(1).size())
                    mul_prefix_emb_added = torch.cat(
                        (self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1, 1), target_prompts.unsqueeze(1)),
                        dim=1)
                    # print('mul_prefix_emb_added --->', mul_prefix_emb_added.size())

                    avg_mul_prefix_emb, _ = torch.max(mul_prefix_emb_added, 2)
                    # print('avg_mul_prefix_emb', avg_mul_prefix_emb.size())
                else:
                    mul_prefix_emb_added = self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1,
                                                                      1)  # torch.Size([32, 6, 100, 768])
                    avg_mul_prefix_emb, _ = torch.max(mul_prefix_emb_added, 2)  # torch.Size([32, 6, 768])


        return avg_mul_prefix_emb, avg_inputs_embeds

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            inputs_embeds=None,
            head_mask=None,
            cross_attn_head_mask=None,
            past_key_values=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            task_embedding=None,
            task_ids=None,
            task=None,
            vote_idx=5,
    ):

        if self.model_parallel:
            torch.cuda.set_device(self.first_device)
            self.embed_tokens = self.embed_tokens.to(self.first_device)
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(
                f"You cannot specify both {err_msg_prefix}inputs and {err_msg_prefix}inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")

        # print('inputs_embeds--->', inputs_embeds)
        if inputs_embeds is None:
            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
            inputs_embeds = self.embed_tokens(input_ids)
            # if inputs_embeds.size(1) > 50:
            #     print('inputs_embeds--->', inputs_embeds[0][70][0])
            ######################################
            # todo: use append_attn_prefix
            if self.append_prefix and self.append_attn_prefix is False:
                print('mul_prefix_emb', self.mul_prefix_emb.size(), inputs_embeds.size())
                # self.mul_prefix_emb[vote_idx].unsqueeze(0).repeat(inputs_embeds.shape[0], 1, 1)
                inputs_embeds = torch.cat(
                    [self.mul_prefix_emb[vote_idx].unsqueeze(0).repeat(inputs_embeds.size(0),1,1), inputs_embeds],
                    dim=1)  # bsz, seqlen, dim
                input_shape = inputs_embeds.size()[:-1]
                # print('inputs_embeds', inputs_embeds.size())

            # print('append_attn_prefix', self.append_attn_prefix, self.attn_method)
            if self.append_attn_prefix:
                avg_inputs_embeds, _ = torch.max(inputs_embeds, 1)

                if self.ignore_target is False:
                    if task_ids is not None:
                        target_prompts = torch.index_select(self.prefix_emb, 0, task_ids)
                    else:
                        target_prompts = self.prefix_emb[0].repeat(inputs_embeds.shape[0], 1, 1)
                    '''
                    self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1, 1) torch.Size([32, 6, 100, 768])
                    target_prompts.unsqueeze(1) torch.Size([32, 1, 100, 768])
                    mul_prefix_emb_added ---> torch.Size([32, 7, 100, 768])
                    # '''
                    # print('self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1, 1)', self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1, 1).size())
                    # print('target_prompts.unsqueeze(1)', target_prompts.unsqueeze(1).size())
                    mul_prefix_emb_added = torch.cat(
                        (self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1, 1), target_prompts.unsqueeze(1)),
                        dim=1)
                    # print('mul_prefix_emb_added --->', mul_prefix_emb_added.size())

                    avg_mul_prefix_emb, _ = torch.max(mul_prefix_emb_added, 2)
                    # print('avg_mul_prefix_emb', avg_mul_prefix_emb.size())
                else:
                    mul_prefix_emb_added = self.mul_prefix_emb.repeat(inputs_embeds.shape[0], 1, 1, 1)  # torch.Size([32, 6, 100, 768])
                    avg_mul_prefix_emb, _ = torch.max(mul_prefix_emb_added, 2)  # torch.Size([32, 6, 768])


                if self.append_attn_prefix:
                    # 1. dot product
                    if self.attn_method == "dot":
                        avg_inputs_embeds = avg_inputs_embeds.unsqueeze(-1)
                        attn_scores = avg_mul_prefix_emb.bmm(avg_inputs_embeds).squeeze(-1)

                    # 2. Linear (Luong et al., 2015; https://arxiv.org/abs/1508.04025) based attention implementaitons.
                    elif self.attn_method == "linear":
                        x = self.attn_Wa(avg_inputs_embeds)
                        x = self.layer_norm(x)
                        x = x.unsqueeze(-1)
                        attn_scores = avg_mul_prefix_emb.bmm(x).squeeze(-1) / self.temperature

                    # 3. Down and up projection-based implementation.
                    elif "sub" in self.attn_method:
                        '''
                        sub -> UW baseline; sub_no_t -> no P_target.
                        %
                        % shape info:
                        mul_prefix_emb_added ---> torch.Size([32, 7, 100, 768])
                        attn_scores torch.Size([32, 7])
                        soft_prompts mul_prefix_emb_added ---> torch.Size([32, 7, 100, 768])
                        normalized_attn_scores ---> torch.Size([32, 7])
                        soft_prompts ---> torch.Size([32, 100, 768])
                        avg_inputs_embeds ---> torch.Size([32, 768])
                        avg_mul_prefix_emb -> torch.Size([32, 6, 768])
                        '''
                        # print('avg_inputs_embeds>>>', avg_inputs_embeds.size())
                        x = self.attn_W_down(avg_inputs_embeds)
                        # print('x1', x.size())
                        x = self.attn_non_linear(x)
                        # print('x2', x.size())
                        x = self.attn_W_up(x)
                        # print('x3', x.size())
                        x = self.layer_norm(x)  #torch.Size([32, 768])
                        # print('x4', x.size())
                        x = x.unsqueeze(-1)  # torch.Size([32, 768, 1])
                        # print('x5', x.size())
                        # print('avg_mul_prefix_emb', avg_mul_prefix_emb.size())
                        # torch.Size([32, 6, 768]) bmm torch.Size([32, 768, 1]) -> torch.Size([32, 6])

                        attn_scores = avg_mul_prefix_emb.bmm(x).squeeze(-1) / self.temperature  # torch.Size([32, 6])
                        # print('attn_scores', attn_scores.size())
                    elif self.attn_method == "constant":
                        attn_scores = (torch.ones(mul_prefix_emb_added.size(0),
                                                  mul_prefix_emb_added.size(1)) / mul_prefix_emb_added.size(1)).cuda()
                    elif self.attn_method == "weight":
                        '''
                        attn_scores torch.Size([32, 7])
                        soft_prompts mul_prefix_emb_added ---> torch.Size([32, 7, 100, 768])
                        normalized_attn_scores ---> torch.Size([32, 7])
                        mul_prefix_emb_added ---> torch.Size([32, 7, 100, 768])
                        soft_prompts ---> torch.Size([32, 100, 768])
                        '''
                        # print('weight, mul_prefix_emb_added --->', self.attn_W.weight.size())
                        attn_scores = self.attn_W.weight.repeat(mul_prefix_emb_added.size(0), 1)  # 32 * 7
                        # print('attn_scores size inside', attn_scores.size()) # 7 * 1
                        attn_scores = self.layer_norm(attn_scores)

                    elif self.attn_method == "weight_more":
                        '''
                        attn_scores ---> torch.Size([32, 7, 100])
                        attn_scores torch.Size([32, 7, 100])
                        soft_prompts mul_prefix_emb_added ---> torch.Size([32, 7, 100, 768])
                        normalized_attn_scores ---> torch.Size([32, 7, 100])
                        '''
                        # print('self.attn_W.weight.size() -->', self.attn_W.weight.size(), mul_prefix_emb_added.size())
                        # print('self.attn_W.weight -->', self.attn_W.weight)
                        # print('new_size', self.attn_W.weight.unsqueeze(0).repeat(mul_prefix_emb_added.size(0),1, 1).size())
                        # print(self.attn_W.weight.unsqueeze(0).repeat(mul_prefix_emb_added.size(0), 1, 1))
                        attn_scores = self.attn_W.weight.unsqueeze(0).repeat(mul_prefix_emb_added.size(0), 1,
                                                                             1)  # 100 * 7 -> 32 * 100 * 7
                        # print('attn_scores --->', attn_scores.size())
                        # print('attn_scores000->', attn_scores[0])
                        # print('attn_scores111->', attn_scores[1])
                        attn_scores = self.layer_norm(attn_scores)  #
                    elif "vote" in self.attn_method:
                        attn_scores = (torch.ones(mul_prefix_emb_added.size(0),
                                                  mul_prefix_emb_added.size(1))).cuda()
                    else:
                        raise NotImplementedError

                    normalized_attn_scores = F.softmax(attn_scores, -1)
                    # print('attn_scores', attn_scores.size())
                    # print('attn_scores', attn_scores)
                    # print('soft_prompts mul_prefix_emb_added --->', mul_prefix_emb_added.size())
                    # print('normalized_attn_scores --->', normalized_attn_scores.size())
                    # avg_mul_prefix_emb = avg_mul_prefix_emb + [1]
                    if "vote" in self.attn_method:  # soft_prompts [32 * 7, 100, 768]
                        soft_prompts = []
                        # print(' self.mul_prefix_emb',  self.mul_prefix_emb.size())

                        for b in range(mul_prefix_emb_added.size(0)):
                            # print('mul_prefix_emb_added', mul_prefix_emb_added.size())
                            soft_prompts.append(self.mul_prefix_emb[vote_idx])
                            # print('self.mul_prefix_emb->', vote_idx, self.mul_prefix_emb[vote_idx][0, :5])
                            # print('mul_prefix_emb_added[b][vote_idx]', mul_prefix_emb_added[b][vote_idx])
                        soft_prompts = torch.stack(soft_prompts)
                        # print('soft_prompts', soft_prompts.size())

                    elif self.attn_method != "weight_more":

                        # print('attn_scores', attn_scores.size())
                        # print('soft_prompts mul_prefix_emb_added --->', mul_prefix_emb_added.size())
                        # print('normalized_attn_scores --->', normalized_attn_scores.size())

                        #                     # attn_scores torch.Size([32, 7])
                        # #                     soft_prompts mul_prefix_emb_added ---> torch.Size([32, 7, 100, 768])
                        # #                     normalized_attn_scores ---> torch.Size([32, 7])
                        # #                     mul_prefix_emb_added ---> torch.Size([32, 7, 100, 768])
                        # #                     soft_prompts ---> torch.Size([32, 100, 768])
                        #                     # b -> 32 p ->7 l ->100 d -> 768
                        soft_prompts = torch.einsum('bp, bpld -> bld', normalized_attn_scores, mul_prefix_emb_added)
                    else:
                        soft_prompts = torch.einsum('bpl, bpld -> bld', normalized_attn_scores, mul_prefix_emb_added)

                    #
                    # print('soft_prompts --->00->', soft_prompts[0])
                    # print('soft_prompts --->11->', soft_prompts[1])
                    # print('soft_prompts --->-11->', soft_prompts[-1])
                    # soft_prompts = soft_prompts + [2] # raise bug
                    if self.ignore_target is False:
                        if self.shared_attn is True and task_ids is not None:
                            soft_prompts = soft_prompts + torch.index_select(self.prefix_emb, 0, task_ids)
                        elif self.shared_attn is True and task_ids is None:
                            soft_prompts = soft_prompts + self.prefix_emb[0].unsqueeze(0).repeat(inputs_embeds.shape[0],
                                                                                                 1, 1)
                        else:
                            soft_prompts = soft_prompts + self.prefix_emb.unsqueeze(0).repeat(inputs_embeds.shape[0], 1,1)
                    # print('self.attn_method', self.attn_method)
                    # print('soft_prompts>>>', soft_prompts.size())
                    # print('inputs_embeds', inputs_embeds.size())
                    if "vote" in self.attn_method:
                        # print('inputs_embeds', inputs_embeds.size())
                        # print('soft_prompts', soft_prompts.size())
                        # print(torch.repeat_interleave(inputs_embeds, 6, dim=0))
                        # print(soft_prompts.size(), inputs_embeds.size())
                        inputs_embeds = torch.cat([soft_prompts, inputs_embeds], dim=1)
                        # inputs_embeds = torch.cat([soft_prompts, torch.repeat_interleave(inputs_embeds, 6, dim=0)], dim=1)  # bsz, seqlen, dim
                    else:
                        inputs_embeds = torch.cat([soft_prompts, inputs_embeds], dim=1)  # bsz, seqlen, dim
                    # print('inputs_embeds after>>', inputs_embeds.size())
                    input_shape = inputs_embeds.size()[:-1]
            ######################################

        batch_size, seq_length = input_shape
        # required mask seq length can be calculated via length of past
        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length

        if use_cache is True:
            assert self.is_decoder, f":obj:`use_cache` can only be set to `True` if {self} is used as a decoder"

        if attention_mask is None:
            if self.attn_method == "vote":
                attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device)
            else:
                attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device)
            # print('attention_mask', attention_mask.size())
        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
            encoder_seq_length = encoder_hidden_states.shape[1]
            encoder_attention_mask = torch.ones(
                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
            )

        # initialize past_key_values with `None` if past does not exist
        if past_key_values is None:
            past_key_values = [None] * len(self.block)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device)

        if self.is_decoder and encoder_attention_mask is not None:
            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
        present_key_value_states = () if use_cache else None
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
        position_bias = None
        encoder_decoder_position_bias = None

        hidden_states = self.dropout(inputs_embeds)

        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
            layer_head_mask = head_mask[i]
            cross_attn_layer_head_mask = cross_attn_head_mask[i]
            # Model parallel
            if self.model_parallel:
                torch.cuda.set_device(hidden_states.device)
                # Ensure that attention_mask is always on the same device as hidden_states
                if attention_mask is not None:
                    attention_mask = attention_mask.to(hidden_states.device)
                if position_bias is not None:
                    position_bias = position_bias.to(hidden_states.device)
                if encoder_hidden_states is not None:
                    encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
                if encoder_extended_attention_mask is not None:
                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
                if encoder_decoder_position_bias is not None:
                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
                if layer_head_mask is not None:
                    layer_head_mask = layer_head_mask.to(hidden_states.device)
                if cross_attn_layer_head_mask is not None:
                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            if getattr(self.config, "gradient_checkpointing", False) and self.training:
                if use_cache:
                    logger.warn(
                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                        "`use_cache=False`..."
                    )
                    use_cache = False

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return tuple(module(*inputs, use_cache, output_attentions))

                    return custom_forward

                layer_outputs = checkpoint(
                    create_custom_forward(layer_module),
                    hidden_states,
                    extended_attention_mask,
                    position_bias,
                    encoder_hidden_states,
                    encoder_extended_attention_mask,
                    encoder_decoder_position_bias,
                    layer_head_mask,
                    cross_attn_layer_head_mask,
                    None,  # past_key_value is always None with gradient checkpointing
                )
            else:
                # print('extended_attention_mask', extended_attention_mask.size())

                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask=extended_attention_mask,
                    position_bias=position_bias,
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_extended_attention_mask,
                    encoder_decoder_position_bias=encoder_decoder_position_bias,
                    layer_head_mask=layer_head_mask,
                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
                    past_key_value=past_key_value,
                    use_cache=use_cache,
                    output_attentions=output_attentions,
                    task=task
                )

            # layer_outputs is a tuple with:
            # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
            if use_cache is False:
                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
            hidden_states, present_key_value_state = layer_outputs[:2]

            # We share the position biases between the layers - the first layer store them
            # layer_outputs = hidden-states, key-value-states (self-attention weights),
            # (self-attention position bias), (cross-attention weights), (cross-attention position bias)
            position_bias = layer_outputs[2]
            if self.is_decoder and encoder_hidden_states is not None:
                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
            # append next layer key value states
            if use_cache:
                present_key_value_states = present_key_value_states + (present_key_value_state,)

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[3],)
                if self.is_decoder:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)

            # Model Parallel: If it's the last layer for that device, put things on the next device
            if self.model_parallel:
                for k, v in self.device_map.items():
                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
                        hidden_states = hidden_states.to("cuda:" + str(k + 1))

        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.dropout(hidden_states)

        # Add last layer
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    present_key_value_states,
                    all_hidden_states,
                    all_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=present_key_value_states,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions,
        )


T5_START_DOCSTRING = r"""

    The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
    <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
    Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a text-to-text
    denoising generative setting.

    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)

    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
    general usage and behavior.

    Parameters:
        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
            weights.
"""

T5_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            detail.

            `What are input IDs? <../glossary.html#input-ids>`__

            To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
            <./t5.html#training>`__.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.

            `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__

            T5 uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
            :obj:`past_key_values`).

            To know more on how to prepare :obj:`decoder_input_ids` for pretraining take a look at `T5 Training
            <./t5.html#training>`__.
        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in ``[0,
            1]``:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in ``[0,
            1]``:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
                ``[0, 1]``:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
            Tuple consists of (:obj:`last_hidden_state`, :obj:`optional`: `hidden_states`, :obj:`optional`:
            `attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)` is a
            sequence of hidden states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
            takes the value of :obj:`inputs_embeds`.

        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).

        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""

T5_ENCODER_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            detail.

            To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
            <./t5.html#training>`__.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""

# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
__HEAD_MASK_WARNING_MSG = """
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
"""


@add_start_docstrings(
    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
    T5_START_DOCSTRING,
)
class T5Model(T5PreTrainedModel):
    _keys_to_ignore_on_load_missing = [
        r"encoder\.embed_tokens\.weight",
        r"decoder\.embed_tokens\.weight",
    ]
    _keys_to_ignore_on_load_unexpected = [
        r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
    ]

    def __init__(self, config: T5Config):
        super().__init__(config)
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = T5Stack(encoder_config, self.shared)

        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = T5Stack(decoder_config, self.shared)

        self.init_weights()

        # Model parallel
        self.model_parallel = False
        self.device_map = None

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        self.device_map = (
            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        assert_device_map(self.device_map, len(self.encoder.block))
        self.encoder.parallelize(self.device_map)
        self.decoder.parallelize(self.device_map)
        self.model_parallel = True

    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self):
        self.encoder.deparallelize()
        self.decoder.deparallelize()
        self.encoder = self.encoder.to("cpu")
        self.decoder = self.decoder.to("cpu")
        self.model_parallel = False
        self.device_map = None
        torch.cuda.empty_cache()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            decoder_input_ids=None,
            decoder_attention_mask=None,
            head_mask=None,
            decoder_head_mask=None,
            cross_attn_head_mask=None,
            encoder_outputs=None,
            past_key_values=None,
            inputs_embeds=None,
            decoder_inputs_embeds=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):
        r"""
        Returns:

        Example::

            >>> from transformers import T5Tokenizer, T5Model

            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
            >>> model = T5Model.from_pretrained('t5-small')

            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

            >>> last_hidden_states = outputs.last_hidden_state
        """
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
        if head_mask is not None and decoder_head_mask is None:
            if self.config.num_layers == self.config.num_decoder_layers:
                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
                decoder_head_mask = head_mask

        # Encode if needed (training, first prediction pass)
        if encoder_outputs is None:
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        hidden_states = encoder_outputs[0]
        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)
        # Set device for model parallelism
        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)
            hidden_states = hidden_states.to(self.decoder.first_device)
            if decoder_input_ids is not None:
                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
            if attention_mask is not None:
                attention_mask = attention_mask.to(self.decoder.first_device)
            if decoder_attention_mask is not None:
                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            inputs_embeds=decoder_inputs_embeds,
            past_key_values=past_key_values,
            encoder_hidden_states=hidden_states,
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if not return_dict:
            return decoder_outputs + encoder_outputs

        return Seq2SeqModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )


@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING)
class T5ForConditionalGeneration_vote(T5PreTrainedModel):
    _keys_to_ignore_on_load_missing = [
        r"encoder\.embed_tokens\.weight",
        r"decoder\.embed_tokens\.weight",
        r"lm_head\.weight",
    ]
    _keys_to_ignore_on_load_unexpected = [
        r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
    ]

    def __init__(self, config, adapter_config=None):
        super().__init__(config)
        self.model_dim = config.d_model
        # print('self.model_dim = config.d_model', self.model_dim)
        self.shared = nn.Embedding(config.vocab_size, config.d_model)
        #############################################################
        self.prefix_tuning = config.prefix_tuning
        self.attn_prefix_tuning = config.attn_prefix_tuning
        self.attn_method = config.attn_method
        self.ignore_target = config.ignore_target
        if self.prefix_tuning:
            self.prefix_dim = adapter_config.prefix_dim
            self.init_prefix_from_vocab = adapter_config.init_prefix_from_vocab
        self.shared_attn = config.shared_attn
        self.temperature = config.temperature

        # 6 shared weights
        if self.shared_attn is True:
            self.prefix_shared = nn.Parameter(
                torch.zeros((config.num_target, self.prefix_dim, config.d_model))) if self.prefix_tuning else None
        else:
            self.prefix_shared = nn.Parameter(
                torch.zeros((self.prefix_dim, config.d_model))) if self.prefix_tuning else None
        self.prefix_num = config.prefix_num

        self.mul_prefix_emb = nn.Parameter(torch.zeros((self.prefix_num, self.prefix_dim,
                                                        config.d_model)))  # if self.prefix_tuning and self.attn_prefix_tuning else None
        #############################################################
        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = T5Stack(encoder_config, self.shared, adapter_config=adapter_config,
                               prefix_emb=self.prefix_shared, attn_prefix_tuning=self.attn_prefix_tuning,
                               mul_prefix_emb=self.mul_prefix_emb, model_dim=config.d_model,
                               attn_method=self.attn_method, shared_attn=self.shared_attn,
                               ignore_target=self.ignore_target, temperature=self.temperature)

        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        if config.train_task_adapters:
            decoder_config.train_task_adapters = adapter_config.task_adapter_in_decoder
        self.decoder = T5Stack(decoder_config, self.shared, adapter_config=adapter_config,
                               prefix_emb=self.prefix_shared, attn_prefix_tuning=self.attn_prefix_tuning,
                               mul_prefix_emb=self.mul_prefix_emb, model_dim=config.d_model,
                               attn_method=self.attn_method, shared_attn=self.shared_attn,
                               ignore_target=self.ignore_target, temperature=self.temperature)

        self.bitfit = adapter_config.bitfit if adapter_config is not None else False
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False if not self.bitfit else True)

        ###########################################################################
        # Creates and sets a shared phm_rule in case of hypercomplex adapters with a shared phm_rule.
        if config.train_task_adapters and adapter_config.hypercomplex_adapters:
            if adapter_config.shared_phm_rule:
                phm_dim = adapter_config.hypercomplex_division
                self.factorized_phm_rule = adapter_config.factorized_phm_rule
                if self.factorized_phm_rule:
                    self.phm_rule_left = nn.Parameter(torch.FloatTensor(phm_dim, phm_dim, 1).to(adapter_config.device),
                                                      requires_grad=adapter_config.learn_phm)
                    self.phm_rule_right = nn.Parameter(torch.FloatTensor(phm_dim, 1, phm_dim).to(adapter_config.device),
                                                       requires_grad=adapter_config.learn_phm)
                    if adapter_config.phm_c_init == "normal":
                        self.phm_rule_left.data.normal_(mean=0, std=adapter_config.phm_init_range)
                        self.phm_rule_right.data.normal_(mean=0, std=adapter_config.phm_init_range)
                    elif adapter_config.phm_c_init == "uniform":
                        self.phm_rule_left.data.uniform_(-1, 1)
                        self.phm_rule_right.data.uniform_(-1, 1)
                    else:
                        raise NotImplementedError
                else:
                    self.phm_rule = nn.Parameter(torch.FloatTensor(phm_dim, phm_dim, phm_dim).to(adapter_config.device), \
                                                 requires_grad=adapter_config.learn_phm)
                    if adapter_config.phm_c_init == "normal":
                        self.phm_rule.data.normal_(mean=0, std=adapter_config.phm_init_range)
                    elif adapter_config.phm_c_init == "uniform":
                        self.phm_rule.data.uniform_(-1, 1)
                    else:
                        raise NotImplementedError
                self.set_phm_rule()

            if adapter_config.shared_W_phm:
                self.w_init = adapter_config.hypercomplex_nonlinearity
                self.phm_dim = adapter_config.hypercomplex_division
                down_sample_size = adapter_config.input_dim // adapter_config.reduction_factor
                in_feats_per_axis = adapter_config.input_dim // self.phm_dim
                out_feats_per_axis = down_sample_size // self.phm_dim
                self.factorized_phm = adapter_config.factorized_phm
                if self.factorized_phm:
                    self.phm_rank = adapter_config.phm_rank
                    self.W_down_left = nn.Parameter(torch.Tensor(size=(self.phm_dim, in_feats_per_axis, self.phm_rank)),
                                                    requires_grad=True)
                    self.W_down_right = nn.Parameter(
                        torch.Tensor(size=(self.phm_dim, self.phm_rank, out_feats_per_axis)),
                        requires_grad=True)
                    self.W_up_left = nn.Parameter(torch.Tensor(size=(self.phm_dim, out_feats_per_axis, self.phm_rank)),
                                                  requires_grad=True)
                    self.W_up_right = nn.Parameter(torch.Tensor(size=(self.phm_dim, self.phm_rank, in_feats_per_axis)),
                                                   requires_grad=True)
                    self.init_W(in_feats_per_axis, out_feats_per_axis, W_left=self.W_down_left,
                                W_right=self.W_down_right)
                    self.init_W(out_feats_per_axis, in_feats_per_axis, W_left=self.W_up_left,
                                W_right=self.W_up_right)
                else:
                    self.W_down = nn.Parameter(torch.Tensor(size=(self.phm_dim, in_feats_per_axis, out_feats_per_axis)),
                                               requires_grad=True)
                    self.W_up = nn.Parameter(torch.Tensor(size=(self.phm_dim, out_feats_per_axis, in_feats_per_axis)),
                                             requires_grad=True)
                    self.init_W(in_feats_per_axis, out_feats_per_axis, W=self.W_down)
                    self.init_W(out_feats_per_axis, in_feats_per_axis, W=self.W_up)
                self.set_phm_Ws()
        ###############################################################################

        self.init_weights()

        # Model parallel
        self.model_parallel = False
        self.device_map = None

    ###############################################
    # def _prepare_decoder_input_ids_for_generation(
    #         self,
    #         batch_size: int,
    #         decoder_start_token_id: int = None,
    #         bos_token_id: int = None,
    #         model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
    #         device: torch.device = None,
    #         vote_idx=None,
    # ) -> torch.LongTensor:
    #
    #     if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
    #         return model_kwargs.pop("decoder_input_ids")
    #     else:
    #         decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
    #         if device is None:
    #             device = self.device
    #         return torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id

    # def generate(
    #     self,
    #     inputs: Optional[torch.Tensor] = None,
    #     max_length: Optional[int] = None,
    #     min_length: Optional[int] = None,
    #     do_sample: Optional[bool] = None,
    #     early_stopping: Optional[bool] = None,
    #     num_beams: Optional[int] = None,
    #     temperature: Optional[float] = None,
    #     top_k: Optional[int] = None,
    #     top_p: Optional[float] = None,
    #     typical_p: Optional[float] = None,
    #     repetition_penalty: Optional[float] = None,
    #     bad_words_ids: Optional[Iterable[int]] = None,
    #     force_words_ids: Optional[Union[Iterable[int], Iterable[Iterable[int]]]] = None,
    #     bos_token_id: Optional[int] = None,
    #     pad_token_id: Optional[int] = None,
    #     eos_token_id: Optional[int] = None,
    #     length_penalty: Optional[float] = None,
    #     no_repeat_ngram_size: Optional[int] = None,
    #     encoder_no_repeat_ngram_size: Optional[int] = None,
    #     num_return_sequences: Optional[int] = None,
    #     max_time: Optional[float] = None,
    #     max_new_tokens: Optional[int] = None,
    #     decoder_start_token_id: Optional[int] = None,
    #     use_cache: Optional[bool] = None,
    #     num_beam_groups: Optional[int] = None,
    #     diversity_penalty: Optional[float] = None,
    #     prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
    #     logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
    #     renormalize_logits: Optional[bool] = None,
    #     stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(),
    #     constraints: Optional[List[Constraint]] = None,
    #     output_attentions: Optional[bool] = None,
    #     output_hidden_states: Optional[bool] = None,
    #     output_scores: Optional[bool] = None,
    #     return_dict_in_generate: Optional[bool] = None,
    #     forced_bos_token_id: Optional[int] = None,
    #     forced_eos_token_id: Optional[int] = None,
    #     remove_invalid_values: Optional[bool] = None,
    #     synced_gpus: Optional[bool] = False,
    #     exponential_decay_length_penalty: Optional[Tuple[Union[int, float]]] = None,
    #     **model_kwargs,
    #     weights=None,
    # ) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, torch.LongTensor]:
    #     r"""
    #
    #     Generates sequences of token ids for models with a language modeling head. The method supports the following
    #     generation methods for text-decoder, text-to-text, speech-to-text, and vision-to-text models:
    #
    #         - *greedy decoding* by calling [`~generation_utils.GenerationMixin.greedy_search`] if `num_beams=1` and
    #           `do_sample=False`.
    #         - *multinomial sampling* by calling [`~generation_utils.GenerationMixin.sample`] if `num_beams=1` and
    #           `do_sample=True`.
    #         - *beam-search decoding* by calling [`~generation_utils.GenerationMixin.beam_search`] if `num_beams>1` and
    #           `do_sample=False`.
    #         - *beam-search multinomial sampling* by calling [`~generation_utils.GenerationMixin.beam_sample`] if
    #           `num_beams>1` and `do_sample=True`.
    #         - *diverse beam-search decoding* by calling [`~generation_utils.GenerationMixin.group_beam_search`], if
    #           `num_beams>1` and `num_beam_groups>1`.
    #         - *constrained beam-search decoding* by calling
    #           [`~generation_utils.GenerationMixin.constrained_beam_search`], if `constraints!=None` or
    #           `force_words_ids!=None`.
    #
    #     <Tip warning={true}>
    #
    #     Apart from `inputs`, all the arguments below will default to the value of the attribute of the same name as
    #     defined in the model's config (`config.json`) which in turn defaults to the
    #     [`~modeling_utils.PretrainedConfig`] of the model.
    #
    #     </Tip>
    #
    #     Most of these parameters are explained in more detail in [this blog
    #     post](https://huggingface.co/blog/how-to-generate).
    #
    #     Parameters:
    #         inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
    #             The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
    #             method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
    #             should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
    #             `input_ids`, `input_values`, `input_features`, or `pixel_values`.
    #         max_length (`int`, *optional*, defaults to `model.config.max_length`):
    #             The maximum length of the sequence to be generated.
    #         max_new_tokens (`int`, *optional*, defaults to None):
    #             The maximum numbers of tokens to generate, ignore the current number of tokens. Use either
    #             `max_new_tokens` or `max_length` but not both, they serve the same purpose.
    #         min_length (`int`, *optional*, defaults to 10):
    #             The minimum length of the sequence to be generated.
    #         do_sample (`bool`, *optional*, defaults to `False`):
    #             Whether or not to use sampling ; use greedy decoding otherwise.
    #         early_stopping (`bool`, *optional*, defaults to `False`):
    #             Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
    #         num_beams (`int`, *optional*, defaults to 1):
    #             Number of beams for beam search. 1 means no beam search.
    #         temperature (`float`, *optional*, defaults to 1.0):
    #             The value used to module the next token probabilities.
    #         top_k (`int`, *optional*, defaults to 50):
    #             The number of highest probability vocabulary tokens to keep for top-k-filtering.
    #         top_p (`float`, *optional*, defaults to 1.0):
    #             If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher
    #             are kept for generation.
    #         repetition_penalty (`float`, *optional*, defaults to 1.0):
    #             The parameter for repetition penalty. 1.0 means no penalty. See [this
    #             paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    #         pad_token_id (`int`, *optional*):
    #             The id of the *padding* token.
    #         bos_token_id (`int`, *optional*):
    #             The id of the *beginning-of-sequence* token.
    #         eos_token_id (`int`, *optional*):
    #             The id of the *end-of-sequence* token.
    #         length_penalty (`float`, *optional*, defaults to 1.0):
    #              Exponential penalty to the length. 1.0 means that the beam score is penalized by the sequence length.
    #              0.0 means no penalty. Set to values < 0.0 in order to encourage the model to generate longer
    #              sequences, to a value > 0.0 in order to encourage the model to produce shorter sequences.
    #         no_repeat_ngram_size (`int`, *optional*, defaults to 0):
    #             If set to int > 0, all ngrams of that size can only occur once.
    #         encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
    #             If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
    #             `decoder_input_ids`.
    #         bad_words_ids(`List[List[int]]`, *optional*):
    #             List of token ids that are not allowed to be generated. In order to get the token ids of the words that
    #             should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True,
    #             add_special_tokens=False).input_ids`.
    #         force_words_ids(`List[List[int]]` or `List[List[List[int]]]`, *optional*):
    #             List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple
    #             list of words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`,
    #             this triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081),
    #             where one can allow different forms of each word.
    #         num_return_sequences(`int`, *optional*, defaults to 1):
    #             The number of independently computed returned sequences for each element in the batch.
    #         max_time(`float`, *optional*, defaults to None):
    #             The maximum amount of time you allow the computation to run for in seconds. generation will still
    #             finish the current pass after allocated time has been passed.
    #         attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    #             Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for tokens
    #             that are not masked, and 0 for masked tokens. If not provided, will default to a tensor the same shape
    #             as `input_ids` that masks the pad token. [What are attention masks?](../glossary#attention-mask)
    #         decoder_start_token_id (`int`, *optional*):
    #             If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
    #         use_cache: (`bool`, *optional*, defaults to `True`):
    #             Whether or not the model should use the past last key/values attentions (if applicable to the model) to
    #             speed up decoding.
    #         num_beam_groups (`int`, *optional*, defaults to 1):
    #             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
    #             beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
    #         diversity_penalty (`float`, *optional*, defaults to 0.0):
    #             This value is subtracted from a beam's score if it generates a token same as any beam from other group
    #             at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is
    #             enabled.
    #         prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
    #             If provided, this function constraints the beam search to allowed tokens only at each step. If not
    #             provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
    #             `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
    #             on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
    #             for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
    #             Retrieval](https://arxiv.org/abs/2010.00904).
    #         logits_processor (`LogitsProcessorList`, *optional*):
    #              Custom logits processors that complement the default logits processors built from arguments and a
    #              model's config. If a logit processor is passed that is already created with the arguments or a model's
    #              config an error is thrown. This feature is intended for advanced users.
    #         renormalize_logits: (`bool`, *optional*, defaults to `False`):
    #             Whether to renormalize the logits after applying all the logits processors or warpers (including the
    #             custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
    #             score logits are normalized but some logit processors or warpers break the normalization.
    #         stopping_criteria (`StoppingCriteriaList`, *optional*):
    #              Custom stopping criteria that complement the default stopping criteria built from arguments and a
    #              model's config. If a stopping criteria is passed that is already created with the arguments or a
    #              model's config an error is thrown. This feature is intended for advanced users.
    #         constraints (`List[Constraint]`, *optional*):
    #              Custom constraints that can be added to the generation to ensure that the output will contain the use
    #              of certain tokens as defined by `Constraint` objects, in the most sensible way possible.
    #         output_attentions (`bool`, *optional*, defaults to `False`):
    #             Whether or not to return the attentions tensors of all attention layers. See `attentions` under
    #             returned tensors for more details.
    #         output_hidden_states (`bool`, *optional*, defaults to `False`):
    #             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
    #             for more details.
    #         output_scores (`bool`, *optional*, defaults to `False`):
    #             Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
    #         return_dict_in_generate (`bool`, *optional*, defaults to `False`):
    #             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
    #         forced_bos_token_id (`int`, *optional*):
    #             The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful
    #             for multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be
    #             the target language token.
    #         forced_eos_token_id (`int`, *optional*):
    #             The id of the token to force as the last generated token when `max_length` is reached.
    #         remove_invalid_values (`bool`, *optional*):
    #             Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to
    #             crash. Note that using `remove_invalid_values` can slow down generation.
    #         synced_gpus (`bool`, *optional*, defaults to `False`):
    #             Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
    #         exponential_decay_length_penalty (`tuple(int, float)`, *optional*):
    #             This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
    #             generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates
    #             where penalty starts and `decay_factor` represents the factor of exponential decay
    #
    #         model_kwargs:
    #             Additional model specific kwargs will be forwarded to the `forward` function of the model. If the model
    #             is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific kwargs
    #             should be prefixed with *decoder_*.
    #
    #     Return:
    #         [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
    #         or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
    #
    #             If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
    #             [`~utils.ModelOutput`] types are:
    #
    #                 - [`~generation_utils.GreedySearchDecoderOnlyOutput`],
    #                 - [`~generation_utils.SampleDecoderOnlyOutput`],
    #                 - [`~generation_utils.BeamSearchDecoderOnlyOutput`],
    #                 - [`~generation_utils.BeamSampleDecoderOnlyOutput`]
    #
    #             If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
    #             [`~utils.ModelOutput`] types are:
    #
    #                 - [`~generation_utils.GreedySearchEncoderDecoderOutput`],
    #                 - [`~generation_utils.SampleEncoderDecoderOutput`],
    #                 - [`~generation_utils.BeamSearchEncoderDecoderOutput`],
    #                 - [`~generation_utils.BeamSampleEncoderDecoderOutput`]
    #
    #     Examples:
    #
    #     Greedy Decoding:
    #
    #     ```python
    #     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
    #
    #     >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
    #     >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
    #
    #     >>> prompt = "Today I believe we can finally"
    #     >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    #
    #     >>> # generate up to 30 tokens
    #     >>> outputs = model.generate(input_ids, do_sample=False, max_length=30)
    #     >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
    #     ['Today I believe we can finally get to the point where we can make a difference in the lives of the people of the United States of America.\n']
    #     ```
    #
    #     Multinomial Sampling:
    #
    #     ```python
    #     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
    #     >>> import torch
    #
    #     >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
    #     >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
    #
    #     >>> prompt = "Today I believe we can finally"
    #     >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    #
    #     >>> # sample up to 30 tokens
    #     >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
    #     >>> outputs = model.generate(input_ids, do_sample=True, max_length=30)
    #     >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
    #     ['Today I believe we can finally get rid of discrimination," said Rep. Mark Pocan (D-Wis.).\n\n"Just look at the']
    #     ```
    #
    #     Beam-search decoding:
    #
    #     ```python
    #     >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    #
    #     >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    #     >>> model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    #
    #     >>> sentence = "Paris is one of the densest populated areas in Europe."
    #     >>> input_ids = tokenizer(sentence, return_tensors="pt").input_ids
    #
    #     >>> outputs = model.generate(input_ids)
    #     >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
    #     ['Paris ist eines der dichtesten besiedelten Gebiete Europas.']
    #     ```"""
    #     # 1. Set generation parameters if not already defined
    #     bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
    #     num_beams = num_beams if num_beams is not None else self.config.num_beams
    #     length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
    #     early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
    #     num_beam_groups = num_beam_groups if num_beam_groups is not None else self.config.num_beam_groups
    #     do_sample = do_sample if do_sample is not None else self.config.do_sample
    #     num_return_sequences = (
    #         num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
    #     )
    #
    #     pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
    #     eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
    #
    #     if eos_token_id is None and hasattr(self.config, "decoder"):
    #         eos_token_id = self.config.decoder.eos_token_id
    #
    #     if pad_token_id is None and eos_token_id is not None:
    #         # special case if pad_token_id is not defined
    #         logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
    #         pad_token_id = eos_token_id
    #
    #     output_scores = output_scores if output_scores is not None else self.config.output_scores
    #     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    #     output_hidden_states = (
    #         output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    #     )
    #     return_dict_in_generate = (
    #         return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
    #     )
    #
    #     # 2. Define model inputs
    #     # inputs_tensor has to be defined
    #     # model_input_name is defined if model-specific keyword input is passed
    #     # otherwise model_input_name is None
    #     # all model-specific keyword inputs are removed from `model_kwargs`
    #     inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(inputs, bos_token_id, model_kwargs)
    #     batch_size = inputs_tensor.shape[0]
    #
    #     # 3. Define other model kwargs
    #     model_kwargs["output_attentions"] = output_attentions
    #     model_kwargs["output_hidden_states"] = output_hidden_states
    #     model_kwargs["use_cache"] = use_cache
    #
    #     accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
    #     requires_attention_mask = "encoder_outputs" not in model_kwargs
    #
    #     if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask:
    #         model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
    #             inputs_tensor, pad_token_id, eos_token_id
    #         )
    #
    #     model_kwargs_lst = []
    #     for idx in range(6):  # todo: change to # of teachers
    #         if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
    #             # if model is encoder decoder encoder_outputs are created
    #             # and added to `model_kwargs`
    #             model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
    #                 inputs_tensor, model_kwargs, model_input_name, vote_idx=idx
    #             )
    #             model_kwargs_lst.append(model_kwargs)
    #
    #     # 4. Prepare `input_ids` which will be used for auto-regressive generation
    #     input_ids_lst = []
    #     for idx in range(6):  # todo: change to # of teachers
    #         if self.config.is_encoder_decoder:
    #             input_ids = self._prepare_decoder_input_ids_for_generation(
    #                 batch_size,
    #                 decoder_start_token_id=decoder_start_token_id,
    #                 bos_token_id=bos_token_id,
    #                 model_kwargs=model_kwargs_lst[idx],
    #                 device=inputs_tensor.device,
    #             )
    #         else:
    #             # if decoder-only then inputs_tensor has to be `input_ids`
    #             input_ids = inputs_tensor
    #         input_ids_lst.append(input_ids)
    #
    #     input_ids_seq_length = input_ids.shape[-1]
    #
    #     # 5. Prepare `max_length` depending on other stopping criteria
    #     # if `max_new_tokens` is passed, but not `max_length` -> set `max_length = max_new_tokens`
    #     if max_length is None and max_new_tokens is not None:
    #         max_length = max_new_tokens + input_ids_seq_length
    #     elif max_length is not None and max_new_tokens is not None:
    #         # Both are set, this is odd, raise a warning
    #         warnings.warn(
    #             "Both `max_length` and `max_new_tokens` have been set "
    #             f"but they serve the same purpose. `max_length` {max_length} "
    #             f"will take priority over `max_new_tokens` {max_new_tokens}.",
    #             UserWarning,
    #         )
    #     # default to config if still None
    #     max_length = max_length if max_length is not None else self.config.max_length
    #     min_length = min_length if min_length is not None else self.config.min_length
    #
    #     if min_length is not None and min_length > max_length:
    #         raise ValueError(
    #             f"Unfeasable length constraints: the minimum length ({min_length}) is larger than the maximum "
    #             f"length ({max_length})"
    #         )
    #     if input_ids_seq_length >= max_length:
    #         input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
    #         logger.warning(
    #             f"Input length of {input_ids_string} is {input_ids_seq_length}, but ``max_length`` is set to {max_length}. "
    #             "This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``."
    #         )
    #
    #     # 6. determine generation mode
    #     is_constraint_gen_mode = constraints is not None or force_words_ids is not None
    #     is_greedy_gen_mode = (
    #         (num_beams == 1) and (num_beam_groups == 1) and do_sample is False and not is_constraint_gen_mode
    #     )
    #     is_sample_gen_mode = (
    #         (num_beams == 1) and (num_beam_groups == 1) and do_sample is True and not is_constraint_gen_mode
    #     )
    #     is_beam_gen_mode = (
    #         (num_beams > 1) and (num_beam_groups == 1) and do_sample is False and not is_constraint_gen_mode
    #     )
    #     is_beam_sample_gen_mode = (
    #         (num_beams > 1) and (num_beam_groups == 1) and do_sample is True and not is_constraint_gen_mode
    #     )
    #     is_group_beam_gen_mode = (num_beams > 1) and (num_beam_groups > 1) and not is_constraint_gen_mode
    #
    #     if num_beam_groups > num_beams:
    #         raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`")
    #     if is_group_beam_gen_mode and do_sample is True:
    #         raise ValueError(
    #             "Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is set to `False`."
    #         )
    #
    #     # 7. prepare distribution pre_processing samplers
    #     logits_processor = self._get_logits_processor(
    #         repetition_penalty=repetition_penalty,
    #         no_repeat_ngram_size=no_repeat_ngram_size,
    #         encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
    #         input_ids_seq_length=input_ids_seq_length,
    #         encoder_input_ids=inputs_tensor,
    #         bad_words_ids=bad_words_ids,
    #         min_length=min_length,
    #         max_length=max_length,
    #         eos_token_id=eos_token_id,
    #         forced_bos_token_id=forced_bos_token_id,
    #         forced_eos_token_id=forced_eos_token_id,
    #         prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
    #         num_beams=num_beams,
    #         num_beam_groups=num_beam_groups,
    #         diversity_penalty=diversity_penalty,
    #         remove_invalid_values=remove_invalid_values,
    #         exponential_decay_length_penalty=exponential_decay_length_penalty,
    #         logits_processor=logits_processor,
    #         renormalize_logits=renormalize_logits,
    #     )
    #
    #     # 8. prepare stopping criteria
    #     stopping_criteria = self._get_stopping_criteria(
    #         max_length=max_length, max_time=max_time, stopping_criteria=stopping_criteria
    #     )
    #
    #     # 9. go into different generation modes
    #     if is_greedy_gen_mode:
    #         if num_return_sequences > 1:
    #             raise ValueError(
    #                 f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search."
    #             )
    #
    #         # 10. run greedy search
    #         return self.greedy_search(
    #             input_ids,
    #             logits_processor=logits_processor,
    #             stopping_criteria=stopping_criteria,
    #             pad_token_id=pad_token_id,
    #             eos_token_id=eos_token_id,
    #             output_scores=output_scores,
    #             return_dict_in_generate=return_dict_in_generate,
    #             synced_gpus=synced_gpus,
    #             **model_kwargs,
    #         )
    #
    #     elif is_sample_gen_mode:
    #         # 10. prepare logits warper
    #         logits_warper = self._get_logits_warper(
    #             top_k=top_k,
    #             top_p=top_p,
    #             typical_p=typical_p,
    #             temperature=temperature,
    #             num_beams=num_beams,
    #             renormalize_logits=renormalize_logits,
    #         )
    #
    #         # 11. expand input_ids with `num_return_sequences` additional sequences per batch
    #         input_ids, model_kwargs = self._expand_inputs_for_generation(
    #             input_ids,
    #             expand_size=num_return_sequences,
    #             is_encoder_decoder=self.config.is_encoder_decoder,
    #             **model_kwargs,
    #         )
    #
    #         # 12. run sample
    #         return self.sample(
    #             input_ids,
    #             logits_processor=logits_processor,
    #             logits_warper=logits_warper,
    #             stopping_criteria=stopping_criteria,
    #             pad_token_id=pad_token_id,
    #             eos_token_id=eos_token_id,
    #             output_scores=output_scores,
    #             return_dict_in_generate=return_dict_in_generate,
    #             synced_gpus=synced_gpus,
    #             **model_kwargs,
    #         )
    #
    #     elif is_beam_gen_mode:
    #         if num_return_sequences > num_beams:
    #             raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
    #
    #         if stopping_criteria.max_length is None:
    #             raise ValueError("`max_length` needs to be a stopping_criteria for now.")
    #
    #         # 10. prepare beam search scorer
    #         beam_scorer = BeamSearchScorer(
    #             batch_size=batch_size,
    #             num_beams=num_beams,
    #             device=inputs_tensor.device,
    #             length_penalty=length_penalty,
    #             do_early_stopping=early_stopping,
    #             num_beam_hyps_to_keep=num_return_sequences,
    #         )
    #         # 11. interleave input_ids with `num_beams` additional sequences per batch
    #         input_ids, model_kwargs = self._expand_inputs_for_generation(
    #             input_ids, expand_size=num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
    #         )
    #         # 12. run beam search
    #         return self.beam_search(
    #             input_ids,
    #             beam_scorer,
    #             logits_processor=logits_processor,
    #             stopping_criteria=stopping_criteria,
    #             pad_token_id=pad_token_id,
    #             eos_token_id=eos_token_id,
    #             output_scores=output_scores,
    #             return_dict_in_generate=return_dict_in_generate,
    #             synced_gpus=synced_gpus,
    #             **model_kwargs,
    #         )
    #
    #     elif is_beam_sample_gen_mode:
    #         # 10. prepare logits warper
    #         logits_warper = self._get_logits_warper(
    #             top_k=top_k,
    #             top_p=top_p,
    #             typical_p=typical_p,
    #             temperature=temperature,
    #             num_beams=num_beams,
    #             renormalize_logits=renormalize_logits,
    #         )
    #
    #         if stopping_criteria.max_length is None:
    #             raise ValueError("`max_length` needs to be a stopping_criteria for now.")
    #         # 11. prepare beam search scorer
    #         beam_scorer = BeamSearchScorer(
    #             batch_size=batch_size * num_return_sequences,
    #             num_beams=num_beams,
    #             device=inputs_tensor.device,
    #             length_penalty=length_penalty,
    #             do_early_stopping=early_stopping,
    #         )
    #
    #         # 12. interleave input_ids with `num_beams` additional sequences per batch
    #         input_ids, model_kwargs = self._expand_inputs_for_generation(
    #             input_ids,
    #             expand_size=num_beams * num_return_sequences,
    #             is_encoder_decoder=self.config.is_encoder_decoder,
    #             **model_kwargs,
    #         )
    #
    #         # 13. run beam sample
    #         return self.beam_sample(
    #             input_ids,
    #             beam_scorer,
    #             logits_processor=logits_processor,
    #             logits_warper=logits_warper,
    #             stopping_criteria=stopping_criteria,
    #             pad_token_id=pad_token_id,
    #             eos_token_id=eos_token_id,
    #             output_scores=output_scores,
    #             return_dict_in_generate=return_dict_in_generate,
    #             synced_gpus=synced_gpus,
    #             **model_kwargs,
    #         )
    #
    #     elif is_group_beam_gen_mode:
    #         if num_return_sequences > num_beams:
    #             raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
    #
    #         if num_beams % num_beam_groups != 0:
    #             raise ValueError("`num_beams` should be divisible by `num_beam_groups` for group beam search.")
    #
    #         if stopping_criteria.max_length is None:
    #             raise ValueError("`max_length` needs to be a stopping_criteria for now.")
    #
    #         # 10. prepare beam search scorer
    #         beam_scorer = BeamSearchScorer(
    #             batch_size=batch_size,
    #             num_beams=num_beams,
    #             max_length=stopping_criteria.max_length,
    #             device=inputs_tensor.device,
    #             length_penalty=length_penalty,
    #             do_early_stopping=early_stopping,
    #             num_beam_hyps_to_keep=num_return_sequences,
    #             num_beam_groups=num_beam_groups,
    #         )
    #         # 11. interleave input_ids with `num_beams` additional sequences per batch
    #         input_ids, model_kwargs = self._expand_inputs_for_generation(
    #             input_ids, expand_size=num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
    #         )
    #         # 12. run beam search
    #         return self.group_beam_search(
    #             input_ids,
    #             beam_scorer,
    #             logits_processor=logits_processor,
    #             stopping_criteria=stopping_criteria,
    #             pad_token_id=pad_token_id,
    #             eos_token_id=eos_token_id,
    #             output_scores=output_scores,
    #             return_dict_in_generate=return_dict_in_generate,
    #             synced_gpus=synced_gpus,
    #             **model_kwargs,
    #         )
    #
    #     elif is_constraint_gen_mode:
    #         if num_return_sequences > num_beams:
    #             raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
    #
    #         if stopping_criteria.max_length is None:
    #             raise ValueError("`max_length` needs to be a stopping_criteria for now.")
    #
    #         if num_beams <= 1:
    #             raise ValueError("`num_beams` needs to be greater than 1 for constrained genertation.")
    #
    #         if do_sample:
    #             raise ValueError("`do_sample` needs to be false for constrained generation.")
    #
    #         if num_beam_groups is not None and num_beam_groups > 1:
    #             raise ValueError("`num_beam_groups` not supported yet for constrained generation.")
    #
    #         final_constraints = []
    #         if constraints is not None:
    #             final_constraints = constraints
    #
    #         if force_words_ids is not None:
    #
    #             def typeerror():
    #                 raise ValueError(
    #                     "`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]`"
    #                     f"of positive integers, but is {force_words_ids}."
    #                 )
    #
    #             if not isinstance(force_words_ids, list) or len(force_words_ids) == 0:
    #                 typeerror()
    #
    #             for word_ids in force_words_ids:
    #                 if isinstance(word_ids[0], list):
    #                     if not isinstance(word_ids, list) or len(word_ids) == 0:
    #                         typeerror()
    #                     if any(not isinstance(token_ids, list) for token_ids in word_ids):
    #                         typeerror()
    #                     if any(
    #                         any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids)
    #                         for token_ids in word_ids
    #                     ):
    #                         typeerror()
    #
    #                     constraint = DisjunctiveConstraint(word_ids)
    #                 else:
    #                     if not isinstance(word_ids, list) or len(word_ids) == 0:
    #                         typeerror()
    #                     if any((not isinstance(token_id, int) or token_id < 0) for token_id in word_ids):
    #                         typeerror()
    #
    #                     constraint = PhrasalConstraint(word_ids)
    #                 final_constraints.append(constraint)
    #
    #         # 10. prepare beam search scorer
    #         constrained_beam_scorer = ConstrainedBeamSearchScorer(
    #             constraints=final_constraints,
    #             batch_size=batch_size,
    #             num_beams=num_beams,
    #             device=inputs_tensor.device,
    #             length_penalty=length_penalty,
    #             do_early_stopping=early_stopping,
    #             num_beam_hyps_to_keep=num_return_sequences,
    #         )
    #         # 11. interleave input_ids with `num_beams` additional sequences per batch
    #         input_ids, model_kwargs = self._expand_inputs_for_generation(
    #             input_ids, expand_size=num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
    #         )
    #         # 12. run beam search
    #         return self.constrained_beam_search(
    #             input_ids,
    #             constrained_beam_scorer=constrained_beam_scorer,
    #             logits_processor=logits_processor,
    #             stopping_criteria=stopping_criteria,
    #             pad_token_id=pad_token_id,
    #             eos_token_id=eos_token_id,
    #             output_scores=output_scores,
    #             return_dict_in_generate=return_dict_in_generate,
    #             synced_gpus=synced_gpus,
    #             **model_kwargs,
    #         )

    def init_prefix_weights(self):
        if self.init_prefix_from_vocab:
            #    self.prefix_shared.data = self.get_input_embeddings().weight[:self.prefix_dim].clone().detach()
            # # initialize prompts with randomly sampled top vocabulary.
            indices = np.random.permutation(range(5000))[:self.prefix_dim]
            init_weight = self.get_input_embeddings().state_dict()["weight"][indices]
            self.prefix_shared.data = init_weight.clone().detach()
        else:
            random_range = 0.5
            self.prefix_shared.data.uniform_(-random_range, random_range)

    def store_prefix_weights(self, prefix_embeddings):
        # need to pass them as a parameter?
        # stack or cat?
        embeddings = torch.stack([emb.cuda() for emb in prefix_embeddings])
        # Initialize the embeddings

        self.mul_prefix_emb.data = embeddings.clone().detach()

    # update attention weights
    def update_attention_weights(self, attention):
        self.encoder.attn_Wa.data = attention.cuda()

    def update_layer_norm_weights(self, layer_norm_dir):
        self.encoder.layer_norm.weight.data = torch.load(os.path.join(layer_norm_dir, "layer_norm_weight.pt"))
        self.encoder.layer_norm.bias.data = torch.load(os.path.join(layer_norm_dir, "layer_norm_bias.pt"))

    def update_attention_weights_sub(self, attention):
        assert len(attention) == 2
        assert "attn_W_down" in attention[0]
        assert "attn_W_up" in attention[1]
        self.encoder.attn_W_down.weight.data = torch.load(attention[0]).cuda()
        self.encoder.attn_W_up.weight.data = torch.load(attention[1]).cuda()

    def update_prefix_weights_single(self, prefix_embedding):
        self.prefix_shared.data = prefix_embedding

    def update_prefix_weights_multi(self, prefix_embedding, num_target):
        self.prefix_shared.data = torch.stack([prefix_embedding.detach().clone() for _ in range(num_target)])

    def update_prefix_weights(self, prefix_embeddings, target_embedding=None):

        def prefix_emb_similarity(emb_a, emb_b):
            return torch.sum(F.cosine_similarity(emb_a.cuda(), emb_b.cuda()))

        if len(prefix_embeddings) == 1:
            self.prefix_shared.data = prefix_embeddings[0]
        else:
            # print('target_embedding', target_embedding)
            if target_embedding is not None:
                target_embedding.cuda()
                sum_sims = torch.sum(
                    torch.Tensor([prefix_emb_similarity(emb, target_embedding) for emb in prefix_embeddings]))
                W_weighting = torch.Tensor(
                    [prefix_emb_similarity(emb, target_embedding) / sum_sims for emb in prefix_embeddings]).detach()
                res = torch.einsum('mld,m->ld', torch.stack([emb.cuda() for emb in prefix_embeddings]),
                                   W_weighting.cuda())
                self.prefix_shared.data = res
            else:
                self.W_weighting = nn.Parameter(torch.rand(len(prefix_embeddings)))
                res = torch.einsum('mld,m->ld', torch.stack([emb.cuda() for emb in prefix_embeddings]),
                                   self.W_weighting.cuda())
                self.prefix_shared.data = res

    def set_phm_Ws(self):
        def set_phm_Ws_helper(module):
            for name, sub_module in module.named_modules():
                if isinstance(sub_module, PHMLinear) and "down_sampler" in name:
                    if self.factorized_phm:
                        sub_module.set_W(W_left=self.W_down_left, W_right=self.W_down_right)
                    else:
                        sub_module.set_W(W=self.W_down)
                if isinstance(sub_module, PHMLinear) and "up_sampler" in name:
                    if self.factorized_phm:
                        sub_module.set_W(W_left=self.W_up_left, W_right=self.W_up_right)
                    else:
                        sub_module.set_W(W=self.W_up)

        set_phm_Ws_helper(self.encoder)
        set_phm_Ws_helper(self.decoder)

    def init_W(self, in_feats_per_axis, out_feats_per_axis, W=None, W_left=None, W_right=None):
        if self.w_init == "glorot-normal":
            if self.factorized_phm:
                for i in range(self.phm_dim):
                    W_left.data[i] = glorot_normal(W_left.data[i])
                    W_right.data[i] = glorot_normal(W_right.data[i])
            else:
                for i in range(self.phm_dim):
                    W.data[i] = glorot_normal(W.data[i])
        elif self.w_init == "glorot-uniform":
            if self.factorized_phm:
                for i in range(self.phm_dim):
                    W_left.data[i] = glorot_uniform(W_left.data[i])
                    W_right.data[i] = glorot_uniform(W_right.data[i])
            else:
                for i in range(self.phm_dim):
                    W.data[i] = glorot_uniform(W.data[i])
        elif self.w_init == "normal":
            if self.factorized_phm:
                for i in range(self.phm_dim):
                    W_left.data[i].normal_(std=0.01)
                    W_right.data[i].normal_(std=0.01)
            else:
                for i in range(self.phm_dim):
                    W.data[i].normal_(std=0.01)
        else:
            raise ValueError

    def set_phm_rule(self):
        def set_phm_rule(module):
            for name, sub_module in module.named_modules():
                if isinstance(sub_module, PHMLinear):
                    if self.factorized_phm_rule:
                        sub_module.set_phm_rule(phm_rule_right=self.phm_rule_right,
                                                phm_rule_left=self.phm_rule_left)
                    else:
                        sub_module.set_phm_rule(phm_rule=self.phm_rule)

        set_phm_rule(self.encoder)
        set_phm_rule(self.decoder)

    ###########################################################

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        self.device_map = (
            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        assert_device_map(self.device_map, len(self.encoder.block))
        self.encoder.parallelize(self.device_map)
        self.decoder.parallelize(self.device_map)
        self.lm_head = self.lm_head.to(self.decoder.first_device)
        self.model_parallel = True

    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self):
        self.encoder.deparallelize()
        self.decoder.deparallelize()
        self.encoder = self.encoder.to("cpu")
        self.decoder = self.decoder.to("cpu")
        self.lm_head = self.lm_head.to("cpu")
        self.model_parallel = False
        self.device_map = None
        torch.cuda.empty_cache()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def get_output_embeddings(self):
        return self.lm_head

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)

    def forward_encode(
            self,
            input_ids=None,
            attention_mask=None,
            decoder_input_ids=None,
            decoder_attention_mask=None,
            head_mask=None,
            decoder_head_mask=None,
            cross_attn_head_mask=None,
            encoder_outputs=None,
            past_key_values=None,
            inputs_embeds=None,
            decoder_inputs_embeds=None,
            labels=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            task=None,
            task_ids=None,
            vote_idx=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
            labels in ``[0, ..., config.vocab_size]``

        Returns:

        Examples::

            >>> from transformers import T5Tokenizer, T5ForConditionalGeneration

            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
            >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')

            >>> input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
            >>> labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2> </s>', return_tensors='pt').input_ids
            >>> outputs = model(input_ids=input_ids, labels=labels)
            >>> loss = outputs.loss
            >>> logits = outputs.logits

            >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you ", return_tensors="pt").input_ids  # Batch size 1
            >>> outputs = model.generate(input_ids)
        """
        # print('!!!model - forward, label', labels)
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
        if head_mask is not None and decoder_head_mask is None:
            if self.config.num_layers == self.config.num_decoder_layers:
                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
                decoder_head_mask = head_mask

        if self.prefix_tuning:
            if attention_mask is not None:
                attention_mask = torch.cat(
                    [torch.ones((attention_mask.shape[0], self.prefix_dim)).to(attention_mask.device), attention_mask],
                    dim=1)

        # Encode if needed (training, first prediction pass)
        # print('input_ids>>>', input_ids[0])
        if encoder_outputs is None:
            # Convert encoder inputs in embeddings if needed
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                task_embedding=None,
                task=task,
                task_ids=task_ids,
                vote_idx=vote_idx,
            )
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        hidden_states = encoder_outputs[0]

        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)

        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
            # get decoder inputs from shifting lm labels to the right
            decoder_input_ids = self._shift_right(labels)

        # If decoding with past key value states, only the last tokens
        # should be given as an input
        if past_key_values is not None:
            assert labels is None, "Decoder should not use cached key value states when training."
            if decoder_input_ids is not None:
                decoder_input_ids = decoder_input_ids[:, -1:]
            if decoder_inputs_embeds is not None:
                decoder_inputs_embeds = decoder_inputs_embeds[:, -1:]

        # Set device for model parallelism
        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)
            hidden_states = hidden_states.to(self.decoder.first_device)
            if decoder_input_ids is not None:
                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
            if attention_mask is not None:
                attention_mask = attention_mask.to(self.decoder.first_device)
            if decoder_attention_mask is not None:
                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            inputs_embeds=decoder_inputs_embeds,
            past_key_values=past_key_values,
            encoder_hidden_states=hidden_states,
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            task_embedding=None,
            task=task,
            vote_idx=vote_idx,
        )

        sequence_output = decoder_outputs[0]

        # Set device for model parallelism
        if self.model_parallel:
            torch.cuda.set_device(self.encoder.first_device)
            self.lm_head = self.lm_head.to(self.encoder.first_device)
            sequence_output = sequence_output.to(self.lm_head.weight.device)

        if self.config.tie_word_embeddings:
            # Rescale output before projecting on vocab
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
            sequence_output = sequence_output * (self.model_dim ** -0.5)

        lm_logits = self.lm_head(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))

        if not return_dict:
            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
            return ((loss,) + output) if loss is not None else output

        return Seq2SeqLMOutput(
            loss=loss,
            logits=lm_logits,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            decoder_input_ids=None,
            decoder_attention_mask=None,
            head_mask=None,
            decoder_head_mask=None,
            cross_attn_head_mask=None,
            encoder_outputs=None,
            past_key_values=None,
            inputs_embeds=None,
            decoder_inputs_embeds=None,
            labels=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            task=None,
            task_ids=None,
            vote_idx=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
            labels in ``[0, ..., config.vocab_size]``

        Returns:

        Examples::

            >>> from transformers import T5Tokenizer, T5ForConditionalGeneration

            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
            >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')

            >>> input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
            >>> labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2> </s>', return_tensors='pt').input_ids
            >>> outputs = model(input_ids=input_ids, labels=labels)
            >>> loss = outputs.loss
            >>> logits = outputs.logits

            >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you ", return_tensors="pt").input_ids  # Batch size 1
            >>> outputs = model.generate(input_ids)
        """
        # print('!!!model - forward, label', labels)
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
        if head_mask is not None and decoder_head_mask is None:
            if self.config.num_layers == self.config.num_decoder_layers:
                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
                decoder_head_mask = head_mask

        if self.prefix_tuning:
            if attention_mask is not None:
                attention_mask = torch.cat(
                    [torch.ones((attention_mask.shape[0], self.prefix_dim)).to(attention_mask.device), attention_mask],
                    dim=1)

        # Encode if needed (training, first prediction pass)
        # print('input_ids>>>', input_ids[0])
        if encoder_outputs is None:
            # Convert encoder inputs in embeddings if needed
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                task_embedding=None,
                task=task,
                task_ids=task_ids,
                vote_idx=vote_idx,
            )
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        hidden_states = encoder_outputs[0]

        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)

        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
            # get decoder inputs from shifting lm labels to the right
            decoder_input_ids = self._shift_right(labels)

        # If decoding with past key value states, only the last tokens
        # should be given as an input
        if past_key_values is not None:
            assert labels is None, "Decoder should not use cached key value states when training."
            if decoder_input_ids is not None:
                decoder_input_ids = decoder_input_ids[:, -1:]
            if decoder_inputs_embeds is not None:
                decoder_inputs_embeds = decoder_inputs_embeds[:, -1:]

        # Set device for model parallelism
        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)
            hidden_states = hidden_states.to(self.decoder.first_device)
            if decoder_input_ids is not None:
                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
            if attention_mask is not None:
                attention_mask = attention_mask.to(self.decoder.first_device)
            if decoder_attention_mask is not None:
                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            inputs_embeds=decoder_inputs_embeds,
            past_key_values=past_key_values,
            encoder_hidden_states=hidden_states,
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            task_embedding=None,
            task=task,
            vote_idx=vote_idx,
        )

        sequence_output = decoder_outputs[0]

        # Set device for model parallelism
        if self.model_parallel:
            torch.cuda.set_device(self.encoder.first_device)
            self.lm_head = self.lm_head.to(self.encoder.first_device)
            sequence_output = sequence_output.to(self.lm_head.weight.device)

        if self.config.tie_word_embeddings:
            # Rescale output before projecting on vocab
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
            sequence_output = sequence_output * (self.model_dim ** -0.5)

        lm_logits = self.lm_head(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))

        if not return_dict:
            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
            return ((loss,) + output) if loss is not None else output

        return Seq2SeqLMOutput(
            loss=loss,
            logits=lm_logits,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )

    def _prepare_encoder_decoder_kwargs_for_generation(
            self, input_ids: torch.LongTensor, model_kwargs, vote_idx=None
    ) -> Dict[str, Any]:

        if "encoder_outputs" not in model_kwargs:
            # retrieve encoder hidden states
            encoder = self.get_encoder()
            encoder_kwargs = {
                argument: value for argument, value in model_kwargs.items() if not argument.startswith("decoder_")
            }
            if self.prefix_tuning:
                attention_mask = encoder_kwargs['attention_mask']
                if attention_mask is not None:
                    encoder_kwargs['attention_mask'] = torch.cat(
                        [torch.ones((attention_mask.shape[0], self.prefix_dim)).to(attention_mask.device),
                         attention_mask], dim=1)
            model_kwargs["encoder_outputs"]: ModelOutput = encoder(input_ids, return_dict=True, **encoder_kwargs)
        return model_kwargs

    def prepare_inputs_for_generation(
            self, input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
    ):

        # cut decoder_input_ids if past is used
        if past is not None:
            input_ids = input_ids[:, -1:]
        return {
            "decoder_input_ids": input_ids,
            "past_key_values": past,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "use_cache": use_cache,
            "task": kwargs["task"]
            # "lang": kwargs["lang"]
        }

    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return self._shift_right(labels)

    def _reorder_cache(self, past, beam_idx):
        # if decoder past is not included in output
        # speedy decoding is disabled and no need to reorder
        if past is None:
            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
            return past

        reordered_decoder_past = ()
        for layer_past_states in past:
            # get the correct batch idx from layer past batch dim
            # batch dim of `past` is at 2nd position
            reordered_layer_past_states = ()
            for layer_past_state in layer_past_states:
                # need to set correct `past` for each of the four key / value states
                reordered_layer_past_states = reordered_layer_past_states + (
                    layer_past_state.index_select(0, beam_idx),
                )

            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
            assert len(reordered_layer_past_states) == len(layer_past_states)

            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
        return reordered_decoder_past


@add_start_docstrings(
    "The bare T5 Model transformer outputting encoder's raw hidden-states" "without any specific head on top.",
    T5_START_DOCSTRING,
)
class T5EncoderModel(T5PreTrainedModel):
    authorized_missing_keys = [
        r"encoder\.embed_tokens\.weight",
    ]

    def __init__(self, config: T5Config):
        super().__init__(config)
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        encoder_config = copy.deepcopy(config)
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = T5Stack(encoder_config, self.shared)

        self.init_weights()

        # Model parallel
        self.model_parallel = False
        self.device_map = None

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        self.device_map = (
            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        assert_device_map(self.device_map, len(self.encoder.block))
        self.encoder.parallelize(self.device_map)
        self.model_parallel = True

    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self):
        self.encoder.deparallelize()
        self.encoder = self.encoder.to("cpu")
        self.model_parallel = False
        self.device_map = None
        torch.cuda.empty_cache()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)

    def get_encoder(self):
        return self.encoder

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            head_mask=None,
            inputs_embeds=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):
        r"""
        Returns:

        Example::

            >>> from transformers import T5Tokenizer, T5EncoderModel
            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
            >>> model = T5EncoderModel.from_pretrained('t5-small')
            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
            >>> outputs = model(input_ids=input_ids)
            >>> last_hidden_states = outputs.last_hidden_state
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        return encoder_outputs
