# coding=utf-8
# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch BlenderbotSmall model. """


import math
import random
from typing import Optional, Tuple

import torch
import torch.nn.functional as F
from torch import nn
from torch.nn import CrossEntropyLoss

from src.transformers.activations import ACT2FN
from src.transformers.file_utils import (
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)

from .vallia_outputs import (
    BaseModelOutput,
    VAESeq2SeqModelOutput,
    VAESeq2SeqLMOutput,
    CausalVAESeq2SeqModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)
from src.transformers.modeling_utils import PreTrainedModel
from src.transformers.utils import logging
from .vallina_configuration import BlenderbotSmallConfig
from tools.utils import kl_normal_log

logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "BlenderbotSmallConfig"
_TOKENIZER_FOR_DOC = "BlenderbotSmallTokenizer"

BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/blenderbot_small-90M",
    # See all BlenderbotSmall models at https://huggingface.co/models?filter=blenderbot_small
]


# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id

    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


# Copied from transformers.models.bart.modeling_bart._make_causal_mask
def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
    """
    Make causal mask used for bi-directional self-attention.
    """
    bsz, tgt_len = input_ids_shape
    mask = torch.full((tgt_len, tgt_len), float("-inf"))
    mask_cond = torch.arange(mask.size(-1))
    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
    mask = mask.to(dtype)

    if past_key_values_length > 0:
        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)


# Copied from transformers.models.bart.modeling_bart._expand_mask
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    bsz, src_len = mask.size()
    tgt_len = tgt_len if tgt_len is not None else src_len

    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)

    inverted_mask = 1.0 - expanded_mask

    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)


# Copied from transformers.models.blenderbot.modeling_blenderbot.BlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall
class BlenderbotSmallLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)

    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
        bsz, seq_len = input_ids_shape[:2]
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        )
        return super().forward(positions)


# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->BlenderbotSmall
class BlenderbotSmallAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert (
            self.head_dim * num_heads == self.embed_dim
        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
        self.scaling = self.head_dim ** -0.5
        self.is_decoder = is_decoder

        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # if key_value_states are provided this layer is used as a cross-attention layer
        # for the decoder
        is_cross_attention = key_value_states is not None
        bsz, tgt_len, embed_dim = hidden_states.size()

        # get query proj
        query_states = self.q_proj(hidden_states) * self.scaling
        # get key, value proj
        if is_cross_attention and past_key_value is not None:
            # reuse k,v, cross_attentions
            key_states = past_key_value[0]
            value_states = past_key_value[1]
        elif is_cross_attention:
            # cross_attentions
            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
        elif past_key_value is not None:
            # reuse k, v, self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
            key_states = torch.cat([past_key_value[0], key_states], dim=2)
            value_states = torch.cat([past_key_value[1], value_states], dim=2)
        else:
            # self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

        if self.is_decoder:
            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
            # Further calls to cross_attention layer can then reuse all cross-attention
            # key/value_states (first "if" case)
            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
            # all previous decoder key/value_states. Further calls to uni-directional self-attention
            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
            # if encoder bi-directional self-attention `past_key_value` is always `None`
            past_key_value = (key_states, value_states)

        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.view(*proj_shape)
        value_states = value_states.view(*proj_shape)

        src_len = key_states.size(1)
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

        assert attn_weights.size() == (
            bsz * self.num_heads,
            tgt_len,
            src_len,
        ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"

        if attention_mask is not None:
            assert attention_mask.size() == (
                bsz,
                1,
                tgt_len,
                src_len,
            ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        attn_weights = F.softmax(attn_weights, dim=-1)

        if output_attentions:
            # this operation is a bit akward, but it's required to
            # make sure that attn_weights keeps its gradient.
            # In order to do so, attn_weights have to reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
        else:
            attn_weights_reshaped = None

        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)

        attn_output = torch.bmm(attn_probs, value_states)

        assert attn_output.size() == (
            bsz * self.num_heads,
            tgt_len,
            self.head_dim,
        ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"

        attn_output = (
            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
            .transpose(1, 2)
            .reshape(bsz, tgt_len, embed_dim)
        )

        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights_reshaped, past_key_value


# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->BlenderbotSmall
class BlenderbotSmallEncoderLayer(nn.Module):
    def __init__(self, config: BlenderbotSmallConfig):
        super().__init__()
        self.embed_dim = config.d_model
        self.self_attn = BlenderbotSmallAttention(
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
        """
        Args:
            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (:obj:`bool`, `optional`):
                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
                returned tensors for more detail.
        """
        residual = hidden_states
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        residual = hidden_states
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.final_layer_norm(hidden_states)

        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs


# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->BlenderbotSmall
class BlenderbotSmallDecoderLayer(nn.Module):
    def __init__(self, config: BlenderbotSmallConfig):
        super().__init__()
        self.embed_dim = config.d_model

        self.self_attn = BlenderbotSmallAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.encoder_attn = BlenderbotSmallAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    ):
        """
        Args:
            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (:obj:`bool`, `optional`):
                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
                returned tensors for more detail.
        """
        residual = hidden_states

        # Self Attention
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # add present self-attn cache to positions 1,2 of present_key_value tuple
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            past_key_value=self_attn_past_key_value,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
        )
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # Cross-Attention Block
        cross_attn_present_key_value = None
        cross_attn_weights = None
        if encoder_hidden_states is not None:
            residual = hidden_states

            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
                hidden_states=hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                past_key_value=cross_attn_past_key_value,
                output_attentions=output_attentions,
            )
            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
            hidden_states = residual + hidden_states
            hidden_states = self.encoder_attn_layer_norm(hidden_states)

            # add cross-attn to positions 3,4 of present_key_value tuple
            present_key_value = present_key_value + cross_attn_present_key_value

        # Fully Connected
        residual = hidden_states
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.final_layer_norm(hidden_states)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


class BlenderbotSmallPreTrainedModel(PreTrainedModel):
    config_class = BlenderbotSmallConfig
    base_model_prefix = "model"

    def _init_weights(self, module):
        std = self.config.init_std
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


    @property
    def dummy_inputs(self):
        pad_token = self.config.pad_token_id
        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
        dummy_inputs = {
            "attention_mask": input_ids.ne(pad_token),
            "input_ids": input_ids,
            "decoder_input_ids": input_ids,
        }
        return dummy_inputs


BLENDERBOT_SMALL_START_DOCSTRING = r"""
    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)

    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
    general usage and behavior.

    Parameters:
        config (:class:`~transformers.BlenderbotSmallConfig`):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""

BLENDERBOT_SMALL_GENERATION_EXAMPLE = r"""
    Conversation example::

        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration
        >>> mname = 'facebook/blenderbot_small-90M'
        >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
        >>> UTTERANCE = "My friends are cool but they eat too many carbs."
        >>> print("Human: ", UTTERANCE)
        >>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
        >>> inputs.pop("token_type_ids")
        >>> reply_ids = model.generate(**inputs)
        >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
        what kind of carbs do they eat? i don't know much about carbs.

        >>> REPLY = "I'm not sure"
        >>> print("Human: ", REPLY)
        >>> NEXT_UTTERANCE = (
        ... "My friends are cool but they eat too many carbs.</s> "
        ... "<s>what kind of carbs do they eat? i don't know much about carbs.</s> "
        ... "<s>I'm not sure."
        ... )
        >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors='pt')
        >>> inputs.pop("token_type_ids")
        >>> next_reply_ids = model.generate(**inputs)
        >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
"""

BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.

            `What are input IDs? <../glossary.html#input-ids>`__

            BlenderbotSmall uses the :obj:`bos_token_id` as the starting token for :obj:`decoder_input_ids` generation.
            If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
            :obj:`past_key_values`).
        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read
            :func:`modeling_blenderbot_small._prepare_decoder_inputs` and modify to your needs. See diagram 1 in `the
            paper <https://arxiv.org/abs/1910.13461>`__ for more information on the default strategy.
        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
            cross-attention of the decoder.
        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.

            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
            takes the value of :obj:`inputs_embeds`.
        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""


class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    :class:`BlenderbotSmallEncoderLayer`.

    Args:
        config: BlenderbotSmallConfig
        embed_tokens (torch.nn.Embedding): output embedding
    """

    def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)

        self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
            self.padding_idx,
        )
        #self.wre = nn.Embedding(2, embed_dim, self.padding_idx)
        #self.wte = nn.Embedding(128, embed_dim, self.padding_idx)
        self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)])
        self.layernorm_embedding = nn.LayerNorm(embed_dim)
        
        self.causalhmm =  CausalHMM(config.d_model)
        self.emotion_embeding = nn.Embedding(6+1, config.d_model, padding_idx=6)  # 6 emotion categories
        self.strategy_embedding = nn.Embedding(8+1, config.d_model, padding_idx=8) # 8 strategy 

        self.init_weights()
    
    #提取Encoder中的句子向量     
    def extract_cls_vectors(self, input_ids, hidden_states):
        
        cls_token_id = 54944 #已校验过
        cls_vectors = []
        sentence_lengths = []
        
        for i, input_id in enumerate(input_ids):
            cls_indices =  (input_id == cls_token_id).nonzero(as_tuple = True)[0]
            if len(cls_indices) > 0:
  
                batch_cls_vectors = hidden_states[i, cls_indices]
                cls_vectors.append(batch_cls_vectors)
                 
                #存储每句话长度
                lengths = []
                prev_index = -1
                for index in cls_indices:
                    length = index - prev_index
                    lengths.append(length.item())
                    prev_index = index
                
                #处理最后一句
                non_zero_indices = (input_id != 0).nonzero(as_tuple=True)[0]
                if len(non_zero_indices) > 0:
                    last_non_zero_index = non_zero_indices[-1].item()
                    final_length = last_non_zero_index - prev_index + 1
                    lengths.append(final_length)
 
                sentence_lengths.append(lengths[1:])
                
            else:
                 hidden_dim = hidden_states.shape[-1]
                 cls_vectors.append(torch.zeros((0, hidden_dim)))
                 sentence_lengths.append([])
        
        return cls_vectors, sentence_lengths
     
    #根据role_id提取不同角色的句子向量,   
    def extract_role_vectors_from_cls(self, role_ids, cls_vectors, sentence_lengths):
        
        # 初始化两个列表来存储不同角色的向量
        seeker_tensors = []
        supporter_tensors = []
        
        #初始化mask列表 记录每个对话样本的长度
        seeker_mask = []
        supporter_mask = []

        # 遍历每个样本的角色ID和对应的句子向量
        for index, (cls_vector, roles, lengths) in enumerate(zip(cls_vectors, role_ids, sentence_lengths)):
            # 每个样本的角色向量列表
            sample_seeker_vectors = []
            sample_supporter_vectors = []
            sample_seeker_mask = []
            sample_supporter_mask = []

            # 遍历每个角色ID 提取对应的句子向量
            start_idx = 0
            for i, length in enumerate(lengths):
                end_idx = start_idx + length
                role_id = roles[start_idx]
                vector = cls_vectors[index][i]
                
                if role_id == 0:
                    sample_seeker_vectors.append(vector)
                    sample_seeker_mask.append(1)
                elif role_id == 1:
                    sample_supporter_vectors.append(vector)
                    sample_supporter_mask.append(1)
                    
                start_idx = end_idx

            # 将当前样本的角色向量列表添加到总列表
            if sample_seeker_vectors:
                seeker_tensors.append(torch.stack(sample_seeker_vectors))
                seeker_mask.append(sample_seeker_mask)
            else:
                seeker_tensors.append(torch.empty(0, cls_vector.size(1), device=cls_vector.device))
                seeker_mask.append([])
                
            if sample_supporter_vectors:
                supporter_tensors.append(torch.stack(sample_supporter_vectors))
                supporter_mask.append(sample_supporter_mask)
            else:
                supporter_tensors.append(torch.empty(0, cls_vector.size(1), device=cls_vector.device))
                supporter_mask.append([])
            
        #保持batch维度
        seeker_tensors = torch.nn.utils.rnn.pad_sequence(seeker_tensors, batch_first=True, padding_value= 0 )
        seeker_mask = torch.nn.utils.rnn.pad_sequence([torch.tensor(m) for m in seeker_mask], batch_first=True, padding_value=0)
        
        supporter_tensors = torch.nn.utils.rnn.pad_sequence(supporter_tensors, batch_first=True, padding_value =0)
        supporter_mask = torch.nn.utils.rnn.pad_sequence([torch.tensor(m) for m in supporter_mask], batch_first=True, padding_value=0)

        return seeker_tensors, supporter_tensors, seeker_mask, supporter_mask   

    def forward(
        self,
        input_ids=None,
        strategy_ids =None,
        emotion_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        role_ids=None,
        turn_ids=None,
        cls_position=None,
        next_strategy_id=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        Args:
            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
                for details.

                `What are input IDs? <../glossary.html#input-ids>`__
            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                `What are attention masks? <../glossary.html#attention-mask>`__
            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
                into associated vectors than the model's internal embedding lookup matrix.
            output_attentions (:obj:`bool`, `optional`):
                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
                returned tensors for more detail.
            output_hidden_states (:obj:`bool`, `optional`):
                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
                for more detail.
            return_dict (:obj:`bool`, `optional`):
                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
            
        if role_ids is not None:
            role_embeds = 0
            #role_embeds = self.wre(role_ids)
        else:
            role_embeds = 0
            
        if turn_ids is not None:
            turn_embeds = 0
            #turn_embeds = self.wte(turn_ids)
        else:
            turn_embeds = 0
            
        embed_pos = self.embed_positions(input_shape)

        hidden_states = inputs_embeds + embed_pos + turn_embeds + role_embeds
        hidden_states = self.layernorm_embedding(hidden_states)
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)

        # expand attention_mask
        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
        for encoder_layer in self.layers:
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                layer_outputs = (None, None)
            else:
                if getattr(self.config, "gradient_checkpointing", False):

                    def create_custom_forward(module):
                        def custom_forward(*inputs):
                            return module(*inputs, output_attentions)

                        return custom_forward

                    layer_outputs = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(encoder_layer),
                        hidden_states,
                        attention_mask,
                    )
                else:
                    layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)

                hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)
            
        cls_vectors, sentence_lengths = self.extract_cls_vectors(input_ids, hidden_states)         
        seeker_tensors, supporter_tensors, seeker_mask, supporter_mask  = self.extract_role_vectors_from_cls(role_ids, cls_vectors, sentence_lengths)
        # print(strategy_ids)
        strategy_embs = self.strategy_embedding(strategy_ids) 
        emotion_embs = self.emotion_embeding(emotion_ids)
       
        eps_mu, eps_all_kl_loss, po_rec_all_loss  = self.causalhmm(strategy_embs, emotion_embs, seeker_tensors, supporter_tensors, seeker_mask, supporter_mask) 
           
        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        
        return CausalVAESeq2SeqModelOutput(
            eps_mu=eps_mu, eps_all_kl_loss=eps_all_kl_loss, po_rec_all_loss=po_rec_all_loss,
            last_hidden_state=hidden_states, hidden_states=cls_vectors, attentions=all_attentions
        )
        # return BaseModelOutput(
        #     last_hidden_state=hidden_states, hidden_states=cls_vectors, attentions=all_attentions
        # )

class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
    :class:`BlenderbotSmallDecoderLayer`

    Args:
        config: BlenderbotSmallConfig
        embed_tokens (torch.nn.Embedding): output embedding
    """

    def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
        self.padding_idx = config.pad_token_id
        self.max_target_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
        self.wre = nn.Embedding(2, config.d_model, self.padding_idx)
        self.wte = nn.Embedding(128, config.d_model, self.padding_idx)

        self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            self.padding_idx,
        )
        self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)])
        self.layernorm_embedding = nn.LayerNorm(config.d_model)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        strategy_ids =None,
        emotion_ids=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        inputs_embeds=None,
        use_cache=None,
        role_ids=None,
        turn_ids=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        Args:
            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
                for details.

                `What are input IDs? <../glossary.html#input-ids>`__
            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                `What are attention masks? <../glossary.html#attention-mask>`__
            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in ``[0, 1]``:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                `What are attention masks? <../glossary.html#attention-mask>`__
            past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
                decoding.

                If :obj:`past_key_values` are used, the user can optionally input only the last
                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
                sequence_length)`.
            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
                into associated vectors than the model's internal embedding lookup matrix.
            output_attentions (:obj:`bool`, `optional`):
                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
                returned tensors for more detail.
            output_hidden_states (:obj:`bool`, `optional`):
                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
                for more detail.
            return_dict (:obj:`bool`, `optional`):
                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

        # past_key_values_length
        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        # create causal mask
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        combined_attention_mask = None
        if input_shape[-1] > 1:
            combined_attention_mask = _make_causal_mask(
                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
            ).to(self.device)

        if attention_mask is not None and combined_attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            combined_attention_mask = combined_attention_mask + _expand_mask(
                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
            )

        # expand encoder attention mask
        if encoder_hidden_states is not None and encoder_attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])

        # embed positions
        positions = self.embed_positions(input_shape, past_key_values_length)
        
        if role_ids is not None:
            role_embeds = self.wre(role_ids)
        else:
            role_embeds = 0
        if turn_ids is not None:
            turn_embeds = self.wte(turn_ids)
        else:
            turn_embeds = 0

        # BlenderbotSmall applies layer norm on hidden_states
        inputs_embeds = self.layernorm_embedding(inputs_embeds)
        hidden_states = inputs_embeds + positions + role_embeds + turn_embeds

        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_cross_attentions = () if output_attentions else None
        next_decoder_cache = () if use_cache else None

        for idx, decoder_layer in enumerate(self.layers):
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.layerdrop):
                continue

            past_key_value = past_key_values[idx] if past_key_values is not None else None

            if getattr(self.config, "gradient_checkpointing", False):
                if use_cache:
                    raise ValueError(
                        "When using `gradient_checkpointing, make sure that `use_cache=False` and `config.use_cache=False`."
                    )

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        # None for past_key_value
                        return module(*inputs, output_attentions, use_cache)

                    return custom_forward

                layer_outputs = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(decoder_layer),
                    hidden_states,
                    combined_attention_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    None,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=combined_attention_mask,
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_attention_mask,
                    past_key_value=past_key_value,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                )
            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)

            if output_attentions:
                all_self_attns += (layer_outputs[1],)
                all_cross_attentions += (layer_outputs[2],)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            cross_attentions=all_cross_attentions,
        )

#定义一个先验网络 用于捕捉seeker情绪信息 其话语之间的信息是流动的
class PriorNet(nn.Module):
    def __init__(self, 
                 utter_dim, 
                 emotion_dim=None,
                 hidden_dim=64, 
                 norm_adj =None, 
                 SCM_model = None,
                 prior_type='GRU',
                 dropout_prob = None, 
                 mu_type='share',
                 var_type='share',
                 activation = nn.ReLU(inplace=True)
                 ):
        super(PriorNet, self).__init__()
        
        self.utter_dim = utter_dim
        self.prior_type = prior_type
        self.mu_type = mu_type
        self.var_type = var_type
        self.emotion_dim = 512
        
        if prior_type == "GRU":
            self.emotion_feature_fusion = nn.Linear(self.emotion_dim + self.utter_dim, hidden_dim)
            self.prior_eps_sec_GRU  = nn.GRUCell(hidden_dim, hidden_dim)
            
        if self.mu_type == 'share':
            self.prior_mu_FC = nn.Linear(hidden_dim, hidden_dim)
            self.prior_logvar_FC = nn.Linear(hidden_dim, hidden_dim)
            
        # if self.var_type == "share":
        
    def forward(self, emotion_cur, seeker_cur, z_pr_last, mask=None):
        
        if mask is None:
            mask = torch.ones(emotion_cur.size(0), dtype=torch.bool, device=emotion_cur.device)
            
        active_emotion_cur = emotion_cur[mask]
        active_seeker_cur = seeker_cur[mask]
        active_z_pr_last = z_pr_last[mask]
        
        #context as input
        eps_emotion_input = torch.cat([active_emotion_cur, active_seeker_cur], dim=1)
        eps_emotion_fusion = self.emotion_feature_fusion(eps_emotion_input)

        if self.prior_type == 'GRU':
            
            #GRU的输入是当前时间步的输入数据 和 上一时刻的隐藏状态； 输出是当前时间步的隐藏状态
            eps_sec_cur = self.prior_eps_sec_GRU(eps_emotion_fusion, active_z_pr_last)
               
        if self.mu_type == 'share':
            eps_mu = self.prior_mu_FC(eps_sec_cur)
        
        if self.var_type == 'share':
            eps_logvar = self.prior_logvar_FC(eps_sec_cur)
            
        full_eps_sec_cur = torch.zeros_like(z_pr_last)
        full_eps_mu = torch.zeros_like(z_pr_last)
        full_eps_logvar = torch.zeros_like(z_pr_last)
        
        full_eps_sec_cur[mask]  = eps_sec_cur
        full_eps_mu[mask] = eps_mu
        full_eps_logvar[mask] = eps_logvar
        # full_new_hidden[mask] = new_hidden
            
        #多个隐藏变量拼接在一起 hidden
        return full_eps_sec_cur, full_eps_mu, full_eps_logvar
    
#定义一个后验网络，其捕捉的是supportor策略的信息流动    
class PosteriorNet(nn.Module):
    def __init__(self, 
                 utter_dim, 
                 hidden_dim = 512,
                 norm_adj =None, 
                 SCM_model = None,
                 posterior_type ='FC', 
                 mu_type = 'share', 
                 var_type ='share', 
                 use_reparameterize=True, 
                 activation = nn.ReLU(inplace=True)):
        super(PosteriorNet, self).__init__()
        
        self.posterior_type = posterior_type
        self.mu_type = mu_type
        self.var_type = var_type
        self.strategy_dim = 512
        self.utter_dim = utter_dim
        
        self.hidden_dim = 64  #这里有问题 先验网络的hidden_dim 要求为64
        
        self.activation = activation
        self.use_reparameterize = use_reparameterize
        
        self.posterior_supporter_encoder = nn.Sequential(nn.Linear(self.utter_dim, self.hidden_dim))
        
        if posterior_type == 'FC':
            self.strategy_feature_fusion = nn.Linear(self.hidden_dim + self.strategy_dim, self.hidden_dim)
            self.posterior_eps_strategy = nn.Sequential(
                nn.Linear(hidden_dim * 2, hidden_dim),
                nn.ReLU())
                
        if self.mu_type == 'share':
            self.posterior_mu_FC = nn.Linear(self.hidden_dim, self.hidden_dim)
                
        if self.var_type == 'share':
            self.posterior_logvar_FC = nn.Linear(self.hidden_dim, self.hidden_dim)
            
        # generate flow via z_t and context
        self.posterior_seeker_decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 4),
            nn.ReLU(),
            nn.Linear(hidden_dim // 4, self.hidden_dim + self.utter_dim),
        )
        
    def reparameterize(self, mu, logvar, test):
    
        if not test:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(mu)
            return eps.mul(std).add_(mu)
        else:
            return mu
    
    def rec_loss_func(self, input_emb, rec_emb):
        # return nn.MSELoss(reduce='mean')(input_emb, rec_emb)
        return nn.SmoothL1Loss(reduction='mean')(input_emb, rec_emb)
            
    def forward(self, strategy_cur, supporter_cur, z_po_last, mask, test=False):
        
        if mask is None:
            mask = torch.ones(strategy_cur.size(0), dtype=torch.bool, device=strategy_cur.device)
               
        active_strategy_cur = strategy_cur[mask]
        active_supporter_cur = supporter_cur[mask]
        active_z_po_last = z_po_last[mask]
    
        supporter_features = self.posterior_supporter_encoder(active_supporter_cur)
        
        eps_strategy_input = torch.cat([supporter_features, active_strategy_cur], dim=1)
        eps_strategy_fusion = self.strategy_feature_fusion(eps_strategy_input)
        
        if self.posterior_type == 'FC':
            eps = self.posterior_eps_strategy(torch.cat([eps_strategy_fusion, active_z_po_last[:,:]], dim=1))
            
        # calculate mu
        if self.mu_type == 'share':
            eps_mu = self.posterior_mu_FC(eps)
        
        if self.var_type == 'share':
            eps_logvar = self.posterior_logvar_FC(eps)
            
        if self.use_reparameterize:
            z_cur = self.reparameterize(eps_mu, eps_logvar, test)
        else:
            z_cur = eps_mu
        
        supporter_rec = self.posterior_seeker_decoder(z_cur)
        
        rec_loss = self.rec_loss_func(eps_strategy_input, supporter_rec)
        
        # full_eps = torch.zeros_like(z_po_last)
        full_eps_logvar = torch.zeros_like(z_po_last)
        full_eps_mu = torch.zeros_like(z_po_last)
        full_z_cur = torch.zeros_like(z_po_last)

        # full_eps[mask] = eps
        full_eps_logvar[mask] = eps_logvar
        full_eps_mu[mask] = eps_mu
        full_z_cur[mask] = z_cur

        return full_eps_mu, full_eps_logvar, full_z_cur, rec_loss

class CausalHMM(nn.Module):
    def __init__(self,
                utter_dim,
                hidden_dim=64,
                norm_adj=None,
                SCM_type='nonlinear',
                nonlinear_activation='relu',
                prior_type='GRU',
                posterior_type='FC',
                dropout_prob=0.3,
                mu_type='share',
                var_type='share',
                use_reparameterize=True,
                activation_type='relu',
                pred_z_init = 'origin',
                ):
        super(CausalHMM, self).__init__()
        
        self.utter_dim = utter_dim
        self.hidden_dim = hidden_dim
        self.var_type = var_type
        self.norm_adj = norm_adj
        self.pred_z_init = pred_z_init
        self.activation = activation_type
        
        self.PriorNet = PriorNet(
            utter_dim=self.utter_dim,
            hidden_dim=hidden_dim,
            norm_adj=norm_adj,
            SCM_model='nonlinear',
            prior_type=prior_type,
            dropout_prob=dropout_prob,
            var_type=var_type,
            mu_type=mu_type,
            activation=self.activation,
        )

        self.PosteriorNet = PosteriorNet(
            utter_dim=self.utter_dim,
            hidden_dim=hidden_dim,
            norm_adj=norm_adj,
            SCM_model='nonlinear',
            posterior_type=posterior_type,
            mu_type=mu_type,
            var_type=var_type,
            use_reparameterize=use_reparameterize,
            activation=self.activation,
        )
        
        self.prior_emotion_FC = nn.Linear(self.utter_dim, self.utter_dim)
        self.posterior_strategy_FC = nn.Linear(self.utter_dim, self.utter_dim)
        
    def pad_and_mask(self, tensors, max_len):
        
        batch_size, seq_len, hidden_size = tensors.shape
        padded_tensors = torch.zeros((batch_size, max_len, hidden_size), device=tensors.device)
        mask = torch.zeros((batch_size, max_len), dtype=torch.bool, device=tensors.device)
        padded_tensors[:, :seq_len, :] = tensors
        mask[:, :seq_len] = 1
        
        return padded_tensors, mask
        
    def forward(self, strategy_embs, emotion_embs, seeker_tensors, supporter_tensors, seeker_mask, supporter_mask, test=False):
        
        batch, _, _ = seeker_tensors.shape
        
        max_time_seeker = seeker_tensors.shape[1]
        max_time_supporter = supporter_tensors.shape[1]
        max_time = max(max_time_seeker, max_time_supporter)

        seeker_tensors, seeker_mask = self.pad_and_mask(seeker_tensors, max_time)
        supporter_tensors, supporter_mask = self.pad_and_mask(supporter_tensors, max_time)
        
        pr_hidden_last = torch.zeros((batch, self.hidden_dim), device=seeker_tensors.device)
        po_hidden_last = torch.zeros((batch, self.hidden_dim), device=seeker_tensors.device)

        final_po_eps_mu = torch.zeros((batch, self.hidden_dim), device=seeker_tensors.device)
        final_pr_eps_mu = torch.zeros((batch, self.hidden_dim), device=seeker_tensors.device)
        
        updated_pr_eps_mu = torch.zeros((batch, max_time, self.hidden_dim), device=seeker_tensors.device)
        updated_po_eps_mu = torch.zeros((batch, max_time, self.hidden_dim), device=seeker_tensors.device)
        
        #记录每个对话样本的最后活跃时间 保证每个样本至少有一个有效的表示
        last_active_time_seeker = torch.full((batch,), -1, dtype=torch.long)
        last_active_time_supporter = torch.full((batch,), -1, dtype=torch.long)
        
        po_eps_all_kl_loss = torch.zeros(1, device=seeker_tensors.device)
        po_rec_all_loss = torch.zeros(1, device=seeker_tensors.device)
        
        #5-29临时处理下
        rec_loss = torch.zeros(1, device=seeker_tensors.device)
        po_eps_mu, po_eps_logvar = torch.zeros(1, device=seeker_tensors.device), torch.zeros(1, device=seeker_tensors.device)

        for t in range(max_time):
            active_seekers = seeker_mask[:, t, ].bool()
            active_supporters = supporter_mask[:, t, ].bool()
            
            last_active_time_seeker[active_seekers] = t
            last_active_time_supporter[active_supporters] = t
            
            #seeker_tensors[active_seekers, t, :] batch维度会变化，
            #active_seekers在不同的时间步选择了不同的数量的活跃批次
            if active_seekers.any():
                prior_emotion_features = self.prior_emotion_FC(emotion_embs[:, t, :])
                pr_eps_mu_cur, pr_eps_mu, pr_eps_logvar= self.PriorNet(
                    prior_emotion_features, seeker_tensors[:, t, :], pr_hidden_last, mask=active_seekers)

                pr_hidden_last[active_seekers] = pr_eps_mu[active_seekers]
                #更新潜在表示
                updated_pr_eps_mu[:, t, :][active_seekers] = pr_eps_mu[active_seekers]
                
            if active_supporters.any():
                posterior_strategy_features = self.posterior_strategy_FC(strategy_embs[:, t, :])
                po_eps_mu, po_eps_logvar, po_z_cur, rec_loss = self.PosteriorNet(
                    posterior_strategy_features, supporter_tensors[:, t, :], po_hidden_last, mask=active_supporters,  test=test)

                po_hidden_last[active_supporters] = po_eps_mu[active_supporters]

                updated_po_eps_mu[:, t, :][active_supporters] = po_eps_mu[active_supporters]
            
            po_rec_all_loss += rec_loss
            
            po_eps_all_kl_loss += kl_normal_log(po_eps_mu, po_eps_logvar,
                                                pr_eps_mu, pr_eps_logvar).sum()
        
        # 根据记录的最后活跃时间步更新最终潜在表示
        for idx in range(batch):
            if last_active_time_seeker[idx] != -1:
                final_pr_eps_mu[idx] = updated_pr_eps_mu[idx, last_active_time_seeker[idx]]
            if last_active_time_supporter[idx] != -1:
                final_po_eps_mu[idx] = updated_po_eps_mu[idx, last_active_time_supporter[idx]]

        final_eps_mu = torch.cat((final_pr_eps_mu, final_po_eps_mu), dim=1)

        po_eps_all_kl_loss = po_eps_all_kl_loss / max_time
        
        return final_eps_mu, po_eps_all_kl_loss, po_rec_all_loss
               
         
@add_start_docstrings(
    "The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top.",
    BLENDERBOT_SMALL_START_DOCSTRING,
)
class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
    def __init__(self, config: BlenderbotSmallConfig):
        super().__init__(config)

        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)

        self.encoder = BlenderbotSmallEncoder(config, self.shared)
        self.decoder = BlenderbotSmallDecoder(config, self.shared)
        
        self.transform_for_eps_mu = nn.Linear(128 ,config.d_model)
    
        self.init_weights()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, value):
        self.shared = value
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder
    
    @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        strategy_ids =None,
        emotion_ids=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        encoder_outputs=None,
        past_key_values=None,
        inputs_embeds=None,
        decoder_role_ids=None,
        decoder_turn_ids=None,
        role_ids=None,
        turn_ids=None,
        decoder_inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        Returns:

        Example::

            >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel

            >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")

            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

            >>> last_hidden_states = outputs.last_hidden_state
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if encoder_outputs is None:
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                strategy_ids =strategy_ids,
                emotion_ids=emotion_ids,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                role_ids=role_ids,
                turn_ids=turn_ids,
                return_dict=return_dict,
            )
            
        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = CausalVAESeq2SeqModelOutput(
                eps_mu=encoder_outputs[0],
                eps_all_kl_loss=encoder_outputs[1],
                po_rec_all_loss=encoder_outputs[2],
                last_hidden_state=encoder_outputs[3],
                hidden_states=encoder_outputs[4] if len(encoder_outputs) > 4 else None,
                attentions=encoder_outputs[5] if len(encoder_outputs) > 5 else None,
            )
            
            #原始的
            # encoder_outputs = BaseModelOutput(
            #     last_hidden_state=encoder_outputs[0],
            #     hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
            #     attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            # )
        
        eps_mu = encoder_outputs[0]
        eps_mu = self.transform_for_eps_mu(eps_mu).unsqueeze(1)
        
        #将获取到的 潜在表示->送入到解码器中
        new_decoder_inputs_embeds = None
        if past_key_values is None and decoder_input_ids.shape[1]>1: #train
            if decoder_inputs_embeds is None:
                decoder_inputs_embeds = self.decoder.embed_tokens(decoder_input_ids) * self.decoder.embed_scale
            #此步骤操作是相当于把  eps_mu信息 注入到第二个索引位置处
            new_decoder_inputs_embeds = torch.cat([decoder_inputs_embeds[:,:1,:], eps_mu, decoder_inputs_embeds[:,2:,:]], dim=1)
        
        elif decoder_input_ids.shape[1] ==1 and decoder_input_ids[0, 0].item() ==0 and past_key_values[0][0].shape[2] < 3: #test
            if decoder_inputs_embeds is None:
                decoder_inputs_embeds = self.decoder.embed_tokens(decoder_input_ids) * self.decoder.embed_scale
            new_decoder_inputs_embeds = eps_mu

        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids if new_decoder_inputs_embeds is None else None,
            attention_mask=decoder_attention_mask,
            encoder_hidden_states=encoder_outputs[3],
            encoder_attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=decoder_inputs_embeds if new_decoder_inputs_embeds is None else new_decoder_inputs_embeds,
            use_cache=use_cache,
            role_ids=decoder_role_ids,
            turn_ids=decoder_turn_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if not return_dict:
            return decoder_outputs + encoder_outputs
        
        return VAESeq2SeqModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
            eps_all_kl_loss=encoder_outputs.eps_all_kl_loss,
            po_rec_all_loss=encoder_outputs.po_rec_all_loss,
        )

        # return Seq2SeqModelOutput(
        #     last_hidden_state=decoder_outputs.last_hidden_state,
        #     past_key_values=decoder_outputs.past_key_values,
        #     decoder_hidden_states=decoder_outputs.hidden_states,
        #     decoder_attentions=decoder_outputs.attentions,
        #     cross_attentions=decoder_outputs.cross_attentions,
        #     encoder_last_hidden_state=encoder_outputs.last_hidden_state,
        #     encoder_hidden_states=encoder_outputs.hidden_states,
        #     encoder_attentions=encoder_outputs.attentions,
        # )


@add_start_docstrings(
    "The BlenderbotSmall Model with a language modeling head. Can be used for summarization.",
    BLENDERBOT_SMALL_START_DOCSTRING,
)
class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
    base_model_prefix = "model"
    _keys_to_ignore_on_load_missing = [
        r"final_logits_bias",
        r"encoder\.version",
        r"decoder\.version",
        r"lm_head\.weight",
    ]

    def __init__(self, config: BlenderbotSmallConfig):
        super().__init__(config)
        self.model = BlenderbotSmallModel(config)
        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)

        self.init_weights()

    def get_encoder(self):
        return self.model.get_encoder()

    def get_decoder(self):
        return self.model.get_decoder()

    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
        new_embeddings = super().resize_token_embeddings(new_num_tokens)
        self._resize_final_logits_bias(new_num_tokens)
        return new_embeddings

    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
        old_num_tokens = self.final_logits_bias.shape[-1]
        if new_num_tokens <= old_num_tokens:
            new_bias = self.final_logits_bias[:, :new_num_tokens]
        else:
            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        self.register_buffer("final_logits_bias", new_bias)

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        strategy_ids =None,
        emotion_ids=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        encoder_outputs=None,
        past_key_values=None,
        inputs_embeds=None,
        role_ids=None,
        decoder_role_ids=None,
        turn_ids=None,
        decoder_turn_ids=None,
        decoder_inputs_embeds=None,
        labels=None,
        use_cache=None,
        cls_position=None,
        next_strategy_id=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.

        Returns:
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
            if decoder_input_ids is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )
            else:
                decoder_input_ids = shift_tokens_right(
                    decoder_input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            strategy_ids = strategy_ids,
            emotion_ids = emotion_ids,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            role_ids=role_ids,
            decoder_role_ids=decoder_role_ids,
            turn_ids=turn_ids,
            decoder_turn_ids=decoder_turn_ids,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
        
        return VAESeq2SeqLMOutput(
            loss=masked_lm_loss,
            eps_all_kl_loss=outputs.eps_all_kl_loss,
            po_rec_all_loss=outputs.po_rec_all_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )

    def prepare_inputs_for_generation(
        self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
    ):
        # cut decoder_input_ids if past is used
        if past is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]
        #print({
        #    "input_ids": None,  # encoder_outputs is defined. input_ids not needed
        #    "decoder_input_ids": decoder_input_ids,
        #    "attention_mask": attention_mask,
        #    "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
        #})
        return {
            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
            "encoder_outputs": encoder_outputs,
            "past_key_values": past,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
        }

    def adjust_logits_during_generation(self, logits, cur_len, max_length):
        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
        return logits

    @staticmethod
    def _force_token_id_to_be_generated(scores, token_id) -> None:
        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")

    @staticmethod
    def _reorder_cache(past, beam_idx):
        reordered_past = ()
        for layer_past in past:
            # cached cross_attention states don't have to be reordered -> they are always the same
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
            )
        return reordered_past