import torch
from torch import nn
from transformers import BertForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput


class BertNonlinearClassificationHead(nn.Module):
    """Head for sentence-level classification tasks. Identical to RobertaClassificationHead."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features  # features is the pooled [CLS] token
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class BertForNonlinearSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super(BertForNonlinearSequenceClassification, self).__init__(config)
        self.classifier = BertNonlinearClassificationHead(config)
        super().post_init()


class BertForFactualCounterfactualSequenceClassification(BertForNonlinearSequenceClassification):
    def __init__(self, config):
        super(BertForFactualCounterfactualSequenceClassification, self).__init__(config)
        super().post_init()

    def forward(self,
            input_ids_factual = None,
            input_ids = None,
            attention_mask_factual = None,
            attention_mask = None,
            token_type_ids_factual = None,
            token_type_ids = None,
            position_ids_factual = None,
            position_ids = None,
            head_mask_factual = None,
            head_mask = None,
            inputs_embeds_factual = None,
            inputs_embeds = None,
            labels = None,
            output_attentions = None,
            output_hidden_states = None,
            return_dict = None,
        ):

        output_factual = super().forward(input_ids = input_ids_factual,
            attention_mask = attention_mask_factual,
            token_type_ids = token_type_ids_factual,
            position_ids = position_ids_factual,
            head_mask = head_mask_factual,
            inputs_embeds = inputs_embeds_factual,
            labels = labels,
            output_attentions = output_attentions,
            output_hidden_states = output_hidden_states,
            return_dict = return_dict)
        output_counterfactual = super().forward(input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            position_ids = position_ids,
            head_mask = head_mask,
            inputs_embeds = inputs_embeds,
            labels = labels,
            output_attentions = output_attentions,
            output_hidden_states = output_hidden_states,
            return_dict = return_dict)

        logits = torch.stack((output_counterfactual.logits, output_factual.logits), axis=1)
        output = SequenceClassifierOutput(logits=logits)
        return output
