from __future__ import print_function, division
import math
import gc

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data

from ..shared import up_conv 
from .lib.functional import dotproduction2, aggregation

class attention_conv_v(nn.Module):
    
    def __init__(self, in_channels, out_channels, kernel_size, att_hidden, att_mh, att_sm, att_two_w, visualization):

        super(attention_conv_v, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.padding = (kernel_size - 1) // 2

        self.att_hidden = att_hidden
        self.att_mh = att_mh
        self.att_out = att_hidden * self.att_mh
        self.att_sm = att_sm
        self.att_two_w = att_two_w
        self.visualization = visualization

        self.x_mlp = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )
        
        self.att_wq = nn.Sequential(
            nn.Conv2d(out_channels, self.att_out, kernel_size=1),
            nn.BatchNorm2d(self.att_out),
            nn.ReLU(inplace=True),
            nn.Conv2d(self.att_out, self.att_out, kernel_size=1)  # attention Linear
        )

        if att_two_w:
            self.att_wk = nn.Sequential(
            nn.Conv2d(out_channels, self.att_out, kernel_size=1),
            nn.BatchNorm2d(self.att_out),
            nn.ReLU(inplace=True),
            nn.Conv2d(self.att_out, self.att_out, kernel_size=1)  # attention Linear
            )
        else:
            self.att_wk = self.att_wq
    
    def get_att_hidden(self, x, w):
        B, Cout, H, W = x.shape
        h = w(x).view(B, self.att_hidden, self.att_mh, H, W)  # B, att_hidden, att_mh, H, W
        h = h / (torch.sqrt(torch.sum(h.pow(2), dim=1, keepdim=True)) + 1e-6)  # B, att_hidden, att_mh, H, W
        h = h.view(B, -1, H, W)

        return h

    def forward(self, x):
        x = self.x_mlp(x)  # B, Cout, H, W
        B, Cout, H, W = x.shape

        if self.att_two_w:
            att = dotproduction2(self.get_att_hidden(x, self.att_wq), 
                                 self.get_att_hidden(x, self.att_wk),
                                 self.kernel_size).view(B, self.att_hidden, self.att_mh, -1, H*W)  # B, att_hidden, att_mh, K^2, HW
        else:
            att = self.get_att_hidden(x, self.att_wq)
            att = dotproduction2(att, att, self.kernel_size).view(B, self.att_hidden, self.att_mh, -1, H*W)  # B, att_hidden, att_mh, K^2, HW

        att = torch.sum(att, dim=1)  # B, att_mh, K^2, HW
        att = torch.nn.functional.softmax(att * self.att_sm, dim=-2)  # B, att_mh, K^2, HW

        if self.visualization:
            return aggregation(x, att, kernel_size=self.kernel_size, padding=self.padding), x.detach(), att.detach()
        else:
            return aggregation(x, att, kernel_size=self.kernel_size, padding=self.padding)


class attention_conv_block_v(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size, att_hidden, att_mh, att_sm, att_two_w, visualization=False):
        super(attention_conv_block_v, self).__init__()
        self.kernel_size = kernel_size
        self.visualization = visualization

        self.att_conv_1 = attention_conv_v(in_ch, out_ch, kernel_size, att_hidden, att_mh, att_sm, att_two_w, visualization)
        self.att_conv_2 = attention_conv_v(out_ch, out_ch, kernel_size, att_hidden, att_mh, att_sm, att_two_w, visualization)


    def forward(self, x):
        if self.visualization:
            B, Cin, H, W = x.shape

            x, x_before, att = self.att_conv_1(x)

            return self.att_conv_2(x)[0], x[:, 0:4, :, :].detach(), x_before[:, 0:4, :, :], att[:, 0:4, :, :].view(B, -1, self.kernel_size ** 2, H, W)
        else:
            return self.att_conv_2(self.att_conv_1(x))


class KAUNet_v(nn.Module):
    """
    UNet - Basic Implementation
    Paper : https://arxiv.org/abs/1505.04597
    """

    def __init__(self, in_ch, out_ch, att_mh, att_sm, att_ks, att_two_w):
        super(KAUNet_v, self).__init__()

        n1 = 64
        filters = [n1, n1 * 2, n1 * 4, n1 * 8, n1 * 16]
        att_hidden = 32

        self.Maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.Maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.Maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.Maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.Conv1 = attention_conv_block_v(
            in_ch, filters[0], att_ks, att_hidden=att_hidden, att_mh=att_mh, att_sm=att_sm, att_two_w=att_two_w)
        self.Conv2 = attention_conv_block_v(
            filters[0], filters[1], att_ks, att_hidden=att_hidden, att_mh=att_mh, att_sm=att_sm, att_two_w=att_two_w)
        self.Conv3 = attention_conv_block_v(
            filters[1], filters[2], att_ks, att_hidden=att_hidden, att_mh=att_mh, att_sm=att_sm, att_two_w=att_two_w)
        self.Conv4 = attention_conv_block_v(
            filters[2], filters[3], att_ks, att_hidden=att_hidden, att_mh=att_mh, att_sm=att_sm, att_two_w=att_two_w)
        self.Conv5 = attention_conv_block_v(
            filters[3], filters[4], att_ks, att_hidden=att_hidden, att_mh=att_mh, att_sm=att_sm, att_two_w=att_two_w)

        # self.Conv1 = conv_block(in_ch, filters[0])
        # self.Conv2 = conv_block(filters[0], filters[1])
        # self.Conv3 = conv_block(filters[1], filters[2])
        # self.Conv4 = conv_block(filters[2], filters[3])
        # self.Conv5 = conv_block(filters[3], filters[4])

        self.Up5 = up_conv(filters[4], filters[3])
        # self.Up_conv5 = conv_block(filters[4], filters[3])
        self.Up_conv5 = attention_conv_block_v(
            filters[4], filters[3], att_ks, att_hidden=att_hidden, att_mh=att_mh, att_sm=att_sm, att_two_w=att_two_w)

        self.Up4 = up_conv(filters[3], filters[2])
        # self.Up_conv4 = conv_block(filters[3], filters[2])
        self.Up_conv4 = attention_conv_block_v(
            filters[3], filters[2], att_ks, att_hidden=att_hidden, att_mh=att_mh, att_sm=att_sm, att_two_w=att_two_w)

        self.Up3 = up_conv(filters[2], filters[1])
        # self.Up_conv3 = conv_block(filters[2], filters[1])
        self.Up_conv3 = attention_conv_block_v(
            filters[2], filters[1], att_ks, att_hidden=att_hidden, att_mh=att_mh, att_sm=att_sm, att_two_w=att_two_w)

        self.Up2 = up_conv(filters[1], filters[0])
        # self.Up_conv2 = conv_block(filters[1], filters[0])
        self.Up_conv2 = attention_conv_block_v(
            filters[1], filters[0], att_ks, att_hidden=att_hidden, att_mh=att_mh, att_sm=att_sm, att_two_w=att_two_w, visualization=True)

        self.Conv = nn.Conv2d(filters[0], out_ch,
                              kernel_size=1, stride=1, padding=0)

        # self.Up_v = nn.Upsample(scale_factor=2, mode='nearest')
        self.Up_v = nn.Upsample(scale_factor=1, mode='nearest')

        # self.active = torch.nn.Softmax(dim=1)

    def forward(self, x):
        e1 = self.Conv1(x)

        e2 = self.Maxpool1(e1)
        # e2, x_after, x_before, att = self.Conv2(e2)
        e2 = self.Conv2(e2)

        e3 = self.Maxpool2(e2)
        e3 = self.Conv3(e3)

        e4 = self.Maxpool3(e3)
        e4 = self.Conv4(e4)

        e5 = self.Maxpool4(e4)
        e5 = self.Conv5(e5)

        y = self.Up5(e5)
        y = torch.cat((e4, y), dim=1)
        y = self.Up_conv5(y)

        y = self.Up4(y)
        y = torch.cat((e3, y), dim=1)
        y = self.Up_conv4(y)

        y = self.Up3(y)
        y = torch.cat((e2, y), dim=1)
        y = self.Up_conv3(y)

        y = self.Up2(y)
        y = torch.cat((e1, y), dim=1)
        y, x_after, x_before, att = self.Up_conv2(y)

        y = self.Conv(y)

        # d1 = self.active(out)

        B, C, K2, H, W = att.shape

        return y, self.Up_v(x_after), self.Up_v(x_before), self.Up_v(att.view(B, -1, H, W)).view(B, C, K2, H, W)  # x_after/before: B*4*240*240, att:B*1*9*240*240
