import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import math
import torch.distributed as dist
from util import Pack
from functools import partial
import torchvision.models as torchvision_models
from util import Pack

class SparseCLModel(nn.Module):
    def __init__(self, args):
        super(SparseCLModel, self).__init__()
        self.args = args
        self.criterion = nn.CosineSimilarity(dim=1).cuda()
        self.online_encoder = torchvision.models.resnet18(zero_init_residual=True)
        self.target_encoder = torchvision.models.resnet18(zero_init_residual=True)

        self.online_encoder.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=2, bias=False)
        self.online_encoder.maxpool = nn.Identity()
        self.target_encoder.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=2, bias=False)
        self.target_encoder.maxpool = nn.Identity()

        hidden_dim = self.online_encoder.fc.weight.shape[1]
        del self.online_encoder.fc, self.target_encoder.fc # remove original fc layer

        #projectors mlp_dim = 2048, feature_dim = 2048
        self.online_encoder.fc = self._build_projector(args, hidden_dim)
        self.target_encoder.fc = self._build_projector(args, hidden_dim)

        # predictor
        if args.use_predictor:
            self.predictor = self._build_predictor(args)

        for param_o, param_t in zip(self.online_encoder.parameters(), self.target_encoder.parameters()):
            param_t.data.copy_(param_o.data)  # initialize
            param_t.requires_grad = False  # not update by gradient

    def _build_projector(self, args, hidden_dim):
        return nn.Sequential(
            nn.Linear(hidden_dim, args.mlp_dim, bias=False),
            nn.BatchNorm1d(args.mlp_dim),
            nn.ReLU(inplace=True),
            nn.Linear(args.mlp_dim, args.mlp_dim, bias=False),
            nn.BatchNorm1d(args.mlp_dim),
            nn.ReLU(inplace=True),
            nn.Linear(args.mlp_dim, args.feature_dim, bias=True)
        )

    def _build_predictor(self, args):
        return nn.Sequential(
            nn.Linear(args.feature_dim, args.mlp_dim, bias=False),
            nn.BatchNorm1d(args.mlp_dim),
            nn.ReLU(inplace=True),
            nn.Linear(args.mlp_dim, args.feature_dim, bias=True)
        )

    @torch.no_grad()
    def _update_momentum_encoder(self, m):
        """Momentum update of the momentum encoder"""
        for param_o, param_t in zip(self.online_encoder.parameters(), self.target_encoder.parameters()):
            param_t.data = param_t.data * m + param_o.data * (1. - m)

    def off_diagonal(self, x):
        # return a flattened view of the off-diagonal elements of a square matrix
        n, m = x.shape
        assert n == m
        return x.flatten()[:-1].view(n - 1, n + 1)[:, 1:].flatten()

    def alignment_loss(self, q, k):
        alignment_loss = 2.0 * (1.0 - self.criterion(q, k)).sum()
        return alignment_loss

    def sparsity_loss(self, q, k):
        # normalize
        q = nn.functional.normalize(q, dim=1)
        k = nn.functional.normalize(k, dim=1)

        ##[batch_size, batch_size]
        cosine_distance = torch.mm(q, k.t().contiguous())
        N = cosine_distance.size(0)
        batch_size = cosine_distance.size(1)
        matrix = torch.sigmoid((cosine_distance - self.args.threshold)/self.args.temperature).clone()
        labels = F.one_hot(torch.arange(N, dtype=torch.long).cuda())
        sparsity_loss = ((1.0-labels)*matrix).abs().sum()
        return sparsity_loss

    def obtain_representation(self, x):
        z = self.online_encoder(x)
        z_norm = F.normalize(z, dim=1) 
        return z, z_norm

    def forward(self, x1, x2, m):
        """
        Input:
            x1: first views of images
            x2: second views of images
            m: moco momentum
        Output:
            loss
        """
        # compute features
        if self.args.use_predictor:
            q1 = self.predictor(self.online_encoder(x1))
            q2 = self.predictor(self.online_encoder(x2))
        else:
            q1 = self.online_encoder(x1)
            q2 = self.online_encoder(x2)

        with torch.no_grad():  # no gradient
            self._update_momentum_encoder(m)  # update the momentum encoder

            # compute momentum features as targets
            k1 = self.target_encoder(x1)
            k2 = self.target_encoder(x2)

        alignment_loss = 0.5 * (self.alignment_loss(q1, k2) + self.alignment_loss(q2, k1))
        sparsity_loss = 0.5 * (self.sparsity_loss(q1, k2) + self.sparsity_loss(q2, k1))
        loss = alignment_loss + self.args.alpha * sparsity_loss
        loss_pack = Pack(loss=loss, alignment_loss=alignment_loss, sparsity_loss=sparsity_loss)
        return loss, loss_pack