from functools import partial

import torch
import torch.nn as nn
import torch.nn.functional as F
from timm.models.layers import trunc_normal_

#from .swin_transformer import SwinTransformer
#from .vision_transformer import VisionTransformer
from .SwinTransformer import SwinTransformer
from .VisionTransformer import VisionTransformer

class SwinTransformerForSimMIM(SwinTransformer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        assert self.num_classes == 0

        self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
        trunc_normal_(self.mask_token, mean=0., std=.02)

    def forward(self, x, mask, colorwise=False):
        x = self.patch_embed(x)

        assert mask is not None
        B, L, _ = x.shape

        # if colorwise:
        #     import pdb; pdb.set_trace()
        
        mask_tokens = self.mask_token.expand(B, L, -1)
        w = mask.flatten(1).unsqueeze(-1).type_as(mask_tokens)
        x = x * (1. - w) + mask_tokens * w

        if self.ape:
            x = x + self.absolute_pos_embed
        x = self.pos_drop(x)

        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)

        x = x.transpose(1, 2).contiguous()
        B, C, L = x.shape
        H = W = int(L ** 0.5)
        x = x.reshape(B, C, H, W)
        return x

    @torch.jit.ignore
    def no_weight_decay(self):
        return super().no_weight_decay() | {'mask_token'}


class VisionTransformerForSimMIM(VisionTransformer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        assert self.num_classes == 0

        self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
        self._trunc_normal_(self.mask_token, std=.02)

    def _trunc_normal_(self, tensor, mean=0., std=1.):
        trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std)

    def forward(self, x, mask):
        x = self.patch_embed(x)

        assert mask is not None
        B, L, _ = x.shape

        mask_token = self.mask_token.expand(B, L, -1)
        w = mask.flatten(1).unsqueeze(-1).type_as(mask_token)
        x = x * (1 - w) + mask_token * w

        cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
        x = torch.cat((cls_tokens, x), dim=1)

        if self.pos_embed is not None:
            x = x + self.pos_embed
        x = self.pos_drop(x)

        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
        for blk in self.blocks:
            x = blk(x, rel_pos_bias=rel_pos_bias)
        x = self.norm(x)

        x = x[:, 1:]
        B, L, C = x.shape
        H = W = int(L ** 0.5)
        x = x.permute(0, 2, 1).reshape(B, C, H, W).contiguous()
        return x

class SimMIM(nn.Module):
    def __init__(self, encoder, encoder_stride):
        super().__init__()
        self.encoder = encoder
        self.encoder_stride = encoder_stride

        self.decoder = nn.Sequential(
            nn.Conv2d(
                in_channels=self.encoder.num_features,
                out_channels=self.encoder_stride ** 2 * 3, kernel_size=1),
            nn.PixelShuffle(self.encoder_stride),
        )

        self.in_chans = self.encoder.in_chans
        self.patch_size = self.encoder.patch_size

    def forward(self, x, mask, reduction='mean', return_recon=False, return_emb=False, colorwise=False):
        z = self.encoder(x, mask)
        x_rec = self.decoder(z)

        mask = mask.repeat_interleave(self.patch_size, 1).repeat_interleave(self.patch_size, 2).unsqueeze(1).contiguous()
        loss_recon = F.l1_loss(x, x_rec, reduction='none')
        if reduction == 'mean':
            loss = (loss_recon * mask).sum() / (mask.sum() + 1e-5) / self.in_chans
        else:
            loss = loss_recon               

        if return_recon:
            return loss, x_rec
        elif return_emb:
            return loss, z        
        else:
            return loss    

    @torch.jit.ignore
    def no_weight_decay(self):
        if hasattr(self.encoder, 'no_weight_decay'):
            return {'encoder.' + i for i in self.encoder.no_weight_decay()}
        return {}

    @torch.jit.ignore
    def no_weight_decay_keywords(self):
        if hasattr(self.encoder, 'no_weight_decay_keywords'):
            return {'encoder.' + i for i in self.encoder.no_weight_decay_keywords()}
        return {}


def build_simmim(config):
    if config.model.type == 'swin':
        encoder = SwinTransformerForSimMIM(
            img_size=config.dataset.image_size,#config.DATA.IMG_SIZE,
            patch_size=config.model.swin.patch_size,#config.MODEL.SWIN.PATCH_SIZE,
            in_chans=config.model.swin.in_chans, #MODEL.SWIN.IN_CHANS,
            num_classes=0,
            embed_dim=config.model.swin.embed_dim,
            depths=config.model.swin.depths,
            num_heads=config.model.swin.num_heads,
            window_size=config.model.swin.window_size,
            mlp_ratio=config.model.swin.mlp_ratio,
            qkv_bias=config.model.swin.qkv_bias,
            qk_scale=config.model.swin.qk_scale,
            drop_rate=config.model.swin.drop_rate,
            drop_path_rate=config.model.swin.drop_path_rate,
            ape=config.model.swin.ape,
            patch_norm=config.model.swin.patch_norm,
            use_checkpoint=config.train.use_checkpoint)
        encoder_stride = 32
    elif config.model.type == 'vit':
        encoder = VisionTransformerForSimMIM(
            img_size=config.dataset.image_size,
            patch_size=config.model.vit.patch_size,
            in_chans=config.model.vit.in_chans,
            num_classes=0,
            embed_dim=config.model.vit.embed_dim,
            depth=config.model.vit.depth,
            num_heads=config.model.vit.num_heads,
            mlp_ratio=config.model.vit.mlp_ratio,
            qkv_bias=config.model.vit.qkv_bias,
            drop_rate=config.model.vit.drop_rate,
            drop_path_rate=config.model.vit.drop_path_rate,
            norm_layer=partial(nn.LayerNorm, eps=1e-6),
            init_values=config.model.vit.init_values,
            use_abs_pos_emb=config.model.vit.use_ape,
            use_rel_pos_bias=config.model.vit.use_rpb,
            use_shared_rel_pos_bias=config.model.vit.use_shared_rpb,
            use_mean_pooling=config.model.vit.use_mean_pooling)
        encoder_stride = 16
    else:
        raise NotImplementedError(f"Unknown pre-train model: {model_type}")
    
    model = SimMIM(encoder=encoder, encoder_stride=encoder_stride)
    return model
