from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import logging

import torch
import torch.nn as nn
import torch.nn.functional as F

from models.pose_resnet import get_pose_net as get_pose_only_net

BN_MOMENTUM = 0.1


class ImageEncoder(nn.Module):
    def __init__(self, in_channels):
        super(ImageEncoder, self).__init__()
        self.layer1 = self._make_first(in_channels, 32)
        self.layer2 = self._make_layer(32)
        self.layer3 = self._make_layer(64)
        self.layer4 = self._make_layer(128)

    def _make_first(self, in_channels, out_channels):
        conv1 = nn.Conv2d(in_channels, out_channels, 7, 1, 3, bias=False)
        bn1 = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
        relu1 = nn.ReLU(inplace=True)
        
        conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
        bn2 = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
        relu2 = nn.ReLU(inplace=True)

        layers = [conv1, bn1, relu1, conv2, bn2, relu2]
        return nn.Sequential(*layers)

    def _make_layer(self, in_channels):
        out_channels = in_channels * 2

        conv1 = nn.Conv2d(in_channels, out_channels, 3, 2, 1, bias=False)
        bn1 = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
        relu1 = nn.ReLU(inplace=True)
        
        conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
        bn2 = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
        relu2 = nn.ReLU(inplace=True)

        layers = [conv1, bn1, relu1, conv2, bn2, relu2]
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x


class ImageRenderer(nn.Module):
    def __init__(self, in_channels):
        super(ImageRenderer, self).__init__()
        self.layer1 = self._make_layer(in_channels)
        in_channels = in_channels // 2
        self.layer2 = self._make_layer(in_channels)
        in_channels = in_channels // 2
        self.layer3 = self._make_layer(in_channels)
        in_channels = in_channels // 2
        self.layer4 = self._make_layer(in_channels, is_last=True)

    def _make_layer(self, in_channels, is_last=False):
        out_channels = in_channels // 2

        conv1 = nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False)
        bn1 = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
        relu1 = nn.ReLU(inplace=True)
        
        conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
        bn2 = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
        relu2 = nn.ReLU(inplace=True)

        layers = [conv1, bn1, relu1, conv2, bn2, relu2]

        if not is_last:
            # Jakab's paper uses bilinear originally.
            # However nearest is perhaps better empirically? e.g. HRNet
            # can use some ablation
            upsample = nn.Upsample(scale_factor=2, mode='nearest')
            layers.append(upsample)
        else:
            conv3 = nn.Conv2d(out_channels, 3, 3, 1, 1, bias=True)
            layers.append(conv3)

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x


class HumanIMMModel(nn.Module):
    def __init__(self, cfg, is_train, is_ttt, freeze_bn):
        super(HumanIMMModel, self).__init__()
        self.image_encoder = ImageEncoder(3)
        self.pose_encoder = ImageEncoder(cfg.MODEL.NUM_JOINTS)
        self.image_renderer = ImageRenderer(512)
        self.pose_upsampler = nn.Upsample(scale_factor=4, mode='bilinear')

        self.sigma = cfg.MODEL.SIGMA
        self.is_ttt = is_ttt
        self.freeze_bn = freeze_bn

        if self.is_ttt:
            self.pose_net = get_pose_only_net(cfg, is_train=is_train)

    def forward(self, images, ref_images):
        image_feature = self.image_encoder(ref_images)
        
        assert self.is_ttt

        pose = self.pose_net(images)
        pose_feature = self.squeeze_heatmap(pose)
        pose_feature = self.pose_upsampler(pose)
        pose_feature = self.pose_encoder(pose_feature)
        
        feature = torch.cat([image_feature, pose_feature], dim=1)
        pred_images = self.image_renderer(feature)

        return pose, pred_images

    def squeeze_heatmap(self, pose):
        coord_y, coord_x = self.get_coord(pose)
        pose = self.get_gaussian(pose, coord_y, coord_x)
        return pose

    def get_coord(self, heatmaps):
        n, c, h, w = heatmaps.size()

        coord_y = heatmaps.new_tensor(range(h))
        coord_x = heatmaps.new_tensor(range(w))
        coord_y, coord_x = torch.meshgrid(coord_y, coord_x)

        coord_y = coord_y.reshape(h * w)
        coord_x = coord_x.reshape(h * w)

        idx = heatmaps.reshape(n, c, h * w).argmax(dim=2)
        coord_y = coord_y[idx]
        coord_x = coord_x[idx]
        return coord_y, coord_x

    def get_gaussian(self, heatmaps, coord_y, coord_x):
        n, c, h, w = heatmaps.size()
        
        y = heatmaps.new_tensor(range(h))
        x = heatmaps.new_tensor(range(w))
        y, x = torch.meshgrid(y, x)

        y = y.reshape(1, 1, h, w)
        x = x.reshape(1, 1, h, w)

        coord_y = coord_y.reshape(n, c, 1, 1)
        coord_x = coord_x.reshape(n, c, 1, 1)

        gaussian_map = torch.exp(-((coord_y - y) ** 2 + (coord_x - x) ** 2) / (2 * self.sigma ** 2))
        return gaussian_map

    def refresh(self):
        nn.init.constant_(self.shift_x, 0)
        nn.init.constant_(self.shift_y, 0)

    def train(self, mode=True):
        if mode:
            super(HumanIMMModel, self).train(mode=mode)
            if self.is_ttt:
                self.eval()
            else:
                self.pose_net.eval()
                if self.freeze_bn:
                    for m in self.modules():
                        if isinstance(m, nn.BatchNorm2d):
                            m.eval()
        else:
            super(HumanIMMModel, self).train(mode=mode)


def get_pose_net(cfg, is_train, is_ttt=False, freeze_bn=False):
    return HumanIMMModel(cfg, is_train, is_ttt, freeze_bn)
