
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.distributions as dist
from layers.VAE_EncDec import Encoder, Decoder
from layers.Transformer_EncDec import Encoder as TransEncoder
from layers.Transformer_EncDec import EncoderLayer as TransEncoderLayer
from layers.SelfAttention_Family import FullAttention, AttentionLayer
from layers.Embed import PatchEmbedding
from layers.FrequencyFilter import FrequencyFilter
import pywt
import numpy as np

class PatchMixerLayer(nn.Module):
    def __init__(self,dim,a,kernel_size = 8):
        super().__init__()
        self.Resnet =  nn.Sequential(
            nn.Conv1d(dim,dim,kernel_size=kernel_size,groups=dim,padding='same'),
            nn.GELU(),
            nn.BatchNorm1d(dim)
        )
        self.Conv_1x1 = nn.Sequential(
            nn.Conv1d(dim,a,kernel_size=1),
            nn.GELU(),
            nn.BatchNorm1d(a)
        )
    def forward(self,x):
        x = x +self.Resnet(x)                  # x: [batch * n_val, patch_num, d_model]
        x = self.Conv_1x1(x)                   # x: [batch * n_val, a, d_model]
        return x

class PriorBlock(nn.Module):
    def __init__(self, d_dim, zd_dim):
        super(PriorBlock, self).__init__()
        self.d_dim = d_dim

        self.fc1 = nn.Sequential(
            nn.Linear(d_dim, zd_dim, bias=False), 
            nn.BatchNorm1d(zd_dim), 
            nn.ReLU()
        )

        self.fc21 = nn.Sequential(
            nn.Linear(zd_dim, zd_dim)
        )

        self.fc22 = nn.Sequential(
            nn.Linear(zd_dim, zd_dim), 
            nn.Softplus()
        )

        self.init_weights()

    def init_weights(self):
        torch.nn.init.xavier_uniform_(self.fc1[0].weight)
        torch.nn.init.xavier_uniform_(self.fc21[0].weight)
        self.fc21[0].bias.data.zero_()
        torch.nn.init.xavier_uniform_(self.fc22[0].weight)
        self.fc22[0].bias.data.zero_()

    def forward(self, d):
        d_onehot = torch.zeros(d.shape[0], self.d_dim)
        for idx, val in enumerate(d):
            d_onehot[idx][val.item()] = 1

        d_onehot = d_onehot.to(d.device)

        hidden = self.fc1(d_onehot)
        zd_loc = self.fc21(hidden)
        zd_scale = self.fc22(hidden) + 1e-7

        return zd_loc, zd_scale

# Auxiliary tasks
class Projection(nn.Module):
    def __init__(self, d_dim, zd_dim):
        super(Projection, self).__init__()

        self.fc1 = nn.Linear(zd_dim, d_dim)

        torch.nn.init.xavier_uniform_(self.fc1.weight)
        self.fc1.bias.data.zero_()

    def forward(self, zd):
        h = F.relu(zd)
        loc_d = self.fc1(h)
        return loc_d

# PatchMixer DWConv
class TimeStochasticBlock(nn.Module):
    def __init__(self, configs) -> None:
        super(TimeStochasticBlock, self).__init__()
        self.enc_in = configs.n_features
        hidden_dim = configs.hidden_size
        self.hidden_dim = hidden_dim
        self.lookback = configs.len_seq
        self.patch_size = configs.patch_len
        patch_len = configs.patch_len
        self.stride = configs.patch_len
        self.kernel_size = configs.mixer_kernel_size
        self.patch_num = int((self.lookback - self.patch_size)/self.stride + 1) + 1
        self.a = self.patch_num
        self.d_model = configs.d_model
        self.dropout = configs.dropout
        self.head_dropout = configs.dropout
        self.depth = configs.e_layers
        self.len_seq = configs.len_seq
        
        self.PatchMixer_blocks = nn.ModuleList([])
        self.padding_patch_layer = nn.ReplicationPad1d((0, self.stride)) 
        self.patch_num = int((self.lookback - self.patch_size)/self.stride + 1) + 1
        print("patch_num", self.patch_num)
        for _ in range(self.depth):
            self.PatchMixer_blocks.append(PatchMixerLayer(dim=self.patch_num, a=self.a, kernel_size=self.kernel_size))

        self.W_P = nn.Linear(self.patch_size, self.d_model)  
        self.head0 = nn.Sequential(
            nn.Flatten(start_dim=-2),
            nn.Linear(self.patch_num * self.d_model, configs.hidden_size),
            nn.Dropout(self.head_dropout)
        )
        self.head1 = nn.Sequential(
            nn.Flatten(start_dim=-2),
            nn.Linear(self.a * self.d_model, int(configs.hidden_size * 2)),
            nn.GELU(),
            nn.Dropout(self.head_dropout),
            nn.Linear(int(configs.hidden_size * 2), configs.hidden_size),
            nn.Dropout(self.head_dropout)
        )

        self.dropout = nn.Dropout(self.dropout)
        self.projection = nn.Linear(configs.hidden_size, configs.hidden_size)
        
    def forward(self, x):
        bs, nvars, L = x.shape 
        # print(x.shape)
        x_lookback = self.padding_patch_layer(x)
        x = x_lookback.unfold(dimension=-1, size=self.patch_size, step=self.stride)  # x: [batch, n_val, patch_num, patch_size]  
        x = self.W_P(x)                                                             # x: [batch, n_val, patch_num, d_model]
        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))      # x: [batch * n_val, patch_num, d_model]
        # print(x.shape)
        x = self.dropout(x)
        u = self.head0(x)

        for PatchMixer_block in self.PatchMixer_blocks:
            x = PatchMixer_block(x)
        x = self.head1(x)
        x = u + x
        # print(x.shape)
        x = torch.reshape(x, (bs,-1))  
        x = self.projection(x) 
        return x

    
class Model(nn.Module):
    def __init__(self, configs):
        super(Model, self).__init__()
        self.zd_dim = configs.hidden_size
        self.zx_dim = 0 # keep but not used
        self.zy_dim = configs.hidden_size
        self.d_dim = configs.n_domains
        self.y_dim = configs.n_classes
        self.input_channel = configs.n_features
        self.mask_spectrum = configs.mask_spectrum
        self.len_seq = configs.len_seq
        self.fc_dim = configs.fc_dim
        self.kernel_size = configs.kernel_size
        self.freq_type = configs.freq_type
        self.constraint_type = configs.constraint_type
        self.temperature = configs.temperature
        stride = configs.stride

        self.z_dim = self.zd_dim + self.zy_dim

        self.disentanglement = FrequencyFilter(self.mask_spectrum, self.freq_type)
        self.time_stochastic = TimeStochasticBlock(configs)

        self.decoder = Decoder(self.z_dim, self.fc_dim, self.input_channel, kernel_size=self.kernel_size, output_channels=[64,128,512,1024])
        self.domain_prior = PriorBlock(self.d_dim, self.zd_dim)
        self.label_prior = PriorBlock(self.y_dim, self.zy_dim)

        self.specific_encoder = Encoder(output_dim=self.zd_dim, input_dim=self.input_channel, fc_dim=self.fc_dim, kernel_size=self.kernel_size, out_channels=[1024, 512, 128, 64])
        self.invariant_encoder = Encoder(output_dim=self.zy_dim, input_dim=self.input_channel, fc_dim=self.fc_dim, kernel_size=self.kernel_size, out_channels=[1024, 512, 128, 64])

        self.domain_projection = Projection(self.d_dim, self.zd_dim)
        self.label_projection = Projection(self.y_dim, self.zy_dim)
        self.sto_projection = Projection(self.y_dim, self.zy_dim)

        self.classifer = Projection(self.y_dim, self.zy_dim*2)
        self.normal = configs.normalize
        self.beta_d = configs.beta_d
        self.beta_y = configs.beta_y
        
        self.weight_true = configs.weight_true
        self.weight_false = configs.weight_false
        self.aux_loss_multiplier_y = configs.aux_loss_multiplier_y
        self.aux_loss_multiplier_d = configs.aux_loss_multiplier_d

    def normalize(self, x):
        # x_enc: B L C
        # Normalize 
        mean_enc = x.mean(1, keepdim=True).detach() # B x 1 x E
        x = x - mean_enc
        std_enc = torch.sqrt(torch.var(x, dim=1, keepdim=True, unbiased=False) + 1e-5).detach()
        x = x / std_enc 
        return x
    
    def forward(self, x, y, d, mask=None):
        if self.training:
            x_sto, x_det = self.disentanglement(x)
            if self.normal:
                x_sto = self.normalize(x_sto)
                x_det = self.normalize(x_det)

            x_sto = x_sto.permute(0, 2, 1)
            x_det = x_det.permute(0, 2, 1)

            x_sto = self.time_stochastic(x_sto)
            x_sto_label = self.sto_projection(x_sto)
            loss_sto = F.cross_entropy(x_sto_label, y, reduction='sum')

            x_recon, d_hat, y_hat, qzd, pzd, z_spc, _, _, _, qzy, pzy, z_inv = self.forward_train(d, x_det, y)
            
            if x_recon.shape[-1] > x_det.shape[-1]:
                x_recon = x_recon[:, :, :x_det.shape[-1]]
            # reconstruction loss
            reconstruction_loss = F.mse_loss(x_recon, x_det.float())

            # KL divergence between q(z|x) and p(z)
            domain_kl_divergence = torch.sum(pzd.log_prob(z_spc) - qzd.log_prob(z_spc))
            label_kl_divergence = torch.sum(pzy.log_prob(z_inv) - qzy.log_prob(z_inv))
            # Auxiliary losses
            domain_cls = F.cross_entropy(d_hat, d, reduction='sum')
            label_cls = F.cross_entropy(y_hat, y, reduction='sum')

            loss_ELOB = reconstruction_loss - self.beta_d * domain_kl_divergence - self.beta_y * label_kl_divergence
            auxil_loss = self.aux_loss_multiplier_d * domain_cls + self.aux_loss_multiplier_y * label_cls
            cross_loss = None

            if self.constraint_type == 'cross':
                cross_loss = self.cross_label_loss(x_det)
                constraint_loss = auxil_loss + cross_loss
            else:
                contrastive_loss = self.contrastive_loss(z_inv, z_spc, temperature=self.temperature)
                constraint_loss = auxil_loss + contrastive_loss

            loss_det = loss_ELOB + constraint_loss

            latent = torch.concat([x_sto, z_inv],dim=-1)

            y_hat_latent = self.classifer(latent)

            loss_cls = F.cross_entropy(y_hat_latent, y, reduction='sum')
            
            return y_hat, loss_det + loss_sto + loss_cls, cross_loss
        else:
            return self.forward_test(x)

    def forward_train(self, d, x, y):
        # Encode q(z|x)
        zd_mean, zd_scale, _, _ = self.specific_encoder(x)
        zy_mean, zy_scale, idxs_y, sizes_y = self.invariant_encoder(x)

        # Reparameterization trick
        domain_space = dist.Normal(zd_mean, zd_scale) # domain posterior distribution
        domain_feature = domain_space.rsample() # z_spc = mu_spc + sigma_spc * epsilon
        invariant_space = dist.Normal(zy_mean, zy_scale) # invariant posterior distribution
        invariant_feature = invariant_space.rsample() # z_inv = mu_inv + sigma_inv * epsilon

        # Decode p(x|z)
        z = torch.cat((domain_feature, invariant_feature), dim=-1)

        x_recon = self.decoder(z, idxs_y, sizes_y)

        # Prior Distributions learn from label data d, y
        domain_prior_mean, domain_prior_scale = self.domain_prior(d)
        label_prior_mean, label_prior_scale = self.label_prior(y)
        # x prior normal distribution N~(0,1)
        x_prior_mean, x_prior_scale = torch.zeros(zd_mean.size()[0], self.zx_dim).cuda(),\
                                   torch.ones(zd_scale.size()[0], self.zx_dim).cuda()

        domain_prior_distribution = dist.Normal(domain_prior_mean, domain_prior_scale)
        label_prior_distribution = dist.Normal(label_prior_mean, label_prior_scale)
        x_prior_distribution = dist.Normal(x_prior_mean, x_prior_scale)

        # Auxiliary losses
        d_hat = self.domain_projection(domain_feature)
        y_hat = self.label_projection(invariant_feature)

        return x_recon, d_hat, y_hat, domain_space, domain_prior_distribution, domain_feature, None, x_prior_distribution, None, invariant_space, label_prior_distribution, invariant_feature


    def contrastive_loss(self, z_inv, z_spc, temperature=0.07):
        eps = 1e-6
        device = z_inv.device
        z_inv = F.normalize(z_inv, dim=1)
        z_spc = F.normalize(z_spc, dim=1)
        
        features = torch.cat([z_inv.unsqueeze(1), z_spc.unsqueeze(1)], dim=1).to(device)
        labels = torch.cat([torch.ones(z_inv.shape[0]), torch.zeros(z_spc.shape[0])]).to(device)
        labels = labels.contiguous().view(-1, 1)
        mask = torch.eq(labels, labels.T).float().to(device)

        batch_size = features.shape[0]
        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
        anchor_feature = contrast_feature
        anchor_count = contrast_count
        
        # compute logits
        anchor_dot_contrast = torch.div(
            torch.matmul(anchor_feature, contrast_feature.T),
            temperature)
        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()
        logits_mask = torch.scatter(
            torch.ones_like(mask).to(device),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
            0
        )
        mask = mask * logits_mask

        # compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True)+eps)

        # compute mean of log-likelihood over positive
        mean_log_prob_pos = (mask * log_prob).sum(1) / (mask.sum(1)+eps)

        loss = - mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()
        # print("con:", loss)
        return loss

    def cross_label_loss(self, x):
        with torch.no_grad():
            zd_mean, zd_scale, _, _ = self.specific_encoder(x)
            zd = zd_mean
            domain_prob = F.softmax(self.domain_projection(zd), dim=1)
            # the maximum predicted class probability
            _, ind = torch.topk(domain_prob, 1) 
            # convert the digit(s) to one-hot tensor(s)
            d = x.new_zeros(domain_prob.size()) 
            pred_d = d.scatter_(1, ind, 1.0)


            zy_mean, zy_scale, _, _ = self.invariant_encoder(x)
            zy = zy_mean
            label_prob = F.softmax(self.label_projection(zy), dim=1)
            _, ind = torch.topk(label_prob, 1)
            y = x.new_zeros(label_prob.size())
            pred_y = y.scatter_(1, ind, 1.0)

            # cross projection for constraint loss
            alpha_y2d = F.softmax(self.domain_projection(zy), dim=1)
            _, ind = torch.topk(alpha_y2d, 1)
            d_false = x.new_zeros(alpha_y2d.size())
            pred_d_false = d_false.scatter_(1, ind, 1.0)

            alpha_d2y = F.softmax(self.label_projection(zd), dim=1)
            _, ind = torch.topk(alpha_d2y, 1)
            y_false = x.new_zeros(alpha_d2y.size())
            pred_y_false = y_false.scatter_(1, ind, 1.0)

        loss_classify_true = self.weight_true * (F.cross_entropy(pred_d, d, reduction='sum') + F.cross_entropy(pred_y, y, reduction='sum'))
        loss_classify_false = self.weight_false * (F.cross_entropy(pred_d_false, d, reduction='sum') + F.cross_entropy(pred_y_false, y, reduction='sum'))

        loss = loss_classify_true - loss_classify_false
        loss.requires_grad = True

        return loss

    def forward_test(self, x):
        x_sto, x_det = self.disentanglement(x)
        if self.normal:
            x_sto = self.normalize(x_sto)
            x_det = self.normalize(x_det)

        x_sto = x_sto.permute(0, 2, 1)
        x_det = x_det.permute(0, 2, 1)

        x_sto = self.time_stochastic(x_sto)

        zy_mean, zy_scale, _, _ = self.invariant_encoder(x_det)
        z_inv = zy_mean
        latent = torch.concat([x_sto, z_inv],dim=-1)

        label_prob = F.softmax(self.classifer(latent), dim=1)
        _, ind = torch.topk(label_prob, 1)
        y = x.new_zeros(label_prob.size())
        pred_y = y.scatter_(1, ind, 1.0)
        return pred_y



