# Copyright (c) OpenMMLab. All rights reserved.
import warnings
import os
import json
import copy
import re,tempfile
from typing import Dict, List, Optional, Tuple, Union
from collections import OrderedDict
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
from mmengine.runner import load_checkpoint, load_state_dict
from mmengine.model import is_model_wrapper
from mmengine import Config
from mmengine.structures import InstanceData
from mmdet.utils import ConfigType, OptConfigType, InstanceList
from mmdet.structures.bbox import bbox2roi
from mmdet.models.utils import multi_apply
from mmdet.registry import MODELS, TASK_UTILS
from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, bbox_overlaps
from mmdet.models.dense_heads.atss_vlfusion_head import convert_grounding_to_cls_scores
from mmdet.structures import OptSampleList, SampleList
from ..layers import SinePositionalEncoding, CdnQueryGenerator
from ..layers import inverse_sigmoid
from .incre_ddetr import incre_DeformableDETR

@MODELS.register_module()
class incre_incre_DeformableDETR(incre_DeformableDETR):
    """Implementation of `Grounding DINO: Marrying DINO with Grounded Pre-
    Training for Open-Set Object Detection.

    <https://arxiv.org/abs/2303.05499>`_

    Code is modified from the `official github repo
    <https://github.com/IDEA-Research/GroundingDINO>`_.
    """

    def __init__(self,incre_cfg, dn_cfg: OptConfigType = None, *args, **kwargs) -> None:
        self.incre_cfg = incre_cfg
        super().__init__(*args, **kwargs)
        self.start=self.bbox_head.trunc_class[0]
        self.end=self.bbox_head.trunc_class[1]
        self.dn_cfg=dn_cfg 
        ori_checkpoint_state=f'./temp_cheakpoints/{self.start}_{self.end}.pth'
        self.load_base_detector(ori_checkpoint_state)

    def init_weights(self) -> None:
        pass

    def train(self, mode=True):
        """Convert the model into training mode while keep layers freezed."""
        super(incre_DeformableDETR, self).train(mode)
        if self.ori_model is not None:
            self.ori_model.eval()

    def load_base_detector(self,ori_checkpoint_state):
        ori_cfg = Config.fromfile(self.incre_cfg['ori_config_file'])
        ori_checkpoint =self.incre_cfg['ori_checkpoint']
        ori_model = MODELS.build(ori_cfg.model)
        load_checkpoint(ori_model, ori_checkpoint, strict=False)
        ori_model.eval()
        ori_model.ori_model = None
        for param in ori_model.parameters():
            param.requires_grad = False
        self.ori_model = ori_model
        self._load_checkpoint_for_new_model(ori_checkpoint,ori_checkpoint_state, strict=False)

    def _load_checkpoint_for_new_model(self, checkpoint_file, checkpoint_state=None,
                                    map_location=None, strict=True, logger=None):
        target_path = checkpoint_state

        checkpoint = torch.load(checkpoint_file, map_location=map_location)
        if isinstance(checkpoint, OrderedDict):
            state_dict, wrapper = checkpoint, 'odict'
        elif isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
            state_dict, wrapper = checkpoint['state_dict'], 'dict'
        else:
            raise RuntimeError(f'No state_dict found in checkpoint file {checkpoint_file}')

        def strip_pref(d, pref):
            if len(d) and all(k.startswith(pref) for k in d.keys()):
                return {k[len(pref):]: v for k, v in d.items()}
            return d
        state_dict = strip_pref(state_dict, 'module.')
        state_dict = strip_pref(state_dict, 'model.')
        C_old, C_new = self.start, self.end
        assert C_new >= C_old, f'num_classes({C_new}) < ori_num_classes({C_old})'

        weight_patterns = [
            r'^bbox_head\.cls_branches\.\d+\.weight$',  
            r'^bbox_head\.class_embed\.weight$',        
            r'^bbox_head\.dn_cls_embed\.weight$',       
        ]
        def find_cls_keys(sd):
            ks=[]
            for k in sd.keys():
                if any(re.match(p,k) for p in weight_patterns):
                    b=k.replace('weight','bias')
                    ks.append((k, b if b in sd else None))
            return sorted(set(ks), key=lambda x:x[0])

        for w_key, b_key in find_cls_keys(state_dict):
            w = state_dict[w_key]
            if w.dim()!=2 or w.size(0)!=C_old:
                continue
            add = C_new - C_old
            if add>0:
                D = w.size(1)
                mean_row = w[:C_old].mean(dim=0, keepdim=True)
                state_dict[w_key] = torch.cat([w, mean_row.expand(add, D).clone()], dim=0)
                if b_key is not None:
                    b = state_dict[b_key]
                    assert b.dim()==1 and b.size(0)==C_old
                    mean_b = b.mean().view(1)
                    state_dict[b_key] = torch.cat([b, mean_b.expand(add).clone()], dim=0)
        new_ckpt = (dict(checkpoint, state_dict=state_dict) if wrapper=='dict' else state_dict)
        dir_name = os.path.dirname(target_path) or '.'
        with tempfile.NamedTemporaryFile(dir=dir_name, delete=False, suffix='.pth') as tmp:
            tmp_name = tmp.name
            torch.save(new_ckpt, tmp_name)
        os.replace(tmp_name, target_path)

        if logger:
            try: logger.info(f'Expanded checkpoint saved to: {target_path}')
            except: pass
    def pre_transformer(
            self,
            mlvl_feats: Tuple[Tensor],
            batch_data_samples: OptSampleList = None) -> Tuple[Dict]:
        """Process image features before feeding them to the transformer.

        The forward procedure of the transformer is defined as:
        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
        More details can be found at `TransformerDetector.forward_transformer`
        in `mmdet/detector/base_detr.py`.

        Args:
            mlvl_feats (tuple[Tensor]): Multi-level features that may have
                different resolutions, output from neck. Each feature has
                shape (bs, dim, h_lvl, w_lvl), where 'lvl' means 'layer'.
            batch_data_samples (list[:obj:`DetDataSample`], optional): The
                batch data samples. It usually includes information such
                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
                Defaults to None.

        Returns:
            tuple[dict]: The first dict contains the inputs of encoder and the
            second dict contains the inputs of decoder.

            - encoder_inputs_dict (dict): The keyword args dictionary of
              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
              and 'feat_pos'.
            - decoder_inputs_dict (dict): The keyword args dictionary of
              `self.forward_decoder()`, which includes 'memory_mask'.
        """
        batch_size = mlvl_feats[0].size(0)

        # construct binary masks for the transformer.
        assert batch_data_samples is not None
        batch_input_shape = batch_data_samples[0].batch_input_shape
        input_img_h, input_img_w = batch_input_shape
        img_shape_list = [sample.img_shape for sample in batch_data_samples]
        same_shape_flag = all([
            s[0] == input_img_h and s[1] == input_img_w for s in img_shape_list
        ])
        if torch.onnx.is_in_onnx_export() or same_shape_flag:
            mlvl_masks = []
            mlvl_pos_embeds = []
            for feat in mlvl_feats:
                mlvl_masks.append(None)
                mlvl_pos_embeds.append(
                    self.positional_encoding(None, input=feat))
        else:
            masks = mlvl_feats[0].new_ones(
                (batch_size, input_img_h, input_img_w))
            for img_id in range(batch_size):
                img_h, img_w = img_shape_list[img_id]
                masks[img_id, :img_h, :img_w] = 0
            # NOTE following the official DETR repo, non-zero
            # values representing ignored positions, while
            # zero values means valid positions.

            mlvl_masks = []
            mlvl_pos_embeds = []
            for feat in mlvl_feats:
                mlvl_masks.append(
                    F.interpolate(masks[None], size=feat.shape[-2:]).to(
                        torch.bool).squeeze(0))
                mlvl_pos_embeds.append(
                    self.positional_encoding(mlvl_masks[-1]))

        feat_flatten = []
        lvl_pos_embed_flatten = []
        mask_flatten = []
        spatial_shapes = []
        for lvl, (feat, mask, pos_embed) in enumerate(
                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
            batch_size, c, h, w = feat.shape
            spatial_shape = torch._shape_as_tensor(feat)[2:].to(feat.device)

            feat = feat.view(batch_size, c, -1).permute(0, 2, 1)
            pos_embed = pos_embed.view(batch_size, c, -1).permute(0, 2, 1)
            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)

            if mask is not None:
                mask = mask.flatten(1)

            feat_flatten.append(feat)
            lvl_pos_embed_flatten.append(lvl_pos_embed)
            mask_flatten.append(mask)
            spatial_shapes.append(spatial_shape)

        feat_flatten = torch.cat(feat_flatten, 1)
        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)

        if mask_flatten[0] is not None:
            mask_flatten = torch.cat(mask_flatten, 1)
        else:
            mask_flatten = None

        # (num_level, 2)
        spatial_shapes = torch.cat(spatial_shapes).view(-1, 2)
        level_start_index = torch.cat((
            spatial_shapes.new_zeros((1, )),  # (num_level)
            spatial_shapes.prod(1).cumsum(0)[:-1]))
        if mlvl_masks[0] is not None:
            valid_ratios = torch.stack(  # (bs, num_level, 2)
                [self.get_valid_ratio(m) for m in mlvl_masks], 1)
        else:
            valid_ratios = mlvl_feats[0].new_ones(batch_size, len(mlvl_feats),
                                                  2)

        encoder_inputs_dict = dict(
            feat=feat_flatten,
            feat_mask=mask_flatten,
            feat_pos=lvl_pos_embed_flatten,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            valid_ratios=valid_ratios)
        decoder_inputs_dict = dict(
            memory_mask=mask_flatten,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            valid_ratios=valid_ratios)
        return encoder_inputs_dict, decoder_inputs_dict

    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
                        feat_pos: Tensor, spatial_shapes: Tensor,
                        level_start_index: Tensor,
                        valid_ratios: Tensor) -> Dict:
        """Forward with Transformer encoder.

        The forward procedure of the transformer is defined as:
        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
        More details can be found at `TransformerDetector.forward_transformer`
        in `mmdet/detector/base_detr.py`.

        Args:
            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
                dim).
            feat_mask (Tensor): ByteTensor, the padding mask of the features,
                has shape (bs, num_feat_points).
            feat_pos (Tensor): The positional embeddings of the features, has
                shape (bs, num_feat_points, dim).
            spatial_shapes (Tensor): Spatial shapes of features in all levels,
                has shape (num_levels, 2), last dimension represents (h, w).
            level_start_index (Tensor): The start index of each level.
                A tensor has shape (num_levels, ) and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
            valid_ratios (Tensor): The ratios of the valid width and the valid
                height relative to the width and the height of features in all
                levels, has shape (bs, num_levels, 2).

        Returns:
            dict: The dictionary of encoder outputs, which includes the
            `memory` of the encoder output.
        """
        memory = self.encoder(
            query=feat,
            query_pos=feat_pos,
            key_padding_mask=feat_mask,  # for self_attn
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            valid_ratios=valid_ratios)
        encoder_outputs_dict = dict(
            memory=memory,
            memory_mask=feat_mask,
            spatial_shapes=spatial_shapes)
        return encoder_outputs_dict

    def pre_decoder(self, memory: Tensor, memory_mask: Tensor,
                    spatial_shapes: Tensor) -> Tuple[Dict, Dict]:
        """Prepare intermediate variables before entering Transformer decoder,
        such as `query`, `query_pos`, and `reference_points`.

        The forward procedure of the transformer is defined as:
        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
        More details can be found at `TransformerDetector.forward_transformer`
        in `mmdet/detector/base_detr.py`.

        Args:
            memory (Tensor): The output embeddings of the Transformer encoder,
                has shape (bs, num_feat_points, dim).
            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
                has shape (bs, num_feat_points). It will only be used when
                `as_two_stage` is `True`.
            spatial_shapes (Tensor): Spatial shapes of features in all levels,
                has shape (num_levels, 2), last dimension represents (h, w).
                It will only be used when `as_two_stage` is `True`.

        Returns:
            tuple[dict, dict]: The decoder_inputs_dict and head_inputs_dict.

            - decoder_inputs_dict (dict): The keyword dictionary args of
              `self.forward_decoder()`, which includes 'query', 'query_pos',
              'memory', and `reference_points`. The reference_points of
              decoder input here are 4D boxes when `as_two_stage` is `True`,
              otherwise 2D points, although it has `points` in its name.
              The reference_points in encoder is always 2D points.
            - head_inputs_dict (dict): The keyword dictionary args of the
              bbox_head functions, which includes `enc_outputs_class` and
              `enc_outputs_coord`. They are both `None` when 'as_two_stage'
              is `False`. The dict is empty when `self.training` is `False`.
        """
        batch_size, _, c = memory.shape
        if self.as_two_stage:
            output_memory, output_proposals = \
                self.gen_encoder_output_proposals(
                    memory, memory_mask, spatial_shapes)
            enc_outputs_class = self.bbox_head.cls_branches[
                self.decoder.num_layers](
                    output_memory)
            enc_outputs_coord_unact = self.bbox_head.reg_branches[
                self.decoder.num_layers](output_memory) + output_proposals
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            # We only use the first channel in enc_outputs_class as foreground,
            # the other (num_classes - 1) channels are actually not used.
            # Its targets are set to be 0s, which indicates the first
            # class (foreground) because we use [0, num_classes - 1] to
            # indicate class labels, background class is indicated by
            # num_classes (similar convention in RPN).
            # See https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/deformable_detr_head.py#L241 # noqa
            # This follows the official implementation of Deformable DETR.
            topk_proposals = torch.topk(
                enc_outputs_class[..., 0], self.num_queries, dim=1)[1]
            topk_coords_unact = torch.gather(
                enc_outputs_coord_unact, 1,
                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
            topk_coords_unact = topk_coords_unact.detach()
            reference_points = topk_coords_unact.sigmoid()
            pos_trans_out = self.pos_trans_fc(
                self.get_proposal_pos_embed(topk_coords_unact))
            pos_trans_out = self.pos_trans_norm(pos_trans_out)
            query_pos, query = torch.split(pos_trans_out, c, dim=2)
        else:
            enc_outputs_class, enc_outputs_coord = None, None
            query_embed = self.query_embedding.weight
            query_pos, query = torch.split(query_embed, c, dim=1)
            query_pos = query_pos.unsqueeze(0).expand(batch_size, -1, -1)
            query = query.unsqueeze(0).expand(batch_size, -1, -1)
            reference_points = self.reference_points_fc(query_pos).sigmoid()

        decoder_inputs_dict = dict(
            query=query,
            query_pos=query_pos,
            memory=memory,
            reference_points=reference_points)
        head_inputs_dict = dict(
            enc_outputs_class=enc_outputs_class,
            enc_outputs_coord=enc_outputs_coord) if self.training else dict()
        return decoder_inputs_dict, head_inputs_dict

    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
                        memory_mask: Tensor, reference_points: Tensor,
                        spatial_shapes: Tensor, level_start_index: Tensor,
                        valid_ratios: Tensor) -> Dict:
        """Forward with Transformer decoder.

        The forward procedure of the transformer is defined as:
        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
        More details can be found at `TransformerDetector.forward_transformer`
        in `mmdet/detector/base_detr.py`.

        Args:
            query (Tensor): The queries of decoder inputs, has shape
                (bs, num_queries, dim).
            query_pos (Tensor): The positional queries of decoder inputs,
                has shape (bs, num_queries, dim).
            memory (Tensor): The output embeddings of the Transformer encoder,
                has shape (bs, num_feat_points, dim).
            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
                has shape (bs, num_feat_points).
            reference_points (Tensor): The initial reference, has shape
                (bs, num_queries, 4) with the last dimension arranged as
                (cx, cy, w, h) when `as_two_stage` is `True`, otherwise has
                shape (bs, num_queries, 2) with the last dimension arranged as
                (cx, cy).
            spatial_shapes (Tensor): Spatial shapes of features in all levels,
                has shape (num_levels, 2), last dimension represents (h, w).
            level_start_index (Tensor): The start index of each level.
                A tensor has shape (num_levels, ) and can be represented
                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
            valid_ratios (Tensor): The ratios of the valid width and the valid
                height relative to the width and the height of features in all
                levels, has shape (bs, num_levels, 2).

        Returns:
            dict: The dictionary of decoder outputs, which includes the
            `hidden_states` of the decoder output and `references` including
            the initial and intermediate reference_points.
        """
        inter_states, inter_references = self.decoder(
            query=query,
            value=memory,
            query_pos=query_pos,
            key_padding_mask=memory_mask,  
            reference_points=reference_points,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            valid_ratios=valid_ratios,
            reg_branches=self.bbox_head.reg_branches
            if self.with_box_refine else None)
        references = [reference_points, *inter_references]
        decoder_outputs_dict = dict(
            hidden_states=inter_states, references=references)
        return decoder_outputs_dict

      
    def forward_ori_model(
            self,
            img_feats: Tuple[Tensor],
            batch_data_samples: OptSampleList = None,
        ):
        encoder_inputs_dict, decoder_inputs_dict = self.ori_model.pre_transformer(
            img_feats, batch_data_samples)

        encoder_outputs_dict = self.ori_model.forward_encoder(
            **encoder_inputs_dict)

        tmp_dec_in, head_inputs_dict = self.ori_model.pre_decoder(
            **encoder_outputs_dict)
        decoder_inputs_dict.update(tmp_dec_in)

        # if self.incre_cfg.query_incre.type == 'seperate_queryinit':
        head_inputs_dict['aux_query'] = tmp_dec_in['query'].clone()
        head_inputs_dict['aux_reference'] = tmp_dec_in['reference_points'].clone()

        decoder_outputs_dict = self.ori_model.forward_decoder(**decoder_inputs_dict)
        head_inputs_dict.update(decoder_outputs_dict)      

        head_inputs_dict['ori_hidden_states'] = head_inputs_dict.pop('hidden_states')
        head_inputs_dict['ori_references'] = head_inputs_dict.pop('references')

        return head_inputs_dict

    def loss(self, batch_inputs: Tensor,
             batch_data_samples: SampleList) -> Union[dict, list]:
        
        with torch.no_grad():
            ori_img_features = self.ori_model.extract_feat(batch_inputs)
            ori_head_inputs_dict = self.forward_ori_model(ori_img_features,batch_data_samples)
            all_layers_ori_cls_scores, all_layers_ori_bbox_preds = \
                self.ori_model.bbox_head(ori_head_inputs_dict['ori_hidden_states'], 
                                        ori_head_inputs_dict['ori_references'])
            ori_head_inputs_dict['all_layers_ori_cls_scores'] = all_layers_ori_cls_scores
            ori_head_inputs_dict['all_layers_ori_bbox_preds'] = all_layers_ori_bbox_preds

            if self.incre_cfg.label_incre.type == 'topk_pseudo' or self.incre_cfg.label_incre.type == 'threshold_pseudo':
                topk_query, batch_pseudo_instances, batch_all_instances = \
                    self.bbox_head.generate_pseudo_label(all_layers_ori_cls_scores,
                                                        all_layers_ori_bbox_preds,
                                                        batch_data_samples)    
            else:
                topk_query = None
                batch_pseudo_instances = None
                batch_all_instances = None
            
            ori_head_inputs_dict['batch_pseudo_instances'] = batch_pseudo_instances
            ori_head_inputs_dict['batch_all_instances'] = batch_all_instances
            ori_head_inputs_dict['ori_topk_query'] = topk_query
        aux_dict = None
        if self.incre_cfg.query_incre.type == 'seperate_queryinit':
            aux_query = ori_head_inputs_dict['aux_query']
            aux_enc_coord = ori_head_inputs_dict['enc_outputs_coord'] 
            aux_enc_score = ori_head_inputs_dict['enc_outputs_class']
            aux_dict = dict(aux_query=aux_query, aux_enc_coord=aux_enc_coord, aux_enc_score=aux_enc_score, 
                            batch_pseudo_instances=batch_pseudo_instances)

        img_feats = self.extract_feat(batch_inputs)
        head_inputs_dict = self.forward_transformer(img_feats,
                                                    batch_data_samples)

        if 'dn_meta' in ori_head_inputs_dict.keys():
            ori_head_inputs_dict.pop('dn_meta')
        if 'enc_outputs_class' in ori_head_inputs_dict.keys():
            ori_head_inputs_dict.pop('enc_outputs_class')
            ori_head_inputs_dict.pop('enc_outputs_coord')

        losses = self.bbox_head.loss(**head_inputs_dict, **ori_head_inputs_dict, 
                                        batch_data_samples=batch_data_samples)
        return losses
