import torch
import numpy as np
from PIL import Image
from omegaconf import OmegaConf
from torchvision import transforms
from autoencoder.ldm.models.dino_decoder import DinoDecoder

class DINO_DECODER:
    
    def __init__(self, config, img_size=256, horizon_flip=0.5, fp16=True):
        """Initialization
        Args:
            config: Configuration dict containing img_size, horizon_flip and fp16 parameters
        """
        self.config = config
        print(f'self.config: {self.config}')
        self.embed_dim = self.config.model.params.embed_dim
        self.ckpt_path = self.config.ckpt_path
        self.img_size = img_size
        self.horizon_flip = horizon_flip
        self.load()

    def load(self):
        self.model = DinoDecoder(
            ddconfig=self.config.model.params.ddconfig,
            dinoconfig=self.config.model.params.dinoconfig,
            lossconfig=self.config.model.params.lossconfig,
            embed_dim=self.embed_dim,
            ckpt_path=self.ckpt_path,
            extra_vit_config=self.config.model.params.extra_vit_config,
        ).cuda().eval()
        return self
    
    def img_transform(self, p_hflip=0, img_size=None):
        """Image preprocessing transforms
        Args:
            p_hflip: Probability of horizontal flip
            img_size: Target image size, use default if None
        Returns:
            transforms.Compose: Image transform pipeline
        """
        img_size = img_size if img_size is not None else self.img_size
        img_transforms = [
            transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, img_size)),
            transforms.RandomHorizontalFlip(p=p_hflip),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
        ]
        return transforms.Compose(img_transforms)

    def encode_images(self, images):
        """Encode images to latent representations
        Args:
            images: Input image tensor
        Returns:
            torch.Tensor: Encoded latent representation
        """
        with torch.no_grad():
            import pdb; pdb.set_trace()
            images = (images + 1.0) / 2.0  # Scale to [0, 1]
            mean = torch.tensor([0.485, 0.456, 0.406], device=images.device).view(1, 3, 1, 1)
            std = torch.tensor([0.229, 0.224, 0.225], device=images.device).view(1, 3, 1, 1)
            images = (images - mean) / std
            
            h = self.model.encoder(images, is_training=True)['x_norm_patchtokens']
            
            
        if self.use_extra_vit:
            h_vit = self.model.extra_vit(x)  # [B, D, N]
            mask_ratio = -1
            if mask_ratio > 0:
                B, D, N = h_vit.shape

                # 每个样本生成一个 mask 标志，True 表示该样本全部用 mask_token 替换
                mask_flags = (torch.rand(B, device=h.device) < mask_ratio).float().view(B, 1, 1)  # [B,1,1]

                # 扩展 mask_token
                mask_token_exp = self.model.mask_token.expand(B, D, N)  # [B,D,N]

                # 用 mask_flags 控制替换：如果 mask_flags[b]=1 -> 全部替换，否则保持原值
                h_vit = h_vit * (1 - mask_flags) + mask_token_exp * mask_flags

            # 拼接原始 encoder 特征
            h = torch.cat([h, h_vit], dim=1)
            
            h = h.permute(0, 2, 1).view(h.shape[0], -1, int(images.shape[2]//16), int(images.shape[3]//16)).contiguous()
            
            return h
            

    def decode_to_images(self, z):
        """Decode latent representations to images
        Args:
            z: Latent representation tensor
        Returns:
            np.ndarray: Decoded image array
        """
        with torch.no_grad():
            images = self.model.decode(z.cuda())
            images = torch.clamp(127.5 * images + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
        return images

def center_crop_arr(pil_image, image_size):
    """
    Center cropping implementation from ADM.
    """
    while min(*pil_image.size) >= 2 * image_size:
        pil_image = pil_image.resize(
            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
        )

    scale = image_size / min(*pil_image.size)
    pil_image = pil_image.resize(
        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
    )

    arr = np.array(pil_image)
    crop_y = (arr.shape[0] - image_size) // 2
    crop_x = (arr.shape[1] - image_size) // 2
    return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])

