import torch
from torch import nn
import numpy as np
from torch_xla.utils.checkpoint import checkpoint
from huggingface_hub import hf_hub_download
import torch.nn.functional as F
from .base_encoder import ProcessorWrapper

from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
from .base_encoder import BaseVisionTower


from .ijepa.vision_transformer import vit_huge, vit_giant

from torchvision import transforms


class IJepaVisionTower(BaseVisionTower):

    def _post_init(self):
        # extract image resolution from model name
        if self.vision_tower_name.startswith("ijepa"):
            self._image_size = 224

        if not self.delay_load:
            self.load_model()


    def load_model(self):
        self.vision_model = "ijepa"

        if self.vision_tower_name.lower()=="ijepa-vit-h-14":
            self.vision_tower = vit_huge()
            ckpt = torch.load("/mnt/disks/storage/vision_ckpts/ijepa/IN22K-vit.h.14-900e.pth.tar", map_location=torch.device('cpu'))
            pretrained_dict = ckpt['encoder']


        elif self.vision_tower_name.lower()=="ijepa-vit-g-16":
            self.vision_tower = vit_giant()
            ckpt = torch.load("/mnt/disks/storage/vision_ckpts/ijepa/IN22K-vit.g.16-600e.pth.tar", map_location=torch.device('cpu'))
            pretrained_dict = ckpt['encoder']

        else:
            raise ValueError(f'Unknown vision tower: {self.vision_tower_name}')

        # Load the pre-trained weights into the vision tower model
        pretrained_dict = {k.replace("module.", ""): v for k, v in pretrained_dict.items()}
        
        self.vision_tower.load_state_dict(pretrained_dict)
        
        self.vision_tower.output_tokens = True

        self._hidden_size = self.vision_tower.embed_dim
        self._image_size = self.vision_tower.patch_embed.img_size
        self._patch_size = self.vision_tower.patch_embed.patch_size
        #print(self._image_size, self._patch_size)
        preprocess = transforms.Compose([
            transforms.Resize(256),            # Resize the image to 256x256 pixels
            transforms.CenterCrop(224),        # Crop the center 224x224 pixels
            transforms.ToTensor(),             # Convert the image to a PyTorch tensor
            transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Normalize the tensor
                                std=[0.229, 0.224, 0.225])
        ])

        self.image_processor = ProcessorWrapper(preprocess, height=self._image_size, width=self._image_size)

        self.vision_tower.requires_grad_(False)
        self.is_loaded = True

    def _forward(self, images):
        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
            image_features = self.vision_tower.forward(images.to(device=self.device, dtype=self.dtype))

            return image_features
    @property
    def patch_size(self):
        return self._patch_size 
    
    @property
    def image_size(self):
        return self._image_size
    
    @property
    def image_token_len(self):
        return (self.image_size // self.patch_size) ** 2

