{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Args:\n",
    "    def __init__(self, mm_vision_select_layer, version):\n",
    "        self.mm_vision_select_layer = mm_vision_select_layer\n",
    "        self.version = version\n",
    "\n",
    "args = Args(mm_vision_select_layer=-2, version = '/home/MaTianren/LaVIT-7B-v2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "device_id = 0\n",
    "torch.cuda.set_device(device_id)\n",
    "device = torch.device('cuda')\n",
    "\n",
    "vision_tower_cfg=args\n",
    "\n",
    "vision_tower = '/home/MaTianren/LaVIT-7B-v2'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from clip_encoder_old import CLIPVisionTower\n",
    "clip = CLIPVisionTower(vision_tower, args=vision_tower_cfg)\n",
    "clip.load_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clip.vision_tower\n",
    "# clip.forward()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cfg = torch.load('/home/MaTianren/Workspace/LLaVA-15/clipconfig')\n",
    "cfg.hidden_size = 1408\n",
    "print(cfg.hidden_size)\n",
    "torch.save(cfg, '/home/MaTianren/Workspace/LLaVA-15/clipconfig_fake')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image\n",
    "from torchvision import transforms\n",
    "\n",
    "# Define a transformation to preprocess the image\n",
    "# For loading, you might need to resize, convert to tensor, normalize, etc.\n",
    "transform = transforms.Compose([\n",
    "    transforms.Resize((224, 224)),  # Resize the image to (224, 224)\n",
    "    transforms.ToTensor(),          # Convert the image to a PyTorch tensor\n",
    "    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize the image\n",
    "])\n",
    "\n",
    "# Load the image from file\n",
    "image = Image.open('/home/MaTianren/Workspace/LaTokenizer/demo/qa_image.jpg')  # Replace \"example.jpg\" with the path to your image file\n",
    "\n",
    "# Apply the transformation to preprocess the image\n",
    "preprocessed_image = transform(image)\n",
    "\n",
    "# Add batch dimension as PyTorch models usually operate on batches\n",
    "preprocessed_image = preprocessed_image.unsqueeze(0)\n",
    "\n",
    "image_tensor = preprocessed_image.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clip_feat = clip.forward(image_tensors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clip_feat.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----------------------------------------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Args:\n",
    "    def __init__(self, mm_vision_select_layer, version):\n",
    "        self.mm_vision_select_layer = mm_vision_select_layer\n",
    "        self.version = version\n",
    "\n",
    "args = Args(mm_vision_select_layer=-2, version = '/home/MaTianren/LaVIT-7B-v2')\n",
    "\n",
    "import torch\n",
    "device_id = 0\n",
    "torch.cuda.set_device(device_id)\n",
    "device = torch.device('cuda')\n",
    "\n",
    "vision_tower_cfg=args\n",
    "\n",
    "vision_tower = '/home/MaTianren/LaVIT-7B-v2'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/MaTianren/anaconda3/envs/llava/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Please 'pip install apex'\n",
      "Loading LaVIT Model Weight from /home/MaTianren/LaVIT-7B-v2, model precision: bf16\n",
      "The Visual Vocab Size is 16384\n",
      "Load eva vitG weight from /home/MaTianren/LaVIT-7B-v2/visual_tokenizer/eva_vitg_psz14.bin\n",
      "Load visual tokenizer encoder weight from /home/MaTianren/LaVIT-7B-v2/visual_tokenizer/tokenizer_encoder.bin\n",
      "cpu\n",
      "cuda:0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "D_CLIPVisionTower(\n",
       "  (vision_tower): VQClip(\n",
       "    (visual_tokenizer): DynamicVisualTokenizer(\n",
       "      (encoder): EVAVisionTransformer(\n",
       "        (patch_embed): PatchEmbed(\n",
       "          (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))\n",
       "        )\n",
       "        (pos_drop): Dropout(p=0.0, inplace=False)\n",
       "        (blocks): ModuleList(\n",
       "          (0-39): 40 x Block(\n",
       "            (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)\n",
       "            (attn): Attention(\n",
       "              (qkv): Linear(in_features=1408, out_features=4224, bias=False)\n",
       "              (attn_drop): Dropout(p=0.0, inplace=False)\n",
       "              (inner_attn_ln): Identity()\n",
       "              (proj): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "              (proj_drop): Dropout(p=0.0, inplace=False)\n",
       "            )\n",
       "            (drop_path): Identity()\n",
       "            (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)\n",
       "            (mlp): Mlp(\n",
       "              (fc1): Linear(in_features=1408, out_features=6144, bias=True)\n",
       "              (act): GELU(approximate='none')\n",
       "              (ffn_ln): Identity()\n",
       "              (fc2): Linear(in_features=6144, out_features=1408, bias=True)\n",
       "              (drop): Dropout(p=0.0, inplace=False)\n",
       "            )\n",
       "          )\n",
       "        )\n",
       "        (norm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)\n",
       "        (head): Linear(in_features=1408, out_features=1024, bias=True)\n",
       "        (patch_dropout): Identity()\n",
       "      )\n",
       "      (token_predictor): TokenPredictor(\n",
       "        (in_conv): Sequential(\n",
       "          (0): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "          (1): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "          (2): GELU(approximate='none')\n",
       "        )\n",
       "        (out_conv): Sequential(\n",
       "          (0): Linear(in_features=1408, out_features=704, bias=True)\n",
       "          (1): GELU(approximate='none')\n",
       "          (2): Linear(in_features=704, out_features=352, bias=True)\n",
       "          (3): GELU(approximate='none')\n",
       "          (4): Linear(in_features=352, out_features=2, bias=True)\n",
       "          (5): LogSoftmax(dim=-1)\n",
       "        )\n",
       "      )\n",
       "      (causal_encoder): TokenMerger(\n",
       "        (blocks): ModuleList(\n",
       "          (0-11): 12 x CausalFuserBlock(\n",
       "            (norm0): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "            (token_causal_attn): TokenCausalAttention(\n",
       "              (qkv): Linear(in_features=1408, out_features=4224, bias=True)\n",
       "              (attn_drop): Dropout(p=0.0, inplace=False)\n",
       "              (proj): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "              (proj_drop): Dropout(p=0.0, inplace=False)\n",
       "            )\n",
       "            (norm1): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "            (token_cross_attn): TokenCrossAttention(\n",
       "              (query): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "              (key): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "              (value): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "              (attn_drop): Dropout(p=0.0, inplace=False)\n",
       "              (proj): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "              (proj_drop): Dropout(p=0.0, inplace=False)\n",
       "            )\n",
       "            (norm2): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "            (mlp): Mlp(\n",
       "              (fc1): Linear(in_features=1408, out_features=6144, bias=True)\n",
       "              (act): GELU(approximate='none')\n",
       "              (fc2): Linear(in_features=6144, out_features=1408, bias=True)\n",
       "              (drop): Dropout(p=0.0, inplace=False)\n",
       "            )\n",
       "          )\n",
       "        )\n",
       "        (ln_vision): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "        (norm): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "      )\n",
       "      (quantize): VectorQuantizer(\n",
       "        (embedding): CodebookEmbedding()\n",
       "      )\n",
       "      (encode_task_layer): Sequential(\n",
       "        (0): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "        (1): Tanh()\n",
       "        (2): Linear(in_features=1408, out_features=32, bias=True)\n",
       "      )\n",
       "      (vit_proj): Linear(in_features=1408, out_features=4096, bias=True)\n",
       "    )\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from clip_encoder import D_CLIPVisionTower\n",
    "\n",
    "clip_fake = D_CLIPVisionTower(vision_tower, args=vision_tower_cfg)\n",
    "clip_fake.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_visual_tokenizer = getattr(clip_fake.vision_tower, 'visual_tokenizer', None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DynamicVisualTokenizer(\n",
       "  (encoder): EVAVisionTransformer(\n",
       "    (patch_embed): PatchEmbed(\n",
       "      (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))\n",
       "    )\n",
       "    (pos_drop): Dropout(p=0.0, inplace=False)\n",
       "    (blocks): ModuleList(\n",
       "      (0-39): 40 x Block(\n",
       "        (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)\n",
       "        (attn): Attention(\n",
       "          (qkv): Linear(in_features=1408, out_features=4224, bias=False)\n",
       "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
       "          (inner_attn_ln): Identity()\n",
       "          (proj): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "          (proj_drop): Dropout(p=0.0, inplace=False)\n",
       "        )\n",
       "        (drop_path): Identity()\n",
       "        (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)\n",
       "        (mlp): Mlp(\n",
       "          (fc1): Linear(in_features=1408, out_features=6144, bias=True)\n",
       "          (act): GELU(approximate='none')\n",
       "          (ffn_ln): Identity()\n",
       "          (fc2): Linear(in_features=6144, out_features=1408, bias=True)\n",
       "          (drop): Dropout(p=0.0, inplace=False)\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (norm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)\n",
       "    (head): Linear(in_features=1408, out_features=1024, bias=True)\n",
       "    (patch_dropout): Identity()\n",
       "  )\n",
       "  (token_predictor): TokenPredictor(\n",
       "    (in_conv): Sequential(\n",
       "      (0): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "      (1): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "      (2): GELU(approximate='none')\n",
       "    )\n",
       "    (out_conv): Sequential(\n",
       "      (0): Linear(in_features=1408, out_features=704, bias=True)\n",
       "      (1): GELU(approximate='none')\n",
       "      (2): Linear(in_features=704, out_features=352, bias=True)\n",
       "      (3): GELU(approximate='none')\n",
       "      (4): Linear(in_features=352, out_features=2, bias=True)\n",
       "      (5): LogSoftmax(dim=-1)\n",
       "    )\n",
       "  )\n",
       "  (causal_encoder): TokenMerger(\n",
       "    (blocks): ModuleList(\n",
       "      (0-11): 12 x CausalFuserBlock(\n",
       "        (norm0): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "        (token_causal_attn): TokenCausalAttention(\n",
       "          (qkv): Linear(in_features=1408, out_features=4224, bias=True)\n",
       "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
       "          (proj): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "          (proj_drop): Dropout(p=0.0, inplace=False)\n",
       "        )\n",
       "        (norm1): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "        (token_cross_attn): TokenCrossAttention(\n",
       "          (query): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "          (key): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "          (value): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
       "          (proj): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "          (proj_drop): Dropout(p=0.0, inplace=False)\n",
       "        )\n",
       "        (norm2): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "        (mlp): Mlp(\n",
       "          (fc1): Linear(in_features=1408, out_features=6144, bias=True)\n",
       "          (act): GELU(approximate='none')\n",
       "          (fc2): Linear(in_features=6144, out_features=1408, bias=True)\n",
       "          (drop): Dropout(p=0.0, inplace=False)\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (ln_vision): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "    (norm): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "  )\n",
       "  (quantize): VectorQuantizer(\n",
       "    (embedding): CodebookEmbedding()\n",
       "  )\n",
       "  (encode_task_layer): Sequential(\n",
       "    (0): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "    (1): Tanh()\n",
       "    (2): Linear(in_features=1408, out_features=32, bias=True)\n",
       "  )\n",
       "  (vit_proj): Linear(in_features=1408, out_features=4096, bias=True)\n",
       ")"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_visual_tokenizer.tokenize_features_mirage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image\n",
    "from torchvision import transforms\n",
    "\n",
    "# Define a transformation to preprocess the image\n",
    "# For loading, you might need to resize, convert to tensor, normalize, etc.\n",
    "transform = transforms.Compose([\n",
    "    transforms.Resize((224, 224)),  # Resize the image to (224, 224)\n",
    "    transforms.ToTensor(),          # Convert the image to a PyTorch tensor\n",
    "    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize the image\n",
    "])\n",
    "\n",
    "# Load the image from file\n",
    "image = Image.open('/home/MaTianren/Workspace/LaTokenizer/demo/qa_image.jpg')  # Replace \"example.jpg\" with the path to your image file\n",
    "\n",
    "# Apply the transformation to preprocess the image\n",
    "preprocessed_image = transform(image)\n",
    "\n",
    "# Add batch dimension as PyTorch models usually operate on batches\n",
    "preprocessed_image = preprocessed_image.unsqueeze(0)\n",
    "\n",
    "image_tensor = preprocessed_image.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.float32"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clip_fake.vision_tower.visual_tokenizer.quantize.embedding.weight.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "with clip_fake.vision_tower.maybe_autocast():\n",
    "    embed_ind_list, quantize = clip_fake.vision_tower.visual_tokenizer.tokenize_image(image_tensor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with clip_fake.vision_tower.maybe_autocast():\n",
    "    image_feat1,_ = clip_fake.vision_tower.visual_tokenizer.encode_features_noproj(image_tensor)\n",
    "\n",
    "with clip_fake.vision_tower.maybe_autocast():\n",
    "    image_feat2,_ = clip_fake.vision_tower.visual_tokenizer.encode_features_noproj(image_tensor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_feat = [image_feat1[0],image_feat2[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp = my_visual_tokenizer.tokenize_features_mirage(image_feat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([90])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tmp[0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([92])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tmp[1].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([89])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embed_ind_list[0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([89, 32])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "quantize.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp = torch.tensor([2333]).to(device)\n",
    "tmp_d = clip_fake.vision_tower.visual_tokenizer.quantize.embedding(tmp)  ###[1,32]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 32])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tmp_d.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "feat_vq_proj = clip_fake.vision_tower.visual_tokenizer.encode_task_layer(image_feat[0].to(torch.bfloat16))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([32])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feat_vq_proj[0].shape"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "debut",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
