{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "47b7df12176b4d0591155f4a80b10bfa",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration\n",
    "\n",
    "model_id = \"/inspire/ssd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/public/yjc/cunxin/pi0_suite/checkpoints/paligemma-3b-ft-nlvr2-224\"\n",
    "model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, device_map=\"auto\")\n",
    "processor = PaliGemmaProcessor.from_pretrained(model_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Which figure is eggplant?\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "from PIL import Image\n",
    "\n",
    "prompt = \"Which figure is eggplant?\"\n",
    "eggplant = Image.open(\"/inspire/ssd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/public/yjc/cunxin/pi0_suite/open-pi-zero/eggplant__conf0.65.jpg\")\n",
    "basket = Image.open(\"/inspire/ssd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/public/yjc/cunxin/pi0_suite/open-pi-zero/yellow_basket__conf0.17.jpg\")\n",
    "\n",
    "inputs = processor(images=[[basket, eggplant]], text=[prompt], return_tensors=\"pt\")\n",
    "\n",
    "output = model.generate(**inputs, max_new_tokens=20)\n",
    "print(processor.decode(output[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3305575/537426921.py:2: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
      "  ckpt = torch.load(\"/inspire/ssd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/public/yjc/cunxin/pi0_suite/checkpoints/bridge_beta_step19296_2024-12-26_22-30_42.pt\")\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "ckpt = torch.load(\"/inspire/ssd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/public/yjc/cunxin/pi0_suite/checkpoints/bridge_beta_step19296_2024-12-26_22-30_42.pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "odict_keys(['embed_tokens.weight', 'vision_tower.vision_model.embeddings.patch_embedding.weight', 'vision_tower.vision_model.embeddings.patch_embedding.bias', 'vision_tower.vision_model.embeddings.position_embedding.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.1.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.1.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.1.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.1.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.2.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.2.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.2.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.2.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.3.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.3.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.3.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.3.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.4.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.4.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.4.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.4.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.5.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.5.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.5.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.5.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.6.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.6.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.6.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.6.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.7.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.7.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.7.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.7.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.8.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.8.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.8.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.8.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.9.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.9.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.9.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.9.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.10.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.10.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.10.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.10.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.11.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.11.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.11.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.11.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.12.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.12.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.12.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.12.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.13.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.13.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.13.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.13.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.14.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.14.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.14.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.14.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.15.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.15.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.15.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.15.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.16.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.16.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.16.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.16.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.17.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.17.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.17.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.17.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.18.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.18.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.18.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.18.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.19.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.19.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.19.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.19.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.20.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.20.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.20.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.20.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.21.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.21.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.21.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.21.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.22.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.22.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.22.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.22.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.23.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.23.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.23.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.23.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.24.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.24.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.24.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.24.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.25.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.25.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.25.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.25.layer_norm2.bias', 'vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight', 'vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias', 'vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight', 'vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias', 'vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight', 'vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias', 'vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight', 'vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias', 'vision_tower.vision_model.encoder.layers.26.layer_norm1.weight', 'vision_tower.vision_model.encoder.layers.26.layer_norm1.bias', 'vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight', 'vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias', 'vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight', 'vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias', 'vision_tower.vision_model.encoder.layers.26.layer_norm2.weight', 'vision_tower.vision_model.encoder.layers.26.layer_norm2.bias', 'vision_tower.vision_model.post_layernorm.weight', 'vision_tower.vision_model.post_layernorm.bias', 'multi_modal_projector.linear.weight', 'multi_modal_projector.linear.bias', 'joint_model.mixtures.vlm.layers.0.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.0.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.0.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.0.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.0.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.0.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.0.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.0.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.0.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.1.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.1.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.1.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.1.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.1.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.1.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.1.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.1.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.1.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.2.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.2.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.2.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.2.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.2.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.2.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.2.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.2.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.2.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.3.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.3.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.3.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.3.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.3.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.3.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.3.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.3.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.3.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.4.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.4.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.4.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.4.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.4.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.4.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.4.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.4.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.4.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.5.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.5.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.5.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.5.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.5.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.5.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.5.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.5.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.5.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.6.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.6.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.6.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.6.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.6.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.6.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.6.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.6.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.6.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.7.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.7.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.7.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.7.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.7.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.7.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.7.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.7.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.7.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.8.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.8.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.8.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.8.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.8.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.8.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.8.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.8.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.8.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.9.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.9.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.9.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.9.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.9.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.9.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.9.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.9.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.9.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.10.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.10.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.10.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.10.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.10.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.10.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.10.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.10.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.10.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.11.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.11.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.11.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.11.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.11.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.11.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.11.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.11.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.11.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.12.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.12.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.12.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.12.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.12.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.12.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.12.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.12.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.12.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.13.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.13.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.13.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.13.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.13.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.13.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.13.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.13.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.13.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.14.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.14.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.14.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.14.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.14.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.14.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.14.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.14.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.14.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.15.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.15.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.15.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.15.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.15.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.15.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.15.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.15.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.15.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.16.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.16.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.16.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.16.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.16.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.16.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.16.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.16.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.16.post_attention_layernorm.weight', 'joint_model.mixtures.vlm.layers.17.self_attn.q_proj.weight', 'joint_model.mixtures.vlm.layers.17.self_attn.k_proj.weight', 'joint_model.mixtures.vlm.layers.17.self_attn.v_proj.weight', 'joint_model.mixtures.vlm.layers.17.self_attn.o_proj.weight', 'joint_model.mixtures.vlm.layers.17.mlp.gate_proj.weight', 'joint_model.mixtures.vlm.layers.17.mlp.up_proj.weight', 'joint_model.mixtures.vlm.layers.17.mlp.down_proj.weight', 'joint_model.mixtures.vlm.layers.17.input_layernorm.weight', 'joint_model.mixtures.vlm.layers.17.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.0.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.0.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.0.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.0.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.0.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.0.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.0.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.0.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.0.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.1.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.1.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.1.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.1.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.1.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.1.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.1.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.1.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.1.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.2.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.2.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.2.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.2.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.2.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.2.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.2.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.2.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.2.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.3.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.3.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.3.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.3.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.3.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.3.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.3.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.3.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.3.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.4.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.4.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.4.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.4.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.4.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.4.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.4.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.4.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.4.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.5.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.5.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.5.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.5.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.5.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.5.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.5.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.5.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.5.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.6.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.6.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.6.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.6.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.6.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.6.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.6.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.6.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.6.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.7.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.7.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.7.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.7.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.7.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.7.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.7.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.7.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.7.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.8.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.8.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.8.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.8.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.8.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.8.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.8.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.8.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.8.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.9.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.9.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.9.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.9.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.9.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.9.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.9.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.9.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.9.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.10.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.10.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.10.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.10.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.10.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.10.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.10.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.10.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.10.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.11.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.11.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.11.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.11.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.11.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.11.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.11.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.11.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.11.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.12.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.12.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.12.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.12.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.12.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.12.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.12.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.12.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.12.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.13.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.13.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.13.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.13.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.13.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.13.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.13.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.13.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.13.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.14.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.14.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.14.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.14.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.14.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.14.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.14.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.14.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.14.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.15.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.15.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.15.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.15.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.15.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.15.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.15.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.15.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.15.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.16.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.16.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.16.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.16.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.16.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.16.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.16.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.16.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.16.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.layers.17.self_attn.q_proj.weight', 'joint_model.mixtures.proprio.layers.17.self_attn.k_proj.weight', 'joint_model.mixtures.proprio.layers.17.self_attn.v_proj.weight', 'joint_model.mixtures.proprio.layers.17.self_attn.o_proj.weight', 'joint_model.mixtures.proprio.layers.17.mlp.gate_proj.weight', 'joint_model.mixtures.proprio.layers.17.mlp.up_proj.weight', 'joint_model.mixtures.proprio.layers.17.mlp.down_proj.weight', 'joint_model.mixtures.proprio.layers.17.input_layernorm.weight', 'joint_model.mixtures.proprio.layers.17.post_attention_layernorm.weight', 'joint_model.mixtures.proprio.norm.weight', 'joint_model.mixtures.action.layers.0.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.0.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.0.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.0.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.0.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.0.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.0.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.0.input_layernorm.weight', 'joint_model.mixtures.action.layers.0.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.1.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.1.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.1.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.1.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.1.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.1.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.1.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.1.input_layernorm.weight', 'joint_model.mixtures.action.layers.1.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.2.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.2.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.2.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.2.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.2.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.2.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.2.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.2.input_layernorm.weight', 'joint_model.mixtures.action.layers.2.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.3.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.3.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.3.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.3.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.3.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.3.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.3.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.3.input_layernorm.weight', 'joint_model.mixtures.action.layers.3.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.4.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.4.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.4.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.4.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.4.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.4.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.4.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.4.input_layernorm.weight', 'joint_model.mixtures.action.layers.4.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.5.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.5.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.5.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.5.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.5.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.5.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.5.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.5.input_layernorm.weight', 'joint_model.mixtures.action.layers.5.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.6.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.6.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.6.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.6.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.6.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.6.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.6.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.6.input_layernorm.weight', 'joint_model.mixtures.action.layers.6.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.7.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.7.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.7.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.7.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.7.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.7.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.7.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.7.input_layernorm.weight', 'joint_model.mixtures.action.layers.7.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.8.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.8.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.8.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.8.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.8.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.8.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.8.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.8.input_layernorm.weight', 'joint_model.mixtures.action.layers.8.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.9.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.9.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.9.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.9.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.9.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.9.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.9.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.9.input_layernorm.weight', 'joint_model.mixtures.action.layers.9.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.10.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.10.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.10.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.10.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.10.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.10.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.10.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.10.input_layernorm.weight', 'joint_model.mixtures.action.layers.10.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.11.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.11.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.11.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.11.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.11.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.11.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.11.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.11.input_layernorm.weight', 'joint_model.mixtures.action.layers.11.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.12.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.12.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.12.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.12.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.12.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.12.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.12.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.12.input_layernorm.weight', 'joint_model.mixtures.action.layers.12.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.13.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.13.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.13.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.13.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.13.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.13.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.13.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.13.input_layernorm.weight', 'joint_model.mixtures.action.layers.13.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.14.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.14.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.14.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.14.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.14.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.14.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.14.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.14.input_layernorm.weight', 'joint_model.mixtures.action.layers.14.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.15.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.15.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.15.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.15.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.15.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.15.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.15.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.15.input_layernorm.weight', 'joint_model.mixtures.action.layers.15.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.16.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.16.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.16.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.16.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.16.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.16.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.16.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.16.input_layernorm.weight', 'joint_model.mixtures.action.layers.16.post_attention_layernorm.weight', 'joint_model.mixtures.action.layers.17.self_attn.q_proj.weight', 'joint_model.mixtures.action.layers.17.self_attn.k_proj.weight', 'joint_model.mixtures.action.layers.17.self_attn.v_proj.weight', 'joint_model.mixtures.action.layers.17.self_attn.o_proj.weight', 'joint_model.mixtures.action.layers.17.mlp.gate_proj.weight', 'joint_model.mixtures.action.layers.17.mlp.up_proj.weight', 'joint_model.mixtures.action.layers.17.mlp.down_proj.weight', 'joint_model.mixtures.action.layers.17.input_layernorm.weight', 'joint_model.mixtures.action.layers.17.post_attention_layernorm.weight', 'joint_model.mixtures.action.norm.weight', 'action_encoder.linear_1.weight', 'action_encoder.linear_1.bias', 'action_encoder.linear_2.weight', 'action_encoder.linear_2.bias', 'action_encoder.linear_3.weight', 'action_encoder.linear_3.bias', 'proprio_encoder.weight', 'proprio_encoder.bias', 'action_decoder.weight', 'action_decoder.bias'])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ckpt['model'].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([2048, 1024]), torch.Size([2048, 2048]), torch.Size([2048, 1024]))"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ckpt['model']['joint_model.mixtures.proprio.layers.17.self_attn.q_proj.weight'].shape, ckpt['model']['joint_model.mixtures.vlm.layers.17.self_attn.q_proj.weight'].shape, ckpt['model']['joint_model.mixtures.action.layers.17.self_attn.q_proj.weight'].shape"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pi0_test",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
