{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4e720133",
   "metadata": {},
   "source": [
    "## Load model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e5a9a9e3-075d-46ef-a7a6-3b4472c24981",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n",
      "2.2.2\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer\n",
    "from torch import nn \n",
    "import pdb\n",
    "from tqdm import tqdm\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "import sys \n",
    "sys.path.append('../')\n",
    "from utils import lowrank_modeling_v2 as lowrank_modeling\n",
    "\n",
    "print(torch.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "33bb1813-e941-4d84-9766-9f28d69f49f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = AutoModelForCausalLM.from_pretrained(model_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "deff586e",
   "metadata": {},
   "source": [
    "## Compare 2 approaches of distilling a model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bf65774e",
   "metadata": {},
   "source": [
    "## Low rank Approximation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2c5f01c4",
   "metadata": {},
   "source": [
    "#### Low-Rank Layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3b1ae7dc-3ca6-435a-8dc5-7e71ca2902f2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([2, 768, 2086])\n"
     ]
    }
   ],
   "source": [
    "class LowrankLinear1(nn.Module):\n",
    "    def __init__(self, current_layer, niter=2):\n",
    "        super(LowrankLinear1, self).__init__()\n",
    "        \"\"\"\n",
    "        Inputs: Linear layer to perform ASVD on. \n",
    "        Approach: Parameter + gumbel sigmoid to generate mask \n",
    "        \"\"\" \n",
    "    \n",
    "        if not isinstance(current_layer, nn.Linear):\n",
    "            raise ValueError(f\"Expected input into SVDLayer be of instance nn.Linear, got {type(current_layer)}\")\n",
    "        \n",
    "        self.in_features, self.out_features = current_layer.in_features, current_layer.out_features\n",
    "        self.rank = min(current_layer.weight.shape[1], current_layer.weight.shape[0])\n",
    "        self.rank = self.rank\n",
    "        U, E, V = torch.svd_lowrank(current_layer.weight.float(),\n",
    "                                q=self.rank,\n",
    "                                niter=niter)\n",
    "        \n",
    "        assert len(E.shape) == 1, 'expected singular values to have only one dim'\n",
    "\n",
    "        # precompute EV for efficency\n",
    "        self.UE = nn.Parameter(U * E.unsqueeze(0))\n",
    "        self.V_t = nn.Parameter(V.T)\n",
    "        \n",
    "        self.E_train = nn.Parameter(torch.ones_like(E, device='cpu'))\n",
    "        self.tau = 0.5\n",
    "\n",
    "        self.U, self.E, self.V = U, E, V\n",
    "        \n",
    "    def forward(self, inputs):\n",
    "        \"\"\"\n",
    "        Computes forward pass that involves selection of singular values through predicted mask \n",
    "\n",
    "        Instead of diagonalize+matmul of singular values, use broadcasting for efficiency\n",
    "\n",
    "        inputs: \n",
    "        \"\"\"\n",
    "\n",
    "        probs = torch.sigmoid(self.E_train)\n",
    "        bernoulli = torch.distributions.relaxed_bernoulli.RelaxedBernoulli(temperature=self.tau, probs=probs)\n",
    "        E_train_mask = bernoulli.rsample()\n",
    "\n",
    "        # reconstruct matrix weight matrix \n",
    "        # weight_approx = torch.matmul(self.UE * E_train_mask.unsqueeze(0), self.V_t)\n",
    "        output = inputs @ ((self.UE * E_train_mask.unsqueeze(0)) @ self.V_t).T\n",
    "\n",
    "        # weight_approx2 = self.U @ (torch.diag(self.E * E_train_mask)) @ self.V_t\n",
    "        # print(torch.allclose(weight_approx, weight_approx2))\n",
    "        \n",
    "        return output\n",
    "    \n",
    "    def __str__(self):\n",
    "        return f\"SVDLayerMasking(in_features={self.in_features}, out_features={self.out_features}, rank={self.rank})\"\n",
    "\n",
    "    def __repr__(self):\n",
    "        return self.__str__()\n",
    "    \n",
    "\n",
    "\n",
    "class LowrankLinear2(torch.nn.Module):\n",
    "    def __init__(self, current_layer, niter=1):\n",
    "        super(LowrankLinear2, self).__init__()\n",
    "        \"\"\"\n",
    "        Inputs: Linear layer to perform ASVD on. \n",
    "        Approach: Parameter + gumbel sigmoid to generate mask \n",
    "        \"\"\" \n",
    "    \n",
    "        if not isinstance(current_layer, torch.nn.Linear):\n",
    "            raise ValueError(f\"Expected input into SVDLayer be of instance nn.Linear, got {type(current_layer)}\")\n",
    "        \n",
    "        self.in_features, self.out_features = current_layer.in_features, current_layer.out_features\n",
    "        self.rank = min(current_layer.weight.shape[1], current_layer.weight.shape[0])\n",
    "        self.rank = int(self.rank * 0.40)\n",
    "\n",
    "        U, E, V = torch.svd_lowrank(current_layer.weight.float().T,\n",
    "                                q=self.rank,\n",
    "                                niter=niter)\n",
    "        \n",
    "        assert len(E.shape) == 1, 'expected singular values to have only one dim'\n",
    "\n",
    "        # precompute EV for efficency\n",
    "        self.UE = torch.nn.Parameter(U * E.unsqueeze(0))\n",
    "        self.V_t = torch.nn.Parameter(V.T)\n",
    "        \n",
    "        self.E_train = torch.nn.Parameter(torch.ones_like(E, device='cpu'))\n",
    "        self.tau = 0.5\n",
    "\n",
    "        self.U, self.E, self.V = U, E, V\n",
    "        \n",
    "    def forward(self, inputs):\n",
    "        \"\"\"\n",
    "        Computes forward pass that involves selection of singular values through predicted mask \n",
    "\n",
    "        Instead of diagonalize+matmul of singular values, use broadcasting for efficiency\n",
    "\n",
    "        inputs: \n",
    "        \"\"\"\n",
    "        probs = torch.sigmoid(self.E_train)\n",
    "        E_train_mask = torch.bernoulli(probs)\n",
    "\n",
    "        # reconstruct matrix weight matrix \n",
    "        output = inputs @ (self.UE * E_train_mask.unsqueeze(0)) @ self.V_t\n",
    "                \n",
    "        return output\n",
    "    \n",
    "    def __str__(self):\n",
    "        return f\"LowrankLinear1(in_features={self.in_features}, out_features={self.out_features}, rank={self.rank})\"\n",
    "\n",
    "    def __repr__(self):\n",
    "        return self.__str__()\n",
    "    \n",
    "\n",
    "in_features, out_features = 768, 2086\n",
    "old_layer = nn.Linear(in_features=in_features, out_features=out_features)\n",
    "x = torch.randn(2, in_features//2, in_features)\n",
    "\n",
    "layer1 = LowrankLinear1(old_layer)\n",
    "layer2 = LowrankLinear2(old_layer)\n",
    "layer3 = lowrank_modeling.LowrankLinearTopk(old_layer, 1.)\n",
    "\n",
    "x = torch.randn(2, in_features, in_features)\n",
    "res = layer1(x)\n",
    "print(res.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "bbb380e6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "99.7 ms ± 7.93 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 10 -r 10\n",
    "res = layer1(x)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4012e60f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "39.6 ms ± 3.66 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 10 -r 10\n",
    "res = layer2(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5e85f688",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "57.3 ms ± 8.95 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 10 -r 10\n",
    "res = layer3(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "55c1e3c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import lowrank_modeling_v2\n",
    "\n",
    "in_features, out_features = 768, 2086\n",
    "old_layer = nn.Linear(in_features=in_features, out_features=out_features)\n",
    "x = torch.randn(2, in_features//2, in_features)\n",
    "\n",
    "layer1 = lowrank_modeling_v2.LowrankLinear2(old_layer, 1, None)\n",
    "layer2 = lowrank_modeling.LowrankLinear2(old_layer, 1, None)\n",
    "res = layer1(x)\n",
    "res2 = layer2(x)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "27d0d12f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "47.9 ms ± 3.44 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 10 -r 10\n",
    "res1 = layer1(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d8ae2e85",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "47.7 ms ± 2.23 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 10 -r 10\n",
    "res1 = layer2(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "747f4596",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "layer3.c_prime.requires_grad_ = False"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6df9c6b",
   "metadata": {},
   "source": [
    "## Test Forward Pass time after Edit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f1b51ab9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def forward_pass(model):\n",
    "    input_text = [\"Hello, how are you?\" * 5, \"Bye now!\" * 2]\n",
    "    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)\n",
    "    labels = inputs.input_ids.clone()\n",
    "    labels[labels == tokenizer.pad_token_id] = -100\n",
    "\n",
    "    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')\n",
    "    with torch.no_grad():\n",
    "        outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)\n",
    "\n",
    "    return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "48cadeb7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 2 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 2 -r 1\n",
    "forward_pass(model)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d645c92",
   "metadata": {},
   "source": [
    "## Timings\n",
    "* Keeping 90% rank of singular values\n",
    "    - Wall time: 1.93 s to Wall time: 51.2 s\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea197675",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "8790d292",
   "metadata": {},
   "source": [
    "## Other"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "47abb4bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "U = torch.randn(100, 20)\n",
    "E = torch.randn(20)\n",
    "V = torch.randn(100, 20)\n",
    "weight = torch.randn(4, 16, 100)\n",
    "\n",
    "output1 = weight @ ((U * E.unsqueeze(0)) @ V.T)\n",
    "output2 = weight @ (U * E.unsqueeze(0)) @ V.T\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5fa51c85",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.allclose(output2, output1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5ee72a34",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "def gumbel_sigmoid_probs(probs, tau=0.5):\n",
    "    \"\"\"Apply Gumbel Sigmoid to probabilities\"\"\"\n",
    "    \n",
    "    def sample_gumbel(shape, device, eps=1e-20):\n",
    "        \"\"\"Sample from Gumbel(0, 1)\"\"\"\n",
    "        U = torch.rand(shape, device=device)\n",
    "        return -torch.log(-torch.log(U + eps) + eps)\n",
    "\n",
    "    # Convert probabilities to logits\n",
    "    logits = torch.log(probs / (1 - probs + 1e-20))\n",
    "\n",
    "    gumbel_noise = sample_gumbel(logits.shape, logits.device)\n",
    "    gumbel_logits = logits + gumbel_noise\n",
    "    y_soft = torch.sigmoid(gumbel_logits / tau)\n",
    "    return y_soft\n",
    "\n",
    "probs = torch.zeros(1, 20)\n",
    "gumbel_sigmoid_probs(probs, tau=0.5)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "516d897c",
   "metadata": {},
   "source": [
    "## Topk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4f9d4925",
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import lowrank_modeling_v2\n",
    "\n",
    "in_features, out_features = 768, 2086\n",
    "old_layer = nn.Linear(in_features=in_features, out_features=out_features)\n",
    "x = torch.randn(2, in_features//2, in_features)\n",
    "\n",
    "layer1 = lowrank_modeling_v2.LowrankLinearTopk2(old_layer, 0.80, None)\n",
    "res = layer1(x)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "835748c3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([0.8000], grad_fn=<RsubBackward1>)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1-torch.sigmoid(layer1.E_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3891be34",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor(0.7996, grad_fn=<DivBackward0>)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res[1].sum()/res[1].numel()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c34de08c",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
