{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45f8efe3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zl4063/Documents/github/ZO-LLM/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM'>\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n",
    ")\n",
    "print(type(model))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e8e71937",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "418c2df8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import inspect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e31ac23c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/home/zl4063/Documents/github/ZO-LLM/.venv/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inspect.getfile(Qwen2ForCausalLM)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "717c1a0f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n",
      "[Layer 27] Non-zero activation ratio: 1.0000\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM\n",
    "import torch\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    \"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\"\n",
    ")\n",
    "\n",
    "for idx, layer in enumerate(model.model.layers):\n",
    "    mlp = layer.mlp\n",
    "\n",
    "    # 注意：我们 hook 的是激活前乘 down_proj 之前的那一步，即 activation(gate_proj(x))\n",
    "    def capture_gate_act_fn(module, input):\n",
    "        x = input[0]\n",
    "        act_output = module.act_fn(module.gate_proj(x))\n",
    "        nonzero = (act_output != 0).float().sum()\n",
    "        total = act_output.numel()\n",
    "        ratio = nonzero / total\n",
    "        print(f\"[Layer {idx}] Non-zero activation ratio: {ratio.item():.4f}\")\n",
    "        return act_output * module.up_proj(x)  # 手动执行 SwiGLU\n",
    "\n",
    "    # 替换掉 forward\n",
    "    mlp.forward = lambda x, module=mlp: module.down_proj(\n",
    "        capture_gate_act_fn(module, (x,))\n",
    "    )\n",
    "\n",
    "\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B\")\n",
    "inputs = tokenizer(\"The quick brown fox jumps over the lazy dog\", return_tensors=\"pt\")\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    model(**inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27bc6475",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Approximate non-zero activation ratio: 1.0000\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "\n",
    "class SmallGELUNet(nn.Module):\n",
    "    def __init__(self, input_dim=16, hidden_dim=64):\n",
    "        super().__init__()\n",
    "        self.fc1 = nn.Linear(input_dim, hidden_dim)\n",
    "        self.act = nn.GELU()  # 可以换成 SiLU/ReLU/GELU 进行对比\n",
    "        self.fc2 = nn.Linear(hidden_dim, 8)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.fc1(x)\n",
    "        self._last_activation = self.act(x)  # 保存 activation 输出以便外部分析\n",
    "        x = self.fc2(self._last_activation)\n",
    "        return x\n",
    "\n",
    "\n",
    "def measure_activation_sparsity(tensor: torch.Tensor, eps=1e-4):\n",
    "    near_zero = (tensor.abs() < eps).float().sum()\n",
    "    total = tensor.numel()\n",
    "    sparse_ratio = near_zero / total\n",
    "    nonzero_ratio = 1.0 - sparse_ratio\n",
    "    print(f\"Approximate non-zero activation ratio: {nonzero_ratio:.4f}\")\n",
    "    return nonzero_ratio\n",
    "\n",
    "\n",
    "model = SmallGELUNet()\n",
    "model.eval()\n",
    "\n",
    "# 随机输入\n",
    "x = torch.randn(32, 16)  # batch size 32\n",
    "with torch.no_grad():\n",
    "    output = model(x)\n",
    "    activation = model._last_activation\n",
    "    measure_activation_sparsity(activation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e92a231",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "zo-llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
