{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MNk7IylTv610"
      },
      "source": [
        "# Loading and Analysing Pre-Trained Sparse Autoencoders"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i_DusoOvwV0M"
      },
      "source": [
        "## Imports & Installs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yfDUxRx0wSRl"
      },
      "outputs": [],
      "source": [
        "try:\n",
        "    import google.colab # type: ignore\n",
        "    from google.colab import output\n",
        "    COLAB = True\n",
        "    %pip install sae-lens transformer-lens\n",
        "except:\n",
        "    COLAB = False\n",
        "    from IPython import get_ipython # type: ignore\n",
        "    ipython = get_ipython(); assert ipython is not None\n",
        "    ipython.run_line_magic(\"load_ext\", \"autoreload\")\n",
        "    ipython.run_line_magic(\"autoreload\", \"2\")\n",
        "\n",
        "# Standard imports\n",
        "import os\n",
        "import torch\n",
        "from tqdm import tqdm\n",
        "import plotly.express as px\n",
        "\n",
        "# Imports for displaying vis in Colab / notebook\n",
        "import webbrowser\n",
        "import http.server\n",
        "import socketserver\n",
        "import threading\n",
        "PORT = 8000\n",
        "\n",
        "torch.set_grad_enabled(False);"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7aGgWkbav610"
      },
      "source": [
        "## Set Up"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rQSD7trbv610",
        "outputId": "222a40c4-75d4-46e2-ed3f-991841144926"
      },
      "outputs": [],
      "source": [
        "# For the most part I'll try to import functions and classes near where they are used\n",
        "# to make it clear where they come from.\n",
        "\n",
        "if torch.backends.mps.is_available():\n",
        "    device = \"mps\"\n",
        "else:\n",
        "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
        "\n",
        "print(f\"Device: {device}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cPUq_bdW8mcp"
      },
      "outputs": [],
      "source": [
        "def display_vis_inline(filename: str, height: int = 850):\n",
        "    '''\n",
        "    Displays the HTML files in Colab. Uses global `PORT` variable defined in prev cell, so that each\n",
        "    vis has a unique port without having to define a port within the function.\n",
        "    '''\n",
        "    if not(COLAB):\n",
        "        webbrowser.open(filename);\n",
        "\n",
        "    else:\n",
        "        global PORT\n",
        "\n",
        "        def serve(directory):\n",
        "            os.chdir(directory)\n",
        "\n",
        "            # Create a handler for serving files\n",
        "            handler = http.server.SimpleHTTPRequestHandler\n",
        "\n",
        "            # Create a socket server with the handler\n",
        "            with socketserver.TCPServer((\"\", PORT), handler) as httpd:\n",
        "                print(f\"Serving files from {directory} on port {PORT}\")\n",
        "                httpd.serve_forever()\n",
        "\n",
        "        thread = threading.Thread(target=serve, args=(\"/content\",))\n",
        "        thread.start()\n",
        "\n",
        "        output.serve_kernel_port_as_iframe(PORT, path=f\"/{filename}\", height=height, cache_in_notebook=True)\n",
        "\n",
        "        PORT += 1"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XoMx3VZpv611"
      },
      "source": [
        "# Loading a pretrained Sparse Autoencoder\n",
        "\n",
        "Below we load a Transformerlens model, a pretrained SAE and a dataset from huggingface."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sNSfL80Uv611"
      },
      "outputs": [],
      "source": [
        "from datasets import load_dataset  \n",
        "from transformer_lens import HookedTransformer\n",
        "from sae_lens import SAE\n",
        "\n",
        "model = HookedTransformer.from_pretrained(\"gpt2-small\", device = device)\n",
        "\n",
        "# the cfg dict is returned alongside the SAE since it may contain useful information for analysing the SAE (eg: instantiating an activation store)\n",
        "# Note that this is not the same as the SAEs config dict, rather it is whatever was in the HF repo, from which we can extract the SAE config dict\n",
        "# We also return the feature sparsities which are stored in HF for convenience. \n",
        "sae, cfg_dict, sparsity = SAE.from_pretrained(\n",
        "    release = \"gpt2-small-res-jb\", # see other options in sae_lens/pretrained_saes.yaml\n",
        "    sae_id = \"blocks.8.hook_resid_pre\", # won't always be a hook point\n",
        "    device = device\n",
        ")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from transformer_lens.utils import tokenize_and_concatenate\n",
        "\n",
        "dataset = load_dataset(\n",
        "    path = \"NeelNanda/pile-10k\",\n",
        "    split=\"train\",\n",
        "    streaming=False,\n",
        ")\n",
        "\n",
        "token_dataset = tokenize_and_concatenate(\n",
        "    dataset= dataset,# type: ignore\n",
        "    tokenizer = model.tokenizer, # type: ignore\n",
        "    streaming=True,\n",
        "    max_length=sae.cfg.context_size,\n",
        "    add_bos_token=sae.cfg.prepend_bos,\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gy2uUl38v611"
      },
      "source": [
        "## Basic Analysis\n",
        "\n",
        "Let's check some basic stats on this SAE in order to see how some basic functionality in the codebase works.\n",
        "\n",
        "We'll calculate:\n",
        "- L0 (the number of features that fire per activation)\n",
        "- The cross entropy loss when the output of the SAE is used in place of the activations"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xOcubgsRv611"
      },
      "source": [
        "### L0 Test and Reconstruction Test"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "sae.cfg"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gAUR5CRBv611"
      },
      "outputs": [],
      "source": [
        "sae.eval()  # prevents error if we're expecting a dead neuron mask for who grads\n",
        "\n",
        "with torch.no_grad():\n",
        "    # activation store can give us tokens.\n",
        "    batch_tokens = token_dataset[:32][\"tokens\"]\n",
        "    _, cache = model.run_with_cache(batch_tokens, prepend_bos=True)\n",
        "\n",
        "    # Use the SAE\n",
        "    feature_acts = sae.encode(cache[sae.cfg.hook_name])\n",
        "    sae_out = sae.decode(feature_acts)\n",
        "\n",
        "    # save some room\n",
        "    del cache\n",
        "\n",
        "    # ignore the bos token, get the number of features that activated in each token, averaged accross batch and position\n",
        "    l0 = (feature_acts[:, 1:] > 0).float().sum(-1).detach()\n",
        "    print(\"average l0\", l0.mean().item())\n",
        "    px.histogram(l0.flatten().cpu().numpy()).show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ijoelLtdv611"
      },
      "source": [
        "Note that while the mean L0 is 64, it varies with the specific activation.\n",
        "\n",
        "To estimate reconstruction performance, we calculate the CE loss of the model with and without the SAE being used in place of the activations. This will vary depending on the tokens."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fwrSvREJv612"
      },
      "outputs": [],
      "source": [
        "from transformer_lens import utils\n",
        "from functools import partial\n",
        "\n",
        "# next we want to do a reconstruction test.\n",
        "def reconstr_hook(activation, hook, sae_out):\n",
        "    return sae_out\n",
        "\n",
        "\n",
        "def zero_abl_hook(activation, hook):\n",
        "    return torch.zeros_like(activation)\n",
        "\n",
        "\n",
        "print(\"Orig\", model(batch_tokens, return_type=\"loss\").item())\n",
        "print(\n",
        "    \"reconstr\",\n",
        "    model.run_with_hooks(\n",
        "        batch_tokens,\n",
        "        fwd_hooks=[\n",
        "            (\n",
        "                sae.cfg.hook_name,\n",
        "                partial(reconstr_hook, sae_out=sae_out),\n",
        "            )\n",
        "        ],\n",
        "        return_type=\"loss\",\n",
        "    ).item(),\n",
        ")\n",
        "print(\n",
        "    \"Zero\",\n",
        "    model.run_with_hooks(\n",
        "        batch_tokens,\n",
        "        return_type=\"loss\",\n",
        "        fwd_hooks=[(sae.cfg.hook_name, zero_abl_hook)],\n",
        "    ).item(),\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "B_TRq_lFv612"
      },
      "source": [
        "## Specific Capability Test\n",
        "\n",
        "Validating model performance on specific tasks when using the reconstructed activation is quite important when studying specific tasks."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "npxKip_Qv612"
      },
      "outputs": [],
      "source": [
        "example_prompt = \"When John and Mary went to the shops, John gave the bag to\"\n",
        "example_answer = \" Mary\"\n",
        "utils.test_prompt(example_prompt, example_answer, model, prepend_bos=True)\n",
        "\n",
        "logits, cache = model.run_with_cache(example_prompt, prepend_bos=True)\n",
        "tokens = model.to_tokens(example_prompt)\n",
        "sae_out = sae(cache[sae.cfg.hook_name])\n",
        "\n",
        "\n",
        "def reconstr_hook(activations, hook, sae_out):\n",
        "    return sae_out\n",
        "\n",
        "\n",
        "def zero_abl_hook(mlp_out, hook):\n",
        "    return torch.zeros_like(mlp_out)\n",
        "\n",
        "\n",
        "hook_name = sae.cfg.hook_name\n",
        "\n",
        "print(\"Orig\", model(tokens, return_type=\"loss\").item())\n",
        "print(\n",
        "    \"reconstr\",\n",
        "    model.run_with_hooks(\n",
        "        tokens,\n",
        "        fwd_hooks=[\n",
        "            (\n",
        "                hook_name,\n",
        "                partial(reconstr_hook, sae_out=sae_out),\n",
        "            )\n",
        "        ],\n",
        "        return_type=\"loss\",\n",
        "    ).item(),\n",
        ")\n",
        "print(\n",
        "    \"Zero\",\n",
        "    model.run_with_hooks(\n",
        "        tokens,\n",
        "        return_type=\"loss\",\n",
        "        fwd_hooks=[(hook_name, zero_abl_hook)],\n",
        "    ).item(),\n",
        ")\n",
        "\n",
        "\n",
        "with model.hooks(\n",
        "    fwd_hooks=[\n",
        "        (\n",
        "            hook_name,\n",
        "            partial(reconstr_hook, sae_out=sae_out),\n",
        "        )\n",
        "    ]\n",
        "):\n",
        "    utils.test_prompt(example_prompt, example_answer, model, prepend_bos=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1swj9KA7v612"
      },
      "source": [
        "# Generating Feature Interfaces\n",
        "\n",
        "Feature dashboards are an important part of SAE Evaluation. They work by:\n",
        "- 1. Collecting feature activations over a larger number of examples.\n",
        "- 2. Aggregating feature specific statistics (such as max activating examples).\n",
        "- 3. Representing that information in a standardized way"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "edt8ag4fv612"
      },
      "outputs": [],
      "source": [
        "from sae_vis.data_config_classes import SaeVisConfig\n",
        "from sae_vis.data_storing_fns import SaeVisData\n",
        "\n",
        "test_feature_idx_gpt = list(range(10)) + [14057]\n",
        "\n",
        "feature_vis_config_gpt = SaeVisConfig(\n",
        "    hook_point=hook_name,\n",
        "    features=test_feature_idx_gpt,\n",
        "    batch_size=2048,\n",
        "    minibatch_size_tokens=128,\n",
        "    verbose=True,\n",
        ")\n",
        "\n",
        "sae_vis_data_gpt = SaeVisData.create(\n",
        "    encoder=sae,\n",
        "    model=model, # type: ignore\n",
        "    tokens=token_dataset[:10000][\"tokens\"],  # type: ignore\n",
        "    cfg=feature_vis_config_gpt,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yQ94Frzbv612"
      },
      "outputs": [],
      "source": [
        "for feature in test_feature_idx_gpt:\n",
        "    filename = f\"{feature}_feature_vis_demo_gpt.html\"\n",
        "    sae_vis_data_gpt.save_feature_centric_vis(filename, feature)\n",
        "    display_vis_inline(filename)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AUaD6CFDv612"
      },
      "source": [
        "Now, since generating feature dashboards can be done once per sparse autoencoder, for pre-trained SAEs in the public domain, everyone can use the same dashboards. Neuronpedia hosts dashboards which we can load via the intergration."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BxluyNRBv612"
      },
      "outputs": [],
      "source": [
        "from sae_lens.analysis.neuronpedia_integration import get_neuronpedia_quick_list\n",
        "\n",
        "# this function should open\n",
        "neuronpedia_quick_list = get_neuronpedia_quick_list(\n",
        "    test_feature_idx_gpt,\n",
        "    layer=sae.cfg.hook_layer,\n",
        "    model=\"gpt2-small\",\n",
        "    dataset=\"res-jb\",\n",
        "    name=\"A quick list we made\",\n",
        ")\n",
        "\n",
        "if COLAB:\n",
        "  # If you're on colab, click the link below\n",
        "  print(neuronpedia_quick_list)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.7"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
