{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e355b99d",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "        <script type=\"text/javascript\">\n",
       "        window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
       "        if (window.MathJax && window.MathJax.Hub && window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
       "        if (typeof require !== 'undefined') {\n",
       "        require.undef(\"plotly\");\n",
       "        requirejs.config({\n",
       "            paths: {\n",
       "                'plotly': ['https://cdn.plot.ly/plotly-2.27.0.min']\n",
       "            }\n",
       "        });\n",
       "        require(['plotly'], function(Plotly) {\n",
       "            window._Plotly = Plotly;\n",
       "        });\n",
       "        }\n",
       "        </script>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from lib_project.notebook import setup_notebook\n",
    "setup_notebook(\"../../../\")\n",
    "               \n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "acbd71c6",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ANONYMOUS/.cache/pypoetry/virtualenvs/llm-memorization-nIIioyR--py3.11/lib/python3.11/site-packages/torch/cuda/__init__.py:611: UserWarning:\n",
      "\n",
      "Can't initialize NVML\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2023-12-14 09:37:35,707] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
     ]
    }
   ],
   "source": [
    "from IPython.display import display, Markdown as md\n",
    "\n",
    "from defs import BASE_FIGURE_DIR\n",
    "from lib_project.visualization import with_paper_style\n",
    "from experiments.prefix_length import results as res_util\n",
    "from experiments.prefix_length import rule_extraction as ext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "511b3544-14bd-4c92-affb-1dfe6173fbd1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "model_name_ids = [\n",
    "    (\"Pythia-1B\", \"pyt-1b\"),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "293b2fda-86d8-4a6f-916c-cc287b6c3ab7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "results = {\n",
    "    model_name: res_util.load(f\"{model_id}_sl-32_al-2\", list(range(1)))\n",
    "    for model_name, model_id in model_name_ids\n",
    "}\n",
    "# fig = res_util.plot_overview(\n",
    "#     results,\n",
    "#     \"Model\",\n",
    "# )\n",
    "# fig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d60f8716-ccbf-4668-a188-1a9a42719859",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>correct_samples</th>\n",
       "      <th>target_prob</th>\n",
       "      <th>top_1_token</th>\n",
       "      <th>top_1_token_prob</th>\n",
       "      <th>top_2_token</th>\n",
       "      <th>top_2_token_prob</th>\n",
       "      <th>entropy</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>token_idx</th>\n",
       "      <th>prefix_length</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <th>1</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.437500</td>\n",
       "      <td>a</td>\n",
       "      <td>1.00</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">2</th>\n",
       "      <th>1</th>\n",
       "      <td>0.80</td>\n",
       "      <td>0.648359</td>\n",
       "      <td>a</td>\n",
       "      <td>0.80</td>\n",
       "      <td>b</td>\n",
       "      <td>0.20</td>\n",
       "      <td>0.500402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.00</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>a</td>\n",
       "      <td>1.00</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">3</th>\n",
       "      <th>1</th>\n",
       "      <td>0.78</td>\n",
       "      <td>0.757212</td>\n",
       "      <td>b</td>\n",
       "      <td>0.78</td>\n",
       "      <td>a</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.526908</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.62</td>\n",
       "      <td>0.566694</td>\n",
       "      <td>b</td>\n",
       "      <td>0.62</td>\n",
       "      <td>a</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.664064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"5\" valign=\"top\">31</th>\n",
       "      <th>20</th>\n",
       "      <td>0.29</td>\n",
       "      <td>0.353941</td>\n",
       "      <td>a</td>\n",
       "      <td>0.71</td>\n",
       "      <td>b</td>\n",
       "      <td>0.29</td>\n",
       "      <td>0.602152</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>0.59</td>\n",
       "      <td>0.581702</td>\n",
       "      <td>b</td>\n",
       "      <td>0.59</td>\n",
       "      <td>a</td>\n",
       "      <td>0.41</td>\n",
       "      <td>0.676859</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>0.91</td>\n",
       "      <td>0.837979</td>\n",
       "      <td>b</td>\n",
       "      <td>0.91</td>\n",
       "      <td>a</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.302538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>1.00</td>\n",
       "      <td>0.936953</td>\n",
       "      <td>b</td>\n",
       "      <td>1.00</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>1.00</td>\n",
       "      <td>0.996094</td>\n",
       "      <td>b</td>\n",
       "      <td>1.00</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>457 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         correct_samples  target_prob top_1_token  \\\n",
       "token_idx prefix_length                                             \n",
       "1         1                         0.00     0.437500           a   \n",
       "2         1                         0.80     0.648359           a   \n",
       "          2                         1.00     0.500000           a   \n",
       "3         1                         0.78     0.757212           b   \n",
       "          2                         0.62     0.566694           b   \n",
       "...                                  ...          ...         ...   \n",
       "31        20                        0.29     0.353941           a   \n",
       "          22                        0.59     0.581702           b   \n",
       "          25                        0.91     0.837979           b   \n",
       "          30                        1.00     0.936953           b   \n",
       "          31                        1.00     0.996094           b   \n",
       "\n",
       "                         top_1_token_prob top_2_token  top_2_token_prob  \\\n",
       "token_idx prefix_length                                                   \n",
       "1         1                          1.00        None               NaN   \n",
       "2         1                          0.80           b              0.20   \n",
       "          2                          1.00        None               NaN   \n",
       "3         1                          0.78           a              0.22   \n",
       "          2                          0.62           a              0.38   \n",
       "...                                   ...         ...               ...   \n",
       "31        20                         0.71           b              0.29   \n",
       "          22                         0.59           a              0.41   \n",
       "          25                         0.91           a              0.09   \n",
       "          30                         1.00        None               NaN   \n",
       "          31                         1.00        None               NaN   \n",
       "\n",
       "                          entropy  \n",
       "token_idx prefix_length            \n",
       "1         1             -0.000000  \n",
       "2         1              0.500402  \n",
       "          2             -0.000000  \n",
       "3         1              0.526908  \n",
       "          2              0.664064  \n",
       "...                           ...  \n",
       "31        20             0.602152  \n",
       "          22             0.676859  \n",
       "          25             0.302538  \n",
       "          30            -0.000000  \n",
       "          31            -0.000000  \n",
       "\n",
       "[457 rows x 7 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prefix_data = results[\"Pythia-1B\"][0].value.prefix_performance\n",
    "prefix_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "14afa899-a54c-4a51-ae34-c6ae8e72fb8e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'bbabbabbababbabaaabbababaaaababb'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\".join(f\"{t}\" for t in results[\"Pythia-1B\"][0].value.data.tokens[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c66f784-4fe1-445f-8770-8b7d470b1dd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "res_util.plot_probability_curves(results[\"Pythia-1B\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bab4636-eca0-45f6-9c71-7d85a5efa99e",
   "metadata": {},
   "source": [
    "# Token Mappings\n",
    "\n",
    "Goal: we want to find the mappings that a model that memorized a string uses to recall tokens, and convert them into a (context-free, deterministic) grammar.\n",
    "\n",
    "## Methodology\n",
    "\n",
    "We extract the mappings of prefixes to next tokens.\n",
    "To find prefixes, we use the randomization test, i.e. for the i-th token in string s and a prefix of length k, we keep `s[i - k : i]` fixed and randomize the remaining tokens.\n",
    "Then, for each token we find the shortest prefix that recalls the token correctly, i.e. for which `model(random_context + s[i - k : i]) == s[i]` and there is no $k' < k$ for which this holds, for a plurality of the random contexts.\n",
    "Results are computed over 100 random context samples.\n",
    "\n",
    "## Results\n",
    "\n",
    "Observations:\n",
    "- Token mappings are not always unambiguous. In some cases (prefixed with `2:`), the same prefix might map to two different tokens. E.g. For the 16 token, 26 character alphabet string, p -> w (at the beginning of the string) and p -> e (at the end of the string). So the model might use both information about the prefix, as well as about the token position to recall tokens. So far, I only observed ambiguous token mappings for single token prefixes."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "35847d16-1ef9-41f4-b186-742b4ba228a2",
   "metadata": {},
   "source": [
    "## 16 Token Strings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "553939b9-4cb8-4b77-84d6-243700f7272e",
   "metadata": {},
   "source": [
    "### 2 Character Alphabet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "09232af3-9ce5-479a-b90b-1bdb84f51da8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<style>\n",
       "    .slider-container {\n",
       "        width: 300px;\n",
       "        margin: 20px;\n",
       "    }\n",
       "\n",
       "    .slider {\n",
       "        width: 100%;\n",
       "    }\n",
       "\n",
       "    .highlight {\n",
       "        color: blue;  /* Adjust the color as needed */\n",
       "        font-weight: bold;\n",
       "    }\n",
       "</style>\n",
       "\n",
       "<div class=\"slider-container\">\n",
       "    <input type=\"range\" class=\"slider\" min=\"0\" max=\"2\" value=\"0\" step=\"1\" id=\"mySlider\">\n",
       "    <pre id=\"codeBlock\"></pre>\n",
       "</div>\n",
       "\n",
       "<script>\n",
       "    var slider = document.getElementById(\"mySlider\");\n",
       "    var codeBlock = document.getElementById(\"codeBlock\");\n",
       "    \n",
       "    // Code blocks for each slider position\n",
       "    var codeBlocks = [\n",
       "        `# Code Block 1\n",
       "<span class=\"highlight\">print</span>(\"This is code block 1\")\n",
       "`,\n",
       "        `# Code Block 2\n",
       "for i in range(5):\n",
       "    <span class=\"highlight\">print</span>(i)\n",
       "`,\n",
       "        `# Code Block 3\n",
       "x = 42\n",
       "y = 7\n",
       "<span class=\"highlight\">print</span>(x + y)\n",
       "`\n",
       "    ];\n",
       "\n",
       "    // Initial code block display\n",
       "    codeBlock.innerHTML = codeBlocks[slider.value];\n",
       "\n",
       "    // Update code block when slider value changes\n",
       "    slider.addEventListener(\"input\", function() {\n",
       "        codeBlock.innerHTML = codeBlocks[slider.value];\n",
       "    });\n",
       "</script>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "result = res_util.load(f\"pyt-1b_sl-16_al-2\", list(range(1)))[0].value\n",
    "string = result.data.tokens[0]\n",
    "prefixes = ext.extract_prefixes(\n",
    "    result.prefix_performance,\n",
    "    string,\n",
    ")\n",
    "prefixes.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "355b3829-1e02-4653-8482-8e5d33bb8f63",
   "metadata": {},
   "source": [
    "### 26 Character Alphabet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "82eeb21a-fe17-45c4-bea1-c30a2df183ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " p w j q s h t u l r c x x l p e\n",
      "\n",
      "Prefixes:\n",
      "-p:w\n",
      "  -w:j\n",
      "    -j:q\n",
      "    -j q:s\n",
      "        -s:h\n",
      "        -s h:t\n",
      "            -t:u\n",
      "            -t u:l\n",
      "                -l:r\n",
      "                -l r:c\n",
      "                    -c:x\n",
      "                      -x:x\n",
      "                      -x x:l\n",
      "                        -x l:p\n",
      "                            -p:e\n",
      "\n",
      "Minimum Necessary Prefix Mappings:\n",
      "p: w (1x), e (1x)\n",
      "w: j (1x)\n",
      "j: q (1x)\n",
      "jq: s (1x)\n",
      "s: h (1x)\n",
      "sh: t (1x)\n",
      "t: u (1x)\n",
      "tu: l (1x)\n",
      "l: r (1x)\n",
      "lr: c (1x)\n",
      "c: x (1x)\n",
      "x: x (1x)\n",
      "xx: l (1x)\n",
      "xl: p (1x)\n",
      "\n",
      "Converged Prefix Mappings:\n",
      "p: w (1x), e (1x)\n",
      "w: j (1x)\n",
      "j: q (1x)\n",
      "jq: s (1x)\n",
      "s: h (1x)\n",
      "sh: t (1x)\n",
      "t: u (1x)\n",
      "tu: l (1x)\n",
      "l: r (1x)\n",
      "lr: c (1x)\n",
      "c: x (1x)\n",
      "x: x (1x)\n",
      "xx: l (1x)\n",
      "xl: p (1x)\n"
     ]
    }
   ],
   "source": [
    "result = res_util.load(f\"pyt-1b_sl-16_al-26\", list(range(1)))[0].value\n",
    "string = result.data.tokens[0]\n",
    "prefixes = ext.extract_prefixes(\n",
    "    result.prefix_performance,\n",
    "    string,\n",
    ")\n",
    "prefixes.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d09a5816-b955-4a7a-84de-fa154fd5a773",
   "metadata": {},
   "source": [
    "## 32 Token Strings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b686d222-ffca-45a0-b418-7a1ad1061227",
   "metadata": {},
   "source": [
    "### 2 Character Alphabet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "242f1d08-2f0d-4c9e-8818-7e947a5cd272",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " b b a b b a b b a b a b b a b a a a b b a b a b a a a a b a b b\n",
      "\n",
      "Prefixes:\n",
      " -:b\n",
      "  -b:a\n",
      "    -a:b\n",
      "    -a b:b\n",
      "        -b:a\n",
      "          -a:b\n",
      "          -a b:b\n",
      "              -b:a\n",
      "             b b-a:b\n",
      "           a b b a-b:a\n",
      "   b a b b a b b a b-a:b\n",
      "                    -a b:b\n",
      "                        -b:a\n",
      "                 a b a b b-a:b\n",
      "                         b a-b:a\n",
      "           a b b a b a b b a-b a:a\n",
      "                            -b a a:a\n",
      "                                  -a:b\n",
      "                                  -a b:b\n",
      "                                      -b:a\n",
      "                               a a a b b-a:b\n",
      "                                          -b:a\n",
      "                     a b b a b a a a b b a b-a:b\n",
      "                                              -b:a\n",
      "                                              -b a:a\n",
      "                                              -b a a:a\n",
      "             b b a b a b b a b a a a b b a b a b a a-a:a\n",
      "                                                      -a:b\n",
      "                 a b a b b a b a a a b b a b a b a a a a-b:a\n",
      "                                                -a a a a b a:b\n",
      "                  -b a b b a b a a a b b a b a b a a a a b a b:b\n",
      "\n",
      "Minimum Necessary Prefix Mappings:\n",
      "b: a (10x)\n",
      "a: b (9x), a (1x)\n",
      "ab: b (4x)\n",
      "ba: a (2x)\n",
      "baa: a (2x)\n",
      "aaaaba: b (1x)\n",
      "babbabaaabbababaaaabab: b (1x)\n",
      "\n",
      "Converged Prefix Mappings:\n",
      "b: a (7x)\n",
      "a: b (4x)\n",
      "ab: b (4x)\n",
      "bba: b (1x)\n",
      "abbab: a (1x)\n",
      "babbabbaba: b (1x)\n",
      "ababba: b (1x)\n",
      "bab: a (1x)\n",
      "abbababbaba: a (1x)\n",
      "baa: a (2x)\n",
      "aaabba: b (1x)\n",
      "abbabaaabbaba: b (1x)\n",
      "ba: a (1x)\n",
      "bbababbabaaabbababaaa: a (1x)\n",
      "ababbabaaabbababaaaab: a (1x)\n",
      "aaaaba: b (1x)\n",
      "babbabaaabbababaaaabab: b (1x)\n"
     ]
    }
   ],
   "source": [
    "result = res_util.load(f\"pyt-1b_sl-32_al-2\", list(range(1)))[0].value\n",
    "string = result.data.tokens[0]\n",
    "prefixes = ext.extract_prefixes(\n",
    "    result.prefix_performance,\n",
    "    string,\n",
    ")\n",
    "prefixes.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2dda080f-d11f-4359-b361-20f430a6d0dc",
   "metadata": {},
   "source": [
    "### 26 Character Alphabet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "d3a503e3-acb8-4857-ba14-8f39c539cc79",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " p w j q s h t u l r c x x l p e g e s s m o g n c h a a t a u v\n",
      "\n",
      "Prefixes:\n",
      "-p:w\n",
      "  -w:j\n",
      "    -j:q\n",
      "      -q:s\n",
      "        -s:h\n",
      "        -s h:t\n",
      "          -h t:u\n",
      "              -u:l\n",
      "              -u l:r\n",
      "                  -r:c\n",
      "                  -r c:x\n",
      "                  -r c x:x\n",
      "                        -x:l\n",
      "                        -x l:p\n",
      "                            -p:e\n",
      "                              -e:g\n",
      "                                -g:e\n",
      "                                -g e:s\n",
      "                                -g e s:s\n",
      "                                    -s s:m\n",
      "                                      -s m:o\n",
      "                                          -o:g\n",
      "                                        -m o g:n\n",
      "                                              -n:c\n",
      "                                                -c:h\n",
      "                                                  -h:a\n",
      "                                                  -h a:a\n",
      "                                                  -h a a:t\n",
      "                                                      -a t:a\n",
      "                                                          -a:u\n",
      "                                                          -a u:v\n",
      "\n",
      "Minimum Necessary Prefix Mappings:\n",
      "p: w (1x), e (1x)\n",
      "w: j (1x)\n",
      "j: q (1x)\n",
      "q: s (1x)\n",
      "s: h (1x)\n",
      "sh: t (1x)\n",
      "ht: u (1x)\n",
      "u: l (1x)\n",
      "ul: r (1x)\n",
      "r: c (1x)\n",
      "rc: x (1x)\n",
      "rcx: x (1x)\n",
      "x: l (1x)\n",
      "xl: p (1x)\n",
      "e: g (1x)\n",
      "g: e (1x)\n",
      "ge: s (1x)\n",
      "ges: s (1x)\n",
      "ss: m (1x)\n",
      "sm: o (1x)\n",
      "o: g (1x)\n",
      "mog: n (1x)\n",
      "n: c (1x)\n",
      "c: h (1x)\n",
      "h: a (1x)\n",
      "ha: a (1x)\n",
      "haa: t (1x)\n",
      "at: a (1x)\n",
      "a: u (1x)\n",
      "au: v (1x)\n",
      "\n",
      "Converged Prefix Mappings:\n",
      "p: w (1x), e (1x)\n",
      "w: j (1x)\n",
      "j: q (1x)\n",
      "q: s (1x)\n",
      "s: h (1x)\n",
      "sh: t (1x)\n",
      "ht: u (1x)\n",
      "u: l (1x)\n",
      "ul: r (1x)\n",
      "r: c (1x)\n",
      "rc: x (1x)\n",
      "rcx: x (1x)\n",
      "x: l (1x)\n",
      "xl: p (1x)\n",
      "e: g (1x)\n",
      "g: e (1x)\n",
      "ge: s (1x)\n",
      "ges: s (1x)\n",
      "ss: m (1x)\n",
      "sm: o (1x)\n",
      "o: g (1x)\n",
      "mog: n (1x)\n",
      "n: c (1x)\n",
      "c: h (1x)\n",
      "h: a (1x)\n",
      "ha: a (1x)\n",
      "haa: t (1x)\n",
      "at: a (1x)\n",
      "a: u (1x)\n",
      "au: v (1x)\n"
     ]
    }
   ],
   "source": [
    "result = res_util.load(f\"pyt-1b_sl-32_al-26\", list(range(1)))[0].value\n",
    "string = result.data.tokens[0]\n",
    "prefixes = ext.extract_prefixes(\n",
    "    result.prefix_performance,\n",
    "    string,\n",
    ")\n",
    "prefixes.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e530b008-2204-4179-8d6e-de70d88f7bba",
   "metadata": {},
   "source": [
    "## 64 Token Strings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd3d5201-98c4-4f07-a551-de853efbbf04",
   "metadata": {},
   "source": [
    "### 2 Character Alphabet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffd52406-00c8-4b96-935f-9783adbca55c",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = res_util.load(f\"pyt-1b_sl-64_al-2\", list(range(1)))[0].value\n",
    "string = result.data.tokens[0]\n",
    "prefixes = ext.extract_prefixes(\n",
    "    result.prefix_performance,\n",
    "    string,\n",
    ")\n",
    "prefixes.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9847ed5f-5b43-4b1f-b0e4-1ac1e95b0e12",
   "metadata": {},
   "source": [
    "### 26 Character Alphabet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "d2a436f5-e053-40d0-9977-5336cb80703e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " p w j q s h t u l r c x x l p e g e s s m o g n c h a a t a u v e l k w l c j s o a a g o b n t s y y v q g v u e k c l n f f c\n",
      "\n",
      "Prefixes:\n",
      "-p:w\n",
      "  -w:j\n",
      "  -w j:q\n",
      "      -q:s\n",
      "        -s:h\n",
      "          -h:t\n",
      "            -t:u\n",
      "              -u:l\n",
      "              -u l:r\n",
      "                  -r:c\n",
      "                  -r c:x\n",
      "                    -c x:x\n",
      "                        -x:l\n",
      "                        -x l:p\n",
      "                          -l p:e\n",
      "                              -e:g\n",
      "                                -g:e\n",
      "                                -g e:s\n",
      "                                  -e s:s\n",
      "                                      -s:m\n",
      "                                      -s m:o\n",
      "                                        -m o:g\n",
      "                                          -o g:n\n",
      "                                              -n:c\n",
      "                                                -c:h\n",
      "                                                  -h:a\n",
      "                                                    -a:a\n",
      "                                                    -a a:t\n",
      "                                                      -a t:a\n",
      "                                                        -t a:u\n",
      "                                                          -a u:v\n",
      "                                                            -u v:e\n",
      "                                                                -e:l\n",
      "                                                                -e l:k\n",
      "                                                                  -l k:w\n",
      "                                                                      -w:l\n",
      "                                                                        -l:c\n",
      "                                                                        -l c:j\n",
      "                                                                            -j:s\n",
      "                                                                          -c j s:o\n",
      "                                                                                -o:a\n",
      "                                                                                  -a:a\n",
      "                                                                                -o a a:g\n",
      "                                                                                -o a a g:o\n",
      "                                                                                    -a g o:b\n",
      "                                                                                          -b:n\n",
      "                                                                                          -b n:t\n",
      "                                                                                            -n t:s\n",
      "                                                                                              -t s:y\n",
      "                                                                                                  -y:y\n",
      "                                                                                                  -y y:v\n",
      "                                                                                                    -y v:q\n",
      "                                                                                                      -v q:g\n",
      "                                                                                                      -v q g:v\n",
      "                                                                                                            -v:u\n",
      "                                                                                                              -u:e\n",
      "                                                                                                              -u e:k\n",
      "                                                                                                                -e k:c\n",
      "                                                                                                                    -c:l\n",
      "                                                                                                                    -c l:n\n",
      "                                                                                                                      -l n:f\n",
      "                                                                                                                          -f:f\n",
      "                                                                                                                          -f f:c\n",
      "\n",
      "Minimum Necessary Prefix Mappings:\n",
      "p: w (1x)\n",
      "w: j (1x), l (1x)\n",
      "wj: q (1x)\n",
      "q: s (1x)\n",
      "s: h (1x), m (1x)\n",
      "h: t (1x), a (1x)\n",
      "t: u (1x)\n",
      "u: l (1x), e (1x)\n",
      "ul: r (1x)\n",
      "r: c (1x)\n",
      "rc: x (1x)\n",
      "cx: x (1x)\n",
      "x: l (1x)\n",
      "xl: p (1x)\n",
      "lp: e (1x)\n",
      "e: g (1x), l (1x)\n",
      "g: e (1x)\n",
      "ge: s (1x)\n",
      "es: s (1x)\n",
      "sm: o (1x)\n",
      "mo: g (1x)\n",
      "og: n (1x)\n",
      "n: c (1x)\n",
      "c: h (1x), l (1x)\n",
      "a: a (2x)\n",
      "aa: t (1x)\n",
      "at: a (1x)\n",
      "ta: u (1x)\n",
      "au: v (1x)\n",
      "uv: e (1x)\n",
      "el: k (1x)\n",
      "lk: w (1x)\n",
      "l: c (1x)\n",
      "lc: j (1x)\n",
      "j: s (1x)\n",
      "cjs: o (1x)\n",
      "o: a (1x)\n",
      "oaa: g (1x)\n",
      "oaag: o (1x)\n",
      "ago: b (1x)\n",
      "b: n (1x)\n",
      "bn: t (1x)\n",
      "nt: s (1x)\n",
      "ts: y (1x)\n",
      "y: y (1x)\n",
      "yy: v (1x)\n",
      "yv: q (1x)\n",
      "vq: g (1x)\n",
      "vqg: v (1x)\n",
      "v: u (1x)\n",
      "ue: k (1x)\n",
      "ek: c (1x)\n",
      "cl: n (1x)\n",
      "ln: f (1x)\n",
      "f: f (1x)\n",
      "ff: c (1x)\n",
      "\n",
      "Converged Prefix Mappings:\n",
      "p: w (1x)\n",
      "w: j (1x), l (1x)\n",
      "wj: q (1x)\n",
      "q: s (1x)\n",
      "s: h (1x), m (1x)\n",
      "h: t (1x), a (1x)\n",
      "t: u (1x)\n",
      "u: l (1x), e (1x)\n",
      "ul: r (1x)\n",
      "r: c (1x)\n",
      "rc: x (1x)\n",
      "cx: x (1x)\n",
      "x: l (1x)\n",
      "xl: p (1x)\n",
      "lp: e (1x)\n",
      "e: g (1x), l (1x)\n",
      "g: e (1x)\n",
      "ge: s (1x)\n",
      "es: s (1x)\n",
      "sm: o (1x)\n",
      "mo: g (1x)\n",
      "og: n (1x)\n",
      "n: c (1x)\n",
      "c: h (1x), l (1x)\n",
      "a: a (2x)\n",
      "aa: t (1x)\n",
      "at: a (1x)\n",
      "ta: u (1x)\n",
      "au: v (1x)\n",
      "uv: e (1x)\n",
      "el: k (1x)\n",
      "lk: w (1x)\n",
      "l: c (1x)\n",
      "lc: j (1x)\n",
      "j: s (1x)\n",
      "cjs: o (1x)\n",
      "o: a (1x)\n",
      "oaa: g (1x)\n",
      "oaag: o (1x)\n",
      "ago: b (1x)\n",
      "b: n (1x)\n",
      "bn: t (1x)\n",
      "nt: s (1x)\n",
      "ts: y (1x)\n",
      "y: y (1x)\n",
      "yy: v (1x)\n",
      "yv: q (1x)\n",
      "vq: g (1x)\n",
      "vqg: v (1x)\n",
      "v: u (1x)\n",
      "ue: k (1x)\n",
      "ek: c (1x)\n",
      "cl: n (1x)\n",
      "ln: f (1x)\n",
      "f: f (1x)\n",
      "ff: c (1x)\n"
     ]
    }
   ],
   "source": [
    "result = res_util.load(f\"pyt-1b_sl-64_al-26\", list(range(1)))[0].value\n",
    "string = result.data.tokens[0]\n",
    "prefixes = ext.extract_prefixes(\n",
    "    result.prefix_performance,\n",
    "    string,\n",
    ")\n",
    "prefixes.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "e3341cec-0464-4638-a04a-ba103bad739c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Notebook published to: https://people.ANONYMOUS-ANONYMOUS.org/~ANONYMOUS/results/llm_memorization/experiments/prefix_length/rule_extraction_59032dk.html"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Upload\n",
    "res_util.publish(\"rule_extraction\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55279102-19c2-41eb-9523-d39be1aa0a3d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
