{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys; sys.path.append('..')\n",
    "import torch\n",
    "from dual_attn_blocks import DualAttnEncoderBlock, DualAttnDecoderBlock\n",
    "from symbol_retrieval import PositionRelativeSymbolRetriever, SymbolicAttention"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Documentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;31mInit signature:\u001b[0m\n",
      "\u001b[0mDualAttnEncoderBlock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0md_model\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mn_heads_sa\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mn_heads_ra\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mdff\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mactivation\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mdropout_rate\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mnorm_first\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mnorm_type\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'layernorm'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0msa_kwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mra_kwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mra_type\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'relational_attention'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mbias\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m    \u001b[0mcausal\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
      "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mDocstring:\u001b[0m     \n",
      "Base class for all neural network modules.\n",
      "\n",
      "Your models should also subclass this class.\n",
      "\n",
      "Modules can also contain other Modules, allowing to nest them in\n",
      "a tree structure. You can assign the submodules as regular attributes::\n",
      "\n",
      "    import torch.nn as nn\n",
      "    import torch.nn.functional as F\n",
      "\n",
      "    class Model(nn.Module):\n",
      "        def __init__(self):\n",
      "            super().__init__()\n",
      "            self.conv1 = nn.Conv2d(1, 20, 5)\n",
      "            self.conv2 = nn.Conv2d(20, 20, 5)\n",
      "\n",
      "        def forward(self, x):\n",
      "            x = F.relu(self.conv1(x))\n",
      "            return F.relu(self.conv2(x))\n",
      "\n",
      "Submodules assigned in this way will be registered, and will have their\n",
      "parameters converted too when you call :meth:`to`, etc.\n",
      "\n",
      ".. note::\n",
      "    As per the example above, an ``__init__()`` call to the parent class\n",
      "    must be made before assignment on the child.\n",
      "\n",
      ":ivar training: Boolean represents whether this module is in training or\n",
      "                evaluation mode.\n",
      ":vartype training: bool\n",
      "\u001b[0;31mInit docstring:\u001b[0m\n",
      "Dual Attention Encoder Block.\n",
      "\n",
      "A Dual Attention Encoder is a variant of the Transformer Encoder that uses a combination of two distinct types of attention heads.\n",
      "The first type is standard self-attention, which captures object-level (i.e., sensory) features, and\n",
      "the second type is relational attention, which captures relational features.\n",
      "\n",
      "Parameters\n",
      "----------\n",
      "d_model : int\n",
      "    model dimension.\n",
      "n_heads_sa : int\n",
      "    number of standard self-attention heads.\n",
      "n_heads_ra : int\n",
      "    number of relational attention heads.\n",
      "dff : int\n",
      "    intermediate dimension of feed-forward block.\n",
      "activation : str\n",
      "    name of activation function to use in feedforward block.\n",
      "dropout_rate : float\n",
      "    dropout rate.\n",
      "norm_first : bool\n",
      "    whether to apply normalization before or after attention. norm_first=True means pre-norm otherwise post-norm.\n",
      "norm_type : 'layernorm' or 'rmsnorm, optional\n",
      "    type of normalization to use, by default 'layernorm'\n",
      "sa_kwargs : dict, optional\n",
      "    self-attention kwargs, by default None\n",
      "ra_kwargs : dict, optional\n",
      "    relational attention kwargs, by default None\n",
      "ra_type : str, optional\n",
      "    type of relational attention module (e.g., whether to use RCA for an ablation experiment), by default 'relational_attention'\n",
      "bias : bool, optional\n",
      "    whether to use bias in multi-head attention, by default True\n",
      "causal : bool, optional\n",
      "    whether attention operations should be causal, by default False\n",
      "\u001b[0;31mFile:\u001b[0m           ~/projects/abstract_transformer/dual_attn_blocks.py\n",
      "\u001b[0;31mType:\u001b[0m           type\n",
      "\u001b[0;31mSubclasses:\u001b[0m     "
     ]
    }
   ],
   "source": [
    "DualAttnEncoderBlock?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dual-Attention Encoder Block with Symbolic Attention"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "d, l = 64, 10\n",
    "\n",
    "symbol_retriever = SymbolicAttention(d_model=d, n_heads=4, n_symbols=10)\n",
    "da_enc_block = DualAttnEncoderBlock(d_model=d, n_heads_sa=4, n_heads_ra=4, dff=d*4, activation='swiglu',\n",
    "    dropout_rate=0., norm_first=True, ra_kwargs=dict(symmetric_rels=True, use_relative_positional_symbols=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x: torch.Size([1, 10, 64]), s: torch.Size([1, 10, 64]), y: torch.Size([10, 64])\n"
     ]
    }
   ],
   "source": [
    "x = torch.randn(1, 10, d)\n",
    "s = symbol_retriever(x)\n",
    "y, *_ = da_enc_block(x, s)\n",
    "print(f'x: {x.shape}, s: {s.shape}, y: {y.shape}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dual-Attention Encoder Block with Position-Relative Symbols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "d, l = 64, 10\n",
    "\n",
    "symbol_retriever = PositionRelativeSymbolRetriever(symbol_dim=d, max_rel_pos=l)\n",
    "da_enc_block = DualAttnEncoderBlock(d_model=d, n_heads_sa=4, n_heads_ra=4, dff=d*4, activation='swiglu',\n",
    "    dropout_rate=0., norm_first=True, ra_kwargs=dict(symmetric_rels=True, use_relative_positional_symbols=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 10, 64])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x: torch.Size([1, 10, 64]), s: torch.Size([10, 10, 64]), y: torch.Size([10, 64])\n"
     ]
    }
   ],
   "source": [
    "x = torch.randn(1, 10, d)\n",
    "s = symbol_retriever(x)\n",
    "y, *_ = da_enc_block(x, s)\n",
    "print(f'x: {x.shape}, s: {s.shape}, y: {y.shape}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "abstract_transformer",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
