{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6b2f641b",
   "metadata": {},
   "source": [
    "# Memory Usage: Operator Only"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1aa588e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import torch as T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db6dfb8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sys import path\n",
    "path.insert(0, '..')\n",
    "import fewbit  # noqa"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6558f3d6",
   "metadata": {},
   "source": [
    "Define parameters of quantisation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e017518",
   "metadata": {},
   "outputs": [],
   "source": [
    "BOUNDS = T.tensor([\n",
    "    -2.39798704e+00, -7.11248159e-01, -3.26290283e-01, -1.55338428e-04,\n",
    "    3.26182064e-01, 7.10855860e-01, 2.39811567e+00\n",
    "], dtype=T.float32, device=T.device('cuda'))\n",
    "\n",
    "LEVELS = T.tensor([\n",
    "    -0.00260009, -0.08883533, 0.1251944, 0.37204148, 0.6277958, 0.87466175,\n",
    "    1.08880716, 1.00259936\n",
    "], dtype=T.float32, device=T.device('cuda'))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2c592632",
   "metadata": {},
   "source": [
    "Preallocate inputs and output gradients."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0db35f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "xs = T.zeros(128 * 1024 ** 2, device=T.device('cuda'), requires_grad=True)\n",
    "gs = T.ones_like(xs)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22d54169",
   "metadata": {},
   "source": [
    "Execute operators under profiler."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae20e1cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "with T.profiler.profile(\n",
    "    schedule=T.profiler.schedule(wait=1, warmup=1, active=2, repeat=1),\n",
    "    on_trace_ready=T.profiler.tensorboard_trace_handler('../log/op-only'),\n",
    "    record_shapes=True,\n",
    "    profile_memory=True,\n",
    "    with_stack=True,\n",
    ") as prof:\n",
    "    prof.step()\n",
    "    for _ in range(2):\n",
    "        ys = T.nn.functional.gelu(xs)\n",
    "        ys.backward(gs)\n",
    "        print(xs.grad[0], xs.grad[-1])\n",
    "        prof.step()\n",
    "\n",
    "with T.profiler.profile(\n",
    "    schedule=T.profiler.schedule(wait=1, warmup=1, active=2, repeat=1),\n",
    "    on_trace_ready=T.profiler.tensorboard_trace_handler('../log/op-only-opt'),\n",
    "    record_shapes=True,\n",
    "    profile_memory=True,\n",
    "    with_stack=True,\n",
    ") as prof:\n",
    "    prof.step()\n",
    "    for _ in range(2):\n",
    "        ys = T.ops.fewbit.gelu(xs, BOUNDS, LEVELS)\n",
    "        ys.backward(gs)\n",
    "        print(xs.grad[0], xs.grad[-1])\n",
    "        prof.step()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5266dcdc",
   "metadata": {},
   "source": [
    "Estimate computational speed up.\n",
    "(We take values from `Operator` view of `torch-tb-profiler`.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d40e880",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame([\n",
    "    ('forward', 'original', 1.314),\n",
    "    ('forward', 'quantized', 2.862),\n",
    "    ('backward', 'original', 1.906),\n",
    "    ('backward', 'quantized', 1.563),\n",
    "], columns=('pass', 'impl', 'elapsed'))\n",
    "\n",
    "df_total = df \\\n",
    "    .set_index('impl') \\\n",
    "    .groupby(level=0) \\\n",
    "    .agg({'pass': 'first', 'elapsed': 'sum'}) \\\n",
    "    .reset_index()\n",
    "df_total['pass'] = 'total'\n",
    "\n",
    "df = df \\\n",
    "    .append(df_total, True) \\\n",
    "    .set_index(['impl', 'pass']) \\\n",
    "    .sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3ab52fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "speedup = df.elapsed.values.copy()\n",
    "speedup[:3] = speedup[:3] / df.elapsed.values[:3]\n",
    "speedup[3:] = speedup[3:] / df.elapsed.values[:3]\n",
    "\n",
    "df['speedup'] = speedup\n",
    "df['diff'] = df.speedup - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13b8e538",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.option_context('display.float_format', '{:,.2f}'.format):\n",
    "    print(df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
