{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query tensor:\n",
      "    shape: torch.Size([2, 16384, 32, 128])\n",
      "    max value: 5.5, min value: -0.279296875, avg value: 0.20703125\n",
      "\n",
      "key tensor:\n",
      "    shape: torch.Size([2, 16384, 32, 128])\n",
      "    max value: 5.71875, min value: -0.279296875, avg value: 0.20703125\n",
      "\n",
      "value tensor:\n",
      "    shape: torch.Size([2, 16384, 32, 128])\n",
      "    max value: 5.625, min value: -0.279296875, avg value: 0.20703125\n",
      "\n",
      "alpha tensor:\n",
      "    shape: torch.Size([2, 16384, 32])\n",
      "    max value: -3.0989474907983094e-05, min value: -57.585968017578125, avg value: -0.6483831405639648\n",
      "\n",
      "beta tensor:\n",
      "    shape: torch.Size([2, 16384, 32])\n",
      "    max value: 0.9930014610290527, min value: 0.009381243027746677, avg value: 0.5001037120819092\n",
      "\n",
      "gamma tensor:\n",
      "    shape: torch.Size([2, 16384, 32])\n",
      "    max value: 0.9939163327217102, min value: 0.004957016557455063, avg value: 0.500361442565918\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Code for generating example data for attention kernel test\n",
    "\n",
    "import torch\n",
    "import math\n",
    "import os\n",
    "import triton\n",
    "\n",
    "\n",
    "def generate_example_data(batch_size, seq_len, num_heads, head_dim, dtype):\n",
    "    q = torch.nn.functional.silu(\n",
    "        torch.randn(\n",
    "            (batch_size, seq_len, num_heads, head_dim), device=\"cuda\", dtype=dtype\n",
    "        )\n",
    "    )\n",
    "    k = torch.nn.functional.silu(\n",
    "        torch.randn(\n",
    "            (batch_size, seq_len, num_heads, head_dim), device=\"cuda\", dtype=dtype\n",
    "        )\n",
    "    )\n",
    "    v = torch.nn.functional.silu(\n",
    "        torch.randn(\n",
    "            (batch_size, seq_len, num_heads, head_dim), device=\"cuda\", dtype=dtype\n",
    "        )\n",
    "    )\n",
    "    A_log = torch.arange(1, num_heads + 1, device=\"cuda\", dtype=torch.float32).log()\n",
    "    dt_min = 0.001\n",
    "    dt_max = 0.1\n",
    "    dt = torch.exp(\n",
    "        torch.rand(num_heads, device=\"cuda\", dtype=torch.float32)\n",
    "        * (math.log(dt_max) - math.log(dt_min))\n",
    "        + math.log(dt_min)\n",
    "    )\n",
    "    dt = dt + torch.log(-torch.expm1(-dt))\n",
    "    alpha = -A_log.exp()[None, None, :] * torch.nn.functional.softplus(\n",
    "        torch.randn(batch_size, seq_len, num_heads, device=\"cuda\", dtype=torch.float32)\n",
    "        + dt[None, None, :]\n",
    "    )\n",
    "    beta = torch.sigmoid(\n",
    "        torch.randn(\n",
    "            (batch_size, seq_len, num_heads),\n",
    "            device=\"cuda\",\n",
    "            dtype=torch.float32,\n",
    "        )\n",
    "    )\n",
    "    gamma = torch.sigmoid(\n",
    "        torch.randn(\n",
    "            (batch_size, seq_len, num_heads),\n",
    "            device=\"cuda\",\n",
    "            dtype=torch.float32,\n",
    "        )\n",
    "    )\n",
    "\n",
    "    return q, k, v, alpha, beta, gamma\n",
    "\n",
    "# data example\n",
    "q, k, v, alpha, beta, gamma = generate_example_data(2, 16384, 32, 128, torch.bfloat16)\n",
    "print(f\"\"\"query tensor:\n",
    "    shape: {q.shape}\n",
    "    max value: {q.max().item()}, min value: {q.min().item()}, avg value: {q.mean().item()}\n",
    "\"\"\")\n",
    "print(f\"\"\"key tensor:\n",
    "    shape: {k.shape}\n",
    "    max value: {k.max().item()}, min value: {k.min().item()}, avg value: {k.mean().item()}\n",
    "\"\"\")\n",
    "print(f\"\"\"value tensor:\n",
    "    shape: {v.shape}\n",
    "    max value: {v.max().item()}, min value: {v.min().item()}, avg value: {v.mean().item()}\n",
    "\"\"\")\n",
    "print(f\"\"\"alpha tensor:\n",
    "    shape: {alpha.shape}\n",
    "    max value: {alpha.max().item()}, min value: {alpha.min().item()}, avg value: {alpha.mean().item()}\n",
    "\"\"\")\n",
    "print(f\"\"\"beta tensor:\n",
    "    shape: {beta.shape}\n",
    "    max value: {beta.max().item()}, min value: {beta.min().item()}, avg value: {beta.mean().item()}\n",
    "\"\"\")\n",
    "print(f\"\"\"gamma tensor:\n",
    "    shape: {gamma.shape}\n",
    "    max value: {gamma.max().item()}, min value: {gamma.min().item()}, avg value: {gamma.mean().item()}\n",
    "\"\"\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Autotuning kernel l2norm_fwd_kernel with config BT: 8, num_warps: 1, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 16, num_warps: 1, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 32, num_warps: 1, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 64, num_warps: 1, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 128, num_warps: 1, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 8, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 16, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 128, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 8, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 16, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 32, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 128, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 8, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 16, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 32, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 64, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 128, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 8, num_warps: 16, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 16, num_warps: 16, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 32, num_warps: 16, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 64, num_warps: 16, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel l2norm_fwd_kernel with config BT: 128, num_warps: 16, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Triton autotuning for function l2norm_fwd_kernel,\n",
      "with key as (128, 'torch.bfloat16', 'torch.bfloat16', 'torch.float32'),\n",
      "finished after 14.72s,\n",
      "best config selected: BT: 8, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None;\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 32, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 32, num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 32, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 32, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 32, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 64, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 64, num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 32, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 32, num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 32, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 32, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 32, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_h with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Triton autotuning for function chunk_fwd_kernel_h,\n",
      "with key as (64, 128, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.float32'),\n",
      "finished after 20.53s,\n",
      "best config selected: BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None;\n",
      "Autotuning kernel chunk_fwd_kernel_o with config BLOCK_SIZE_KD: 128, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_o with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_o with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Triton autotuning for function chunk_fwd_kernel_o,\n",
      "with key as (64, 128, 128, True, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32'),\n",
      "finished after 4.08s,\n",
      "best config selected: BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None;\n",
      "Autotuning kernel chunk_fwd_kernel_sq_sk with config BLOCK_SIZE_KD: 128, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_sq_sk with config BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_sq_sk with config BLOCK_SIZE_KD: 32, BLOCK_SIZE_VD: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Triton autotuning for function chunk_fwd_kernel_sq_sk,\n",
      "with key as (64, 128, 128, True, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32'),\n",
      "finished after 5.45s,\n",
      "best config selected: BLOCK_SIZE_KD: 64, BLOCK_SIZE_VD: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None;\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 8, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 128, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 128, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 128, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 128, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 128, num_warps: 8, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 128, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 8, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 128, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 128, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 128, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 128, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 128, num_warps: 8, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _clip_kernel_fwd with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 128, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Triton autotuning for function _clip_kernel_fwd,\n",
      "with key as (False, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16'),\n",
      "finished after 12.33s,\n",
      "best config selected: BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 128, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None;\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 32, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 32, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 32, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 32, BLOCK_SIZE_D: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 32, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 32, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 32, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel _merge_fwd_kernel with config BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Triton autotuning for function _merge_fwd_kernel,\n",
      "with key as (128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32'),\n",
      "finished after 8.75s,\n",
      "best config selected: BLOCK_SIZE_N: 64, BLOCK_SIZE_D: 64, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None;\n",
      "o_sgla, shape: torch.Size([2, 16384, 32, 128])\n",
      "tensor([[[[ 4.5166e-03, -2.5024e-03, -2.1515e-03,  ..., -1.3428e-03,\n",
      "            8.8120e-04, -9.2316e-04],\n",
      "          [ 2.6398e-03,  9.3384e-03,  7.1335e-04,  ...,  1.2283e-03,\n",
      "            3.9978e-03,  1.3199e-03],\n",
      "          [ 6.9275e-03, -4.2419e-03,  9.6436e-03,  ...,  5.1575e-03,\n",
      "           -3.2806e-03,  3.3569e-03],\n",
      "          ...,\n",
      "          [ 7.1411e-03,  5.7068e-03, -1.5030e-03,  ...,  2.1667e-03,\n",
      "           -2.1515e-03,  1.3000e-02],\n",
      "          [-1.1778e-04,  5.3406e-04,  4.7684e-05,  ...,  9.0790e-04,\n",
      "           -5.2490e-03,  9.9945e-04],\n",
      "          [-1.6022e-04,  6.9580e-03,  2.1839e-04,  ...,  1.0010e-02,\n",
      "           -1.3428e-03, -1.1292e-03]],\n",
      "\n",
      "         [[ 9.3384e-03, -4.8523e-03, -4.2725e-03,  ..., -2.8839e-03,\n",
      "            7.1106e-03,  8.2397e-03],\n",
      "          [ 4.3678e-04,  6.2256e-03,  7.0190e-03,  ...,  4.8218e-03,\n",
      "            2.8839e-03, -3.2425e-04],\n",
      "          [ 3.8910e-03,  1.9989e-03,  5.2185e-03,  ...,  1.4343e-03,\n",
      "            2.3804e-03,  1.9073e-03],\n",
      "          ...,\n",
      "          [ 1.1292e-03, -6.1798e-04,  4.3106e-04,  ...,  4.2343e-04,\n",
      "           -7.6675e-04,  1.1368e-03],\n",
      "          [ 1.0777e-04,  1.5163e-04,  4.9114e-05,  ..., -2.8534e-03,\n",
      "            1.2939e-02, -4.4250e-04],\n",
      "          [-2.1172e-04,  1.2817e-03, -2.3842e-04,  ...,  2.2583e-03,\n",
      "           -4.7922e-05,  3.5095e-04]],\n",
      "\n",
      "         [[ 5.8594e-03, -5.6458e-03, -1.1826e-03,  ..., -2.7466e-03,\n",
      "            1.8066e-02,  1.3916e-02],\n",
      "          [ 4.9591e-04,  1.5640e-03,  5.9204e-03,  ...,  2.5482e-03,\n",
      "            2.5177e-03,  2.1362e-03],\n",
      "          [ 4.4250e-03,  8.6060e-03,  5.0964e-03,  ...,  2.7618e-03,\n",
      "            3.9978e-03,  6.2866e-03],\n",
      "          ...,\n",
      "          [ 6.8054e-03,  4.8218e-03, -1.6251e-03,  ..., -1.5182e-03,\n",
      "            5.6763e-03,  3.8086e-02],\n",
      "          [-1.3885e-03,  5.1575e-03,  8.0109e-05,  ..., -1.9989e-03,\n",
      "           -1.0300e-03,  8.2397e-03],\n",
      "          [ 2.2705e-02,  1.5076e-02, -1.3275e-03,  ...,  1.6235e-02,\n",
      "           -1.9836e-03,  7.3624e-04]],\n",
      "\n",
      "         ...,\n",
      "\n",
      "         [[ 3.8477e-01,  4.2188e-01,  3.1641e-01,  ...,  3.6523e-01,\n",
      "            4.4727e-01,  3.9648e-01],\n",
      "          [ 1.2695e-02,  2.0630e-02,  2.6550e-03,  ...,  4.8218e-03,\n",
      "            1.1139e-03,  2.1118e-02],\n",
      "          [ 6.7871e-02,  6.9824e-02,  5.6885e-02,  ...,  3.2959e-02,\n",
      "            7.6172e-02,  7.1289e-02],\n",
      "          ...,\n",
      "          [-1.5793e-03,  1.5625e-02,  1.5182e-03,  ...,  1.4572e-03,\n",
      "           -3.3569e-03,  3.8757e-03],\n",
      "          [ 1.6098e-03,  2.2125e-03,  6.1035e-03,  ...,  4.1199e-03,\n",
      "            3.3722e-03,  5.2490e-03],\n",
      "          [-1.1673e-03,  2.8839e-03, -2.3499e-03,  ...,  9.9487e-03,\n",
      "           -9.4986e-04,  1.5030e-03]],\n",
      "\n",
      "         [[ 3.1641e-01,  3.7891e-01,  2.6562e-01,  ...,  3.6133e-01,\n",
      "            3.9453e-01,  3.3398e-01],\n",
      "          [ 1.2878e-02,  2.0386e-02, -1.9455e-04,  ...,  1.5320e-02,\n",
      "           -1.3275e-03,  2.8320e-02],\n",
      "          [ 1.0400e-01,  6.6406e-02,  8.5449e-02,  ...,  6.2988e-02,\n",
      "            1.0693e-01,  9.3750e-02],\n",
      "          ...,\n",
      "          [-1.5335e-03,  7.1411e-03,  5.7373e-03,  ..., -2.1820e-03,\n",
      "           -1.4648e-03,  1.0559e-02],\n",
      "          [ 4.4556e-03,  1.1169e-02,  1.9775e-02,  ...,  1.0925e-02,\n",
      "            7.6599e-03,  1.1536e-02],\n",
      "          [-2.6131e-04,  2.1973e-03, -5.7220e-04,  ...,  2.0294e-03,\n",
      "           -2.8992e-04, -5.3787e-04]],\n",
      "\n",
      "         [[ 4.6680e-01,  5.4297e-01,  3.8281e-01,  ...,  4.4336e-01,\n",
      "            5.3516e-01,  4.5508e-01],\n",
      "          [ 1.4877e-03,  1.2329e-02, -2.1362e-03,  ...,  1.4160e-02,\n",
      "            1.2573e-02,  8.6670e-03],\n",
      "          [ 7.9102e-02,  4.8828e-02,  6.7871e-02,  ...,  5.9082e-02,\n",
      "            5.7373e-02,  4.9072e-02],\n",
      "          ...,\n",
      "          [ 2.0630e-02,  4.8828e-03, -1.4267e-03,  ...,  2.7313e-03,\n",
      "           -4.6082e-03,  8.3618e-03],\n",
      "          [ 2.0905e-03,  1.1536e-02,  1.5869e-02,  ...,  9.0332e-03,\n",
      "            8.7738e-04,  1.1230e-02],\n",
      "          [ 8.1253e-04, -1.3657e-03,  8.9264e-04,  ...,  7.0953e-04,\n",
      "           -4.0770e-05, -2.6321e-04]]],\n",
      "\n",
      "\n",
      "        [[[ 1.8005e-03,  7.1106e-03,  6.0425e-03,  ..., -1.0967e-04,\n",
      "           -7.9346e-04, -1.1597e-03],\n",
      "          [ 6.7139e-03,  4.1504e-03, -9.2506e-05,  ..., -1.7166e-03,\n",
      "            3.0518e-04,  3.7994e-03],\n",
      "          [-1.2817e-03,  6.6833e-03,  3.8910e-03,  ..., -9.5367e-04,\n",
      "           -3.1948e-05,  1.0010e-02],\n",
      "          ...,\n",
      "          [-1.5945e-03,  2.7847e-04,  1.0300e-03,  ...,  2.4261e-03,\n",
      "           -1.4648e-03, -1.0529e-03],\n",
      "          [-1.7624e-03,  1.4114e-03, -1.5488e-03,  ..., -1.5030e-03,\n",
      "           -1.7319e-03,  1.6022e-03],\n",
      "          [ 9.9182e-04,  1.3924e-04,  6.3324e-04,  ...,  9.9182e-04,\n",
      "            2.2030e-04,  9.3460e-04]],\n",
      "\n",
      "         [[ 2.9297e-03,  3.0518e-03,  1.1658e-02,  ...,  9.0332e-03,\n",
      "            3.6469e-03,  6.8359e-03],\n",
      "          [ 1.4343e-02,  4.6692e-03,  1.9836e-03,  ..., -3.0060e-03,\n",
      "            1.2665e-03,  1.2360e-03],\n",
      "          [-3.1433e-03,  6.9885e-03,  2.2583e-03,  ..., -2.0752e-03,\n",
      "            4.0588e-03,  9.0942e-03],\n",
      "          ...,\n",
      "          [-1.5030e-03,  2.6131e-04,  8.8120e-04,  ...,  1.9531e-03,\n",
      "           -1.0910e-03, -8.8120e-04],\n",
      "          [-2.1820e-03,  3.9673e-03,  3.2196e-03,  ..., -3.7384e-03,\n",
      "            1.0437e-02, -1.4973e-04],\n",
      "          [ 8.3008e-03,  1.4877e-03,  4.9744e-03,  ...,  8.5449e-03,\n",
      "            3.3569e-03,  7.9956e-03]],\n",
      "\n",
      "         [[ 2.9907e-03,  5.0659e-03,  9.0942e-03,  ...,  4.9744e-03,\n",
      "            1.6556e-03,  3.7689e-03],\n",
      "          [ 2.9297e-02,  9.7046e-03,  4.1199e-03,  ..., -2.6855e-03,\n",
      "            1.4801e-03,  2.7618e-03],\n",
      "          [-3.2959e-03,  2.2095e-02,  2.8992e-03,  ..., -6.1951e-03,\n",
      "            2.7222e-02,  1.1047e-02],\n",
      "          ...,\n",
      "          [ 2.1973e-03, -6.1417e-04, -7.3242e-04,  ...,  5.1498e-04,\n",
      "           -1.5869e-03,  3.7956e-04],\n",
      "          [ 1.9653e-02, -1.3638e-04, -2.1362e-03,  ...,  1.5793e-03,\n",
      "           -4.0531e-05, -8.2016e-04],\n",
      "          [ 1.8311e-03,  6.9809e-04,  7.2479e-04,  ...,  2.2583e-03,\n",
      "            2.6093e-03,  2.0447e-03]],\n",
      "\n",
      "         ...,\n",
      "\n",
      "         [[ 3.9648e-01,  3.7109e-01,  3.9844e-01,  ...,  5.3516e-01,\n",
      "            4.9609e-01,  3.4570e-01],\n",
      "          [ 2.8320e-02,  1.5259e-02,  1.1841e-02,  ...,  2.1484e-02,\n",
      "            2.1729e-02,  1.5747e-02],\n",
      "          [ 9.5215e-02,  9.7168e-02,  1.0059e-01,  ...,  8.3496e-02,\n",
      "            3.9307e-02,  1.0059e-01],\n",
      "          ...,\n",
      "          [ 2.3041e-03,  6.5613e-04,  2.2583e-03,  ...,  5.3406e-03,\n",
      "            5.4016e-03, -2.0599e-03],\n",
      "          [ 1.8066e-02,  1.3000e-02,  6.7749e-03,  ..., -1.7929e-03,\n",
      "            4.3335e-03,  1.8799e-02],\n",
      "          [ 2.5940e-03,  1.6327e-03, -1.2131e-03,  ...,  1.6708e-03,\n",
      "            5.7678e-03,  5.7602e-04]],\n",
      "\n",
      "         [[ 2.1387e-01,  2.6172e-01,  2.5000e-01,  ...,  2.7539e-01,\n",
      "            3.6914e-01,  2.0898e-01],\n",
      "          [ 2.8442e-02,  3.2959e-02,  6.1646e-03,  ...,  3.3447e-02,\n",
      "            2.1362e-02,  2.6367e-02],\n",
      "          [ 4.7607e-02,  8.8379e-02,  8.6914e-02,  ...,  5.4688e-02,\n",
      "            1.2817e-02,  7.7637e-02],\n",
      "          ...,\n",
      "          [ 1.2684e-04, -1.5335e-03,  2.2736e-03,  ...,  6.2561e-03,\n",
      "            6.1646e-03,  1.8692e-03],\n",
      "          [ 2.1118e-02,  1.9287e-02,  1.2878e-02,  ...,  7.4768e-03,\n",
      "            1.2512e-02,  1.6846e-02],\n",
      "          [-9.8419e-04, -5.8174e-05,  3.8910e-03,  ..., -1.2131e-03,\n",
      "            1.0681e-02,  6.1035e-03]],\n",
      "\n",
      "         [[ 3.3984e-01,  3.7109e-01,  4.2969e-01,  ...,  5.1562e-01,\n",
      "            4.7461e-01,  3.1055e-01],\n",
      "          [ 4.1992e-02,  5.6152e-03,  1.2695e-02,  ...,  1.6113e-02,\n",
      "            2.3071e-02,  2.6001e-02],\n",
      "          [ 7.2266e-02,  9.5215e-02,  1.1084e-01,  ...,  8.8379e-02,\n",
      "            2.8320e-02,  1.1426e-01],\n",
      "          ...,\n",
      "          [ 6.5002e-03, -1.8921e-03,  3.0060e-03,  ...,  1.5640e-03,\n",
      "            5.8594e-03,  1.6098e-03],\n",
      "          [ 2.6245e-02,  1.9653e-02,  1.6479e-02,  ..., -1.9302e-03,\n",
      "            6.1646e-03,  2.0142e-02],\n",
      "          [-6.5994e-04, -1.2054e-03,  2.2430e-03,  ..., -1.8597e-04,\n",
      "            1.5640e-03,  4.6692e-03]]]], device='cuda:0', dtype=torch.bfloat16)\n",
      "o_rla, shape: torch.Size([2, 16384, 32, 128])\n",
      "tensor([[[[ 4.3678e-04, -2.4033e-04, -2.0695e-04,  ..., -1.2970e-04,\n",
      "            8.4877e-05, -8.9169e-05],\n",
      "          [ 1.1749e-03,  1.7471e-03,  3.1853e-04,  ...,  5.4932e-04,\n",
      "            1.7471e-03,  5.8746e-04],\n",
      "          [ 2.1362e-04, -1.3065e-04,  2.9755e-04,  ...,  1.5926e-04,\n",
      "           -1.0109e-04,  1.0347e-04],\n",
      "          ...,\n",
      "          [ 8.0585e-05,  6.4373e-05, -1.6928e-05,  ...,  2.4557e-05,\n",
      "           -2.4319e-05,  8.7261e-05],\n",
      "          [-4.2200e-05,  1.9073e-04,  1.7047e-05,  ...,  3.2425e-04,\n",
      "           -1.3275e-03,  3.5667e-04],\n",
      "          [-1.2589e-04,  3.9978e-03,  1.7166e-04,  ...,  3.9978e-03,\n",
      "           -1.0452e-03, -8.8120e-04]],\n",
      "\n",
      "         [[ 1.1536e-02, -5.9509e-03, -5.2185e-03,  ..., -3.5858e-03,\n",
      "            8.3008e-03,  5.0964e-03],\n",
      "          [ 3.1891e-03,  9.5825e-03,  3.1433e-03,  ...,  3.0212e-03,\n",
      "            5.5237e-03,  1.4114e-03],\n",
      "          [ 3.9673e-03,  1.2970e-03,  5.3101e-03,  ...,  1.5488e-03,\n",
      "            1.8158e-03,  1.9379e-03],\n",
      "          ...,\n",
      "          [ 1.3123e-03, -8.2016e-04,  5.3787e-04,  ...,  4.9591e-04,\n",
      "           -9.1553e-04,  1.1978e-03],\n",
      "          [ 3.0327e-04, -8.8120e-04, -5.1022e-05,  ..., -3.9062e-03,\n",
      "            1.7822e-02, -2.3499e-03],\n",
      "          [-5.0545e-05,  1.7624e-03,  4.5538e-05,  ...,  2.4567e-03,\n",
      "           -3.4523e-04, -2.7466e-04]],\n",
      "\n",
      "         [[ 6.8665e-03, -4.0283e-03, -3.0060e-03,  ..., -2.1973e-03,\n",
      "            4.1199e-03,  2.4109e-03],\n",
      "          [ 2.6131e-04,  3.0823e-03,  5.4932e-03,  ...,  3.5706e-03,\n",
      "            2.1210e-03,  1.1504e-05],\n",
      "          [ 4.7607e-03,  6.5308e-03,  6.0425e-03,  ...,  7.0190e-04,\n",
      "            6.4392e-03,  2.7771e-03],\n",
      "          ...,\n",
      "          [ 6.3477e-03,  7.2632e-03, -2.6703e-03,  ..., -1.0757e-03,\n",
      "            5.0354e-03,  1.8555e-02],\n",
      "          [-2.2583e-03,  1.0315e-02,  5.4169e-04,  ..., -3.2501e-03,\n",
      "           -6.1340e-03,  1.5259e-02],\n",
      "          [ 1.6556e-03,  3.8300e-03, -1.4191e-03,  ...,  6.2561e-03,\n",
      "            5.8889e-05,  2.2278e-03]],\n",
      "\n",
      "         ...,\n",
      "\n",
      "         [[-2.5391e-01, -2.2363e-01, -3.2031e-01,  ..., -2.6953e-01,\n",
      "           -1.9043e-01, -2.4316e-01],\n",
      "          [ 2.9602e-03,  1.8921e-02,  1.9989e-03,  ...,  3.4485e-03,\n",
      "            2.2125e-03,  2.1118e-02],\n",
      "          [-4.3213e-02, -3.6133e-02, -4.8340e-02,  ..., -5.7617e-02,\n",
      "           -2.7222e-02, -3.2959e-02],\n",
      "          ...,\n",
      "          [-2.8381e-03,  1.9165e-02,  4.8065e-04,  ...,  3.2806e-03,\n",
      "           -4.1809e-03,  3.3417e-03],\n",
      "          [ 1.6022e-03,  2.5177e-03,  6.2256e-03,  ...,  4.5776e-03,\n",
      "            4.3335e-03,  5.7068e-03],\n",
      "          [-2.2278e-03,  6.0120e-03, -3.8757e-03,  ...,  8.7280e-03,\n",
      "           -1.4572e-03,  1.2207e-03]],\n",
      "\n",
      "         [[-5.1172e-01, -4.5117e-01, -5.6250e-01,  ..., -4.6875e-01,\n",
      "           -4.3359e-01, -4.8633e-01],\n",
      "          [ 3.9368e-03,  1.2207e-02, -5.4932e-04,  ...,  1.1230e-02,\n",
      "           -9.2163e-03,  1.9775e-02],\n",
      "          [-5.7129e-02, -1.1377e-01, -7.7148e-02,  ..., -7.0312e-02,\n",
      "           -5.0293e-02, -6.5430e-02],\n",
      "          ...,\n",
      "          [-7.2098e-04,  9.6436e-03,  8.9111e-03,  ..., -3.3722e-03,\n",
      "           -2.7313e-03,  1.4282e-02],\n",
      "          [ 5.1270e-03,  1.9165e-02,  2.6123e-02,  ...,  1.5442e-02,\n",
      "            1.3733e-02,  1.5503e-02],\n",
      "          [-5.3024e-04,  2.5024e-03, -1.0834e-03,  ...,  3.3569e-03,\n",
      "           -5.4169e-04, -1.3351e-03]],\n",
      "\n",
      "         [[-7.4219e-01, -6.7969e-01, -8.2422e-01,  ..., -7.6562e-01,\n",
      "           -6.7578e-01, -7.5000e-01],\n",
      "          [-5.7678e-03,  7.0496e-03,  1.4648e-03,  ...,  1.1780e-02,\n",
      "            1.4893e-02, -1.1349e-04],\n",
      "          [-4.3945e-02, -8.0078e-02, -5.2246e-02,  ..., -4.0039e-02,\n",
      "           -7.0312e-02, -7.7637e-02],\n",
      "          ...,\n",
      "          [ 9.2773e-03,  5.8899e-03, -6.4087e-04,  ...,  2.0752e-03,\n",
      "           -3.7689e-03,  6.1951e-03],\n",
      "          [ 2.4719e-03,  1.6357e-02,  1.7334e-02,  ...,  1.0498e-02,\n",
      "            1.6785e-03,  1.0742e-02],\n",
      "          [ 5.9891e-04, -1.7090e-03,  7.5531e-04,  ...,  6.8665e-04,\n",
      "           -1.9908e-05, -1.8597e-04]]],\n",
      "\n",
      "\n",
      "        [[[ 7.5912e-04,  1.7853e-03,  1.7853e-03,  ..., -4.6253e-05,\n",
      "           -3.3569e-04, -4.8828e-04],\n",
      "          [ 7.0496e-03,  4.3335e-03, -9.6798e-05,  ..., -1.8005e-03,\n",
      "            3.2043e-04,  3.9978e-03],\n",
      "          [-1.0204e-04,  5.2643e-04,  3.0899e-04,  ..., -7.5340e-05,\n",
      "           -2.5332e-06,  6.6376e-04],\n",
      "          ...,\n",
      "          [-1.5411e-03,  2.6894e-04,  9.9182e-04,  ...,  2.3499e-03,\n",
      "           -1.4267e-03, -1.0223e-03],\n",
      "          [-1.5030e-03,  1.1978e-03, -1.3199e-03,  ..., -1.2817e-03,\n",
      "           -1.4801e-03,  1.3657e-03],\n",
      "          [ 1.0910e-03,  1.8024e-04,  8.2397e-04,  ...,  1.0910e-03,\n",
      "            2.8610e-04,  1.0910e-03]],\n",
      "\n",
      "         [[ 2.1210e-03,  5.3406e-03,  7.1411e-03,  ...,  1.9531e-03,\n",
      "            4.0817e-04,  9.6130e-04],\n",
      "          [ 1.5564e-02,  7.5684e-03,  1.0681e-03,  ..., -3.7689e-03,\n",
      "            1.1063e-03,  5.1575e-03],\n",
      "          [-1.4420e-03,  7.3853e-03,  4.2725e-03,  ..., -1.0605e-03,\n",
      "            5.0962e-06,  1.0986e-02],\n",
      "          ...,\n",
      "          [-3.0212e-03,  5.2643e-04,  1.8311e-03,  ...,  4.1809e-03,\n",
      "           -2.4109e-03, -1.8616e-03],\n",
      "          [-1.0071e-03,  1.0986e-03, -2.3365e-04,  ..., -1.1063e-03,\n",
      "            6.1798e-04,  6.4468e-04],\n",
      "          [ 1.6724e-02,  3.2043e-03,  1.1108e-02,  ...,  1.7334e-02,\n",
      "            5.7983e-03,  1.6724e-02]],\n",
      "\n",
      "         [[ 7.4463e-03,  8.6670e-03,  1.2390e-02,  ...,  5.9509e-03,\n",
      "            1.7014e-03,  6.9580e-03],\n",
      "          [ 3.8330e-02,  1.0559e-02,  6.7444e-03,  ..., -6.1035e-03,\n",
      "            3.2654e-03, -5.1880e-04],\n",
      "          [-5.4321e-03,  1.3245e-02,  2.3956e-03,  ..., -4.4556e-03,\n",
      "            1.1353e-02,  1.2329e-02],\n",
      "          ...,\n",
      "          [ 3.0670e-03, -5.3787e-04, -6.4468e-04,  ...,  6.2561e-04,\n",
      "           -1.7548e-03,  2.7466e-04],\n",
      "          [ 8.4839e-03,  1.5793e-03, -3.4809e-05,  ..., -8.1253e-04,\n",
      "            3.6316e-03, -2.4605e-04],\n",
      "          [ 3.6621e-03,  1.7548e-03,  1.2817e-03,  ...,  4.8828e-03,\n",
      "            5.4626e-03,  4.4556e-03]],\n",
      "\n",
      "         ...,\n",
      "\n",
      "         [[-1.5078e+00, -1.5391e+00, -1.5078e+00,  ..., -1.3828e+00,\n",
      "           -1.4141e+00, -1.5547e+00],\n",
      "          [ 3.0151e-02,  1.5076e-02,  1.2207e-02,  ...,  2.2827e-02,\n",
      "            2.5391e-02,  1.0986e-02],\n",
      "          [-9.8633e-02, -1.2598e-01, -1.6406e-01,  ..., -1.4746e-01,\n",
      "           -1.6797e-01, -1.4453e-01],\n",
      "          ...,\n",
      "          [ 3.8147e-03,  4.8828e-04,  3.7537e-03,  ...,  7.2937e-03,\n",
      "            8.1177e-03, -3.4180e-03],\n",
      "          [ 2.3804e-02,  1.6357e-02,  1.6251e-03,  ..., -1.0620e-02,\n",
      "            6.2866e-03,  1.7822e-02],\n",
      "          [ 2.1515e-03,  3.8757e-03, -2.9755e-04,  ...,  3.5400e-03,\n",
      "            3.0060e-03,  2.5940e-03]],\n",
      "\n",
      "         [[-3.4180e-01, -2.9688e-01, -3.1836e-01,  ..., -2.8711e-01,\n",
      "           -1.9824e-01, -3.5156e-01],\n",
      "          [ 2.9053e-02,  4.0527e-02,  2.4414e-03,  ...,  4.2236e-02,\n",
      "            2.1240e-02,  2.0264e-02],\n",
      "          [-1.4844e-01, -9.8145e-02, -1.6309e-01,  ..., -1.6309e-01,\n",
      "           -1.7773e-01, -1.3672e-01],\n",
      "          ...,\n",
      "          [ 4.2915e-04, -3.4485e-03,  4.5471e-03,  ...,  9.3994e-03,\n",
      "            1.0437e-02,  3.0670e-03],\n",
      "          [ 2.4292e-02,  2.2705e-02,  8.4839e-03,  ...,  3.1281e-03,\n",
      "            1.5076e-02,  1.4160e-02],\n",
      "          [ 7.2098e-04,  8.7357e-04,  3.4523e-04,  ...,  2.8801e-04,\n",
      "            2.6093e-03,  9.4986e-04]],\n",
      "\n",
      "         [[-4.8633e-01, -4.5312e-01, -3.9648e-01,  ..., -3.1250e-01,\n",
      "           -3.4961e-01, -5.1562e-01],\n",
      "          [ 4.2236e-02,  5.1575e-03,  1.7334e-02,  ...,  1.7822e-02,\n",
      "            2.7832e-02,  2.1362e-02],\n",
      "          [-7.2266e-02, -6.9824e-02, -7.6660e-02,  ..., -6.5430e-02,\n",
      "           -1.2207e-01, -4.9072e-02],\n",
      "          ...,\n",
      "          [ 1.1368e-03, -1.6403e-03,  1.9836e-03,  ...,  2.7008e-03,\n",
      "            7.0496e-03,  2.2736e-03],\n",
      "          [ 4.1260e-02,  3.2959e-02,  1.7334e-02,  ..., -1.1780e-02,\n",
      "            1.3794e-02,  1.5747e-02],\n",
      "          [-5.1498e-04, -6.5994e-04,  1.8387e-03,  ...,  1.7047e-05,\n",
      "            1.1749e-03,  3.9062e-03]]]], device='cuda:0', dtype=torch.bfloat16)\n"
     ]
    }
   ],
   "source": [
    "# Example usage of Residual Linear Attention\n",
    "# It will take some time to run the following code, because it will run the autotuning of Triton.\n",
    "\n",
    "from residual_linear_attention import sgla_prefill, rla_prefill\n",
    "\n",
    "os.environ[\"TRITON_PRINT_AUTOTUNING\"] = \"1\"\n",
    "\n",
    "q, k, v, alpha, beta, gamma = generate_example_data(2, 16384, 32, 128, torch.bfloat16)\n",
    "\n",
    "o_sgla, _ = sgla_prefill(q, k, v, alpha, beta)\n",
    "\n",
    "o_rla, _, _ = rla_prefill(q, k, v, alpha, beta, gamma)\n",
    "\n",
    "print(f\"o_sgla, shape: {o_sgla.shape}\")\n",
    "print(o_sgla)\n",
    "\n",
    "print(f\"o_rla, shape: {o_rla.shape}\")\n",
    "print(o_rla)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Autotuning kernel chunk_local_cumsum_scalar_kernel with config num_warps: 1, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_local_cumsum_scalar_kernel with config num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_local_cumsum_scalar_kernel with config num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_local_cumsum_scalar_kernel with config num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Triton autotuning for function chunk_local_cumsum_scalar_kernel,\n",
      "with key as (2, 32, 64, False, False, 'torch.float32', 'torch.float32'),\n",
      "finished after 1.92s,\n",
      "best config selected: num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None;\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 32, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 32, num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 32, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 32, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 32, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 32, num_warps: 8, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 32, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 32, num_warps: 8, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 64, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 64, num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 64, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 64, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 64, num_warps: 8, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 64, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 64, num_warps: 8, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 128, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 128, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 128, num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 128, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 128, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 128, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 128, num_warps: 8, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 128, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_scaled_dot_kkt_fwd_kernel with config BK: 128, num_warps: 8, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Triton autotuning for function chunk_scaled_dot_kkt_fwd_kernel,\n",
      "with key as (32, 128, 64, False, 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32'),\n",
      "finished after 19.31s,\n",
      "best config selected: BK: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None;\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 2, num_ctas: 1, num_stages: 5, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 4, num_ctas: 1, num_stages: 5, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 8, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 8, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel merge_16x16_to_64x64_inverse_kernel with config num_warps: 8, num_ctas: 1, num_stages: 5, maxnreg: None\n",
      "Triton autotuning for function merge_16x16_to_64x64_inverse_kernel,\n",
      "with key as (32, 64, False, 'torch.float32', 'torch.bfloat16'),\n",
      "finished after 12.96s,\n",
      "best config selected: num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None;\n",
      "Autotuning kernel recompute_w_u_fwd_kernel with config num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel recompute_w_u_fwd_kernel with config num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel recompute_w_u_fwd_kernel with config num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel recompute_w_u_fwd_kernel with config num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel recompute_w_u_fwd_kernel with config num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel recompute_w_u_fwd_kernel with config num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel recompute_w_u_fwd_kernel with config num_warps: 8, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel recompute_w_u_fwd_kernel with config num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel recompute_w_u_fwd_kernel with config num_warps: 8, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Triton autotuning for function recompute_w_u_fwd_kernel,\n",
      "with key as (32, 128, 128, 64, 64, 64, False, 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32'),\n",
      "finished after 8.28s,\n",
      "best config selected: num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None;\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 32, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 64, num_warps: 2, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 64, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 32, num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 64, num_warps: 2, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 32, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 64, num_warps: 4, num_ctas: 1, num_stages: 2, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 32, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 32, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Autotuning kernel chunk_gated_delta_rule_fwd_kernel_h_blockdim64 with config BV: 64, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None\n",
      "Triton autotuning for function chunk_gated_delta_rule_fwd_kernel_h_blockdim64,\n",
      "with key as (32, 128, 128, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32'),\n",
      "finished after 17.94s,\n",
      "best config selected: BV: 64, num_warps: 4, num_ctas: 1, num_stages: 4, maxnreg: None;\n",
      "Autotuning kernel chunk_fwd_kernel_o with config BK: 128, BV: 128, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_o with config BK: 64, BV: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_o with config BK: 32, BV: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Triton autotuning for function chunk_fwd_kernel_o,\n",
      "with key as (32, 128, 128, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16'),\n",
      "finished after 3.37s,\n",
      "best config selected: BK: 64, BV: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None;\n",
      "Autotuning kernel chunk_fwd_kernel_sq_sk with config BK: 128, BV: 128, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_sq_sk with config BK: 64, BV: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Autotuning kernel chunk_fwd_kernel_sq_sk with config BK: 32, BV: 32, num_warps: 2, num_ctas: 1, num_stages: 3, maxnreg: None\n",
      "Triton autotuning for function chunk_fwd_kernel_sq_sk,\n",
      "with key as (32, 128, 128, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16'),\n",
      "finished after 4.50s,\n",
      "best config selected: BK: 64, BV: 64, num_warps: 4, num_ctas: 1, num_stages: 3, maxnreg: None;\n",
      "o_gdn, shape: torch.Size([2, 16384, 32, 128])\n",
      "tensor([[[[ 7.0190e-04,  1.1108e-02,  1.5991e-02,  ...,  1.0803e-02,\n",
      "           -3.5095e-04, -2.7008e-03],\n",
      "          [ 2.8992e-03, -2.7618e-03,  5.2185e-03,  ..., -2.6855e-03,\n",
      "           -1.9836e-03, -1.5411e-03],\n",
      "          [ 7.3242e-03, -8.5449e-04,  1.3351e-03,  ...,  2.5177e-03,\n",
      "           -2.1057e-03, -3.7956e-04],\n",
      "          ...,\n",
      "          [-9.7656e-04, -4.7874e-04, -1.1396e-04,  ..., -7.8964e-04,\n",
      "           -1.4191e-03,  4.1809e-03],\n",
      "          [-1.8311e-04,  2.0599e-03, -1.2131e-03,  ...,  6.7520e-04,\n",
      "            1.9455e-03, -7.0953e-04],\n",
      "          [ 1.1027e-05,  1.3733e-04, -2.9755e-04,  ..., -5.3787e-04,\n",
      "            6.2943e-05, -2.0599e-04]],\n",
      "\n",
      "         [[ 5.8365e-04,  7.1716e-03,  1.0620e-02,  ...,  7.2021e-03,\n",
      "           -3.1281e-04, -1.8158e-03],\n",
      "          [ 3.0060e-03, -3.3569e-03,  5.4016e-03,  ...,  5.3406e-03,\n",
      "           -3.5858e-03, -2.6703e-03],\n",
      "          [ 7.5989e-03, -6.9427e-04,  6.1417e-04,  ...,  1.6556e-03,\n",
      "           -2.4567e-03, -5.7220e-04],\n",
      "          ...,\n",
      "          [-4.1809e-03, -1.1139e-03,  4.4250e-03,  ...,  8.1177e-03,\n",
      "           -3.5553e-03,  1.9684e-03],\n",
      "          [-2.0742e-05, -3.9864e-04,  5.7983e-04,  ...,  1.3885e-03,\n",
      "           -8.5449e-04, -7.5912e-04],\n",
      "          [ 8.4839e-03, -4.1199e-03,  1.0010e-02,  ...,  6.8665e-03,\n",
      "           -2.9297e-03,  1.0742e-02]],\n",
      "\n",
      "         [[ 2.4109e-03,  5.4321e-03,  1.1841e-02,  ...,  9.0332e-03,\n",
      "           -1.4114e-03, -2.3956e-03],\n",
      "          [ 8.6670e-03, -5.0049e-03,  4.7913e-03,  ...,  2.7657e-04,\n",
      "           -4.0894e-03, -4.2725e-04],\n",
      "          [ 4.7607e-03,  6.8283e-04, -7.9727e-04,  ...,  5.8289e-03,\n",
      "           -2.5940e-03, -1.9836e-03],\n",
      "          ...,\n",
      "          [-3.8605e-03, -7.0572e-04,  5.6458e-03,  ...,  1.8066e-02,\n",
      "            5.4932e-04, -4.8523e-03],\n",
      "          [ 3.4027e-03, -1.9226e-03, -1.4973e-04,  ..., -9.2697e-04,\n",
      "           -1.8463e-03, -1.8997e-03],\n",
      "          [ 4.1809e-03, -2.0142e-03,  5.4932e-03,  ...,  5.0049e-03,\n",
      "           -1.5106e-03,  5.8594e-03]],\n",
      "\n",
      "         ...,\n",
      "\n",
      "         [[-1.3000e-02,  3.5156e-02,  6.3782e-03,  ..., -1.5320e-02,\n",
      "            4.8828e-03,  1.6968e-02],\n",
      "          [ 1.4587e-02,  5.2795e-03, -1.6846e-02,  ...,  1.3550e-02,\n",
      "            7.9956e-03,  1.2146e-02],\n",
      "          [ 1.2939e-02,  1.8311e-02,  4.8523e-03,  ..., -2.2736e-03,\n",
      "            1.2390e-02,  6.5308e-03],\n",
      "          ...,\n",
      "          [-4.4441e-04,  5.9814e-03, -1.1841e-02,  ...,  2.4048e-02,\n",
      "            3.0975e-03,  9.3994e-03],\n",
      "          [ 1.2939e-02, -1.5488e-03,  5.9204e-03,  ...,  5.4550e-04,\n",
      "            5.9814e-03, -2.3041e-03],\n",
      "          [ 6.0425e-03,  3.4332e-03, -1.2512e-03,  ...,  4.5776e-03,\n",
      "           -7.6294e-04,  1.2589e-03]],\n",
      "\n",
      "         [[ 1.4465e-02, -7.0572e-04,  3.1494e-02,  ...,  3.6469e-03,\n",
      "           -5.9204e-03,  3.9062e-03],\n",
      "          [ 3.8574e-02, -2.4719e-03,  1.4038e-02,  ...,  4.6631e-02,\n",
      "           -1.0376e-02,  1.1475e-02],\n",
      "          [ 1.4709e-02,  1.5747e-02, -2.5368e-04,  ..., -2.5635e-03,\n",
      "            8.1787e-03,  1.5869e-02],\n",
      "          ...,\n",
      "          [ 1.1841e-02,  1.0742e-02,  8.1787e-03,  ...,  1.0742e-02,\n",
      "            8.6060e-03, -9.1171e-04],\n",
      "          [ 1.0967e-04, -1.2894e-03,  4.7493e-04,  ..., -1.5831e-04,\n",
      "           -7.4768e-04, -4.3869e-04],\n",
      "          [ 9.2773e-03,  4.9438e-03, -7.8964e-04,  ...,  7.3242e-03,\n",
      "           -7.9346e-04,  1.3275e-03]],\n",
      "\n",
      "         [[ 8.3008e-03,  1.1292e-02,  2.4780e-02,  ...,  2.9053e-02,\n",
      "            2.8198e-02,  1.7578e-02],\n",
      "          [ 1.4893e-02, -1.3184e-02, -5.7373e-03,  ...,  3.0029e-02,\n",
      "            1.0986e-02, -1.7090e-02],\n",
      "          [ 7.8125e-03,  6.4392e-03, -4.7112e-04,  ..., -7.2098e-04,\n",
      "            1.0681e-02, -1.2589e-03],\n",
      "          ...,\n",
      "          [ 2.4719e-03,  1.9989e-03, -5.5695e-04,  ...,  1.0559e-02,\n",
      "           -1.7624e-03,  1.4191e-03],\n",
      "          [ 4.1199e-03,  1.3062e-02,  7.8583e-04,  ...,  2.7618e-03,\n",
      "            5.8594e-03,  3.6621e-03],\n",
      "          [-6.1417e-04,  3.5858e-03,  7.7438e-04,  ..., -4.7684e-04,\n",
      "           -2.2984e-04, -5.7983e-04]]],\n",
      "\n",
      "\n",
      "        [[[-1.7738e-04,  1.3351e-04,  2.4605e-04,  ..., -1.6117e-04,\n",
      "           -1.7071e-04, -1.5831e-04],\n",
      "          [ 3.0398e-05,  3.8624e-05, -3.6806e-06,  ..., -6.8247e-06,\n",
      "           -7.3016e-06, -8.7023e-06],\n",
      "          [-8.6212e-04, -1.3828e-04,  8.1177e-03,  ...,  2.8076e-03,\n",
      "            3.1586e-03,  8.1635e-04],\n",
      "          ...,\n",
      "          [ 3.5400e-03,  3.2425e-04,  1.0071e-03,  ...,  1.7700e-03,\n",
      "            2.6245e-03, -6.1798e-04],\n",
      "          [-5.0659e-03,  1.2268e-02,  9.2773e-03,  ...,  8.6060e-03,\n",
      "            4.1504e-03, -5.0964e-03],\n",
      "          [-2.3499e-03, -2.3499e-03, -2.4414e-03,  ..., -1.2436e-03,\n",
      "           -8.7738e-05,  5.6839e-04]],\n",
      "\n",
      "         [[ 3.1281e-03,  2.2507e-04,  3.0365e-03,  ..., -1.3046e-03,\n",
      "           -1.3809e-03, -7.5531e-04],\n",
      "          [ 3.9368e-03,  3.4027e-03,  1.7624e-03,  ..., -1.5488e-03,\n",
      "           -1.6708e-03, -6.7520e-04],\n",
      "          [-7.0953e-04,  2.5392e-05,  4.8523e-03,  ...,  1.4648e-03,\n",
      "            1.0605e-03,  5.4550e-04],\n",
      "          ...,\n",
      "          [ 2.1057e-03,  7.5378e-03,  2.2292e-05,  ...,  1.0910e-03,\n",
      "            1.9379e-03, -1.2875e-04],\n",
      "          [-4.1389e-04, -4.6253e-05, -1.7643e-05,  ...,  2.8133e-05,\n",
      "           -2.2984e-04,  6.9141e-05],\n",
      "          [-1.0757e-03,  5.0735e-04,  1.4572e-03,  ...,  7.7057e-04,\n",
      "            1.3809e-03,  1.5030e-03]],\n",
      "\n",
      "         [[ 1.2634e-02, -4.1580e-04,  1.4420e-03,  ..., -8.0872e-04,\n",
      "           -8.7357e-04, -2.8534e-03],\n",
      "          [-4.8218e-03,  7.8583e-04, -3.1471e-04,  ..., -1.3275e-03,\n",
      "           -6.8970e-03,  2.2430e-03],\n",
      "          [-1.2054e-03,  1.3367e-02,  1.9043e-02,  ..., -1.3351e-03,\n",
      "           -1.5640e-03,  9.1553e-03],\n",
      "          ...,\n",
      "          [ 4.1504e-03,  1.6403e-03, -1.8120e-04,  ...,  4.0531e-05,\n",
      "            6.3477e-03,  1.2875e-04],\n",
      "          [ 2.3651e-04,  3.9339e-05,  2.1815e-05,  ..., -1.1146e-05,\n",
      "            1.3351e-04, -4.1008e-05],\n",
      "          [ 6.2866e-03, -2.0599e-03,  1.4587e-02,  ...,  1.1444e-03,\n",
      "           -1.1749e-03,  5.0049e-03]],\n",
      "\n",
      "         ...,\n",
      "\n",
      "         [[ 3.1128e-03,  2.3499e-03,  3.9062e-02,  ..., -1.3428e-02,\n",
      "            1.8677e-02,  4.1504e-03],\n",
      "          [ 1.8311e-02,  7.6294e-03, -1.1520e-03,  ..., -1.3733e-02,\n",
      "            6.8054e-03,  1.6235e-02],\n",
      "          [ 9.3994e-03, -3.2654e-03, -1.4267e-03,  ...,  2.9297e-02,\n",
      "            1.2573e-02, -3.2806e-03],\n",
      "          ...,\n",
      "          [ 1.1902e-02,  4.6387e-03,  4.1809e-03,  ...,  1.3275e-03,\n",
      "            2.9175e-02,  1.4465e-02],\n",
      "          [-9.3937e-05,  1.1253e-04, -1.0788e-05,  ..., -6.0558e-05,\n",
      "           -2.3127e-05, -9.2506e-05],\n",
      "          [-7.5912e-04,  7.3242e-03,  2.1172e-04,  ...,  4.8828e-04,\n",
      "            7.6294e-04, -1.7090e-03]],\n",
      "\n",
      "         [[ 2.6001e-02, -4.8828e-03,  3.4668e-02,  ...,  8.1787e-03,\n",
      "            2.2461e-02, -5.4932e-04],\n",
      "          [ 7.0190e-03,  5.2979e-02,  2.3438e-02,  ...,  3.8086e-02,\n",
      "            2.2705e-02,  3.7598e-02],\n",
      "          [ 1.1658e-02,  2.6001e-02, -1.5259e-03,  ...,  3.0518e-03,\n",
      "            1.2131e-03, -1.4267e-03],\n",
      "          ...,\n",
      "          [ 1.0132e-02,  1.6785e-03, -2.4986e-04,  ..., -4.0894e-03,\n",
      "            1.7822e-02,  6.8054e-03],\n",
      "          [-2.8229e-04,  3.4904e-04, -9.1171e-04,  ..., -1.9302e-03,\n",
      "           -1.6403e-03, -3.9978e-03],\n",
      "          [ 2.4915e-05,  6.2943e-05, -2.0027e-05,  ..., -1.3947e-05,\n",
      "            1.5616e-05, -1.0443e-04]],\n",
      "\n",
      "         [[-1.0071e-03,  8.2397e-03,  2.1240e-02,  ...,  1.6968e-02,\n",
      "            5.4199e-02,  2.4048e-02],\n",
      "          [-4.1199e-03,  2.5146e-02, -8.9722e-03,  ...,  1.4099e-02,\n",
      "            5.4016e-03,  1.0315e-02],\n",
      "          [ 8.8501e-03,  1.3794e-02, -2.9144e-03,  ...,  1.3428e-02,\n",
      "            6.3477e-03, -2.8610e-04],\n",
      "          ...,\n",
      "          [ 4.9744e-03,  2.3499e-03,  1.6174e-03,  ..., -2.7771e-03,\n",
      "            1.3245e-02,  7.9346e-03],\n",
      "          [ 2.5024e-03,  9.2030e-05,  7.3624e-04,  ..., -2.3193e-03,\n",
      "            1.6556e-03, -5.7983e-04],\n",
      "          [ 5.4626e-03,  6.1417e-04, -8.0490e-04,  ..., -7.6675e-04,\n",
      "            3.1586e-03,  3.6469e-03]]]], device='cuda:0', dtype=torch.bfloat16)\n",
      "o_rdn, shape: torch.Size([2, 16384, 32, 128])\n",
      "tensor([[[[ 2.5368e-04,  4.0283e-03,  4.0894e-03,  ...,  3.9368e-03,\n",
      "           -1.2779e-04, -9.8419e-04],\n",
      "          [ 9.4604e-04, -8.9645e-04,  1.7014e-03,  ..., -8.7357e-04,\n",
      "           -6.4468e-04, -4.9973e-04],\n",
      "          [ 2.5940e-03, -3.0136e-04,  4.7493e-04,  ...,  8.8882e-04,\n",
      "           -7.4387e-04, -1.3447e-04],\n",
      "          ...,\n",
      "          [-3.4714e-04, -1.6975e-04, -4.0293e-05,  ..., -2.8038e-04,\n",
      "           -5.0735e-04,  1.4877e-03],\n",
      "          [-2.2793e-04,  2.5787e-03, -1.5106e-03,  ...,  8.3542e-04,\n",
      "            2.4109e-03, -8.8501e-04],\n",
      "          [ 1.5080e-05,  1.8787e-04, -4.0817e-04,  ..., -6.7520e-04,\n",
      "            8.6308e-05, -2.8038e-04]],\n",
      "\n",
      "         [[ 5.5695e-04,  8.6670e-03,  1.1963e-02,  ...,  8.4839e-03,\n",
      "           -2.8229e-04, -2.1210e-03],\n",
      "          [ 4.9133e-03, -5.0049e-03,  8.8501e-03,  ..., -1.7548e-03,\n",
      "           -4.3335e-03, -3.2959e-03],\n",
      "          [ 1.3123e-02, -9.5367e-04,  1.0252e-04,  ...,  1.6708e-03,\n",
      "           -4.5776e-03, -1.2054e-03],\n",
      "          ...,\n",
      "          [-4.2114e-03, -1.3275e-03,  3.4027e-03,  ...,  5.7068e-03,\n",
      "           -4.1504e-03,  5.4016e-03],\n",
      "          [-1.9312e-05, -2.0218e-04,  3.4904e-04,  ...,  9.4223e-04,\n",
      "           -5.0735e-04, -5.2261e-04],\n",
      "          [ 4.1809e-03, -5.5847e-03,  1.2817e-02,  ...,  1.5076e-02,\n",
      "           -3.2501e-03,  1.1169e-02]],\n",
      "\n",
      "         [[ 2.4872e-03,  6.5918e-03,  1.3245e-02,  ...,  9.9487e-03,\n",
      "           -1.5564e-03, -2.6855e-03],\n",
      "          [ 3.9062e-03, -4.0588e-03,  6.7139e-03,  ...,  3.0518e-03,\n",
      "           -3.8910e-03, -2.8534e-03],\n",
      "          [ 9.0942e-03,  4.8399e-05, -6.1417e-04,  ...,  4.4556e-03,\n",
      "           -3.8300e-03, -1.8768e-03],\n",
      "          ...,\n",
      "          [-4.0894e-03, -6.9427e-04,  6.3782e-03,  ...,  1.2878e-02,\n",
      "           -2.3804e-03, -4.7302e-03],\n",
      "          [ 3.1433e-03, -1.7853e-03, -1.4019e-04,  ..., -8.8501e-04,\n",
      "           -1.7014e-03, -1.7395e-03],\n",
      "          [ 5.0659e-03, -3.4943e-03,  8.4229e-03,  ...,  7.7820e-03,\n",
      "           -2.2888e-03,  8.2397e-03]],\n",
      "\n",
      "         ...,\n",
      "\n",
      "         [[-1.9409e-02,  5.9082e-02, -7.5150e-04,  ..., -3.2227e-02,\n",
      "           -6.0425e-03,  1.6724e-02],\n",
      "          [ 8.3008e-03,  4.9133e-03, -2.4414e-02,  ...,  9.4604e-03,\n",
      "            1.0620e-02,  1.6846e-02],\n",
      "          [ 1.3550e-02,  2.0508e-02,  5.4016e-03,  ..., -6.0120e-03,\n",
      "            1.1719e-02,  7.0190e-03],\n",
      "          ...,\n",
      "          [ 3.4485e-03,  2.0294e-03, -7.3547e-03,  ...,  5.2490e-03,\n",
      "            1.0681e-02,  8.4229e-03],\n",
      "          [ 1.3504e-03, -1.8978e-04,  6.8665e-04,  ..., -3.3379e-05,\n",
      "            7.6294e-04, -3.6049e-04],\n",
      "          [ 1.5411e-03,  5.4169e-04, -3.9577e-05,  ...,  5.0354e-04,\n",
      "           -2.7657e-04,  6.2180e-04]],\n",
      "\n",
      "         [[ 1.6479e-02, -7.6599e-03,  3.4668e-02,  ...,  3.2959e-03,\n",
      "           -1.9165e-02, -1.0864e-02],\n",
      "          [ 4.5654e-02,  3.6812e-04,  1.4343e-02,  ...,  5.6396e-02,\n",
      "           -1.2451e-02,  1.5503e-02],\n",
      "          [ 1.6479e-02,  1.7700e-02, -1.4801e-03,  ..., -2.9755e-03,\n",
      "            8.3618e-03,  1.7456e-02],\n",
      "          ...,\n",
      "          [ 1.0437e-02,  1.3733e-02,  1.0010e-02,  ...,  1.6357e-02,\n",
      "            6.1340e-03, -3.5706e-03],\n",
      "          [ 5.6839e-04, -6.6757e-04,  5.0735e-04,  ..., -6.4373e-05,\n",
      "           -1.2779e-04, -3.3760e-04],\n",
      "          [ 1.0986e-02,  6.3477e-03, -1.6403e-03,  ...,  8.9722e-03,\n",
      "           -9.3842e-04,  1.5640e-03]],\n",
      "\n",
      "         [[ 8.6060e-03,  1.4343e-02,  2.3193e-02,  ...,  3.2471e-02,\n",
      "            1.6479e-02,  1.7334e-02],\n",
      "          [ 5.3711e-03, -1.4954e-02, -8.6670e-03,  ...,  3.9062e-02,\n",
      "           -4.3640e-03, -2.0874e-02],\n",
      "          [ 1.0681e-02,  9.8877e-03,  2.5749e-05,  ..., -3.0212e-03,\n",
      "            1.2939e-02, -3.5095e-03],\n",
      "          ...,\n",
      "          [ 3.4790e-03,  3.1433e-03, -1.1902e-03,  ...,  1.2695e-02,\n",
      "           -2.0752e-03,  1.9684e-03],\n",
      "          [ 4.6692e-03,  1.1108e-02,  1.1492e-04,  ...,  2.4719e-03,\n",
      "            7.2021e-03,  3.4790e-03],\n",
      "          [-4.1771e-04,  1.6861e-03,  5.2261e-04,  ..., -3.0518e-04,\n",
      "           -1.7357e-04, -4.1580e-04]]],\n",
      "\n",
      "\n",
      "        [[[-2.0695e-04,  1.5640e-04,  2.8801e-04,  ..., -1.9073e-04,\n",
      "           -2.0123e-04, -1.8501e-04],\n",
      "          [ 6.7055e-06,  6.9737e-06, -8.1584e-07,  ..., -1.5125e-06,\n",
      "           -1.6093e-06, -1.9372e-06],\n",
      "          [-2.4719e-03, -3.9291e-04,  1.5625e-02,  ...,  8.0566e-03,\n",
      "            9.0332e-03,  2.3651e-03],\n",
      "          ...,\n",
      "          [ 9.9182e-04,  1.1683e-04,  3.6240e-04,  ...,  6.3705e-04,\n",
      "            9.4986e-04, -2.2507e-04],\n",
      "          [-6.2561e-04,  1.5182e-03,  1.1368e-03,  ...,  1.0605e-03,\n",
      "            5.1117e-04, -6.2943e-04],\n",
      "          [-2.8381e-03, -2.8381e-03, -2.9449e-03,  ..., -1.4954e-03,\n",
      "           -1.0538e-04,  6.8283e-04]],\n",
      "\n",
      "         [[ 6.2561e-04,  1.0223e-03,  5.3406e-03,  ..., -2.6093e-03,\n",
      "           -2.7466e-03, -1.8539e-03],\n",
      "          [ 3.5095e-03,  4.0283e-03, -2.8372e-05,  ..., -9.0408e-04,\n",
      "           -9.6893e-04, -9.1934e-04],\n",
      "          [-1.3809e-03, -1.9550e-05,  8.6060e-03,  ...,  3.2806e-03,\n",
      "            2.8381e-03,  1.1292e-03],\n",
      "          ...,\n",
      "          [ 2.9297e-03,  1.4832e-02, -2.9373e-04,  ...,  1.9684e-03,\n",
      "            3.6774e-03, -8.4877e-05],\n",
      "          [-1.8787e-04,  1.6809e-05,  2.0027e-05,  ...,  3.6955e-05,\n",
      "           -8.4877e-05,  1.3947e-05],\n",
      "          [-4.0436e-04, -1.6785e-04, -3.4809e-05,  ..., -1.2815e-05,\n",
      "            1.9741e-04,  2.8610e-04]],\n",
      "\n",
      "         [[ 6.7749e-03,  1.0443e-04,  5.3406e-03,  ..., -2.2125e-03,\n",
      "           -2.3499e-03, -1.5564e-03],\n",
      "          [ 1.6937e-03,  2.0790e-04,  2.9144e-03,  ..., -1.4343e-03,\n",
      "           -1.8158e-03,  1.9550e-04],\n",
      "          [-2.8076e-03,  8.6060e-03,  2.4170e-02,  ...,  3.6926e-03,\n",
      "            2.8381e-03,  7.9956e-03],\n",
      "          ...,\n",
      "          [ 3.1891e-03,  3.6163e-03, -2.0027e-04,  ...,  3.2234e-04,\n",
      "            3.1891e-03,  9.4891e-05],\n",
      "          [ 3.2234e-04,  3.8147e-05,  1.6332e-05,  ..., -2.2531e-05,\n",
      "            1.7834e-04, -5.2452e-05],\n",
      "          [ 4.6082e-03, -1.4267e-03,  1.0803e-02,  ...,  1.3199e-03,\n",
      "           -2.4223e-04,  4.8523e-03]],\n",
      "\n",
      "         ...,\n",
      "\n",
      "         [[ 3.1433e-03, -2.1362e-03,  4.4678e-02,  ..., -3.1128e-02,\n",
      "            9.2163e-03,  1.0742e-02],\n",
      "          [ 2.9297e-02, -3.2654e-03, -1.1719e-02,  ..., -9.9487e-03,\n",
      "            7.1411e-03,  2.5024e-02],\n",
      "          [ 1.0559e-02, -1.9684e-03,  9.2316e-04,  ...,  2.5757e-02,\n",
      "            7.3853e-03, -1.6022e-03],\n",
      "          ...,\n",
      "          [ 1.4465e-02,  5.1575e-03,  6.7444e-03,  ...,  3.2349e-03,\n",
      "            3.0640e-02,  2.1484e-02],\n",
      "          [-1.2589e-04,  1.3828e-04, -2.8610e-05,  ..., -8.1062e-05,\n",
      "           -2.4796e-05, -1.6022e-04],\n",
      "          [-1.6174e-03,  1.7944e-02,  1.3885e-03,  ...,  3.0975e-03,\n",
      "            3.0212e-03, -4.4861e-03]],\n",
      "\n",
      "         [[ 2.5757e-02, -9.8877e-03,  4.7119e-02,  ..., -5.6458e-03,\n",
      "            2.3071e-02, -1.1673e-03],\n",
      "          [ 1.5198e-02,  6.6895e-02,  2.6703e-03,  ...,  4.7363e-02,\n",
      "            2.7710e-02,  3.1250e-02],\n",
      "          [ 1.3733e-02,  1.2939e-02, -1.1292e-03,  ...,  2.7313e-03,\n",
      "            5.8594e-03, -3.2501e-03],\n",
      "          ...,\n",
      "          [ 1.4709e-02,  2.5940e-03, -2.2125e-03,  ..., -7.7515e-03,\n",
      "            1.9775e-02,  8.1177e-03],\n",
      "          [-4.7684e-04,  6.8283e-04, -4.1962e-04,  ..., -1.2970e-03,\n",
      "           -1.0757e-03, -1.1215e-03],\n",
      "          [ 3.3140e-05,  1.1826e-04, -4.5776e-05,  ..., -4.8637e-05,\n",
      "            8.7261e-05, -5.2214e-05]],\n",
      "\n",
      "         [[-4.9744e-03,  9.9487e-03,  1.9165e-02,  ...,  1.7090e-02,\n",
      "            6.2256e-02,  2.6978e-02],\n",
      "          [-1.3580e-03,  2.1240e-02, -1.4343e-02,  ...,  1.4465e-02,\n",
      "            6.4087e-03,  1.0620e-02],\n",
      "          [ 9.3384e-03,  1.6235e-02, -4.9438e-03,  ...,  1.8555e-02,\n",
      "            5.9509e-03, -7.0572e-04],\n",
      "          ...,\n",
      "          [ 7.4768e-03,  1.7166e-03,  1.0452e-03,  ..., -4.3945e-03,\n",
      "            1.5503e-02,  1.0864e-02],\n",
      "          [ 1.5640e-03,  2.4605e-04,  3.3140e-05,  ..., -2.5787e-03,\n",
      "            3.0327e-04, -1.9455e-03],\n",
      "          [ 1.6937e-03,  4.4823e-04, -4.5586e-04,  ..., -3.8528e-04,\n",
      "            1.5564e-03,  1.2360e-03]]]], device='cuda:0', dtype=torch.bfloat16)\n"
     ]
    }
   ],
   "source": [
    "# Example usage of Residual Delta Net\n",
    "# It will take some time to run the following code, because it will run the autotuning of Triton.\n",
    "\n",
    "\n",
    "from residual_linear_attention import gdn_prefill, rdn_prefill\n",
    "\n",
    "os.environ[\"TRITON_PRINT_AUTOTUNING\"] = \"1\"\n",
    "\n",
    "q, k, v, alpha, beta, gamma = generate_example_data(2, 16384, 32, 128, torch.bfloat16)\n",
    "\n",
    "o_gdn, _ = gdn_prefill(q, k, v, alpha, beta)\n",
    "\n",
    "o_rdn, _, _ = rdn_prefill(q, k, v, alpha, beta, gamma)\n",
    "\n",
    "print(f\"o_gdn, shape: {o_gdn.shape}\")\n",
    "print(o_gdn)\n",
    "\n",
    "print(f\"o_rdn, shape: {o_rdn.shape}\")\n",
    "print(o_rdn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"TRITON_PRINT_AUTOTUNING\"] = \"0\"\n",
    "# test config\n",
    "seq_len_for_test = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]\n",
    "batch_size, num_heads, head_dim = 1, 32, 128\n",
    "dtype = torch.bfloat16\n",
    "# use flash attention as baseline\n",
    "from flash_attn import flash_attn_func"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAGxCAYAAACEFXd4AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjUsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvWftoOwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAW5JJREFUeJzt3Xd4FOX+/vH37ia76QkEkhBJIr1IKFJCxC5SRI8KNgRBD+oBQUUUEY8NCyAee8EWwPNTrMfyFSsiAZEAgiBVBERBIQGBZEkgdef3x5iFpYaQZDab+3Vde2XKs7ufeZTMnSnP2AzDMBAREREJUHarCxARERGpTgo7IiIiEtAUdkRERCSgKeyIiIhIQFPYERERkYCmsCMiIiIBTWFHREREAprCjoiIiAS0IKsL8Acej4dt27YRGRmJzWazuhwRERGpAMMw2Lt3L4mJidjtRz9+o7ADbNu2jaSkJKvLEBERkUrYunUrjRs3Pup6hR0gMjISMDsrKirK4mpERESkItxuN0lJSd79+NEo7ID31FVUVJTCjoiISC1zvEtQdIGyiIiIBDSFHREREQloCjsiIiIS0HTNTgV5PB6Ki4utLqPOcDqdx7yNUEREpKIUdiqguLiYzZs34/F4rC6lzrDb7TRp0gSn02l1KSIiUssp7ByHYRhs374dh8NBUlKSjjbUgPJBHrdv305ycrIGehQRkZOisHMcpaWl7Nu3j8TERMLCwqwup85o2LAh27Zto7S0lODgYKvLERGRWkyHKY6jrKwMQKdTalh5f5f3v4iISGUp7FSQTqXULPW3iIhUFYUdERERCWgKOyIiIhLQFHYC1PXXX4/NZjvstXHjRq6//nouu+yy437GH3/8gdPppF27dkdcP2/ePM4//3zq169PWFgYLVq0YOjQod7xiDIzM7HZbOTm5h723lNPPZVnnnnmJLZQRESkYvwm7EyePBmbzcbo0aO9ywoLCxk5ciSxsbFEREQwYMAAcnJyfN63ZcsW+vXrR1hYGHFxcYwdO5bS0tIart4/9enTh+3bt/u8mjRpUuH3z5gxg6uuugq3283ixYt91q1du5Y+ffrQpUsX5s+fz6pVq3j++edxOp26qFhERA7YvQy2fWlpCX5x6/kPP/zAK6+8Qvv27X2W33HHHXz22We8//77REdHM2rUKPr378/3338PmHfq9OvXj4SEBBYuXMj27dsZMmQIwcHBTJw40YpN8Ssul4uEhIRKvdcwDKZPn85LL71E48aNycjIIC0tzbv+66+/JiEhgSlTpniXNWvWjD59+px03SIiEiD2boLMi6BoF5z9f3DKRZaUYfmRnfz8fAYNGsRrr71GvXr1vMvz8vLIyMjgqaee4vzzz6dz585Mnz6dhQsXsmjRIsDc4a5du5Y333yTjh070rdvXx555BFefPHF6nu0g2FAaYE1L8Oonm06grlz57Jv3z569uzJ4MGDeeeddygoKPCuT0hIYPv27cyfP7/GahIRkVqkcAfM7WP+jGwJ9TpaVorlR3ZGjhxJv3796NmzJ48++qh3+bJlyygpKaFnz57eZa1btyY5OZmsrCy6d+9OVlYWqampxMfHe9v07t2bESNGsGbNGjp16nTE7ywqKqKoqMg773a7K15w2T54L+IEtrAKXZUPQeEVbj5r1iwiIg7U2rdvX95///0KvTcjI4NrrrkGh8NBu3btaNq0Ke+//z7XX389AFdeeSVfffUV55xzDgkJCXTv3p0LLriAIUOGEBUV5fNZjRs3Puzz9+3bV+HtEBGRWqYkHzIvhvyNEHoKdJ8GYYmWlWNp2HnnnXf48ccf+eGHHw5bl52djdPpJCYmxmd5fHw82dnZ3jYHB53y9eXrjmbSpElMmDDhJKv3f+eddx5Tp071zoeHVywo5ebm8uGHH7JgwQLvssGDB5ORkeENOw6Hg+nTp/Poo4/y7bffsnjxYiZOnMjjjz/OkiVLaNSokfe93333HZGRkT7fce6551Z+w0RExH95SmDBVbD7BwiOgbTXoUF3S0uyLOxs3bqV22+/ndmzZxMSElKj3z1+/HjGjBnjnXe73SQlJVXszY4w8wiLFRwn9riK8PBwmjdvfsJfM3PmTAoLC32u0TEMA4/Hwy+//ELLli29y0855RSuu+46rrvuOh555BFatmzJyy+/7BMmmzRpclhoDQqy/KCiiIhUNcOAJcNh+xdgD4Fur0Cj3lZXZV3YWbZsGTt27OD000/3LisrK2P+/Pm88MILfPXVVxQXF5Obm+uzo8zJyfFedJuQkMCSJUt8Prf8bq1jXZjrcrlwuVyVK9xmO6FTSbVRRkYGd955p/coTrlbbrmFadOmMXny5CO+r169ejRq1Mjn2h4REalDVj4Av04D7ND5aUi+wtxvWsyysHPBBRewatUqn2U33HADrVu3Zty4cSQlJREcHMycOXMYMGAAAOvXr2fLli2kp6cDkJ6ezmOPPcaOHTuIi4sDYPbs2URFRdG2bdua3aBaJi8vjxUrVvgsi42NZdeuXfz444+89dZbtG7d2mf9wIEDefjhh3n00UfJyMhgxYoVXH755TRr1ozCwkL++9//smbNGp5//vka3BIREfELG16GNX9fe9v+YWh2I9gsvw8KsDDsREZGHjZYXXh4OLGxsd7lw4YNY8yYMdSvX5+oqChuvfVW0tPT6d7dPPfXq1cv2rZty3XXXceUKVPIzs7mvvvuY+TIkZU/clNHZGZmHnYB97BhwwgNDaVt27aHBR2Ayy+/nFGjRvH555/TrVs3FixYwPDhw9m2bRsRERGcdtppfPzxx5xzzjk1tRkiIuIPtn4MS0ea0y1vhTZ3gt1/LlewGUYN3s98HOeeey4dO3b0jqxbWFjInXfeydtvv01RURG9e/fmpZde8jlF9fvvvzNixAgyMzMJDw9n6NChTJ48+YSuCXG73URHR5OXl3fYnUSFhYVs3ryZJk2a1Pi1RXWZ+l1EpJbY+T182xPKCiH5Kuj2Ojgjj/++KnCs/ffB/CrsWEVhx/+o30VEaoG8tTD7TCjeA/HnQY93IaRhjX19RcOOf5xMExERkdpl35/moIHFeyCmo3lEpwaDzolQ2BEREZETU5wLmX1h31YIP9UcNDCyqdVVHZXCjoiIiFRcWRHMvxxyV4GrIaRlQP0jP7HAXyjsiIiISMUYHsgaAjsyzTHnur0GCedbXdVxKeyIiIjI8RkG/HgnbHkPbMHQ5UVo/A+rq6oQhR0RERE5vp+fhPXPmNMdJ8Opg/1idOSKUNgRERGRY9v8Fiwfa063GQctR4HdYW1NJ0BhR0RERI4u+xtYfIM53eR6SH0AHE5LSzpRCjt1lNvt5v777+e0004jNDSU2NhYunbtypQpU9izZ4+33bnnnsvo0aOP+3lZWVk4HA769etXjVWLiEiN2r3cvPPKUwKJ/eD0pyAozOqqTpjCTh20e/duunfvzvTp07nrrrtYvHgxP/74I4899hjLly9n5syZJ/yZGRkZ3HrrrcyfP59t27ZVQ9UiIlKj8jebY+mU5kNsGnR9GVz1rK6qUvznKV1S5T744AMmTJjAxo0bCQsLo1OnTnzyySfce++9bNmyhV9++YXExERv+5SUFHr16sWJPkEkPz+fd999l6VLl5Kdnc2MGTO49957q3pzRESkphT+BXN7Q2EORLaC7tMhvLHVVVWajuwEqO3btzNw4ED++c9/sm7dOjIzM+nfvz9lZWW8++67DB482CfoHMx2glfXv/fee7Ru3ZpWrVoxePBgpk2bdsKBSURE/ERpAcy7GPZugNBEc3Tk6DZWV3VSdGTnBBkG7NtnzXeHhVX8Lr/t27dTWlpK//79SUlJASA1NZWcnBxyc3Np1aqVT/vOnTuzfv16AC655BLefvvtCteVkZHB4MGDAejTpw95eXnMmzePc889t8KfISIifsBTCguugV2LITjafN5VwzOsruqkKeycoH37ICLCmu/Oz4fw8Iq17dChAxdccAGpqan07t2bXr16ccUVVxy1/UcffURxcTHjxo1j//79Fa5p/fr1LFmyhI8++giAoKAgrr76ajIyMhR2RERqE8OAH4bDtllgd5nX6CT2sbqqKqHTWAHK4XAwe/ZsvvjiC9q2bcvzzz9Pq1atyM/PJyYmxnsUp1xycjLNmzcnMjLyhL4nIyOD0tJSEhMTCQoKIigoiKlTp/K///2PvLy8qtwkERGpTqsmwKYMwG7edZVyVa0ZNPB4dGTnBIWFmUdYrPruE2Gz2ejRowc9evTggQceICUlhU8++YSrrrqKN998kwceeOCo1+1URGlpKf/973958skn6dWrl8+6yy67jLfffpvhw4dX+vNFRKSGbHwVVk8wp1MfhOY3gS1wjoco7Jwgm63ip5KstHjxYubMmUOvXr2Ii4tj8eLF7Ny5kzZt2jB06FAyMzPp1q0bDz/8MF26dCE8PJyVK1eSlZVFu3btfD5r586drFixwmdZo0aNyMrKYs+ePQwbNozo6Gif9QMGDCAjI0NhR0TE3/3xf/DDCHO6xS3Q9m6wB1tbUxVT2AlQUVFRzJ8/n2eeeQa3201KSgpPPvkkffv2BWDJkiU8/vjjPPHEE2zevBm73U6LFi24+uqrDxtEcObMmYeNvfPII4+wePFievbseVjQATPsTJkyhZUrV9K+fftq204RETkJO7Pg+2vMp5knXQEdJoEjxOqqqpzN0D3CuN1uoqOjycvLIyoqymddYWEhmzdvpkmTJoSEBN7/AP5K/S4iUs3yfobZPaB4N8SdCz3ehdA4q6s6Icfafx8scE7IiYiISMXs2waZfcygE9Me0jJqXdA5EQo7IiIidUlxHmReBAW/Q/ip5ujIkU2trqpaKeyIiIjUFWVF8F1/yP0JXA3MQQPrn251VdVOYUdERKQuMDyw6HrI+RYcYdDtVWh0gdVV1QiFHRERkbpg+Vj4/R2wBUGXF6DxZVZXVGMUdkRERALduqfg56fM6Q4TocmQgBkduSIUdkRERALZb+/A8jvN6dZ3QavbwO6wtqYaprAjIiISqLK/hUVDzOlTr4PUh8DhsrQkKyjsiIiIBKI9P8H8y8BTAo36QOdnIbgWPO+oGijsiIiIBJr83yCzL5TuhfrdzDuvXPWsrsoyCjsB6vrrr8dms2Gz2QgODqZJkybcfffdFBYWetvYbDY+/vjj437WpEmTcDgcPPHEE9VYsYiIVImiXeboyPu3Q2RL6D4NwpOsrspSCjsBrE+fPmzfvp1ff/2Vp59+mldeeYUHH3zwhD9n2rRp3H333UybNq0aqhQRkSpTug/mXQLu9RDSyHwMRMxpVldlOUvDztSpU2nfvj1RUVFERUWRnp7OF1984V1/7rnneo9OlL+GDx/u8xlbtmyhX79+hIWFERcXx9ixYyktLa3pTfFLLpeLhIQEkpKSuOyyy+jZsyezZ88+oc+YN28e+/fv5+GHH8btdrNw4cJqqlZERE6KpxS+Hwh/ZUFwFKS9DnFnWl2VXwiy8ssbN27M5MmTadGiBYZh8MYbb3DppZeyfPlyTjvNTKI33XQTDz/8sPc9YWFh3umysjL69etHQkICCxcuZPv27QwZMoTg4GAmTpxY49vjz1avXs3ChQtJSUk5ofdlZGQwcOBAgoODGThwIBkZGZxxxhnVVKWIiFSKYcDSkfDn/4HdCV2nQmJfq6vyG5aGnUsuucRn/rHHHmPq1KksWrTIG3bCwsJISEg44vu//vpr1q5dyzfffEN8fDwdO3bkkUceYdy4cTz00EM4nc4qr9kwDPaV7Kvyz62IsOAwbCcwCNSsWbOIiIigtLSUoqIi7HY7L7zwQoXf73a7+eCDD8jKygJg8ODBnHXWWTz77LNERESccP0iIlJNVj8CG18FbNDpSUi5pk4NGng8loadg5WVlfH+++9TUFBAenq6d/lbb73Fm2++SUJCApdccgn333+/9+hOVlYWqampxMfHe9v37t2bESNGsGbNGjp16nTE7yoqKqKoqMg773a7K1znvpJ9REyyZkefPz6fcGfFbxs877zzmDp1KgUFBTz99NMEBQUxYMCACr//7bffplmzZnTo0AGAjh07kpKSwrvvvsuwYcNOuH4REakGmzJg1d/XY7a7H1r8C2y6JPdgloedVatWkZ6eTmFhIREREXz00Ue0bdsWgGuvvZaUlBQSExNZuXIl48aNY/369Xz44YcAZGdn+wQdwDufnZ191O+cNGkSEyZMqKYt8h/h4eE0b94cMC8y7tChAxkZGRUOKhkZGaxZs4agoAP/m3g8HqZNm6awIyLiD/78DJb8y5xuPhza3gP2YGtr8kOWh51WrVqxYsUK8vLy+OCDDxg6dCjz5s2jbdu23Hzzzd52qampNGrUiAsuuIBNmzbRrFmzSn/n+PHjGTNmjHfe7XaTlFSx2/LCgsPIH59f6e8+GWHBYcdvdBR2u517772XMWPGcO211xIaGnrM9qtWrWLp0qVkZmZSv3597/Ldu3dz7rnn8vPPP9O6detK1yMiIifpr8Ww4EowyqBxf+j4OAQd+3d7XWV52HE6nd6jD507d+aHH37g2Wef5ZVXXjmsbVpaGgAbN26kWbNmJCQksGTJEp82OTk5AEe9zgfMu5RcrsoNl22z2U7oVJI/ufLKKxk7diwvvvgid911FwCbN29mxYoVPu1atGhBRkYG3bp14+yzzz7sc7p27UpGRobG3RERsYr7F5jXD8r2Q8OzoeuL4Iyyuiq/5Xcn9Twej8/1NAcr3yk3atQIgPT0dFatWsWOHTu8bWbPnk1UVJT3VJgcEBQUxKhRo5gyZQoFBQUAjBkzhk6dOvm8li1bxptvvnnU63sGDBjAf//7X0pKSmqyfBERAdifDXN7m4MHRreD7hkQevQ/8AVshmEYVn35+PHj6du3L8nJyezdu5eZM2fy+OOP89VXX9G0aVNmzpzJRRddRGxsLCtXruSOO+6gcePGzJs3DzAvau7YsSOJiYlMmTKF7OxsrrvuOm688cYTuvXc7XYTHR1NXl4eUVG+ybiwsJDNmzfTpEkTQkJCqnT75ejU7yIiR1Dihm/OhT3LISwFzvoAYrtYXZVljrX/Ppilp7F27NjBkCFD2L59O9HR0bRv356vvvqKCy+8kK1bt/LNN9/wzDPPUFBQQFJSEgMGDOC+++7zvt/hcDBr1ixGjBhBeno64eHhDB061GdcHhERkYBQVgzfDTCDjjPWHDSwDgedE2HpkR1/oSM7/kf9LiJyEMMDWUPgt7fAEQbp/4Xkig8lEqgqemTH767ZERERkUOsuMcMOrYg6PwsJPW3uqJaRWFHRETEn/38LKz7++7X9o9C0+s1OvIJUtgRERHxV7+/Bz/eYU63HgOtR4Pd8lFjah2FnQrSpU01S/0tInVeTiZkXQcYkDIIUieAo3JjxNV1CjvH4XA4ACguLra4krqlvL/L+19EpE7JXQXzLwNPMST0hi7PQrAewFxZOhZ2HEFBQYSFhbFz506Cg4Ox25UPq5vH42Hnzp2EhYX5PJdLRKROKNgCc/tASR7U7wJpr4Ir1uqqajXtSY7DZrPRqFEjNm/ezO+//251OXWG3W4nOTkZmy7CE5G6pGi3GXT2b4PIFpA2DcKTra6q1lPYqQCn00mLFi10KqsGOZ1OHUUTkbqldD/M/we410FIPHR7HeqlWl1VQFDYqSC73a7B7UREpHp4ymDhtbDzewiOMoNO/OEPYpbK0Z/OIiIiVjIMWHYr/PEx2IOhy0twSj+rqwooCjsiIiJWWjMRNkwFbNDxP5ByjQYNrGIKOyIiIlbZNB1W/v2A69P+DS1HgF1DblQ1hR0RERErbPsCltxkTje7CU671zyNJVVOYUdERKSm7foBvrsCjDI45VLo9AQEhVpdVcBS2BEREalJezdCZj8o2wcNz4RuL4Mz2uqqAprCjoiISE3ZnwNze0PRTog+DbplQGiC1VUFPIUdERGRmlCSD/P6Qf6vEJZkjo4c3dLqquoEhR0REZHq5imBBVfA7mXgrGcOGtigm9VV1RkKOyIiItXJMGDxjbD9K3CEQtdXIbGX1VXVKQo7IiIi1emne2Hzf8HmgM7PQnJ/qyuqcxR2REREqsv6F2DtZHO6/SPQ9Aawaddb09TjIiIi1WHLB7DsNnO61e3QegzY9fxtKyjsiIiIVLUd82HhYMAwn3XV/lFwuKyuqs5S2BEREalKuWtg3qXgKYL4ntDleQiOsLqqOk1hR0REpKoUbIXMPlCSC/VOh7TXwNXA6qrqPIUdERGRqlC8BzL7wr4/IKIZdJ8OEadaXZWgsCMiInLyygph/mWQtwZccZD2OtRrb3VV8jeFHRERkZPhKTMvRt4xH4IizKATf67VVclBFHZEREQqyzDgx9Gw9X9gD4YuL8EpF1tdlRxCYUdERKSy1j4Ov7wA2KDjFDj1WrDZrK5KDmFp2Jk6dSrt27cnKiqKqKgo0tPT+eKLL7zrCwsLGTlyJLGxsURERDBgwABycnJ8PmPLli3069ePsLAw4uLiGDt2LKWlpTW9KSIiUtf8+l/4abw53XY8tLgF7A5ra5IjsjTsNG7cmMmTJ7Ns2TKWLl3K+eefz6WXXsqaNWsAuOOOO/j00095//33mTdvHtu2baN//wPPFCkrK6Nfv34UFxezcOFC3njjDWbMmMEDDzxg1SaJiEhdsO1LWDzMnG76T2h3Hzic1tYkR2UzDMOwuoiD1a9fnyeeeIIrrriChg0bMnPmTK644goAfv75Z9q0aUNWVhbdu3fniy++4OKLL2bbtm3Ex8cD8PLLLzNu3Dh27tyJ01mx//HcbjfR0dHk5eURFRVVbdsmIiIBYPcy+OYcKC2AUy6B9P+CM8bqquqkiu6//eaanbKyMt555x0KCgpIT09n2bJllJSU0LNnT2+b1q1bk5ycTFZWFgBZWVmkpqZ6gw5A7969cbvd3qNDIiIiVWbvJsi8yAw6Dc6Ari8r6NQClj+RbNWqVaSnp1NYWEhERAQfffQRbdu2ZcWKFTidTmJiYnzax8fHk52dDUB2drZP0ClfX77uaIqKiigqKvLOu93uKtoaEREJWIU7YG4f82dUG0ibBmGJVlclFWD5kZ1WrVqxYsUKFi9ezIgRIxg6dChr166t1u+cNGkS0dHR3ldSUlK1fp+IiNRyJfmQeTHkb4TQU8zRkaNbWV2VVJDlYcfpdNK8eXM6d+7MpEmT6NChA88++ywJCQkUFxeTm5vr0z4nJ4eEhAQAEhISDrs7q3y+vM2RjB8/nry8PO9r69atVbtRIiISODwlsOAq2P0DBMdAWgY0SLO6KjkBloedQ3k8HoqKiujcuTPBwcHMmTPHu279+vVs2bKF9PR0ANLT01m1ahU7duzwtpk9ezZRUVG0bdv2qN/hcrm8t7uXv0RERA5jGLDkZtj+BdhDoNur0KiX1VXJCbL0mp3x48fTt29fkpOT2bt3LzNnziQzM5OvvvqK6Ohohg0bxpgxY6hfvz5RUVHceuutpKen0717dwB69epF27Ztue6665gyZQrZ2dncd999jBw5EpfLZeWmiYhIIFh5P/w6A2wO6PwMJA/QoIG1kKVhZ8eOHQwZMoTt27cTHR1N+/bt+eqrr7jwwgsBePrpp7Hb7QwYMICioiJ69+7NSy+95H2/w+Fg1qxZjBgxgvT0dMLDwxk6dCgPP/ywVZskIiKBYsNUWPOYOZ06AZoNA5vfnRCRCvC7cXasoHF2RETEx9aP4LsBgAEtb4VOU8ARYnVVcohaN86OiIiIX9ixAL4fCBiQfBW0f0xBp5ZT2BERESmXtxbmXQKeIog/H7q8AM5Iq6uSk6SwIyIiArDvT3PQwJJcqNcJur0OIQ2trkqqgMKOiIhIcS5k9oV9WyG8CXSfBpFNrK5KqojCjoiI1G1lRTD/cshdBa6GkPY61OtodVVShRR2RESk7jI8kDUEdmRCUDh0ew0Szre6KqliCjsiIlI3GQb8OAa2vAe2YOjyIjT+h9VVSTVQ2BERkbpp3X9g/bPmdMfH4dTBGh05QCnsiIhI3bP5TVhxtznd9h5oORLsDmtrkmqjsCMiInXL9tmw6AZzuukN0O5+cDitrUmqlcKOiIjUHbt/hO/6g1EKif2g05MQFGZ1VVLNFHZERKRuyP8VMi+C0nyITYOur4CrntVVSQ1Q2BERkcBX+Jc5OnJhDkS2gu7TIfwUq6uSGqKwIyIiga20AOZdDHs3QOgp5ujI0W2srkpqkMKOiIgELk8pLLgGdi2G4BhIew0anmF1VVLDFHZERCQwGQb8MBy2zQK7C7pOhUZ9rK5KLKCwIyIigWnVQ7ApA7DD6U9DylUaNLCOUtgREZHAs+EVWP2wOZ36EDS/EWza5dVV+i8vIiKB5Y9PYOkt5nSLkdB2LNiDra1JLKWwIyIigWPnQvj+GvNp5klXQoeJ4AixuiqxmMKOiIgEhryfYd4lUFYIcedClxfAGWV1VeIHFHZERKT227cN5vaG4t0Q0x7SMiA0zuqqxE8o7IiISO1WnAeZfWHfFgg/1RwdObKp1VWJH1HYERGR2qusyHywZ+5KcDWAtNeh/ulWVyV+RmFHRERqJ8MDi66HnG/BEQbdXoOEC6yuSvyQwo6IiNROy8fC7++ALci8GLnxpVZXJH5KYUdERGqfdU/Bz0+Z0x0mQZMhGh1ZjkphR0REapff3obld5rTbcZCq9vA7rC2JvFrCjsiIlJ7ZM+BRUPN6SZDoN2D4HBaW5P4PYUdERGpHfasgPmXg6cEGvWF05+B4HCrq5JaQGFHRET8X/5vMLcvlO6F2G7Q7VVw1bO6KqklLA07kyZNomvXrkRGRhIXF8dll13G+vXrfdqce+652Gw2n9fw4cN92mzZsoV+/foRFhZGXFwcY8eOpbS0tCY3RUREqkvRLsjsA4XZENkS0qZBeGOrq5JaJMjKL583bx4jR46ka9eulJaWcu+999KrVy/Wrl1LePiBQ5M33XQTDz/8sHc+LCzMO11WVka/fv1ISEhg4cKFbN++nSFDhhAcHMzEiRNrdHtERKSKle4zn3flXg8hjczHQMScZnVVUstYGna+/PJLn/kZM2YQFxfHsmXLOPvss73Lw8LCSEhIOOJnfP3116xdu5ZvvvmG+Ph4OnbsyCOPPMK4ceN46KGHcDp14ZqISK3kKYXvB8JfWRAcbY6OHHem1VVJLeRX1+zk5eUBUL9+fZ/lb731Fg0aNKBdu3aMHz+effv2eddlZWWRmppKfHy8d1nv3r1xu92sWbPmiN9TVFSE2+32eYmIiB8xDFg6Ev78P7C7oOtLkNjX6qqklrL0yM7BPB4Po0ePpkePHrRr1867/NprryUlJYXExERWrlzJuHHjWL9+PR9++CEA2dnZPkEH8M5nZ2cf8bsmTZrEhAkTqmlLRETkpK1+BDa+Ctih05OQco0GDZRK85uwM3LkSFavXs2CBQt8lt98883e6dTUVBo1asQFF1zApk2baNasWaW+a/z48YwZM8Y773a7SUpKqlzhIiJStTa+DqseNKfb3Q8tbgabX52IkFrGL/7vGTVqFLNmzWLu3Lk0bnzsK+zT0tIA2LhxIwAJCQnk5OT4tCmfP9p1Pi6Xi6ioKJ+XiIj4gT9nwQ9/33HbfDicdg/Yg62tSWo9S8OOYRiMGjWKjz76iG+//ZYmTZoc9z0rVqwAoFGjRgCkp6ezatUqduzY4W0ze/ZsoqKiaNu2bbXULSIi1eCvRbDgKjDKIKk/dHwcHCFWVyUBwNLTWCNHjmTmzJl88sknREZGeq+xiY6OJjQ0lE2bNjFz5kwuuugiYmNjWblyJXfccQdnn3027du3B6BXr160bduW6667jilTppCdnc19993HyJEjcblcVm6eiIhUlHs9ZF4MZfuh4dnQ5SVw6qi7VA2bYRiGZV9+lIvNpk+fzvXXX8/WrVsZPHgwq1evpqCggKSkJC6//HLuu+8+n1NPv//+OyNGjCAzM5Pw8HCGDh3K5MmTCQqqWJZzu91ER0eTl5enU1oiIjVt/3b4+gwo+A2iU+HsjyCyctdkSt1S0f23pWHHXyjsiIhYpMQN35xjPvcqLAXO+gBiu1hdldQSFd1/+8UFyiIiUgeVFcN3A8yg44w1Bw1U0JFqoLAjIiI1z/DA4n9C9jfgCINur0CjnlZXJQFKYUdERGreinvgt7fAFgSdnzPvvhKpJgo7IiJSs35+FtY9YU53eAyaDtXoyFKtFHZERKTm/P4e/HiHOd36Tmh1O9j9ZjB/CVAKOyIiUjNy5kLWdYABpw6G1IfAofHQpPop7IiISPXbsxLmXwaeYmjUGzo/A8ERVlcldYTCjoiIVK+CLZDZ1xxTp35X6PYquGKtrkrqEIUdERGpPkW7YW4f2L8NIltAWgaEJ1tdldQxCjsiIlI9SvfDvEvAvQ5CEqBbBtRLtboqqYMUdkREpOp5ymDhtfDXQgiOgm6vQfxZVlcldZTCjoiIVC3DgGW3wh8fg91pPsH8lH5WVyV1mMKOiIhUrTUTYcNUwAad/gMp12jQQLGUwo6IiFSdTdNh5X3m9Gn3QYvhYHdYW5PUeQo7IiJSNf78HJbcZE43uxlOGw/2YGtrEkFhR0REqsJfS2DBlWCUQePLoNMTEBRqdVUigMKOiIicLPcGmNcPyvZBwzOh61RwRlldlYiXwo6IiFTe/hyY2xuK/oLo06D7dAhNsLoqER8KOyIiUjkleyHzIijYDGHJkDYNIptbXZXIYRR2RETkxJUVw3dXwJ4fwVkf0l6HBt2srkrkiBR2RETkxBgGLL4Rsr8GRyh0ewUaXWh1VSJHpbAjIiIn5qd74bf/BzYHdH4WkvpbXZHIMSnsiIhIxa1/AdZONqfbPwpNbwCbdiXi3/R/qIiIVMyWD2DZbeZ0q9HQ+g6wB1lakkhFKOyIiMjx7ZgPCwcDBqRcC+0fAYfL6qpEKkRhR0REji13Ncz7B3iKIOFC6PIcBEdYXZVIhSnsiIjI0RVshbl9oCQP6p0O3V4FV6zVVYmckEqFnTfeeIPPPvvMO3/33XcTExPDGWecwe+//15lxYmIiIWK90BmH9j/J0Q0M0dHjjjV6qpETlilws7EiRMJDTUf8JaVlcWLL77IlClTaNCgAXfccUeVFigiIhYoK4R5l0LeWgiJh7QMqNfe6qpEKqVSl9Fv3bqV5s3NIcE//vhjBgwYwM0330yPHj0499xzq7I+ERGpaZ4yWDgIdn4HQZHQ7TWIP8fqqkQqrVJHdiIiIti1axcAX3/9NRdeaI6cGRISwv79+6uuOhERqVmGActuh60fgj0YurwIp1xsdVUiJ6VSYefCCy/kxhtv5MYbb+SXX37hoosuAmDNmjWkpKRU+HMmTZpE165diYyMJC4ujssuu4z169f7tCksLGTkyJHExsYSERHBgAEDyMnJ8WmzZcsW+vXrR1hYGHFxcYwdO5bS0tLKbJqISN229nHY8CJgg45T4NRrwWazuiqRk1KpsPPiiy+Snp7Ozp07+d///kdsrHll/rJly7j22msr/Dnz5s1j5MiRLFq0iNmzZ1NSUkKvXr0oKCjwtrnjjjv49NNPef/995k3bx7btm2jf/8DQ5OXlZXRr18/iouLWbhwIW+88QYzZszggQceqMymiYjUXb/+F34ab063HQ8tbgG7w9qaRKqAzTAMozJvLCwsZOXKlezYsQOPx+Oz7h//+Eelitm5cydxcXHMmzePs88+m7y8PBo2bMjMmTO54oorAPj5559p06YNWVlZdO/enS+++IKLL76Ybdu2ER8fD8DLL7/MuHHj2LlzJ06n87jf63a7iY6OJi8vj6ioqErVLiJSq237EuZdAkYpNB0GXZ6HoFCrqxI5poruvyt1gfKXX37JkCFD2LVrF4dmJZvNRllZWWU+lry8PADq168PmEeKSkpK6Nmzp7dN69atSU5O9oadrKwsUlNTvUEHoHfv3owYMYI1a9bQqVOnw76nqKiIoqIi77zb7a5UvSIiAWHXUlhwhRl0TrkETv+Pgo4ElEqdxrr11lu58sor2bZtGx6Px+dV2aDj8XgYPXo0PXr0oF27dgBkZ2fjdDqJiYnxaRsfH092dra3zcFBp3x9+bojmTRpEtHR0d5XUlJSpWoWEan19m6Cef2gtAAa9ICuL4MzxuqqRKpUpcJOTk4OY8aMOSxknIyRI0eyevVq3nnnnSr7zKMZP348eXl53tfWrVur/TtFRPxO4Q6Y29v8GdUW0qZBWKLVVYlUuUqFnSuuuILMzMwqK2LUqFHMmjWLuXPn0rhxY+/yhIQEiouLyc3N9Wmfk5NDQkKCt82hd2eVz5e3OZTL5SIqKsrnJSJSp5TkQ2Y/yN8EYY2h+zSIbml1VSLVolLX7LzwwgtceeWVfPfdd6SmphIcHOyz/rbbbqvQ5xiGwa233spHH31EZmYmTZo08VnfuXNngoODmTNnDgMGDABg/fr1bNmyhfT0dADS09N57LHH2LFjB3FxcQDMnj2bqKgo2rZtW5nNExEJbJ4SWHAl7F4KznrQ7XVokGZ1VSLVplJ3Y2VkZDB8+HBCQkKIjY3FdtAYDDabjV9//bVCn3PLLbcwc+ZMPvnkE1q1auVdHh0d7X0cxYgRI/j888+ZMWMGUVFR3HrrrQAsXLgQMG8979ixI4mJiUyZMoXs7Gyuu+46brzxRiZOnFihOnQ3lojUGYYBi26AzW+AIxS6vwEpV1pdlUilVHT/Xamwk5CQwG233cY999yD3V75B6fbjjJQ1fTp07n++usB8xb3O++8k7fffpuioiJ69+7NSy+95HOK6vfff2fEiBFkZmYSHh7O0KFDmTx5MkFBFTtwpbAjInXGT/+GNRPB5oAuL0HzG8FW+d/jIlaq1rBTv359fvjhB5o1a3ZSRfoLhR0RqRN+eQmWjjSn2z8KbceBvVJXM4j4hYruvysV54cOHcq7775b6eJERKSGbf0Ilo4yp1veCm3uUtCROqNS/6eXlZUxZcoUvvrqK9q3b3/YBcpPPfVUlRQnIiJVYMcC+H4gYEDy1dD+MXC4rK5KpMZUKuysWrXKOzLx6tWrfdYd7TocERGxQN5a8zEQniKIP998DIQz0uqqRGpUpcLO3Llzq7oOERGpavv+gLl9oCQX6nUybzEPaWh1VSI1Tpfgi4gEouJcmNsX9m2FiKbQfTpENjnu20QCkcKOiEigKSuE+ZdB3mpwNTSP6NTrYHVVIpZR2BERCSSGB7KGwI55EBQBaa9DwnlWVyViKYUdEZFAYRiw7A7Y8j7YgqHLC3DKJVZXJWI5hR0RkUCx7gn45TlzuuPjcOpg0B2yIgo7IiIBYfObsGKcOd12PLQcCXaHtTWJ+AmFHRGR2m77bPPhngBNb4B294HDaW1NIn5EYUdEpDbb/SN81x+MUki8GE5/CoLCrK5KxK8o7IiI1Fb5v0LmRVCaD7HdoevL4IyxuioRv6OwIyJSGxXuNEdHLsyBqNbmoIHhp1hdlYhfUtgREaltSgtg3sWwdwOEngJp0yC6tdVVifgthR0RkdrEUwoLroZdSyA4xhw0sGG61VWJ+DWFHRGR2sIwYMm/YNtnYA+Bbi9Do95WVyXi9xR2RERqi1UPwq/TALt511XylRo0UKQCFHZERGqDDa/A6kfM6dQJ0PwmsOlXuEhF6F+KiIi/++MTWHqLOd1iFLS9C+xB1tYkUoso7IiI+LOdC+H7a8ynmSddCR0mgiPE6qpEahWFHRERf5X3M8y7BMoKIe488ynmzkirqxKpdRR2RET80b5tMLc3FO+GmA7mLeahcVZXJVIrKeyIiPib4jzI7Av7tkD4qeboyJFNra5KpNZS2BER8SdlRfDd5ZC7ElwNzSM69TtZXZVIraawIyLiLwwPZA2FnLkQFA7dXoWEC6yuSqTWU9gREfEXP94FW94FWzB0fgEaX2p1RSIBQWFHRMQfrHsS1j9tTnecBE2u0+jIIlVEYUdExGq/zYTld5nTbe6GlreC3WFtTSIBRGFHRMRK2XNg0fXmdJOh0O4BcDgtLUkk0FgadubPn88ll1xCYmIiNpuNjz/+2Gf99ddfj81m83n16dPHp83u3bsZNGgQUVFRxMTEMGzYMPLz82twK0REKmnPCph/OXhKoFFfOP1pCA63uiqRgGNp2CkoKKBDhw68+OKLR23Tp08ftm/f7n29/fbbPusHDRrEmjVrmD17NrNmzWL+/PncfPPN1V26iMjJyf8N5vaF0r0Q282888pVz+qqRAKSpU+S69u3L3379j1mG5fLRUJCwhHXrVu3ji+//JIffviBLl26APD8889z0UUX8Z///IfExMQqr1lE5KQV7YLMPlCYDZEtzUEDwxtbXZVIwPL7a3YyMzOJi4ujVatWjBgxgl27dnnXZWVlERMT4w06AD179sRut7N48WIryhURObbSfZB5MbjXQ2gipE2D6LZWVyUS0Cw9snM8ffr0oX///jRp0oRNmzZx77330rdvX7KysnA4HGRnZxMX5/usmKCgIOrXr092dvZRP7eoqIiioiLvvNvtrrZtEBHx8pSaTzDftQiCo6Hb6xDXw+qqRAKeX4eda665xjudmppK+/btadasGZmZmVxwQeVHFZ00aRITJkyoihJFRCrGMOCHW+DPT8Hugq5TIbHP8d8nIifN709jHaxp06Y0aNCAjRs3ApCQkMCOHTt82pSWlrJ79+6jXucDMH78ePLy8ryvrVu3VmvdIiKsfhg2vQbY4fSnIOVqDRooUkNqVdj5448/2LVrF40aNQIgPT2d3Nxcli1b5m3z7bff4vF4SEtLO+rnuFwuoqKifF4iItVm42uw6iFzOvUBaH4T2GrVr1+RWs3S01j5+fneozQAmzdvZsWKFdSvX5/69eszYcIEBgwYQEJCAps2beLuu++mefPm9O7dG4A2bdrQp08fbrrpJl5++WVKSkoYNWoU11xzje7EEhH/8Men8MNwc7rFCGg7DuzB1tYkUsfYDMMwrPryzMxMzjvvvMOWDx06lKlTp3LZZZexfPlycnNzSUxMpFevXjzyyCPEx8d72+7evZtRo0bx6aefYrfbGTBgAM899xwREREVrsPtdhMdHU1eXp6O8ohI1flrEcw5H8r2Q9IA884rp37HiFSViu6/LQ07/kJhR0SqnHs9fN0DindB3DnQ410IjT/++0Skwiq6/9ZJYxGRqrZ/O8ztYwad6FTziI6CjohlFHZERKpSiRsyL4KC3yAsxRwdObKp1VWJ1GkKOyIiVaWsGOb3Nx/w6YyFtAyI7Wx1VSJ1nsKOiEhVMDyw6AbImQOOMPPBno0qP/ipiFQdhR0RkaqwYhz8PhNsQdDleUi63OqKRORvCjsiIifr52dg3X/M6Q6PQZMhGh1ZxI8o7IiInIzf34Uf7zCnW98JrW4Hu18/dlCkzlHYERGprJy5kDXEnD71Okh9CBwuS0sSkcMp7IiIVMaelTD/MvAUQ6Pe0PlpCK74yO0iUnMUdkRETlTBFsjsa46pU7+beeeVK9bqqkTkKBR2RERORNFuc3Tk/dsgsgV0nwbhyVZXJSLHoLAjIlJRpfth3iXgXgchCeaggTGnWV2ViByHwo6ISEV4ymDhtfDXQgiOgrTXIe4sq6sSkQpQ2BEROR7DgKWj4I+Pwe6ELlMh8SKrqxKRClLYERE5njWPwcaXARt0ehJSrtaggSK1iMKOiMixbJoGK+83p9vdDy3+BXaHtTWJyAlR2BEROZo/P4MlN5vTzf8Fbe8Be7C1NYnICVPYERE5kr+WwIKrwCiDxpdDxykQFGp1VSJSCQo7IiKHcv8C8/pB2T5oeBZ0fQmcUVZXJSKVpLAjInKw/TnmoIFFf0F0O3PQwNAEq6sSkZOgsCMiUq5kL2ReBAWbISzZDDqRza2uSkROksKOiAhAWTF8dwXs+RGc9c1BA2O7Wl2ViFQBhR0REcOAxTdC9tfgCIVur0CjC62uSkSqiMKOiMhP4+G3/wc2B3R+FpL6W12RiFQhhR0RqdvWPw9rHzen2z8KTW8Am341igQS/YsWkbpry/uw7HZzutUd0PoOsAdZW5OIVDmFHRGpm3LmwcLBgAEp10L7h8HhsroqEakGCjsiUvfkroL5l4KnGBJ6QZfnIDjC6qpEpJoo7IhI3VKwFeb2hZI8qN8Z0l4FV6zVVYlINVLYEZG6o3gPZPaB/X9CRHNImw7hKVZXJSLVTGFHROqGskKYdynkrYWQeEjLgHqpVlclIjXA0rAzf/58LrnkEhITE7HZbHz88cc+6w3D4IEHHqBRo0aEhobSs2dPNmzY4NNm9+7dDBo0iKioKGJiYhg2bBj5+fk1uBUi4vc8ZbBwEOz8DoIiodtrEH+21VWJSA2xNOwUFBTQoUMHXnzxxSOunzJlCs899xwvv/wyixcvJjw8nN69e1NYWOhtM2jQINasWcPs2bOZNWsW8+fP5+abb66pTRARf2cY5u3lWz8EezB0eRFOudjqqkSkBtkMwzCsLgLAZrPx0UcfcdlllwHmUZ3ExETuvPNO7rrrLgDy8vKIj49nxowZXHPNNaxbt462bdvyww8/0KVLFwC+/PJLLrroIv744w8SExMr9N1ut5vo6Gjy8vKIioqqlu0TEYusmQQ/3QvY4PSnoeUosDusrkpEqkBF999+e83O5s2byc7OpmfPnt5l0dHRpKWlkZWVBUBWVhYxMTHeoAPQs2dP7HY7ixcvPupnFxUV4Xa7fV4iEoB+fePvoAOcdi+0GKGgI1IH+W3Yyc7OBiA+Pt5neXx8vHdddnY2cXFxPuuDgoKoX7++t82RTJo0iejoaO8rKSmpiqsXEctt+xIWDzOnm90Ip/0bHE5raxIRS/ht2KlO48ePJy8vz/vaunWr1SWJSFUpzoMV98L8y8Aog1P+AZ2egKBQqysTEYv47UNgEhISAMjJyaFRo0be5Tk5OXTs2NHbZseOHT7vKy0tZffu3d73H4nL5cLl0rDwIgGlrBg2vgKrH4aiv8xl8edD16ngjLG0NBGxlt8e2WnSpAkJCQnMmTPHu8ztdrN48WLS09MBSE9PJzc3l2XLlnnbfPvtt3g8HtLS0mq8ZhGxgGGYD/T8rC0su80MOhFNoctLcNbHEFaxGxVEJHBZemQnPz+fjRs3euc3b97MihUrqF+/PsnJyYwePZpHH32UFi1a0KRJE+6//34SExO9d2y1adOGPn36cNNNN/Hyyy9TUlLCqFGjuOaaayp8J5aI1GI7voPlY2HX3zckuBpAy9ug+b8gNO7Y7xWROsPSsLN06VLOO+887/yYMWMAGDp0KDNmzODuu++moKCAm2++mdzcXM4880y+/PJLQkJCvO956623GDVqFBdccAF2u50BAwbw3HPP1fi2iEgNyvsZfroH/vjEnHeEQbNh0Oo2iGxubW0i4nf8ZpwdK2mcHZFaYn82rJoAm14zLz62OSDpCmh9J8R2BpvfnpkXkWpQ0f23316gLCLiVZIPPz8J656A0gJzWfwFZshpdCHY9atMRI5OvyFExH95SuHXabDyQSj8e+ysmPbQ5m5IHgCOkGO/X0QEhR0R8UeGAX/OghXjwL3OXBaWDK3HQNMbwKnTzSJScQo7IuJf/loCK8bCjvnmfHAMtBwJLW7RbeQiUikKOyLiH/ZuMp9jteU9c97ugiZDofVoiGoNNpul5YlI7aWwIyLWKvwL1jwKG14CTwlgg8aXQ5s7oMEZusNKRE6awo6IWKN0P6x/FtZOghK3uazhWdDmLkjsC/Zga+sTkYChsCMiNctTBr+9CSvvg31/mMui2ph3WKVcBUFh1tYnIgFHYUdEas72r2H53ZD7kzkfmgitRkOzG8FVz9LSRCRwKeyISPXbs8IMOdmzzfmgSGgxHFqOgvBkS0sTkcCnsCMi1adgC6y8Hzb/P8AAWzCcOsi8wyqmve6wEpEaobAjIlWvOBfWTDIvQPYUmcsS+5mDAsadA3aHpeWJSN2isCMiVaesCDZMhdWPQPFuc1n9rtD2bjjlEnC4rK1PROokhR0ROXmGB35/zxwUsGCzuSyiObQZC6deC8ER1tYnInWawo6InJycebB8LOz+wZx3NYRWt0PzmyGkobW1iYigsCMilZW3FpaPg22zzHlHGDS/CVreBpFNra1NROQgCjsicmL2b4eVD8KvGebpK5sDkq+G1ndA/c66w0pE/I7CjohUTMleWPcf81W2z1yWcKF5h1VCT7Dr14mI+Cf9dhKRY/OUwKbXYdVDULjDXBbT0bzDKulycIRYWZ2IyHEp7IjIkRkG/PEJ/HQPuNeby8JToPVd0GQIOKOsrU9EpIIUdkTkcDuzYMVY2Pm9Oe+sbz7aoflwCGtkbW0iIidIYUdEDnBvgJ/Gw9b/mfP2EGh6vXkreXRrS0sTEakshR0RgcKdsPph2PAyGKWAHZL6m3dYNegONrvVFYqIVJrCjkhdVroP1j8DayZD6V5zWcNzoM2dkNgH7MGWliciUhUUdkTqIk8ZbH7DfCL5/m3msqi25h1WyVdBUKi19YmIVCGFHZG6xDBg+5ew/G7IW20uC000x8pp+k9w1bO2PhGRaqCwI1JX7P7RfIZVzrfmfHC0eXdVy5EQnmRtbSIi1UhhRyTQ5f8GK++D394y5+3BcOp10Go0xLTT4x1EJOAp7IgEquI9sGYirH8OPMXmssRLzDus4s4Gu8Pa+kREaojCjkigKSuEX16ENY+ZgQcgtrt58XFiP3A4ra1PRKSGKeyIBArDA7+9DSv/DQW/m8siW0CbsZAyEIIjrK1PRMQifj1S2EMPPYTNZvN5tW59YBTXwsJCRo4cSWxsLBEREQwYMICcnBwLKxaxSPa38GVXyBpsBh1XHLR/DHothOY3KeiISJ3m90d2TjvtNL755hvvfFDQgZLvuOMOPvvsM95//32io6MZNWoU/fv35/vvv7eiVJGal7vavI18+xfmfFA4NLsJWt4KkU2trU1ExE/4fdgJCgoiISHhsOV5eXlkZGQwc+ZMzj//fACmT59OmzZtWLRoEd27d6/pUkVqzr4/YeUDsHmGefrKFgTJV0ObMVCvk+6wEhE5iF+fxgLYsGEDiYmJNG3alEGDBrFlyxYAli1bRklJCT179vS2bd26NcnJyWRlZR3zM4uKinC73T4vkVqhxA0/3QeftoBfp5lBJ6E3nPMZpM+A+qcr6IiIHMKvj+ykpaUxY8YMWrVqxfbt25kwYQJnnXUWq1evJjs7G6fTSUxMjM974uPjyc7OPubnTpo0iQkTJlRj5SJVzFMCG16B1ROg6C9zWb3TzTusGl8KjhBr6xMR8WN+HXb69u3rnW7fvj1paWmkpKTw3nvvERpa+Wf3jB8/njFjxnjn3W43SUkaQVb8kGHA1g/hp/Gwd4O5LPxU8w6rUweDM8rS8kREagO/DjuHiomJoWXLlmzcuJELL7yQ4uJicnNzfY7u5OTkHPEan4O5XC5cLlc1VytyknZ+bz7e4a+/T8s6Y80Lj1v8C0KP/f+4iIgc4PfX7BwsPz+fTZs20ahRIzp37kxwcDBz5szxrl+/fj1btmwhPT3dwipFTpJ7PczvD7PPNIOOIxRa3AIXLoT2DyroiIicIL8+snPXXXdxySWXkJKSwrZt23jwwQdxOBwMHDiQ6Ohohg0bxpgxY6hfvz5RUVHceuutpKen604sqZ3255jX5Gx8FYwywA5JA8w7rGK7ga1W/W0iIuI3/Drs/PHHHwwcOJBdu3bRsGFDzjzzTBYtWkTDhg0BePrpp7Hb7QwYMICioiJ69+7NSy+9ZHHVIieotADWPQXrpkBpvrks7jxocxc0utB8cKeIiFSazTAMw+oirOZ2u4mOjiYvL4+oKF3wKTXEUwq/zoBVD8D+7eay6HbQ5m5IvgKCKn8RvohIXVDR/bdfH9kRCUiGAds+gxXjIG+tuSy0MbQeA02vB1c9S8sTEQk0CjsiNWnXUvMOqx2Z5nxwtHnxcYtbILyxpaWJiAQqhR2RmpC/GX66F35/x5y3O+HUIdD6dog+TaMei4hUI4UdkepUtAtWPwYbXjBHQcYGp/zDPGXVsAfYHVZXKCIS8BR2RKpD6X745XlYMxFK8sxlDc4wLz5O7AsOp7X1iYjUIQo7IlXJ8MBvb8FP/4Z9W81lka3+frzDNRAUbm19IiJ1kMKOSFXZPhtW3A17VpjzIQnQ6nZofiO4GlhamohIXaawI3Ky9vxk3ka+/StzPigCmv8LWo6EiCbW1iYiIgo7IpVWsBVW3g+b/wsYYAuGlGug9R1Qr6PusBIR8RMKOyInqjgP1k6G9c9AWaG5rFEfaH0nxJ8Ldv2zEhHxJ/qtLFJRZcWw8WVY/bB5SzlAvc7Qdhw0/gc4XNbWJyIiR6SwI3I8hgFb3oefxkP+r+ay8KbmgzpPHQROPU9NRMSfKeyIHMuO72D5XbBriTnvjIVWt5kXIIfGW1ubiIhUiMKOyJHkrYMV98Cf/2fOO0Kh2TBoeRtEtbC2NhEROSEKOyJgDgaYuxKyv4WcOeZt5EYZ2ByQdIX5eIfYLmCzW12piIicIIUdqZsMA9zrIefbv1+ZULzLt038+eZ1OQk9wR5sSZkiInLyFHak7ij4HbLnHAg4+7f7rneEQf0u0KA7NDwLEi6AoFBrahURkSqjsCOBa3825Mw9EG7K76QqZ3dCvU4Q2x0apkPcOeBqqCeRi4gEGIUdCRxFu2HHvAPhJm+t73qbA2LaQ2waNEiH+PMgtJEGARQRCXD6LS+1V0k+7PzODDbZ38Ke5YBxUAMbRLUxT0s16A5x50FEiq6/ERE5CsMwKPGUsL9kP4Wlhcd97S+teLtXL36V+AhrhuxQ2JHao6wQ/sr6+46pb82xb4xS3zYRTc3TUg3SzUc3RLbQyMYiUquUecpOPEwcK5yUVbDd3y/D54/GqvPY+Y8p7IgcxlMCu5YeOC2183vwFPm2CW1snpYqv+Ym+jRdVCwiJ8UwDIrKiioeJip5pONo60o9pccvsoa4HC5cQS5cDhdOh5OQoBCf+fJpl8OFM8jpnQ4JCvGZj3RFkhCeYNl2KOyI/zA8sOenA+Fmx3wozfdt42p44JqbuLPNp4sHR1hSrohUD8MwKPWUnlyYODiclFWw3d+vorKi4xdZQ4LtwUcMFSFBIUcMGa4g3xASEhRy2PtDgkIICQohNDiU0CDzFeYMM6eDQwkPDvdOO+wOHDYHdpvd+7LZbFZ3ywlT2BHrHDbWzVwo3u3bJjjaDDex3SHuTIjtBsFRUAv/sYnUJh7DU/nTJlVwpMNjeKzuAgBs2LzBwhsqDj3ScYSQcawjHSEOM2yEBB8IHGFBYWbQcB4IGmFBYQQ7gnHYzbBRHjpqY9iwmsKO1Kz83w4KN0cb66br32PdnAENzwRnPYUbqXMMw6C4rLhGTpscKcSUeEqs7gIvp+PwkHGyRzpCg0J9j26Uh47gMO/PcGc4ToeTIHuQ96hGefCQ2kVhR6rX/u0HxrrJ/hYKNvuuP9JYNyFxeiyD+IWjnUqp9JGOsoofESksrb4LRU+Uw+Y4EC6OdOSikkc6ygNGSFCIN2SEBJvTB59KCXYEHwgbOrohlaCwI1WraDfsyDxwx5R7ne96WxDEpB4y1k2iBvKTI6rMhaJVeaTDXy8UPfRnZY90lF+v4T2yEXzgdEr5dRxhwWG4HC6foxrlL5HaQmFHTk7JXtjx3YHTUntWoLFuAsuhRzdONmwcd/1Bn+/XF4oe60jHQaHiWEc6vIHj4JBx0GmVQLxQVE6MYYDHA6Wl5qus7PDpIy2rbNvq/KzXXoPERGv6UWFHKs4woDDbHJm4/NTUriXm08EPFtHskLFummusm5Nw6NGNmgwbhaWFlB3639ciJ3qh6MHLjnak40gXipYHjIPvTtGFotXP46l9O++a+Kwy//jnVyV271bYEX9RvAfyN5uvgs2+0wW/mQP7HSos6aDTUudAVNuAG+um/OjGSQeNkv3HvA32SN/hb0c3Dj6iUX7h6LFCiCvIDBVHO7IREhTic4HowT/Lj3ZEOCMC4kLRqvgrvbbvvI+2zPCPy5NqFYcD7HYICjKnj/UKCvJtWz5d0fcf/Dkn+l0OB4SFWRd0IIDCzosvvsgTTzxBdnY2HTp04Pnnn6dbt25Wl+V/SveZoeXQQFP+syTvOB9gN58nVe9087RU/NnmBcZB4dVa9pHuTDlasDhSIDlS2xN5f204unFoiKjoKZcQR4j39El58PDekfL30Y3w4HBCg0MJsgdV6OiGYVRiB1gCpfsPLNtbCnuqeCdrdRAIpL/Sa1JwsLnzPN4Otbp29JVtW97+4J/Bwb7T5e3Klx/88+B15T8PXm63mzeqHu118Pq6LiDCzrvvvsuYMWN4+eWXSUtL45lnnqF3796sX7+euLg4q8urWZ5S2Lf18CMz+b+a84U5x/8MZyyEnQJhjc0RisOSMEJPoSSiCSWhyRQ5QijywP7yIxQ71x/3FMvJho3CUv+5M+XgoxtOhwuXw4nLEUKw3YXT7vL5GWxzEWRz4bSFEGQLIQhz3mG4CCIEB+Z0MGEEEUKQEYbDCCHICCWIMByeEII8Ydg9YQQZYVAWjMfjwCizU1Zmw1Nkw7PfXqEd7f5SyK/CHfix3ufxjyFSapWT+Su9/HWkZeVtqyooHO27Dt4JH/o62s770J38odPlbY61Qz/0JXIkNsOo/QcP09LS6Nq1Ky+88AIAHo+HpKQkbr31Vu65557jvt/tdhMdHU1eXh5RUVFVVtf/1v6P3MJcAO+OOsoVRcOwhjQMb0jDsIbEhsUSdJSnbpeUlVBQnE/B/p3kF2yjYF8OBft3mPP7d1FQtJuCwlwKi3ZTXPgXxUW7KSnOo9gwKDagxIDiv18lHDztoNjuotjmpMQWTDEOirGby4FiTxklnlKKy4opLiumpKzEr8bcAAgyQnEYLuyGC4cRgt0IweEJwe4Jwe5xYfeEYPO4sJWFYCsLNX+WmvOUhUBJKJSGQGkIRon5oiQMozQEozgUT7Hr75+hlBWFYvz9s6zYhafU7t2pG4Z+u56oY+1gK7MTPtr7jvY5NfFX+pF25AcvO/gvdf2VLlJ5Fd1/1/ojO8XFxSxbtozx48d7l9ntdnr27ElWVpaFlcEN/28ce0M2HbddcEkUIaXR2IBS+35K7YWUOgrx2EurqbIyYN/fr0ry2KH0QGCg1DdAHHgdYdkR251A2zInpdiort6pCjbb4YfMK7KjPt5OuLJ/0Z/oDv7gHfHRduyHHnI/1l/uB7/nSDtv/ZUuItWp1oedv/76i7KyMuLjfZ+kGh8fz88//3zE9xQVFVFUdOCiT7fbXS212Td3A0drMGzA37+5Q3IhbCeE74TQ3WAzKAl2UxJ8jBrKgqAkHIojoDj87+nwA8v+DgAHXsEHpj3B1bPcc+DWcYfD+HsnaDviefFjnTN3OMARDI6Qw3fOFd1xV2Snfuj3Hvzeox12dzqPvK4i59IP3qkf6y910E5dRKS61fqwUxmTJk1iwoQJ1f49H3RthnvXHsCGzW7HZrNTaoRQWBrO/pIw9u1xsbuslN2eYvYYhRgE47RH4rRH4nLUw2mPJdTVgKDgethDQwmKDMYR5CAoyEFQkO2IO/LynbnNdvhf/oe2Kd8hlweEQ8/rH3y04dDPOfj8ut1uO+rhdv2lLiIiVqv1YadBgwY4HA5ycnwvvM3JySEh4ciPkx8/fjxjxozxzrvdbpKSkqq8tp53PlLlnykiIiInpnYNUnEETqeTzp07M2fOHO8yj8fDnDlzSE9PP+J7XC4XUVFRPi8REREJTLX+yA7AmDFjGDp0KF26dKFbt24888wzFBQUcMMNN1hdmoiIiFgsIMLO1Vdfzc6dO3nggQfIzs6mY8eOfPnll4ddtCwiIiJ1T0CMs3OyqmucHREREak+Fd1/1/prdkRERESORWFHREREAprCjoiIiAQ0hR0REREJaAo7IiIiEtAUdkRERCSgKeyIiIhIQFPYERERkYCmsCMiIiIBTWFHREREAlpAPBvrZJU/McPtdltciYiIiFRU+X77eE++UtgB9u7dC0BSUpLFlYiIiMiJ2rt3L9HR0UddrweBAh6Ph23bthEZGYnNZjupz3K73SQlJbF161Y9VPRv6pPDqU+OTP1yOPXJ4dQnR1YX+8UwDPbu3UtiYiJ2+9GvzNGRHcBut9O4ceMq/cyoqKg68z9bRalPDqc+OTL1y+HUJ4dTnxxZXeuXYx3RKacLlEVERCSgKeyIiIhIQFPYqWIul4sHH3wQl8tldSl+Q31yOPXJkalfDqc+OZz65MjUL0enC5RFREQkoOnIjoiIiAQ0hR0REREJaAo7IiIiEtAUdqrQiy++yKmnnkpISAhpaWksWbLE6pIqZdKkSXTt2pXIyEji4uK47LLLWL9+vU+bwsJCRo4cSWxsLBEREQwYMICcnByfNlu2bKFfv36EhYURFxfH2LFjKS0t9WmTmZnJ6aefjsvlonnz5syYMeOwevyxXydPnozNZmP06NHeZXW1T/78808GDx5MbGwsoaGhpKamsnTpUu96wzB44IEHaNSoEaGhofTs2ZMNGzb4fMbu3bsZNGgQUVFRxMTEMGzYMPLz833arFy5krPOOouQkBCSkpKYMmXKYbW8//77tG7dmpCQEFJTU/n888+rZ6OPoaysjPvvv58mTZoQGhpKs2bNeOSRR3yGs68LfTJ//nwuueQSEhMTsdlsfPzxxz7r/akPKlJLVThWn5SUlDBu3DhSU1MJDw8nMTGRIUOGsG3bNp/PCLQ+qTGGVIl33nnHcDqdxrRp04w1a9YYN910kxETE2Pk5ORYXdoJ6927tzF9+nRj9erVxooVK4yLLrrISE5ONvLz871thg8fbiQlJRlz5swxli5danTv3t0444wzvOtLS0uNdu3aGT179jSWL19ufP7550aDBg2M8ePHe9v8+uuvRlhYmDFmzBhj7dq1xvPPP284HA7jyy+/9Lbxx35dsmSJceqppxrt27c3br/9du/yutgnu3fvNlJSUozrr7/eWLx4sfHrr78aX331lbFx40Zvm8mTJxvR0dHGxx9/bPz000/GP/7xD6NJkybG/v37vW369OljdOjQwVi0aJHx3XffGc2bNzcGDhzoXZ+Xl2fEx8cbgwYNMlavXm28/fbbRmhoqPHKK69423z//feGw+EwpkyZYqxdu9a47777jODgYGPVqlU10xl/e+yxx4zY2Fhj1qxZxubNm43333/fiIiIMJ599llvm7rQJ59//rnx73//2/jwww8NwPjoo4981vtTH1Skluruk9zcXKNnz57Gu+++a/z8889GVlaW0a1bN6Nz584+nxFofVJTFHaqSLdu3YyRI0d658vKyozExERj0qRJFlZVNXbs2GEAxrx58wzDMP9RBgcHG++//763zbp16wzAyMrKMgzD/Edtt9uN7Oxsb5upU6caUVFRRlFRkWEYhnH33Xcbp512ms93XX311Ubv3r298/7Wr3v37jVatGhhzJ492zjnnHO8Yaeu9sm4ceOMM88886jrPR6PkZCQYDzxxBPeZbm5uYbL5TLefvttwzAMY+3atQZg/PDDD942X3zxhWGz2Yw///zTMAzDeOmll4x69ep5+6n8u1u1auWdv+qqq4x+/fr5fH9aWprxr3/96+Q28gT169fP+Oc//+mzrH///sagQYMMw6ibfXLojt2f+qAitVSHIwXAQy1ZssQAjN9//90wjMDvk+qk01hVoLi4mGXLltGzZ0/vMrvdTs+ePcnKyrKwsqqRl5cHQP369QFYtmwZJSUlPtvbunVrkpOTvdublZVFamoq8fHx3ja9e/fG7XazZs0ab5uDP6O8Tfln+GO/jhw5kn79+h1Wd13tk//7v/+jS5cuXHnllcTFxdGpUydee+017/rNmzeTnZ3tU290dDRpaWk+/RITE0OXLl28bXr27Indbmfx4sXeNmeffTZOp9Pbpnfv3qxfv549e/Z42xyr72rKGWecwZw5c/jll18A+Omnn1iwYAF9+/YF6mafHMqf+qAitVglLy8Pm81GTEwMoD45GQo7VeCvv/6irKzMZycGEB8fT3Z2tkVVVQ2Px8Po0aPp0aMH7dq1AyA7Oxun0+n9B1ju4O3Nzs4+Yn+UrztWG7fbzf79+/2uX9955x1+/PFHJk2adNi6utonv/76K1OnTqVFixZ89dVXjBgxgttuu4033ngDOLBdx6o3OzubuLg4n/VBQUHUr1+/Svqupvvlnnvu4ZprrqF169YEBwfTqVMnRo8ezaBBg3zqrUt9cih/6oOK1GKFwsJCxo0bx8CBA73PuarrfXIy9CBQOaaRI0eyevVqFixYYHUpltq6dSu33347s2fPJiQkxOpy/IbH46FLly5MnDgRgE6dOrF69Wpefvllhg4danF11njvvfd46623mDlzJqeddhorVqxg9OjRJCYm1tk+kRNTUlLCVVddhWEYTJ061epyAoKO7FSBBg0a4HA4DrvzJicnh4SEBIuqOnmjRo1i1qxZzJ071+ep8AkJCRQXF5Obm+vT/uDtTUhIOGJ/lK87VpuoqChCQ0P9ql+XLVvGjh07OP300wkKCiIoKIh58+bx3HPPERQURHx8fJ3rE4BGjRrRtm1bn2Vt2rRhy5YtwIHtOla9CQkJ7Nixw2d9aWkpu3fvrpK+q+l+GTt2rPfoTmpqKtdddx133HGH94hgXeyTQ/lTH1SklppUHnR+//13Zs+e7fP08rraJ1VBYacKOJ1OOnfuzJw5c7zLPB4Pc+bMIT093cLKKscwDEaNGsVHH33Et99+S5MmTXzWd+7cmeDgYJ/tXb9+PVu2bPFub3p6OqtWrfL5h1n+D7d855ienu7zGeVtyj/Dn/r1ggsuYNWqVaxYscL76tKlC4MGDfJO17U+AejRo8dhwxL88ssvpKSkANCkSRMSEhJ86nW73SxevNinX3Jzc1m2bJm3zbfffovH4yEtLc3bZv78+ZSUlHjbzJ49m1atWlGvXj1vm2P1XU3Zt28fdrvvr1aHw4HH4wHqZp8cyp/6oCK11JTyoLNhwwa++eYbYmNjfdbXxT6pMlZfIR0o3nnnHcPlchkzZsww1q5da9x8881GTEyMz503tcWIESOM6OhoIzMz09i+fbv3tW/fPm+b4cOHG8nJyca3335rLF261EhPTzfS09O968tvs+7Vq5exYsUK48svvzQaNmx4xNusx44da6xbt8548cUXj3ibtb/268F3YxlG3eyTJUuWGEFBQcZjjz1mbNiwwXjrrbeMsLAw48033/S2mTx5shETE2N88sknxsqVK41LL730iLcYd+rUyVi8eLGxYMECo0WLFj630+bm5hrx8fHGddddZ6xevdp45513jLCwsMNupw0KCjL+85//GOvWrTMefPBBS249Hzp0qHHKKad4bz3/8MMPjQYNGhh33323t01d6JO9e/cay5cvN5YvX24AxlNPPWUsX77ce2eRP/VBRWqp7j4pLi42/vGPfxiNGzc2VqxY4fO79+A7qwKtT2qKwk4Vev75543k5GTD6XQa3bp1MxYtWmR1SZUCHPE1ffp0b5v9+/cbt9xyi1GvXj0jLCzMuPzyy43t27f7fM5vv/1m9O3b1wgNDTUaNGhg3HnnnUZJSYlPm7lz5xodO3Y0nE6n0bRpU5/vKOev/Xpo2KmrffLpp58a7dq1M1wul9G6dWvj1Vdf9Vnv8XiM+++/34iPjzdcLpdxwQUXGOvXr/dps2vXLmPgwIFGRESEERUVZdxwww3G3r17fdr89NNPxplnnmm4XC7jlFNOMSZPnnxYLe+9957RsmVLw+l0Gqeddprx2WefVf0GH4fb7TZuv/12Izk52QgJCTGaNm1q/Pvf//bZYdWFPpk7d+4Rf48MHTrUMAz/6oOK1FIVjtUnmzdvPurv3rlz5wZsn9QUPfVcREREApqu2REREZGAprAjIiIiAU1hR0RERAKawo6IiIgENIUdERERCWgKOyIiIhLQFHZEREQkoCnsiIiISEBT2BGROikzMxObzXbYw1tFJPAo7IiIiEhAU9gRERGRgKawIyKW+uCDD0hNTSU0NJTY2Fh69uxJQUEBAK+//jpt2rQhJCSE1q1b89JLL/m8d8mSJXTq1ImQkBC6dOnCRx99hM1mY8WKFZWqZcGCBZx11lmEhoaSlJTEbbfd5q0F4NRTT2XixIn885//JDIykuTkZF599dVKb7uI1AyFHRGxzPbt2xk4cCD//Oc/WbduHZmZmfTv3x/DMHjrrbd44IEHeOyxx1i3bh0TJ07k/vvv54033gAgPz+fiy++mLZt27Js2TIeeugh7rrrrkrXsmnTJvr06cOAAQNYuXIl7777LgsWLGDUqFE+7Z588km6dOnC8uXLueWWWxgxYgTr168/qX4QkWpm8VPXRaQOW7ZsmQEYv/3222HrmjVrZsycOdNn2SOPPGKkp6cbhmEYr7zyihEbG2vs37/fu37q1KkGYCxfvvy43z137lwDMPbs2WMYhmEMGzbMuPnmm33afPfdd4bdbvd+R0pKijF48GDveo/HY8TFxRlTp06t0PaKiDWCLM5aIlKHdejQgQsuuIDU1FR69+5Nr169uOKKK3A6nWzatIlhw4Zx0003eduXlpYSHR0NwLp162jfvj0hISHe9enp6ZWu5aeffmLlypW89dZb3mWGYeDxeNi8eTNt2rQBoH379t71NpuNhIQEduzYUenvFZHqp7AjIpZxOBzMnj2bhQsX8vXXX/P888/z73//m08//RSA1157jbS0tMPeUx3y8/P517/+xW233XbYuuTkZO90cHCwzzqbzYbH46mWmkSkaijsiIilbDYbPXr0oEePHjzwwAOkpKTw/fffk5iYyK+//sqgQYOO+L42bdrw//7f/6OwsNB7dGfRokWVruP0009n7dq1NG/evNKfISL+SRcoi4hlFi9ezMSJE1m6dClbtmzhww8/ZOfOnbRp04YJEyYwadIknnvuOX755RdWrVrF9OnTeeqppwC49tprsdls3HTTTaxdu5bPP/+c//znP5WuZdy4cSxcuJBRo0axYsUKNmzYwCeffHLYBcoiUvvoyI6IWCYqKor58+fzzDPP4Ha7SUlJ4cknn6Rv374AhIWF8cQTTzB27FjCw8NJTU1l9OjRAERERPDpp58yfPhwOnXqRNu2bXn88ccZMGBApWpp37498+bN49///jdnnXUWhmHQrFkzrr766qraXBGxiM0wDMPqIkREqsJvv/1GkyZNWL58OR07drS6HBHxEzqNJSIiIgFNYUdEAtLw4cOJiIg44mv48OFWlyciNUinsUQkIO3YsQO3233EdVFRUcTFxdVwRSJiFYUdERERCWg6jSUiIiIBTWFHREREAprCjoiIiAQ0hR0REREJaAo7IiIiEtAUdkRERCSgKeyIiIhIQFPYERERkYD2/wFMZ7XsqDBPYQAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RLA Efficiency (ms):\n",
      "    seq_len       FLASH      sGLA        RLA\n",
      "0    1024.0    0.057952  0.669824   1.400992\n",
      "1    2048.0    0.156320  0.707520   1.412640\n",
      "2    4096.0    0.459392  0.476096   0.717952\n",
      "3    8192.0    1.593856  0.685072   1.466496\n",
      "4   16384.0    5.990848  0.993088   2.220608\n",
      "5   32768.0   23.571120  1.975232   4.397488\n",
      "6   65536.0   98.274178  3.916960   8.724000\n",
      "7  131072.0  403.149841  7.782544  17.350752\n"
     ]
    }
   ],
   "source": [
    "@triton.testing.perf_report(\n",
    "    triton.testing.Benchmark(\n",
    "        x_names=[\"seq_len\"],\n",
    "        x_vals=seq_len_for_test,\n",
    "        line_arg=\"provider\",\n",
    "        line_vals=[\n",
    "            \"flash\",\n",
    "            \"sgla\",\n",
    "            \"rla\",\n",
    "        ],\n",
    "        line_names=[\n",
    "            \"FLASH\",\n",
    "            \"sGLA\",\n",
    "            \"RLA\",\n",
    "        ],\n",
    "        styles=[\n",
    "            (\"orange\", \"-\"),\n",
    "            (\"blue\", \"-\"),\n",
    "            (\"green\", \"-\"),\n",
    "        ],\n",
    "        ylabel=\"ms\",\n",
    "        plot_name=\"RLA Efficiency (ms)\",\n",
    "        args={\n",
    "            \"batch_size\": batch_size,\n",
    "            \"num_heads\": num_heads,\n",
    "            \"head_dim\": head_dim,\n",
    "            \"dtype\": dtype,\n",
    "        },\n",
    "    )\n",
    ")\n",
    "def benchmark_forward(\n",
    "    batch_size,\n",
    "    seq_len,\n",
    "    num_heads,\n",
    "    head_dim,\n",
    "    provider,\n",
    "    dtype,\n",
    "):\n",
    "    q = torch.randn(\n",
    "        (batch_size, seq_len, num_heads, head_dim), device=\"cuda\", dtype=dtype\n",
    "    )\n",
    "    k = torch.randn(\n",
    "        (batch_size, seq_len, num_heads, head_dim), device=\"cuda\", dtype=dtype\n",
    "    )\n",
    "    v = torch.randn(\n",
    "        (batch_size, seq_len, num_heads, head_dim), device=\"cuda\", dtype=dtype\n",
    "    )\n",
    "    a = torch.rand(\n",
    "        (batch_size, seq_len, num_heads), device=\"cuda\", dtype=torch.float32\n",
    "    ).log()\n",
    "    b = torch.rand(\n",
    "        (batch_size, seq_len, num_heads), device=\"cuda\", dtype=torch.float32\n",
    "    )\n",
    "    g = torch.rand(\n",
    "        (batch_size, seq_len, num_heads), device=\"cuda\", dtype=torch.float32\n",
    "    )\n",
    "\n",
    "    quantiles = [0.5, 0.2, 0.8]\n",
    "\n",
    "    if provider == \"flash\":\n",
    "        ms, min_ms, max_ms = triton.testing.do_bench(\n",
    "            lambda: flash_attn_func(q, k, v, causal=True),\n",
    "            quantiles=quantiles,\n",
    "        )\n",
    "    elif provider == \"sgla\":\n",
    "        ms, min_ms, max_ms = triton.testing.do_bench(\n",
    "            lambda: sgla_prefill(\n",
    "                q,\n",
    "                k,\n",
    "                v,\n",
    "                a,\n",
    "                b,\n",
    "            ),\n",
    "            quantiles=quantiles,\n",
    "        )\n",
    "    elif provider == \"rla\":\n",
    "        ms, min_ms, max_ms = triton.testing.do_bench(\n",
    "            lambda: rla_prefill(\n",
    "                q,\n",
    "                k,\n",
    "                v,\n",
    "                a,\n",
    "                b,\n",
    "                g,\n",
    "            ),\n",
    "            quantiles=quantiles,\n",
    "        )\n",
    "    else:\n",
    "        raise ValueError(f\"Invalid provider: {provider}\")\n",
    "    return ms, min_ms, max_ms\n",
    "\n",
    "benchmark_forward.run(show_plots=True, print_data=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAGxCAYAAACEFXd4AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjUsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvWftoOwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAUDdJREFUeJzt3Xt8k+X9//FX0jTpMakttqVSsBzkICAMFCqerRRkTifqVERwTAYWFFBEnKKoHMTNOR2g2xD0O1HHfh4m84QIKFJBGMhRlIOCQlsE6Ql6zP37IzQ20NJS0t5p8n7ukQfNfV9JrvuSkveu+7o/t8UwDAMRERGRIGU1uwMiIiIijUlhR0RERIKawo6IiIgENYUdERERCWoKOyIiIhLUFHZEREQkqCnsiIiISFBT2BEREZGgZjO7A4HA7Xazb98+YmNjsVgsZndHRERE6sEwDAoLC0lJScFqrX3+RmEH2LdvH6mpqWZ3Q0RERBpg7969tGrVqtb9CjtAbGws4Bksp9Npcm9ERESkPgoKCkhNTfV+j9dGYQe8p66cTqfCjoiISDNT1xIULVAWERGRoKawIyIiIkFNYUdERESCmtbs1JPb7aasrMzsboQMu91+0ssIRURE6kthpx7KysrYvXs3brfb7K6EDKvVSlpaGna73eyuiIhIM6ewUwfDMNi/fz9hYWGkpqZqtqEJVBV53L9/P61bt1ahRxEROS0KO3WoqKjgyJEjpKSkEBUVZXZ3QsaZZ57Jvn37qKioIDw83OzuiIhIM6ZpijpUVlYC6HRKE6sa76rxFxERaSiFnXrSqZSmpfEWERF/UdgRERGRoKawIyIiIkFNYSdIDR8+HIvFcsJjx44dDB8+nOuuu67O9/j++++x2+107dq1xv0rVqzgiiuuID4+nqioKDp06MCwYcO89YiWL1+OxWLh8OHDJ7z27LPP5plnnjmNIxQREamfgAk7M2fOxGKxMG7cOO+2kpISsrKySEhIICYmhsGDB5Obm+vzuj179jBo0CCioqJITExk4sSJVFRUNHHvA9OAAQPYv3+/zyMtLa3er1+wYAE33XQTBQUFrF692mff1q1bGTBgAL179+aTTz5h06ZNPPfcc9jtdi0qFhGRnx1aB/veN7ULAXHp+RdffMELL7xA9+7dfbaPHz+e//73vyxatAiXy8WYMWO4/vrr+eyzzwDPlTqDBg0iOTmZVatWsX//fm6//XbCw8OZPn26GYcSUBwOB8nJyQ16rWEYzJ8/nzlz5tCqVSvmzZtHnz59vPs//PBDkpOTmTVrlndbu3btGDBgwGn3W0REgkTRLlh+NZQehEv+A2ddbUo3TJ/ZKSoqYsiQIfz973/njDPO8G7Pz89n3rx5PP3001xxxRX06tWL+fPns2rVKj7//HPA84W7detW/vnPf9KjRw8GDhzI448/zuzZsxvv1g6GARXF5jwMo3GOqQbLli3jyJEjZGRkcNttt/Haa69RXFzs3Z+cnMz+/fv55JNPmqxPIiLSjJQcgGUDoCQPYs+BM3qY1hXTZ3aysrIYNGgQGRkZPPHEE97t69ato7y8nIyMDO+2Tp060bp1a7Kzs+nbty/Z2dl069aNpKQkb5vMzExGjx7Nli1b6Nmzp/87XHkE/hXj//etj5uKwBZd7+aLFy8mJubnvg4cOJBFixbV67Xz5s3j5ptvJiwsjK5du9K2bVsWLVrE8OHDAbjxxhv54IMPuPTSS0lOTqZv375ceeWV3H777TidTp/3atWq1Qnvf+TIkXofh4iINDMVR2DFNVD4DUSeBX3nQVSKad0xNey89tpr/O9//+OLL744YV9OTg52u524uDif7UlJSeTk5HjbVA86Vfur9tWmtLSU0tJS7/OCgoKGHkJAu/zyy5k7d673eXR0/YLS4cOHeeONN1i5cqV322233ca8efO8YScsLIz58+fzxBNP8PHHH7N69WqmT5/Ok08+yZo1a2jZsqX3tZ9++imxsbE+n3HZZZc1/MBERCRwuSvgs5vh4GoIj4M+f4cW6aZ2ybSws3fvXu655x6WLFlCREREk372jBkzmDp1asNeHBblmWExQ9ip3a4iOjqa9u3bn/LHLFy4kJKSEp81OoZh4Ha7+frrrznnnHO828866yyGDh3K0KFDefzxxznnnHN4/vnnfcY3LS3thNBqs5k+qSgiIv5mGLA2C354B6wOOH8OtDR/Ladpa3bWrVtHXl4ev/jFL7DZbNhsNlasWMGzzz6LzWYjKSmJsrKyEy5bzs3N9S66TU5OPuHqrKrnJ1uYO3nyZPLz872PvXv31r/jFovnVJIZjyaqKjxv3jzuvfdeNmzY4H18+eWXXHzxxbz44ou1vu6MM86gZcuWPmt7REQkhGyZBjv+Blig55+gzW+a7LvrZEz7v9dXXnklmzZt8tl2xx130KlTJyZNmkRqairh4eEsXbqUwYMHA7B9+3b27NlDerpnOiw9PZ1p06aRl5dHYmIiAEuWLMHpdNKlS5daP9vhcOBwOBrpyJqH/Px8NmzY4LMtISGBgwcP8r///Y9XXnmFTp06+ey/5ZZbeOyxx3jiiSeYN28eGzZs4Ne//jXt2rWjpKSEl19+mS1btvDcc8814ZGIiEhA2LUANj7s+bnrw9BhJFhMvw4KMDHsxMbGnlCsLjo6moSEBO/2ESNGMGHCBOLj43E6nYwdO5b09HT69u0LQP/+/enSpQtDhw5l1qxZ5OTk8NBDD5GVlRXyYaYuy5cvP2EB94gRI4iMjKRLly4nBB2AX//614wZM4Z3332XCy64gJUrVzJq1Cj27dtHTEwM5557Lm+99RaXXnppUx2GiIgEgn3vw+rfeX5uNxK6PADWcHP7VI3FMJrweuY6XHbZZfTo0cNbWbekpIR7772XV199ldLSUjIzM5kzZ47PKarvvvuO0aNHs3z5cqKjoxk2bBgzZ848pTUhBQUFuFwu8vPzT7iSqKSkhN27d5OWltbka4tCmcZdRKSZOLQOPrrUUyLlrF9B+stgdzXJR5/s+7u6gAo7ZlHYCTwadxGRZqBoF3yY7qml0+JCuPjfENmy7tf5SX3DTmCcTBMREZHmpXrRQGdn6PNikwadU6GwIyIiIqempqKBro5m96pWCjsiIiJSfwFYNLAuCjsiIiJSPwFaNLAuCjsiIiJSPwFaNLAuCjsiIiJStwAuGliX5tFLERERMU+AFw2si8KOiIiI1O7QOlh5AxiVnqKBPWeBLdLsXp0ShZ0gl5OTwz333EP79u2JiIggKSmJfv36MXfuXI4cOQLA2WefjcViwWKxEBkZydlnn81NN93Exx9/7PNe3377LRaLhcTERAoLC3329ejRg0cffbSpDktERJpC0S5YfrWnOnKLC+GC55usOrI/KewEsV27dtGzZ08+/PBDpk+fzvr168nOzub+++9n8eLFfPTRR962jz32GPv372f79u28/PLLxMXFkZGRwbRp005438LCQv74xz825aGIiEhTa0ZFA+ti2o1ApfHddddd2Gw21q5dS3R0tHd727Ztufbaa6l+p5DY2FjvPcdat27NJZdcQsuWLZkyZQo33HADHTv+XCxq7NixPP3002RlZXnvNi8iIkGkmRUNrItmdoLUwYMH+fDDD8nKyvIJOtVZ6rhc8J577sEwDN5++22f7bfccgvt27fnscce81t/RUQkQDTDooF10czOKTIMOLbUpclFRdW/nMGOHTswDMNnRgagRYsWlJSUAJCVlcWTTz5Z63vEx8eTmJjIt99+67PdYrEwc+ZMrrnmGsaPH0+7du1O6ThERCRANdOigXVR2DlFR45ATIw5n11UBLVM0tTbmjVrcLvdDBkyhNLS0jrbG4ZR4wxQZmYmF110EQ8//DALFy48vU6JiEhgaKZFA+ui01hBqn379lgsFrZv3+6zvW3btrRv357IyLovGzx48CAHDhwgLS2txv0zZ87k9ddfZ/369X7ps4iImKgZFw2si2Z2TlFUlGeGxazPrq+EhASuuuoq/vrXvzJ27Nha1+2czF/+8hesVivXXXddjfsvuOACrr/+eh544IFTfm8REQkgzbxoYF0Udk6RxXL6p5Kaypw5c+jXrx+9e/fm0UcfpXv37litVr744gu++uorevXq5W1bWFhITk4O5eXl7N69m3/+85/84x//YMaMGbRv377Wz5g2bRrnnnsuNpv+KomINEtBUDSwLvqGCmLt2rVj/fr1TJ8+ncmTJ/P999/jcDjo0qUL9913H3fddZe37ZQpU5gyZQp2u53k5GT69u3L0qVLufzyy0/6Geeccw6//e1v+dvf/tbYhyMiIv4WJEUD62IxqhdbCVEFBQW4XC7y8/NxOp0++0pKSti9ezdpaWlERESY1MPQo3EXEWlkJQdgST9PLR1nZ7j4zWZXS+dk39/VBcfKIxEREam/ICsaWBeFHRERkVAShEUD66KwIyIiEiqCtGhgXRR2REREQkWQFg2si8KOiIhIKAjiooF1CY2jFBERCWVBXjSwLgo7IiIiwSwEigbWRWFHREQkWIVI0cC6KOyIiIgEo5IDsGwAlOR5igb2eREiW5rdK1Mo7IiIiASbECsaWBeFnSA1fPhwLBYLFouF8PBw0tLSuP/++ykpKfG2qdpvsViIjo6mQ4cODB8+nHXr1vm81/Lly7FYLJx77rlUVlb67IuLi2PBggVNcUgiIlIfIVg0sC4KO0FswIAB7N+/n127dvHnP/+ZF154gUceecSnzfz589m/fz9btmxh9uzZFBUV0adPH15++eUT3m/Xrl01bhcRkQARokUD62Jq2Jk7dy7du3fH6XTidDpJT0/nvffe8+6/7LLLfGYfLBYLo0aN8nmPPXv2MGjQIKKiokhMTGTixIlUVFQ09aEEJIfDQXJyMqmpqVx33XVkZGSwZMkSnzZxcXEkJydz9tln079/f/79738zZMgQxowZw08//eTTduzYsTzyyCOUlpY25WGIiEh9hWjRwLqYGnZatWrFzJkzWbduHWvXruWKK67g2muvZcuWLd42d955J/v37/c+Zs2a5d1XWVnJoEGDKCsrY9WqVbz00kssWLCAKVOmmHE4AW3z5s2sWrUKu91eZ9vx48dTWFh4QjAaN24cFRUVPPfcc43VTRERaagQLhpYF5uZH37NNdf4PJ82bRpz587l888/59xzzwUgKiqK5OTkGl//4YcfsnXrVj766COSkpLo0aMHjz/+OJMmTeLRRx+t1xf7qTIMgyPlR/z+vvURFR6F5RQS+uLFi4mJiaGiooLS0lKsVit//etf63xdp06dAPj22299Pz8qikceeYQHH3yQO++8E5cr9C5fFBEJSCFeNLAupoad6iorK1m0aBHFxcWkp/+8kOqVV17hn//8J8nJyVxzzTU8/PDDREVFAZCdnU23bt1ISkryts/MzGT06NFs2bKFnj171vhZpaWlPqdiCgoK6t3PI+VHiJkRc6qH5xdFk4uItkfXu/3ll1/O3LlzKS4u5s9//jM2m43BgwfX+TrDMABqDFYjRozgT3/6E08++STTp0+vf+dFRKRxqGhgnUwPO5s2bSI9PZ2SkhJiYmJ488036dKlCwC33norbdq0ISUlhY0bNzJp0iS2b9/OG2+8AUBOTo5P0AG8z3Nycmr9zBkzZjB16tRGOqLAER0dTfv27QF48cUXOe+885g3bx4jRow46eu2bdsGQFpa2gn7bDYb06ZNY/jw4YwZM8b/nRYRkfpT0cB6MT3sdOzYkQ0bNpCfn8+///1vhg0bxooVK+jSpQsjR470tuvWrRstW7bkyiuvZOfOnbRr167Bnzl58mQmTJjgfV5QUEBqamq9XhsVHkXR5KIGf/bpiAqPavBrrVYrDz74IBMmTODWW28lMrL21P/MM8/gdDrJyMiocf+NN97IU089FRKBUUQkYKloYL2ZHnbsdrt39qFXr1588cUX/OUvf+GFF144oW2fPn0A2LFjB+3atSM5OZk1a9b4tMnNzQWodZ0PeK5ScjgcDeqvxWI5pVNJgeTGG29k4sSJzJ49m/vuuw+Aw4cPk5OTQ2lpKV9//TUvvPACb731Fi+//DJxcXG1vtfMmTPJzMxsop6LiIgPFQ08JQG3TNvtdtd6afOGDRsAaNnSk1zT09PZtGkTeXl53jZLlizB6XR6T4XJz2w2G2PGjGHWrFkUFxcDcMcdd9CyZUs6derE6NGjiYmJYc2aNdx6660nfa8rrriCK664Qpf5i4g0NRUNPGUWo2o1qgkmT57MwIEDad26NYWFhSxcuJAnn3ySDz74gLZt27Jw4UKuvvpqEhIS2LhxI+PHj6dVq1asWLEC8Cxq7tGjBykpKcyaNYucnByGDh3K7373u1NaPFtQUIDL5SI/Px+n0+mzr6SkhN27d5OWlkZERIRfj19qp3EXEamBYcAXozy1dKwO6Dsf2twcsrV0Tvb9XZ2pp7Hy8vK4/fbb2b9/Py6Xi+7du/PBBx9w1VVXsXfvXj766COeeeYZiouLSU1NZfDgwTz00EPe14eFhbF48WJGjx5Neno60dHRDBs2jMcee8zEoxIREWkkKhrYIKbO7AQKzewEHo27iMhxdi2Az+/w/Nx1CnR9KORr6dR3Zifg1uyIiIjIcVQ08LQo7IiIiAQyFQ08bQo7IiIigUpFA/1CYaeetLSpaWm8RSTkqWig3yjs1CEsLAyAsrIyk3sSWqrGu2r8RURCiooG+pXpFZQDnc1mIyoqigMHDhAeHo7VqnzY2NxuNwcOHCAqKgqbTX9FRSTEqGig3+mbpA4Wi4WWLVuye/duvvvuO7O7EzKsViutW7eu8c7rIiJByzBgbRb88I6naOD5c6DlALN71ewp7NSD3W6nQ4cOOpXVhOx2u2bRRCT0qGhgo1DYqSer1aridiIi0nh2LYCND3t+7vowdBgJFv2fPn/QKIqIiJhNRQMblcKOiIiImVQ0sNEp7IiIiJhFRQObhMKOiIiIGVQ0sMko7IiIiDS1E4oGvqiigY1IYUdERKQp1Vg0sK/ZvQpqCjsiIiJNRUUDTaGwIyIi0lRUNNAUCjsiIiJNQUUDTaNRFhERaWz73lPRQBMp7IiIiDSmg2th5Y0qGmgihR0REZHGUrQLVgxS0UCTKeyIiIg0BhUNDBgKOyIiIv6mooEBRWFHRETEn1Q0MOAo7IiIiPiLigYGJIUdERERf1HRwICksCMiIuIPKhoYsPRfQURE5HSpaGBAU9gRERE5HSoaGPAUdkRERBpKRQObBVPDzty5c+nevTtOpxOn00l6ejrvvfeed39JSQlZWVkkJCQQExPD4MGDyc3N9XmPPXv2MGjQIKKiokhMTGTixIlUVFQ09aGIiEioUdHAZsPUsNOqVStmzpzJunXrWLt2LVdccQXXXnstW7ZsAWD8+PG88847LFq0iBUrVrBv3z6uv/567+srKysZNGgQZWVlrFq1ipdeeokFCxYwZcoUsw5JRERCgYoGNisWwzAMsztRXXx8PE899RQ33HADZ555JgsXLuSGG24A4KuvvqJz585kZ2fTt29f3nvvPX75y1+yb98+kpKSAHj++eeZNGkSBw4cwG631+szCwoKcLlc5Ofn43Q6G+3YREQkCLgr4NPrPbV0wuOg30JIGWh2r0JSfb+/A2bNTmVlJa+99hrFxcWkp6ezbt06ysvLycjI8Lbp1KkTrVu3Jjs7G4Ds7Gy6devmDToAmZmZFBQUeGeHRERE/EZFA5slm9kd2LRpE+np6ZSUlBATE8Obb75Jly5d2LBhA3a7nbi4OJ/2SUlJ5OTkAJCTk+MTdKr2V+2rTWlpKaWlpd7nBQUFfjoaEREJaioa2CyZPrPTsWNHNmzYwOrVqxk9ejTDhg1j69atjfqZM2bMwOVyeR+pqamN+nkiIhIEVDSw2TL9v5Ldbqd9+/b06tWLGTNmcN555/GXv/yF5ORkysrKOHz4sE/73NxckpOTAUhOTj7h6qyq51VtajJ58mTy8/O9j7179/r3oEREJLioaGCzZnrYOZ7b7aa0tJRevXoRHh7O0qVLvfu2b9/Onj17SE9PByA9PZ1NmzaRl5fnbbNkyRKcTiddunSp9TMcDof3cveqh4iISI1UNLDZM3XNzuTJkxk4cCCtW7emsLCQhQsXsnz5cj744ANcLhcjRoxgwoQJxMfH43Q6GTt2LOnp6fTt2xeA/v3706VLF4YOHcqsWbPIycnhoYceIisrC4fDYeahiYhIMFDRwKBgatjJy8vj9ttvZ//+/bhcLrp3784HH3zAVVddBcCf//xnrFYrgwcPprS0lMzMTObMmeN9fVhYGIsXL2b06NGkp6cTHR3NsGHDeOyxx8w6JBERCRYqGhg0Aq7OjhlUZ0dERHxUHIGlV8DB1Z6igRf/G1r0NbtXcpxmV2dHREQkILgr4LObPUEnPA76/F1Bp5lT2BEREamiooFBSWFHRESkiooGBiWFHREREVDRwCCm/4oiIiIqGhjUFHZERCS0qWhg0FPYERGR0KWigSFBYUdEREKTigaGDIUdEREJPRVHYMU1UPiNp2hg3xfB1dHsXkkjUdgREZHQoqKBIUdhR0REQoeKBoYkhR0REQkdKhoYkhR2REQkNKhoYMjSf2UREQl+KhoY0hR2REQkuKloYMhT2BERkeClooGCwo6IiAQrFQ2UYxR2REQk+KhooFSjsCMiIsFFRQPlOAo7IiISPFQ0UGqgsCMiIsFDRQOlBgo7IiISHFQ0UGqhvwUiItL8qWignITCjoiING8qGih1UNgREZHmS0UDpR4UdkREpHlS0UCpJ4UdERFpflQ0UE6Bwo6IiDQvKhoop0hhR0REmg8VDZQGUNgREZHmQ0UDpQEUdkREpHlQ0UBpIFP/lsyYMYPzzz+f2NhYEhMTue6669i+fbtPm8suuwyLxeLzGDVqlE+bPXv2MGjQIKKiokhMTGTixIlUVFQ05aGIiEhjUtFAOQ02Mz98xYoVZGVlcf7551NRUcGDDz5I//792bp1K9HR0d52d955J4899pj3eVRUlPfnyspKBg0aRHJyMqtWrWL//v3cfvvthIeHM3369CY9HhERaQQqGiinyWIYhmF2J6ocOHCAxMREVqxYwSWXXAJ4ZnZ69OjBM888U+Nr3nvvPX75y1+yb98+kpKSAHj++eeZNGkSBw4cwG631/m5BQUFuFwu8vPzcTqdfjseERE5TUW74MN0Ty2dFhfCxf9WLR3xqu/3d0Cd7MzPzwcgPj7eZ/srr7xCixYt6Nq1K5MnT+bIkSPefdnZ2XTr1s0bdAAyMzMpKChgy5YtTdNxERHxPxUNFD8x9TRWdW63m3HjxtGvXz+6du3q3X7rrbfSpk0bUlJS2LhxI5MmTWL79u288cYbAOTk5PgEHcD7PCcnp8bPKi0tpbS01Pu8oKDA34cjIiKnQ0UDxY8CJuxkZWWxefNmVq5c6bN95MiR3p+7detGy5YtufLKK9m5cyft2rVr0GfNmDGDqVOnnlZ/RUSkkahooPhZQJzGGjNmDIsXL2bZsmW0atXqpG379OkDwI4dOwBITk4mNzfXp03V8+Tk5BrfY/LkyeTn53sfe/fuPd1DEBERf1DRQGkEpoYdwzAYM2YMb775Jh9//DFpaWl1vmbDhg0AtGzpOW+bnp7Opk2byMvL87ZZsmQJTqeTLl261PgeDocDp9Pp8xARkQCgooHSCEw9jZWVlcXChQt5++23iY2N9a6xcblcREZGsnPnThYuXMjVV19NQkICGzduZPz48VxyySV0794dgP79+9OlSxeGDh3KrFmzyMnJ4aGHHiIrKwuHw2Hm4YmIyKnYOV9FA6VRmHrpuaWWtD5//nyGDx/O3r17ue2229i8eTPFxcWkpqby61//moceeshnNua7775j9OjRLF++nOjoaIYNG8bMmTOx2eqX5XTpuYiIyfa951mQbFR6igb2eka1dKRO9f3+Dqg6O2ZR2BERMdHBtbD0MqgohrOuhfSXwO4yu1fSDDTLOjsiIhJiinbBikGeoNPiQrhgroKO+J3CjoiImENFA6WJKOyIiEjTU9FAaUIKOyIi0rRUNFCamMKOiIg0HRUNFBMo7IiISNNR0UAxgcKOiIg0DRUNFJPob5mIiDS+fe/Bmjs9P7cbCV0eAGu4uX2SkKGwIyIijevgWlh5o6c68lnXQs9Zqo4sTUphR0REGo+KBkoAUNgREZHGoaKBEiAUdkRExP9UNFACiMKOiIj4l4oGSoBR2BEREf9R0UAJQAo7IiLiPyoaKAFIYUdERPxDRQMlQOlvoYiInD4VDZQAprAjIiKnR0UDJcAp7IiISMOpaKA0Awo7IiLSMCoaKM2Ewo6IiJw6FQ2UZkRhR0RETo2KBkozo7AjIiL1p6KB0gwp7IiISP2paKA0Qwo7IiJSPyoaKM2U/paKiEjdVDRQmjGFHREROTkVDZRmrkFh56WXXuK///2v9/n9999PXFwcF154Id99953fOiciIiZT0UAJAg0KO9OnTycy0pPqs7OzmT17NrNmzaJFixaMHz/erx0UERGTqGigBAlbQ160d+9e2rdvD8Bbb73F4MGDGTlyJP369eOyyy7zZ/9ERMQMKhooQaRBMzsxMTEcPHgQgA8//JCrrroKgIiICI4ePeq/3omISNNT0UAJMg0KO1dddRW/+93v+N3vfsfXX3/N1VdfDcCWLVto06ZNvd9nxowZnH/++cTGxpKYmMh1113H9u3bfdqUlJSQlZVFQkICMTExDB48mNzcXJ82e/bsYdCgQURFRZGYmMjEiROpqKhoyKGJiIQ2FQ2UINSgsDN79mzS09M5cOAA/+///T8SEhIAWLduHbfeemu932fFihVkZWXx+eefs2TJEsrLy+nfvz/FxcXeNuPHj+edd95h0aJFrFixgn379nH99dd791dWVjJo0CDKyspYtWoVL730EgsWLGDKlCkNOTQRkdCmooEShCyGYRgNeWFJSQkbN24kLy8Pt9vts+9Xv/pVgzpz4MABEhMTWbFiBZdccgn5+fmceeaZLFy4kBtuuAGAr776is6dO5OdnU3fvn157733+OUvf8m+fftISkoC4Pnnn2fSpEkcOHAAu91e5+cWFBTgcrnIz8/H6XQ2qO8iIs3ezvmw+reen7tOga4PqZaOBLT6fn83aIHy+++/z+23387Bgwc5PitZLBYqKysb8rbk5+cDEB8fD3hmisrLy8nIyPC26dSpE61bt/aGnezsbLp16+YNOgCZmZmMHj2aLVu20LNnzxM+p7S0lNLSUu/zgoKCBvVXRCRoqGigBLEGncYaO3YsN954I/v27cPtdvs8Ghp03G4348aNo1+/fnTt2hWAnJwc7HY7cXFxPm2TkpLIycnxtqkedKr2V+2ryYwZM3C5XN5Hampqg/osIhIUVDRQglyDwk5ubi4TJkw4IWScjqysLDZv3sxrr73mt/eszeTJk8nPz/c+9u7d2+ifKSISkFQ0UEJAg8LODTfcwPLly/3WiTFjxrB48WKWLVtGq1atvNuTk5MpKyvj8OHDPu1zc3NJTk72tjn+6qyq51VtjudwOHA6nT4PEZGQo6KBEiIatGbnr3/9KzfeeCOffvop3bp1Izzc97zu3XffXa/3MQyDsWPH8uabb7J8+XLS0tJ89vfq1Yvw8HCWLl3K4MGDAdi+fTt79uwhPT0dgPT0dKZNm0ZeXh6JiYkALFmyBKfTSZcuXRpyeCIiwU9FAyWENOhqrHnz5jFq1CgiIiJISEjAUu2yRIvFwq5du+r1PnfddRcLFy7k7bffpmPHn3/JXC6X93YUo0eP5t1332XBggU4nU7Gjh0LwKpVqwDPpec9evQgJSWFWbNmkZOTw9ChQ/nd737H9OnT69UPXY0lIiHFXQGfXu+ppRMeB/0WQspAs3slcsrq+/3doLCTnJzM3XffzQMPPIDV2vAbp1tqqd0wf/58hg8fDngucb/33nt59dVXKS0tJTMzkzlz5vicovruu+8YPXo0y5cvJzo6mmHDhjFz5kxstvpNXCnsiEjIMAz4YpSnlo7VAX3nQ5ubVUtHmqVGDTvx8fF88cUXtGvX7rQ6GSgUdkQkZGx+AjY+DFig13NwzmiwNPz/tIqYqb7f3w36Gz5s2DBef/31BndORERMsHP+saADdH0YOoxU0JGQ0KAFypWVlcyaNYsPPviA7t27n7BA+emnn/ZL50RExE9UNFBCWIPCzqZNm7yViTdv3uyzr7Z1OCIiYhIVDZQQ16Cws2zZMn/3Q0REGoOKBoo0bM2OiIg0AyoaKAIo7IiIBCcVDRTxUtgREQk27gr47GY4uNpTNLDP36FFX7N7JWIahR0RkWBiGLA2y1Md2eqA8+dAywFm90rEVAo7IiLBZMs0T3VkLNDzT9DmN6qOLCFPYUdEJFioaKBIjfRbICISDFQ0UKRWCjsiIs2digaKnJTCjohIc6aigSJ1UtgREWmuVDRQpF4UdkREmiMVDRSpN4UdEZHmRkUDRU6Jwo6ISHOiooEip0xhR0SkOVHRQJFTprAjItJcqGigSIPot0REpDlQ0UCRBlPYEREJdCoaKHJaFHZERAKZigaKnDaFHRGRQKWigSJ+obAjIhKIVDRQxG8UdkREAo2KBor4lcKOiEggUdFAEb9T2BERCSQqGijidwo7IiKBQkUDRRqFfotERAKBigaKNBqFHRERs6looEijMjXsfPLJJ1xzzTWkpKRgsVh46623fPYPHz4ci8Xi8xgwwHeh3qFDhxgyZAhOp5O4uDhGjBhBUVFREx6FiMhpUNFAkUZnatgpLi7mvPPOY/bs2bW2GTBgAPv37/c+Xn31VZ/9Q4YMYcuWLSxZsoTFixfzySefMHLkyMbuuojI6VPRQJEmYTPzwwcOHMjAgQNP2sbhcJCcnFzjvm3btvH+++/zxRdf0Lt3bwCee+45rr76av74xz+SkpLi9z6LiPiFigaKNJmAX7OzfPlyEhMT6dixI6NHj+bgwYPefdnZ2cTFxXmDDkBGRgZWq5XVq1eb0V0RkbqpaKBIkzJ1ZqcuAwYM4PrrryctLY2dO3fy4IMPMnDgQLKzswkLCyMnJ4fExESf19hsNuLj48nJyan1fUtLSyktLfU+LygoaLRjEBHxoaKBIk0uoMPOzTff7P25W7dudO/enXbt2rF8+XKuvPLKBr/vjBkzmDp1qj+6KCJyalQ0UKTJBfxprOratm1LixYt2LFjBwDJycnk5eX5tKmoqODQoUO1rvMBmDx5Mvn5+d7H3r17G7XfIiKAigaKmKRZ/ZZ9//33HDx4kJYtPVcrpKenc/jwYdatW+dt8/HHH+N2u+nTp0+t7+NwOHA6nT4PEZFGpaKBIqYx9TRWUVGRd5YGYPfu3WzYsIH4+Hji4+OZOnUqgwcPJjk5mZ07d3L//ffTvn17MjMzAejcuTMDBgzgzjvv5Pnnn6e8vJwxY8Zw880360osEQkcKhooYiqLYRiGWR++fPlyLr/88hO2Dxs2jLlz53Ldddexfv16Dh8+TEpKCv379+fxxx8nKSnJ2/bQoUOMGTOGd955B6vVyuDBg3n22WeJiYmpdz8KCgpwuVzk5+drlkdE/KtoF3yY7qml0+JCuGgRROn/jIn4Q32/v00NO4FCYUdEGkXJAVjSz1NLx9kZLn5TtXRE/Ki+39/Nas2OiEizoaKBIgFDYUdExN9UNFAkoCjsiIj4k4oGigQchR0REX9S0UCRgKOwIyLiLyoaKBKQ9FsoIuIPKhooErAUdkRETpeKBooENIUdEZHTUbQLVgyCimJP0cDz54DdZXavRKQahR0RkYYqOQDLBniqIzs7Q58XVR1ZJAAp7IiINISKBoo0Gwo7IiKnSkUDRZoVhR0RkVOhooEizY7CjojIqVDRQJFmR2FHRKS+VDRQpFnSb6mISH2oaKBIs6WwIyJSFxUNFGnWFHZERE5GRQNFmj2FHRGR2qhooEhQUNgREamJigaKBA2FHRGR46looEhQUdgREalORQNFgo7CjohIdSoaKBJ0FHZERKqoaKBIUNJvsYgIqGigSBBT2BERUdFAkaCmsCMioU1FA0WCnsKOiIQuFQ0UCQkKOyISmlQ0UCRkKOyISOhR0UCRkKKwIyKhRUUDRUKOwo6IhBYVDRQJOaaGnU8++YRrrrmGlJQULBYLb731ls9+wzCYMmUKLVu2JDIykoyMDL755hufNocOHWLIkCE4nU7i4uIYMWIERUVFTXgUItJsqGigSEgy9be8uLiY8847j9mzZ9e4f9asWTz77LM8//zzrF69mujoaDIzMykpKfG2GTJkCFu2bGHJkiUsXryYTz75hJEjRzbVIYhIc6GigSIhy2IYhmF2JwAsFgtvvvkm1113HeCZ1UlJSeHee+/lvvvuAyA/P5+kpCQWLFjAzTffzLZt2+jSpQtffPEFvXv3BuD999/n6quv5vvvvyclpX6XkBYUFOByucjPz8fpdDbK8YmIiQ6uhaWXeWrpnHUtpL+kWjoiQaC+398BO3+7e/ducnJyyMjI8G5zuVz06dOH7OxsALKzs4mLi/MGHYCMjAysViurV69u8j6LSABS0UCRkGczuwO1ycnJASApKclne1JSkndfTk4OiYmJPvttNhvx8fHeNjUpLS2ltLTU+7ygoMBf3RaRQKKigSJCAM/sNKYZM2bgcrm8j9TUVLO7JCL+pqKBInJMwIad5ORkAHJzc3225+bmevclJyeTl5fns7+iooJDhw5529Rk8uTJ5Ofnex979+71c+9FxFQlefDpDSoaKCJAAIedtLQ0kpOTWbp0qXdbQUEBq1evJj09HYD09HQOHz7MunXrvG0+/vhj3G43ffr0qfW9HQ4HTqfT5yEiQaC8EDY+Cv9pC/vfU9FAEQFMXrNTVFTEjh07vM93797Nhg0biI+Pp3Xr1owbN44nnniCDh06kJaWxsMPP0xKSor3iq3OnTszYMAA7rzzTp5//nnKy8sZM2YMN998c72vxBKRIFBZCjtegM1PQOkBzzZXVzj3D9DmJhUNFAlxpoadtWvXcvnll3ufT5gwAYBhw4axYMEC7r//foqLixk5ciSHDx/moosu4v333yciIsL7mldeeYUxY8Zw5ZVXYrVaGTx4MM8++2yTH4uImMBdCd8thI1ToPhbz7bos6HjeGg7TFddiQgQQHV2zKQ6OyLNjGHAvnfhy8lweJNnm+NMOGcstB8JkUknf72IBIX6fn8H7KXnIiI1OrAKNjwABz71PLfFegJOh7sgtq25fRORgKSwIyLNw+Et8OWD8MN/PM+tDkgbCufcDXFdtS5HRGqlsCMiga14D2x6BHa/DIYbsELqYOh0D7RI1408RaROCjsiEphKD8KW6fD1bHAfq3ie3B86jYPkq8Cqf75EpH70r4WIBJaKYvjqGdg2C8qP3col4QLodB+0+hWEOUztnog0Pwo7IhIY3OWw8x+w6TEoOXZvO2dn6DQB2twM4THm9k9Emi2FHRExl+GGPYvgy4eg6FiR0ahU6DgO2t4BjjNM7Z6INH8KOyJiDsOAnCWwYTL89D/PNnsCnJMF7X+vu5OLiN8o7IhI0zv4hadWTu7Hnue2aGg7AjqOhdj25vZNRIKOwo6INJ2C7Z7TVXv/7XluDYc2t0LHe+CMHqqVIyKNQmFHRBrfkR9g01TY9SIYlYAFWl3nCTlnXgTWMLN7KCJBTGFHRBpP2U+w9UnY/ixUHvVsS7ocOk6AlEzPzI6ISCNT2BER/6s4Cl8/B1tnegIPwBm/gM73Qur1EBZhbv9EJKQo7IiI/7grYNcC2PQoHP3Bsy2mA3SeAG2GgD3WzN6JSIhS2BGR02cY8P2bnht1Fmz3bItM8dyks/3vwJFgbv9EJKQp7IjI6cld5rmM/OAaz/PwOOgwGjrcBdGtTO2aiAgo7IhIQx1aD19Ohv0feJ6HRUHb4XDOWHB21GXkIhIwFHZE5NQU7oSND8N3r3qeW2zQ+jeey8gTeoHFam7/RESOo7AjIvVzNAc2PwE7XgCjwrMt5Zeee1glXQpW/XMiIoFJ/zqJyMmVF8DWp2D7n6Gi2LPtzIuh072QMhDC7Ob2T0SkDgo7IlKzylL4Zg5smQalBz3b4rpDp/ug9Q1gizS3fyIi9aSwIyK+3JXw7T9h4xQ4ssezLToNOk2AtNvB7jS3fyIip0hhR0Q8DAN+WOyplZO/2bMtIslzdVW7OyEy0dz+iYg0kMKOiEDeSvjyATjwmed5uAvaj/TUyok529SuiYicLoUdkVB2eDNsmAz7FnueWyMgbSh0vBtc56pWjogEBYUdkVBU/J1nTc7u/wMMsIRB6mDoNA4S+qhWjogEFYUdkVBS8qPn6qpv5oC7zLMtOdMTcpIzVCtHRIKS/mUTCQXlRfDVn2HbU1BR6NmW0Bc63wtnXQNhDnP7JyLSiBR2RIJZZRns+BtseRxK8jzbnF2g833Q+kYIjzG3fyIiTUBhRyQYGW747jXPPayKdnm2RbXxnK5KGwaOM0ztnohIU1LYEQkmhuG5C/mXk+GnDZ5tjjM9l5C3/z1EtTS1eyIiZgjoSy4effRRLBaLz6NTp07e/SUlJWRlZZGQkEBMTAyDBw8mNzfXxB6LmOjH1bD0Clg+0BN0bDFwzjjonw3dH1XQEZGQFfAzO+eeey4fffSR97nN9nOXx48fz3//+18WLVqEy+VizJgxXH/99Xz22WdmdFXEHPlfeaoef/+m57nVDm2GQKd7PPeyUq0cEQlxAR92bDYbycnJJ2zPz89n3rx5LFy4kCuuuAKA+fPn07lzZz7//HP69u3b1F0VaVpHvodNj8Ku+Z41Olih1XXQcRyceSFYw8ztn4hIgAjo01gA33zzDSkpKbRt25YhQ4awZ4/nxoTr1q2jvLycjIwMb9tOnTrRunVrsrOzT/qepaWlFBQU+DxEmo3SQ7D+fninA+yc5wk6SVfCpf+Bi16DpIsVdEREqgnomZ0+ffqwYMECOnbsyP79+5k6dSoXX3wxmzdvJicnB7vdTlxcnM9rkpKSyMnJOen7zpgxg6lTpzZiz0UaQcUR2P4sbH0Syg97tsX3hk73Qup1EBZhZu9ERAJWQIedgQMHen/u3r07ffr0oU2bNvzrX/8iMjKywe87efJkJkyY4H1eUFBAamrqafVVpNG4yz2nqjZNhaP7PNtiz4FO90Gbm8Eea27/REQCXECHnePFxcVxzjnnsGPHDq666irKyso4fPiwz+xObm5ujWt8qnM4HDgcqhgrAc4wYO+/4cuHoPBrz7bIs6DjPdDut+BIMLd/IiLNRMCv2amuqKiInTt30rJlS3r16kV4eDhLly717t++fTt79uwhPT3dxF6K+EHOUvjgAlh5kyfo2OOhy4PQ/3PoMlFBR0TkFAT0zM59993HNddcQ5s2bdi3bx+PPPIIYWFh3HLLLbhcLkaMGMGECROIj4/H6XQyduxY0tPTdSWWNF+H/gcbHoCcJZ7nYVHQ9g44525wnWNu30REmqmADjvff/89t9xyCwcPHuTMM8/koosu4vPPP+fMM88E4M9//jNWq5XBgwdTWlpKZmYmc+bMMbnXIg1QuMNzumrP657nlnDPepyOd0P8L8DSrCZhRUQCisUwDMPsTpitoKAAl8tFfn4+TqfT7O5IKDm6HzY9Bjv/AUYFYPHchbzjPZB4qS4hFxE5ifp+fwf0zI5I0CrLh22z4KtnoPKIZ9uZl0LnCdByAITZTe2eiEgwUdgRaUqVJfD1bNgyHcoOebbFnQedJ0Lq9WBreEkFERGpmcKOSFNwV8Lul2HTI3Bkr2dbTDvoNAHOvg3sOn0qItJYFHZEGpNhwA//8dyoM3+rZ1tESzhnDHQYCY4W5vZPRCQEKOyINJa8TzyXkf947F5t4S5o/3vocBfEtDG3byIiIURhR8TfftoIX06Gfe96nlsjIG0YdBwLri5gsZjbPxGREKOwI+IvRbth4xT49hXAAEsYpN4IncZBwvmqlSMiYhKFHZHTVZIHm6fBjrmem3YCtLzaUysn+Qqw6tdMRMRM+ldYpKHKC2Hbn+CrP0FFkWdbiwuh071w1iAI081mRUQCgcKOyKmqLIUdL8DmJ6D0gGebqyt0vg9a3wi2KHP7JyIiPhR2ROrLcMO3C2Hjw1D8rWdbdBvoOB7aDge7y8zeiYgElEp3JYVlhRSVFVFYWkj7+PaEh4Wb0heFHZG6GAbse89zhdXhjZ5tjjOhQxZ0+D1EJpvbPxERP6hwV3iDSfWQUlhWWOu2ovLa2xytOOrz/jvH7qRtfFtTjk1hR+RkDmTDlw94auYA2GKh3Z1wThbEmvNLKyICnnBSWHosXJwkkJywrZY2JRUljdLPcGs40eHRFJYVNsr714fCjkhN8rd6qh5//7bnudXhua1Dx7shrptq5YjIKasKJ95ZkfqGlFraNGo4sUcTFR5FdHg00fZo759V22LsMUSHH3t+bL/T4STWEYvT4cTlcBEXEUdcRBxR4VGEh4VjN/EGxwo7IuC5d9VP6yH3Y8hZCrkfedboYIXUX0PHcdAiHaxhZvdURJpIeWX5CWGjzpBykjaNHU6ODyRV26LCo7zhpPo2p8PpfbgiXN6AEhUeRZg1DJvVRpglDEsQ/J87hR0JTYYB+Vs84Sb3Y8hdDuX5vm2SMjw36myZAVZzFtWJSP2VV5bXGjZqDSknaVNaWdoo/bSH2U86a1I1c1J91iTafmzmxB7rDSdxjjhcES6iwqM8wcQaFjThxN8UdiQ0GAYU7Tw2c3Ms4FRdNl7FFgsJF0BCH0i8BJIuhbAIc/orEgLKKstOXPB6sgBSR5vGDCe1zZr4zJxUmzWpCidOhxOn3Ykzwkmcw3NaJzI8UuGkiSnsSPAq3gu5y36evTmy13d/WCTE94KEvp5TVEmXgD1et3UQqUVZZVm9Z01qulLn+DZllWWN0k9HmOOEWZHqsybRdt+wUvVz9fUmToeTuIg4XA6XTzixqSJ6s6T/ahI8SvI8p6Oqwk3hN777reEQ1wNa9PGEm8TLICJJ63AkaFWFE3+d2mnMcHL8rEitMynVQkr1NSc1hZOqgCKisCPNV9lhzyXhVeHm8KbjGlghrmu1mZvLIKqV7lUlAau0orTmWiYNPLVTXnWvNj+rHk7qG1Ji7DE+602qZk8UTqQp6F99aT4qiuHAZz+vu/lp3bErpqqJ7egJN2ceCzfRaWDi5Y4S3KrCSa0F144PIHW0aaxwEmGLqDGQnCykxNhjiHXE+oSSuIg4nA4nEbYIhRNpVhR2JHBVlsLB1T8vKD74+c93Fa8SneZZUNwiHZIuB2cHLSqWGhmGQWllaf2qwh43a1Jbm8YOJ6d6aqcqnLgcLp/ZE4UTCXUKOxI43BVwaN2x01LL4MBKqPQtN05ES2jRFxLSPVdLxXXVjTeDWPVLiev1Zw3bqoeUCndFo/Qz0hZ5SrMm0fZoYsJjfBbEuiJ+njlxhDkUTqTZMgyoqICyMigt9fxZVgYtW0K4SVU8FHbEPIbbs86m6rRU3gqoOK6cuD3h2Jqbvp7Lwc/oAeGxqmAcoNyGm+KyYgpKC047pDRmnZNIW2StgaS2kBITHuNTgK36glh7mF3hRBqdYUBlpW+AqO1RVxt/vMfJ2tRkxw5o165px6yKwo40HcOAwq+r1bpZBmUHfduEOyH+Ak+4ObOfJ+jYXQo3jcQwDEoqSvwSTKpmURpD9Ton9f2zauak+imdqj8dNgdhljCFE/GqmokwKxzUd79hmD1SDRMeDsXF5n2+wo40ruLvfl5zk/sxHN3nuz8sEuJ7H1tUfKEn4DgSVOvmJPxxaqf6n5VGpd/7GGYJqz2InCSknHBap4YibKpz0rxUVpobDurbxu2u+1gCUXj4zw+73fd5bdtqa3OytlX7HA7Pzw7HiT9HRPy8rerniAjP86p2ZtG/GuJfR3N8C/kV7fLdb7V7TkVVXQ6eeClEJAZ1rRu34a5xUWygndqptZ7JcX9Wv8dOjD3GJ5xUXbETHR5NeFi499SOKsT6n9sdmKcvjt8fDCGiIcHhZNtqalM9OFQPC9Ufx4cIhwPCwjwT31ar70O/cr4UduT0lB7yrLWpCjf5W333W8LA1c1zxdSZx8JN1FkBfa8pwzA4WnHUb+GkuLxx5m4bemrHWyH2WPl6Z4Rn7Un1dSfWEJ5Zc7uhvLzpA8Spvkel/yfkmoTNdnrB4fj9dbU5PizUFCAcDoiM9J2JCAurOUBYQ/dXo1lT2JH6c1fAke89gSZv2bFaN+uB6ieRLeDsdOxy8AuP1bpp0+i1bo6vd1LXpcUnCydFZUUBdWrn+DsTH39qp7msO6kpRDTkC9/f7Y5vU9E4F2w1uqoQ0dDg0NDTGTUFiOqP409n2Gw1BwiFCGlMCjvyM8OA0h+haLfn9FPxbt+fi/eAUcM3QUy7arVuLoPY9nXWuqlwV5zSzMnx99hpqkqxDTm1E2uPPWFhbFxEHDH2GO+pHX/e/M8wzAkJp9quuYaIsLCmO51R07qI6usgalsbERHhGyKqBwiLRac0RIIm7MyePZunnnqKnJwczjvvPJ577jkuuOACs7sVeCqKjwWYWgJNRc2nXNwGFLmhEBuF9iQKoztQGNORwuj2FNrOoLCijMIfCyn84fUaZ0+OryRbUlHSKIcXYYuod0CpHlSq7krsW5AtDtzhuCtsVJbbqCi3nvxLvdhzVq/q+Q9lsLsJwkR54+S8RldTiDiVUFF9X23tjt9eNbtQ22mNquBQfW1EePiJAUIhQqR5CYqw8/rrrzNhwgSef/55+vTpwzPPPENmZibbt28nMTHR7O41LXc5HNmLUbiL4sNfUVjwDYX5Oygs3ENR8fcUlhym0IBC90kelnAKDRuFhpXCSjeFlRUcqaz6Rq0Afjj2WH7a3Q23hhNliyHSFk1kWDQRYTE4LFFEWGNwWGKwE004MdiNGGxGLOGVMYS5YwivdGGrjMXmdmErdxFW4cRa7qKy3E5FmYWKcivlx8JJVWAoLoOfThIijt/WXEOE1Vq/mYf6zj7UZ0airnURNZ3SCA+vfWGlQoSI+JPFMJrrVfs/69OnD+effz5//etfAXC73aSmpjJ27FgeeOCBOl9fUFCAy+UiPz8fp9Ppt379eORHDh45yMGjB/mx+EdyinI4WHKQ4rJi70xH9bUkle5KLFiwWI49vP9zYzEqsRgVnj/dFcd+9jzKK45SWFZEYflRCisqKHRDkeG7ksZfLEYYNncMNncsYZUxhFXEYK2IxVoRg6UsFkt5LEZpDJTGYpTG4i6JxTgaS8VRJ+5iJxVHYqkoduI+6oSyWKhsPvetsljqt6bhVGcfTrZW4mRXaNR0ZcbxpzOOP62hECEiwaS+39/NfmanrKyMdevWMXnyZO82q9VKRkYG2dnZJvYMzp7eg+LwH0ztA4YFSmM9waLqz7KYE7fV9udxbY2KCMqx4O9Jj8aYdTjZbET19RDVw8TxwaF6uKg6naEQISLSvDT7sPPjjz9SWVlJUlKSz/akpCS++uqrGl9TWlpKaenPtUoKCgoapW8V+S6IKYIjCXA0Ho4mQEkclDqrPWI9f5bFQmU4WAzAqOVPat7nttUaWKzuSOx2C3bvl77l5LMLkXXPVNhsNc9Y1La4sj7rInSFhoiINJZmH3YaYsaMGUydOrXRP+f1ljdx5KfDRMSUEpNYRlREBdawcMqNCCqNMMoMNxWVpZQbhZRXVniuYLJFYQ2PhPAYrI5YsDmxRriw2mIIC7dhDQvDEhbmEw7CwnxnIWpaXKkrNEREJFQ1+7DTokULwsLCyM3N9dmem5tLcnJyja+ZPHkyEyZM8D4vKCggNTXV73279sFH/P6eIiIicmqa/UkCu91Or169WLp0qXeb2+1m6dKlpKen1/gah8OB0+n0eYiIiEhwavYzOwATJkxg2LBh9O7dmwsuuIBnnnmG4uJi7rjjDrO7JiIiIiYLirDzm9/8hgMHDjBlyhRycnLo0aMH77///gmLlkVERCT0BEWdndPVWHV2REREpPHU9/u72a/ZERERETkZhR0REREJago7IiIiEtQUdkRERCSoKeyIiIhIUFPYERERkaCmsCMiIiJBTWFHREREgprCjoiIiAQ1hR0REREJakFxb6zTVXXHjIKCApN7IiIiIvVV9b1d152vFHaAwsJCAFJTU03uiYiIiJyqwsJCXC5Xrft1I1DA7Xazb98+YmNjsVgsp/VeBQUFpKamsnfvXt1U9BiNyYk0JjXTuJxIY3IijUnNQnFcDMOgsLCQlJQUrNbaV+ZoZgewWq20atXKr+/pdDpD5i9bfWlMTqQxqZnG5UQakxNpTGoWauNyshmdKlqgLCIiIkFNYUdERESCmsKOnzkcDh555BEcDofZXQkYGpMTaUxqpnE5kcbkRBqTmmlcaqcFyiIiIhLUNLMjIiIiQU1hR0RERIKawo6IiIgENYUdP5o9ezZnn302ERER9OnThzVr1pjdpQaZMWMG559/PrGxsSQmJnLdddexfft2nzYlJSVkZWWRkJBATEwMgwcPJjc316fNnj17GDRoEFFRUSQmJjJx4kQqKip82ixfvpxf/OIXOBwO2rdvz4IFC07oTyCO68yZM7FYLIwbN867LVTH5IcffuC2224jISGByMhIunXrxtq1a737DcNgypQptGzZksjISDIyMvjmm2983uPQoUMMGTIEp9NJXFwcI0aMoKioyKfNxo0bufjii4mIiCA1NZVZs2ad0JdFixbRqVMnIiIi6NatG++++27jHPRJVFZW8vDDD5OWlkZkZCTt2rXj8ccf9ylnHwpj8sknn3DNNdeQkpKCxWLhrbfe8tkfSGNQn774w8nGpLy8nEmTJtGtWzeio6NJSUnh9ttvZ9++fT7vEWxj0mQM8YvXXnvNsNvtxosvvmhs2bLFuPPOO424uDgjNzfX7K6dsszMTGP+/PnG5s2bjQ0bNhhXX3210bp1a6OoqMjbZtSoUUZqaqqxdOlSY+3atUbfvn2NCy+80Lu/oqLC6Nq1q5GRkWGsX7/eePfdd40WLVoYkydP9rbZtWuXERUVZUyYMMHYunWr8dxzzxlhYWHG+++/720TiOO6Zs0a4+yzzza6d+9u3HPPPd7toTgmhw4dMtq0aWMMHz7cWL16tbFr1y7jgw8+MHbs2OFtM3PmTMPlchlvvfWW8eWXXxq/+tWvjLS0NOPo0aPeNgMGDDDOO+884/PPPzc+/fRTo3379sYtt9zi3Z+fn28kJSUZQ4YMMTZv3my8+uqrRmRkpPHCCy9423z22WdGWFiYMWvWLGPr1q3GQw89ZISHhxubNm1qmsE4Ztq0aUZCQoKxePFiY/fu3caiRYuMmJgY4y9/+Yu3TSiMybvvvmv84Q9/MN544w0DMN58802f/YE0BvXpS2OPyeHDh42MjAzj9ddfN7766isjOzvbuOCCC4xevXr5vEewjUlTUdjxkwsuuMDIysryPq+srDRSUlKMGTNmmNgr/8jLyzMAY8WKFYZheH4pw8PDjUWLFnnbbNu2zQCM7OxswzA8v9RWq9XIycnxtpk7d67hdDqN0tJSwzAM4/777zfOPfdcn8/6zW9+Y2RmZnqfB9q4FhYWGh06dDCWLFliXHrppd6wE6pjMmnSJOOiiy6qdb/b7TaSk5ONp556yrvt8OHDhsPhMF599VXDMAxj69atBmB88cUX3jbvvfeeYbFYjB9++MEwDMOYM2eOccYZZ3jHqeqzO3bs6H1+0003GYMGDfL5/D59+hi///3vT+8gT9GgQYOM3/72tz7brr/+emPIkCGGYYTmmBz/xR5IY1CfvjSGmgLg8dasWWMAxnfffWcYRvCPSWPSaSw/KCsrY926dWRkZHi3Wa1WMjIyyM7ONrFn/pGfnw9AfHw8AOvWraO8vNzneDt16kTr1q29x5udnU23bt1ISkrytsnMzKSgoIAtW7Z421R/j6o2Ve8RiOOalZXFoEGDTuh3qI7Jf/7zH3r37s2NN95IYmIiPXv25O9//7t3/+7du8nJyfHpr8vlok+fPj7jEhcXR+/evb1tMjIysFqtrF692tvmkksuwW63e9tkZmayfft2fvrpJ2+bk41dU7nwwgtZunQpX3/9NQBffvklK1euZODAgUBojsnxAmkM6tMXs+Tn52OxWIiLiwM0JqdDYccPfvzxRyorK32+xACSkpLIyckxqVf+4Xa7GTduHP369aNr164A5OTkYLfbvb+AVaofb05OTo3jUbXvZG0KCgo4evRowI3ra6+9xv/+9z9mzJhxwr5QHZNdu3Yxd+5cOnTowAcffMDo0aO5++67eemll4Cfj+tk/c3JySExMdFnv81mIz4+3i9j19Tj8sADD3DzzTfTqVMnwsPD6dmzJ+PGjWPIkCE+/Q2lMTleII1BffpihpKSEiZNmsQtt9zivc9VqI/J6dCNQOWksrKy2Lx5MytXrjS7K6bau3cv99xzD0uWLCEiIsLs7gQMt9tN7969mT59OgA9e/Zk8+bNPP/88wwbNszk3pnjX//6F6+88goLFy7k3HPPZcOGDYwbN46UlJSQHRM5NeXl5dx0000YhsHcuXPN7k5Q0MyOH7Ro0YKwsLATrrzJzc0lOTnZpF6dvjFjxrB48WKWLVvmc1f45ORkysrKOHz4sE/76sebnJxc43hU7TtZG6fTSWRkZECN67p168jLy+MXv/gFNpsNm83GihUrePbZZ7HZbCQlJYXcmAC0bNmSLl26+Gzr3Lkze/bsAX4+rpP1Nzk5mby8PJ/9FRUVHDp0yC9j19TjMnHiRO/sTrdu3Rg6dCjjx4/3zgiG4pgcL5DGoD59aUpVQee7775jyZIlPncvD9Ux8QeFHT+w2+306tWLpUuXere53W6WLl1Kenq6iT1rGMMwGDNmDG+++SYff/wxaWlpPvt79epFeHi4z/Fu376dPXv2eI83PT2dTZs2+fxiVv3iVn05pqen+7xHVZuq9wikcb3yyivZtGkTGzZs8D569+7NkCFDvD+H2pgA9OvX74SyBF9//TVt2rQBIC0tjeTkZJ/+FhQUsHr1ap9xOXz4MOvWrfO2+fjjj3G73fTp08fb5pNPPqG8vNzbZsmSJXTs2JEzzjjD2+ZkY9dUjhw5gtXq+09rWFgYbrcbCM0xOV4gjUF9+tJUqoLON998w0cffURCQoLP/lAcE78xe4V0sHjttdcMh8NhLFiwwNi6dasxcuRIIy4uzufKm+Zi9OjRhsvlMpYvX27s37/f+zhy5Ii3zahRo4zWrVsbH3/8sbF27VojPT3dSE9P9+6vusy6f//+xoYNG4z333/fOPPMM2u8zHrixInGtm3bjNmzZ9d4mXWgjmv1q7EMIzTHZM2aNYbNZjOmTZtmfPPNN8Yrr7xiREVFGf/85z+9bWbOnGnExcUZb7/9trFx40bj2muvrfES4549exqrV682Vq5caXTo0MHnctrDhw8bSUlJxtChQ43Nmzcbr732mhEVFXXC5bQ2m8344x//aGzbts145JFHTLn0fNiwYcZZZ53lvfT8jTfeMFq0aGHcf//93jahMCaFhYXG+vXrjfXr1xuA8fTTTxvr16/3XlkUSGNQn7409piUlZUZv/rVr4xWrVoZGzZs8Pm3t/qVVcE2Jk1FYcePnnvuOaN169aG3W43LrjgAuPzzz83u0sNAtT4mD9/vrfN0aNHjbvuuss444wzjKioKOPXv/61sX//fp/3+fbbb42BAwcakZGRRosWLYx7773XKC8v92mzbNkyo0ePHobdbjfatm3r8xlVAnVcjw87oTom77zzjtG1a1fD4XAYnTp1Mv72t7/57He73cbDDz9sJCUlGQ6Hw7jyyiuN7du3+7Q5ePCgccsttxgxMTGG0+k07rjjDqOwsNCnzZdffmlcdNFFhsPhMM466yxj5syZJ/TlX//6l3HOOecYdrvdOPfcc43//ve//j/gOhQUFBj33HOP0bp1ayMiIsJo27at8Yc//MHnCysUxmTZsmU1/jsybNgwwzACawzq0xd/ONmY7N69u9Z/e5ctWxa0Y9JUdNdzERERCWpasyMiIiJBTWFHREREgprCjoiIiAQ1hR0REREJago7IiIiEtQUdkRERCSoKeyIiIhIUFPYERERkaCmsCMiIWn58uVYLJYTbt4qIsFHYUdERESCmsKOiIiIBDWFHREx1b///W+6detGZGQkCQkJZGRkUFxcDMA//vEPOnfuTEREBJ06dWLOnDk+r12zZg09e/YkIiKC3r178+abb2KxWNiwYUOD+rJy5UouvvhiIiMjSU1N5e677/b2BeDss89m+vTp/Pa3vyU2NpbWrVvzt7/9rcHHLiJNQ2FHREyzf/9+brnlFn7729+ybds2li9fzvXXX49hGLzyyitMmTKFadOmsW3bNqZPn87DDz/MSy+9BEBRURG//OUv6dKlC+vWrePRRx/lvvvua3Bfdu7cyYABAxg8eDAbN27k9ddfZ+XKlYwZM8an3Z/+9Cd69+7N+vXrueuuuxg9ejTbt28/rXEQkUZm8l3XRSSErVu3zgCMb7/99oR97dq1MxYuXOiz7fHHHzfS09MNwzCMF154wUhISDCOHj3q3T937lwDMNavX1/nZy9btswAjJ9++skwDMMYMWKEMXLkSJ82n376qWG1Wr2f0aZNG+O2227z7ne73UZiYqIxd+7ceh2viJjDZnLWEpEQdt5553HllVfSrVs3MjMz6d+/PzfccAN2u52dO3cyYsQI7rzzTm/7iooKXC4XANu2baN79+5ERER496enpze4L19++SUbN27klVde8W4zDAO3283u3bvp3LkzAN27d/fut1gsJCcnk5eX1+DPFZHGp7AjIqYJCwtjyZIlrFq1ig8//JDnnnuOP/zhD7zzzjsA/P3vf6dPnz4nvKYxFBUV8fvf/5677777hH2tW7f2/hweHu6zz2Kx4Ha7G6VPIuIfCjsiYiqLxUK/fv3o168fU6ZMoU2bNnz22WekpKSwa9cuhgwZUuPrOnfuzP/93/9RUlLind35/PPPG9yPX/ziF2zdupX27ds3+D1EJDBpgbKImGb16tVMnz6dtWvXsmfPHt544w0OHDhA586dmTp1KjNmzODZZ5/l66+/ZtOmTcyfP5+nn34agFtvvRWLxcKdd97J1q1beffdd/njH//Y4L5MmjSJVatWMWbMGDZs2MA333zD22+/fcICZRFpfjSzIyKmcTqdfPLJJzzzzDMUFBTQpk0b/vSnPzFw4EAAoqKieOqpp5g4cSLR0dF069aNcePGARATE8M777zDqFGj6NmzJ126dOHJJ59k8ODBDepL9+7dWbFiBX/4wx+4+OKLMQyDdu3a8Zvf/MZfhysiJrEYhmGY3QkREX/49ttvSUtLY/369fTo0cPs7ohIgNBpLBEREQlqCjsiEpRGjRpFTExMjY9Ro0aZ3T0RaUI6jSUiQSkvL4+CgoIa9zmdThITE5u4RyJiFoUdERERCWo6jSUiIiJBTWFHREREgprCjoiIiAQ1hR0REREJago7IiIiEtQUdkRERCSoKeyIiIhIUFPYERERkaD2/wH4lxjPT3g61wAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RDN Efficiency (ms):\n",
      "    seq_len       FLASH        GDN        RDN\n",
      "0    1024.0    0.058016   0.631040   1.095776\n",
      "1    2048.0    0.156352   0.817664   1.405936\n",
      "2    4096.0    0.461376   0.689376   1.871584\n",
      "3    8192.0    1.593504   0.850144   1.803392\n",
      "4   16384.0    6.030656   1.658496   3.525696\n",
      "5   32768.0   23.611744   3.310672   6.971104\n",
      "6   65536.0   96.535683   6.560448  13.904832\n",
      "7  131072.0  403.419220  13.072992  27.521248\n"
     ]
    }
   ],
   "source": [
    "@triton.testing.perf_report(\n",
    "    triton.testing.Benchmark(\n",
    "        x_names=[\"seq_len\"],\n",
    "        x_vals=seq_len_for_test,\n",
    "        line_arg=\"provider\",\n",
    "        line_vals=[\n",
    "            \"flash\",\n",
    "            \"gdn\",\n",
    "            \"rdn\",\n",
    "        ],\n",
    "        line_names=[\n",
    "            \"FLASH\",\n",
    "            \"GDN\",\n",
    "            \"RDN\",\n",
    "        ],\n",
    "        styles=[\n",
    "            (\"orange\", \"-\"),\n",
    "            (\"blue\", \"-\"),\n",
    "            (\"green\", \"-\"),\n",
    "        ],\n",
    "        ylabel=\"ms\",\n",
    "        plot_name=\"RDN Efficiency (ms)\",\n",
    "        args={\n",
    "            \"batch_size\": batch_size,\n",
    "            \"num_heads\": num_heads,\n",
    "            \"head_dim\": head_dim,\n",
    "            \"dtype\": dtype,\n",
    "        },\n",
    "    )\n",
    ")\n",
    "def benchmark_forward(\n",
    "    batch_size,\n",
    "    seq_len,\n",
    "    num_heads,\n",
    "    head_dim,\n",
    "    provider,\n",
    "    dtype,\n",
    "):\n",
    "    q = torch.randn(\n",
    "        (batch_size, seq_len, num_heads, head_dim), device=\"cuda\", dtype=dtype\n",
    "    )\n",
    "    k = torch.randn(\n",
    "        (batch_size, seq_len, num_heads, head_dim), device=\"cuda\", dtype=dtype\n",
    "    )\n",
    "    v = torch.randn(\n",
    "        (batch_size, seq_len, num_heads, head_dim), device=\"cuda\", dtype=dtype\n",
    "    )\n",
    "    a = torch.rand(\n",
    "        (batch_size, seq_len, num_heads), device=\"cuda\", dtype=torch.float32\n",
    "    ).log()\n",
    "    b = torch.rand(\n",
    "        (batch_size, seq_len, num_heads), device=\"cuda\", dtype=torch.float32\n",
    "    )\n",
    "    g = torch.rand(\n",
    "        (batch_size, seq_len, num_heads), device=\"cuda\", dtype=torch.float32\n",
    "    )\n",
    "\n",
    "    quantiles = [0.5, 0.2, 0.8]\n",
    "\n",
    "    if provider == \"flash\":\n",
    "        ms, min_ms, max_ms = triton.testing.do_bench(\n",
    "            lambda: flash_attn_func(q, k, v, causal=True),\n",
    "            quantiles=quantiles,\n",
    "        )\n",
    "    elif provider == \"gdn\":\n",
    "        ms, min_ms, max_ms = triton.testing.do_bench(\n",
    "            lambda: gdn_prefill(\n",
    "                q,\n",
    "                k,\n",
    "                v,\n",
    "                a,\n",
    "                b,\n",
    "            ),\n",
    "            quantiles=quantiles,\n",
    "        )\n",
    "    elif provider == \"rdn\":\n",
    "        ms, min_ms, max_ms = triton.testing.do_bench(\n",
    "            lambda: rdn_prefill(\n",
    "                q,\n",
    "                k,\n",
    "                v,\n",
    "                a,\n",
    "                b,\n",
    "                g,\n",
    "            ),\n",
    "            quantiles=quantiles,\n",
    "        )\n",
    "    else:\n",
    "        raise ValueError(f\"Invalid provider: {provider}\")\n",
    "    return ms, min_ms, max_ms\n",
    "\n",
    "benchmark_forward.run(show_plots=True, print_data=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "sparse",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
