{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import pandas\n",
    "from tqdm.notebook import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = torch.load(\"sentence_embed.pt\")\n",
    "#unique = torch.load(\"unique.pt\")\n",
    "# df = pandas.read_csv(\"../../mimic_impressions.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([454265, 768]) torch.Size([454265]) 454265 cuda:0\n"
     ]
    }
   ],
   "source": [
    "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
    "norms = torch.stack([torch.norm(data[a]) for a in data]).to(device)\n",
    "values = torch.stack([data[a] for a in sorted(data.keys())]).to(device)\n",
    "cuda_data = [data[a].to(device) for a in data]\n",
    "print(values.size(), norms.size(), len(cuda_data), device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6e00258267e341109bf9de1324f27150",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=454265.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0.2 10\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1bc3cddcd42d4f57ac8aae5bff4a3a0b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=454265.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0.4 36\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "af5dd3dabb754e5098b03c8f02dcd9f4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=454265.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0.6 265\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e9022c3a44b045508a1152d717af2c20",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=454265.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0.75 1546\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8026984f96a6411b965c9c432036379c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=454265.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0.85 7510\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "26a0d5e6abfd492f815eac0eb3c8aa27",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=454265.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0.9 20470\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "df07a42bceb04d1da1b12b03dbd190e5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=454265.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0.95 61496\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "201bbbb5a9a2489daf5c108948c9df8e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=454265.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "0.99 167272\n"
     ]
    }
   ],
   "source": [
    "sizes = []\n",
    "cutoffs = [0.99, 0.95, 0.9, 0.85, 0.75, 0.6, 0.4, 0.2]\n",
    "# cutoffs = [0.98, 0.95, 0.9, 0.85, 0.65, 0.4, 0.2]\n",
    "for cutoff in cutoffs[::-1]:\n",
    "    x = {}\n",
    "    copy = set()\n",
    "    pbar = tqdm(total=len(data))\n",
    "    idxs = set(range(len(data)))\n",
    "    values_copy = values.detach().clone().to(device)\n",
    "    while len(idxs) != 0:\n",
    "        i = min(idxs)\n",
    "        cos = values_copy @ cuda_data[i] / (norms * norms[i])\n",
    "        same_t = torch.where(cos > cutoff)[0]\n",
    "        same = set(same_t.tolist())\n",
    "        copy.update(same)\n",
    "        x[i] = same\n",
    "        idxs -= same\n",
    "        idxs -= {i}\n",
    "        values_copy[same_t, :] *= 0.\n",
    "        pbar.update(1)\n",
    "    pbar.close()\n",
    "    print(cutoff, len(x))\n",
    "    sizes.append(len(x))\n",
    "    num = int(cutoff*100)\n",
    "    torch.save(x, f\"sentences_{num}.pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# torch.save(x, \"sentences_95.pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "167272\n",
      "[0, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n",
      "{10, 238068, 285119}\n",
      "[10, 36, 265, 1546, 7510, 20470, 61496, 167272] [0.99, 0.95, 0.9, 0.85, 0.75, 0.6, 0.4, 0.2]\n"
     ]
    }
   ],
   "source": [
    "print(len(x))\n",
    "print(list(x.keys())[:10])\n",
    "print(x[10])\n",
    "print(sizes, cutoffs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAARUUlEQVR4nO3df2zcd33H8ecbJwVvdLgsZmqchIQpRFRjIp1VijIxptG67R80Q2hLJwRjiEiM7ocGlhIxsa77g45om4TooN1WIdBGAS0z0SjyECmaNI0Sd26bJp3BdIXarkg6cDcJb02z9/64r8PFvYvv4rPP9/HzIZ1838/3c/d539fnl77+fO/7vchMJEm97yXdLkCS1BkGuiQVwkCXpEIY6JJUCANdkgqxqVsDb9myJXfu3Nmt4SWpJz388MPPZuZgo3VdC/SdO3cyMTHRreElqSdFxHebrXPKRZIKYaBLUiEMdEkqhIEuSYUw0CWpEAa6JBXCQJekQhjoklSIZQM9Iu6LiDMR8XiT9RERH4+I6Yh4LCKu7XyZkqTltHKm6KeBTwCfabL+ZmB3dXsj8MnqpySpztjkLEfGp5ibX2DrQD+jI3vYv3eoY8+/7B56Zv4z8INLdLkV+EzWfAMYiIirO1WgJJVgbHKWw0dPMju/QAKz8wscPnqSscnZjo3RiTn0IeDpuuWZqk3qSWOTs+y76zi7Dn2ZfXcd7+gfnDauI+NTLJw7f1HbwrnzHBmf6tgYnbg4VzRoa/hFpRFxEDgIsGPHjg4MLXXW4l7U4h/e4l4U0NF/jbXxzM0vtNV+OTqxhz4DbK9b3gbMNeqYmfdm5nBmDg8ONrz6o9RVa7EXpY1p60B/W+2XoxOBfgx4V/Vpl+uB5zLzmQ48r7Tm1mIvShvT6Mge+jf3XdTWv7mP0ZE9HRtj2SmXiPgc8BZgS0TMAH8EbAbIzE8BDwC3ANPAj4D3dKw6aY1tHehntkF4d3IvShvT4pTdan7KZdlAz8zbllmfwAc6VpHURaMjey6aQ4fO70Vp49q/d2hVj8V07RuLpPVoLfaipNVioEtLrPZelLRavJaLJBXCPfR1ZrVPDZZULgN9HfGkFkkr4ZTLOuJJLZJWwkBfRzypRdJKGOjryFqcGiypXAb6OrIWpwZLKpcHRdcRT2qRtBIG+jrjSS2SLpdTLpJUCANdkgphoEtSIQx0SSrEhj0o6jVTJJVmQwa610yRVKINOeXiNVMklWhDBrrXTJFUog0Z6F4zRVKJNmSge80USSXakAdFvWaKpBJtyEAHr5kiqTwbcspFkkpkoEtSIQx0SSqEgS5JhSjmoKjXZpG00RUR6F6bRZIKmXLx2iySVEige20WSSok0L02iyQVEuhem0WSCjko6rVZJKmQQAevzSJJRUy5SJIMdEkqhoEuSYUw0CWpEC0FekTcFBFTETEdEYcarN8REQ9GxGREPBYRt3S+VEnSpSwb6BHRB9wN3AxcA9wWEdcs6faHwBcycy9wAPjLThcqSbq0VvbQrwOmM/PJzHweuB+4dUmfBH6quv8KYK5zJUqSWtFKoA8BT9ctz1Rt9e4A3hkRM8ADwO80eqKIOBgRExExcfbs2csoV5LUTCuBHg3acsnybcCnM3MbcAvw2Yh40XNn5r2ZOZyZw4ODg+1XK0lqqpVAnwG21y1v48VTKu8FvgCQmf8KvAzY0okCJUmtaSXQTwC7I2JXRFxB7aDnsSV9vgf8CkBEvI5aoDunIklraNlAz8wXgNuBceAJap9mORURd0bE26puHwTeFxGPAp8DfjMzl07LSJJWUUsX58rMB6gd7Kxv+0jd/dPAvs6WJklqh2eKSlIhDHRJKoSBLkmF6MkvuBibnPXbiSRpiZ4L9LHJWQ4fPcnCufMAzM4vcPjoSQBDXdKG1nNTLkfGpy6E+aKFc+c5Mj7VpYokaX3ouUCfm19oq12SNoqeC/StA/1ttUvSRtFzgT46sof+zX0XtfVv7mN0ZE+XKpKk9aHnDoouHvj0Uy6SdLGeC3SohboBLkkX67kpF0lSYwa6JBXCQJekQhjoklQIA12SCmGgS1IhDHRJKoSBLkmFMNAlqRAGuiQVwkCXpEIY6JJUCANdkgphoEtSIQx0SSqEgS5JhTDQJakQBrokFcJAl6RCGOiSVAgDXZIKYaBLUiEMdEkqhIEuSYUw0CWpEAa6JBXCQJekQhjoklSIlgI9Im6KiKmImI6IQ036/FpEnI6IUxHxd50t88fGJmfZd9dxdh36MvvuOs7Y5OxqDSVJPWXTch0iog+4G7gBmAFORMSxzDxd12c3cBjYl5k/jIhXrUaxY5OzHD56koVz5wGYnV/g8NGTAOzfO7QaQ0pSz2hlD/06YDozn8zM54H7gVuX9HkfcHdm/hAgM890tsyaI+NTF8J80cK58xwZn1qN4SSpp7QS6EPA03XLM1VbvdcCr42If4mIb0TETY2eKCIORsREREycPXu27WLn5hfaapekjaSVQI8GbblkeROwG3gLcBvw1xEx8KIHZd6bmcOZOTw4ONhurWwd6G+rXZI2klYCfQbYXre8DZhr0OdLmXkuM/8DmKIW8B01OrKH/s19F7X1b+5jdGRPp4eSpJ7TSqCfAHZHxK6IuAI4ABxb0mcM+GWAiNhCbQrmyU4WCrUDnx99++sZGugngKGBfj769td7QFSSaOFTLpn5QkTcDowDfcB9mXkqIu4EJjLzWLXuxog4DZwHRjPzP1ej4P17hwxwSWogMpdOh6+N4eHhnJiY6MrYktSrIuLhzBxutM4zRSWpEAa6JBXCQJekQhjoklQIA12SCmGgS1IhDHRJKoSBLkmFMNAlqRAGuiQVwkCXpEIY6JJUCANdkgphoEtSIQx0SSqEgS5JhTDQJakQBrokFcJAl6RCGOiSVAgDXZIKYaBLUiEMdEkqhIEuSYUw0CWpEAa6JBXCQJekQhjoklQIA12SCmGgS1IhDHRJKoSBLkmFMNAlqRAGuiQVwkCXpEIY6JJUCANdkgphoEtSIQx0SSpES4EeETdFxFRETEfEoUv0e0dEZEQMd67EHxubnGXfXcfZdejL7LvrOGOTs6sxjCT1pE3LdYiIPuBu4AZgBjgREccy8/SSflcCvws8tBqFjk3OcvjoSRbOnQdgdn6Bw0dPArB/79BqDClJPaWVPfTrgOnMfDIznwfuB25t0O9PgI8B/9PB+i44Mj51IcwXLZw7z5HxqdUYTpJ6TiuBPgQ8Xbc8U7VdEBF7ge2Z+Y+XeqKIOBgRExExcfbs2bYKnZtfaKtdkjaaVgI9GrTlhZURLwH+Avjgck+Umfdm5nBmDg8ODrZeJbB1oL+tdknaaFoJ9Blge93yNmCubvlK4OeAr0fEU8D1wLFOHxgdHdlD/+a+i9r6N/cxOrKnk8NIUs9a9qAocALYHRG7gFngAPAbiysz8zlgy+JyRHwd+FBmTnSy0MUDn0fGp5ibX2DrQD+jI3s8ICpJlWUDPTNfiIjbgXGgD7gvM09FxJ3ARGYeW+0iF+3fO2SAS1ITreyhk5kPAA8saftIk75vWXlZkqR2eaaoJBXCQJekQhjoklQIA12SCmGgS1IhDHRJKoSBLkmFMNAlqRAGuiQVwkCXpEIY6JJUCANdkgphoEtSIQx0SSqEgS5JhTDQJakQBrokFcJAl6RCGOiSVAgDXZIKYaBLUiEMdEkqhIEuSYUw0CWpEAa6JBXCQJekQhjoklQIA12SCmGgS1IhDHRJKoSBLkmFMNAlqRAGuiQVwkCXpEIY6JJUCANdkgphoEtSIQx0SSpES4EeETdFxFRETEfEoQbr/yAiTkfEYxHxtYh4dedLlSRdyqblOkREH3A3cAMwA5yIiGOZebqu2yQwnJk/ioj3Ax8Dfr3TxY5NznJkfIq5+QW2DvQzOrKH/XuHOj2MJPWkVvbQrwOmM/PJzHweuB+4tb5DZj6YmT+qFr8BbOtsmbUwP3z0JLPzCyQwO7/A4aMnGZuc7fRQktSTWgn0IeDpuuWZqq2Z9wJfWUlRjRwZn2Lh3PmL2hbOnefI+FSnh5KknrTslAsQDdqyYceIdwLDwC81WX8QOAiwY8eOFkusmZtfaKtdkjaaVvbQZ4DtdcvbgLmlnSLircCHgbdl5v82eqLMvDczhzNzeHBwsK1Ctw70t9UuSRtNK4F+AtgdEbsi4grgAHCsvkNE7AXuoRbmZzpfJoyO7KF/c99Fbf2b+xgd2bMaw0lSz1l2yiUzX4iI24FxoA+4LzNPRcSdwERmHgOOAC8HvhgRAN/LzLd1stDFT7P4KRdJaiwyG06Hr7rh4eGcmJjoytiS1Ksi4uHMHG60zjNFJakQBrokFcJAl6RCGOiSVAgDXZIKYaBLUiEMdEkqhIEuSYXo2olFEXEW+O5lPnwL8GwHy1kr1r22rHttWffaeHVmNrwYVtcCfSUiYqLZmVLrmXWvLeteW9bdfU65SFIhDHRJKkSvBvq93S7gMln32rLutWXdXdaTc+iSpBfr1T10SdISBrokFaLnAj0iboqIqYiYjohDXarhqYg4GRGPRMRE1fbKiPhqRHy7+nlV1R4R8fGq3sci4tq653l31f/bEfHuuvZfqJ5/unpsoy/qbrXW+yLiTEQ8Xte26rU2G2MFNd8REbPVNn8kIm6pW3e4Gn8qIkbq2hu+V6qvU3yoqu3z1VcrEhEvrZanq/U7W9/SEBHbI+LBiHgiIk5FxO/1yPZuVve63uYR8bKI+GZEPFrV/ceXO1anXk/XZWbP3Kh9Bd53gNcAVwCPAtd0oY6ngC1L2j4GHKruHwL+tLp/C/AVIIDrgYeq9lcCT1Y/r6ruX1Wt+ybwpuoxXwFuXkGtbwauBR5fy1qbjbGCmu8APtSg7zXV++ClwK7q/dF3qfcK8AXgQHX/U8D7q/u/DXyqun8A+Hyb2/pq4Nrq/pXAt6r61vv2blb3ut7m1TZ4eXV/M/BQtR3bGquTr6fbt64X0OYfzJuA8brlw8DhLtTxFC8O9Cng6ur+1cBUdf8e4Lal/YDbgHvq2u+p2q4G/r2u/aJ+l1nvTi4Ox1WvtdkYK6j5DhqHy0XvAWrfffumZu+VKgSeBTYtfU8tPra6v6nqFyvY7l8CbuiF7d2k7p7Z5sBPAP8GvLHdsTr5erp967UplyHg6brlmaptrSXwTxHxcEQcrNp+JjOfAah+vqpqb1bzpdpnGrR30lrU2myMlbi9mpq4r25Kod2afxqYz8wXGtR84THV+ueq/m2r/p3fS22vsWe295K6YZ1v84joi4hHgDPAV6ntUbc7VidfT1f1WqA3mkvuxucu92XmtcDNwAci4s2X6Nus5nbb18J6rvWTwM8CbwCeAf6sau9kzR15PRHxcuDvgd/PzP+6VNcm43Vlezeoe91v88w8n5lvALYB1wGvu4yx1tXvYSV6LdBngO11y9uAubUuIjPnqp9ngH+g9kb6fkRcDVD9PFN1b1bzpdq3NWjvpLWotdkYlyUzv1/98f4f8FfUtvnl1PwsMBARmxrUfOEx1fpXAD9op86I2EwtFP82M49Wzet+ezequ1e2eVXrPPB1anPo7Y7VydfTVb0W6CeA3dUR5iuoHdg4tpYFRMRPRsSVi/eBG4HHqzoWP43wbmrzkFTt76o+0XA98Fz1L/E4cGNEXFX9K3sjtXm4Z4D/jojrq08wvKvuuTplLWptNsZlWQyryq9S2+aL4xyoPsGwC9hN7cBhw/dK1iY9HwTe0eT1L9b8DuB41b/VGgP4G+CJzPzzulXrens3q3u9b/OIGIyIgep+P/BW4InLGKuTr6e7uj2J3+6N2icDvkVtruzDXRj/NdSOdj8KnFqsgdq82teAb1c/X1m1B3B3Ve9JYLjuuX4LmK5u76lrH6b2x/Md4BOs7MDc56j9u3yO2h7He9ei1mZjrKDmz1Y1PUbtD/Dquv4frsafou4TQc3eK9Xv8JvVa/ki8NKq/WXV8nS1/jVtbutfpPav92PAI9Xtlh7Y3s3qXtfbHPh5YLKq73HgI5c7VqdeT7dvnvovSYXotSkXSVITBrokFcJAl6RCGOiSVAgDXZIKYaBLUiEMdEkqxP8DzOsYprkOHEsAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.scatter(sizes + [331007], cutoffs[::-1] + [1.0])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0.9468)\n"
     ]
    }
   ],
   "source": [
    "a, b = data[0], data[2]\n",
    "print(a @ b / (torch.norm(a) * torch.norm(b)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAfHElEQVR4nO3de5hcVbnn8e+PJkBzjZDIyYWQgBBFBYMtovECRzTA4CEqSqLIRUbGCypyjBJhGGDGecA8MOiRo6AiR46AqDFGARsfCDLjEUiHQELQ1hhu6XAIoM1FGkjCO3/sVaHSVFXvvlRVqvfv8zz11N5rX9a7K516a6+199qKCMzMrLi2aXYAZmbWXE4EZmYF50RgZlZwTgRmZgXnRGBmVnBOBGZmBedEYLaVk3STpJOaHYeNXvJ9BNaqJH0UOBN4LfAMcA/wtYj4f00NzKzF+IzAWpKkM4FLgf8N7AlMAf4VOLaJMW3brLrNhsOJwFqOpN2AC4DPRsTCiPh7RGyIiF9GxDxJ20u6VNK69LpU0vZp28MkrZX0ZUnrJT0qabakoyX9SdJfJX21rK7zJP1U0o8lPSPpbkkHlS1/UNJXJK0A/i5pW0kTJf1M0uOSHpD0+bL1D5HUJelpSY9JuiSV7yDp3yU9KalX0lJJe6Zlt0n6r2l6G0nnSHooxf/D9HkgaaqkkHSSpIclPSHp7Ab8k1iLcyKwVvQ2YAfg51WWnw0cCrwJOAg4BDinbPk/pO0nAecC3wVOAN4MvBM4V9I+ZesfC/wE2B24BlgkaUzZ8rnAfwHGAi8BvwTuTft/D3CGpFlp3W8A34iIXYF9getT+UnAbsBewB7Ap4C+Csd2cnodDuwD7Ax8q9867wCmp7rPlfS6Sh+SWYkTgbWiPYAnImJjleUfAy6IiPUR8ThwPvDxsuUbyPoSNgDXAePIvpyfiYhVwCrgwLL1l0XET9P6l5AlkUPLln8zIh6JiD7gLcD4iLggIl6MiDVkiWZOWd2vkTQuIp6NiDvKyvcAXhMRmyJiWUQ8XeXYLomINRHxLDAfmNOvWer8iOiLiHvJEtJBFfZjtpkTgbWiJ4FxNdrkJwIPlc0/lMo2bx8Rm9J06Vf3Y2XL+8h+aZc8UpqIiJeAtf3290jZ9N7AxNS80yupF/gqWT8GwKnA/sAfU/PPMan8aqATuC41Z32931lHrWPbtmz/AP9ZNv1cv2MxewUnAmtFvweeB2ZXWb6O7Au5ZEoqG6q9ShOStgEm99tf+aV3jwAPRMTYstcuEXE0QET8OSLmAq8GLgJ+Kmmn1MdxfkQcALwdOAY4MeexbWTLRGY2KE4E1nIi4imytv3LUkfvjpLGSDpK0teBa4FzJI2XNC6t++/DqPLNkj6YzkDOAF4A7qiy7l3A06kDuV1Sm6Q3SHoLgKQTJI1PZxa9aZtNkg6X9EZJbcDTZE1Fmyrs/1rgi5KmSdqZ7KqpH9doJjMbkC93s5YUEZdIeoysE/hHZPcRLAO+BtwN7AqsSKv/BPhfw6juF8DxwL8Bq4EPpv6CSnFtkvR+4GLgAWB7oJuXO6uPBC6RtCNZs86ciHhe0j8A3yE723gW+DGVk9eVZM1Dt5P1VXQCnxvGsZn5hjKzWiSdR9aBe0KzYzGrFzcNmZkVnBOBmVnBuWnIzKzgfEZgZlZwLXfV0Lhx42Lq1KnNDsPMrKUsW7bsiYgYX2lZyyWCqVOn0tXV1ewwzMxaiqSHqi1z05CZWcE5EZiZFZwTgZlZwTkRmJkVnBOBmVnBORGYmRWcE4GZWcE5EZiZFVzdEoGkKyWtl3RfleWS9E1JqyWtkHRwvWIxM7Pq6nln8VXAt4AfVll+FLBfer0V+HZ6NzOzMouW97Cgs5t1vX1MHNvOvFnTmT1j0ojtv25nBBFxO/DXGqscC/wwMncAYyVNqFc8ZmataNHyHuYvXElPbx8B9PT2MX/hShYt7xmxOprZRzCJ7EHfJWtTmVlLWrS8h5kX3sq0s25g5oW3juh/VCuuBZ3d9G3Y8vHVfRs2saCze8TqaOagc6pQVvHhCJJOA04DmDJlSj1jMhuS0q+20n/Y0q82YERP4a141vX2Dap8KJp5RrAW2KtsfjKwrtKKEXFFRHRERMf48RVHUTVrqkb8arNimji2fVDlQ9HMRLAYODFdPXQo8FREPNrEeMyGrBG/2qyY5s2aTvuYti3K2se0MW/W9BGro25NQ5KuBQ4DxklaC/wPYAxARHwHuBE4GlgNPAecUq9YzOpt4th2eip86Y/krzYrplLTYj2vGqpbIoiIuQMsD+Cz9arfrJHmzZq+RR8BjPyvNiuu2TMm1bWvqeWeUGa2NWrErzazenEiMBsh9f7VZlYvHmvIzKzgfEYwStT7FnQzG72cCEYB38xkZsPhpqFRwDczmdlwOBGMAr6ZycyGw4lgFGjELehmNno5EYwCjbgF3cxGL3cWjwK+mcnMhsOJYJTwzUxmNlRuGjIzKzgnAjOzgnMiMDMrOCcCM7OCc2fxIHlMHzMbbZwIBsFj+pjZaOSmoUHwmD5mNho5EQyCx/Qxs9HIiWAQPKaPmY1GTgSD4DF9zGw0cmfxIHhMHzMbjZwIBslj+pjZaOOmITOzgnMiMDMrOCcCM7OCcyIwMyu4wncWe+wgMyu6QicCjx1kZlbwpiGPHWRmVvBE4LGDzMwKngg8dpCZWcETgccOMjMreGexxw4yMyt4IgCPHWRmVuimITMzcyIwMyu8qk1Dkj5Ya8OIWDjy4ZiZWaPV6iN4f3p/NfB24NY0fzhwG+BEYGY2ClRtGoqIUyLiFCCAAyLiQxHxIeD1eXcu6UhJ3ZJWSzqrwvIpkpZIWi5phaSjh3QUZmY2ZHn6CKZGxKNl848B+w+0kaQ24DLgKOAAYK6kA/qtdg5wfUTMAOYA/5orajMzGzF5Lh+9TVIncC3Z2cEcYEmO7Q4BVkfEGgBJ1wHHAveXrRPArml6N2BdzrjNzGyEDJgIIuL01HH8zlR0RUT8PMe+JwGPlM2vBd7ab53zgJslfQ7YCTii0o4knQacBjBlypQcVZuZWV65bihLVwgNtnNYlXbVb34ucFVEXCzpbcDVkt4QES/1q/8K4AqAjo6O/vswM7NhqHX56DO88osbsi/4iIhdKywrtxbYq2x+Mq9s+jkVOJJsh7+XtAMwDlg/wL7NzGyE1LpqaJeI2LXCa5ccSQBgKbCfpGmStiPrW1jcb52HgfcASHodsAPw+NAOxczMhiJX05Ckg3i5j+D2iFgx0DYRsVHS6UAn0AZcGRGrJF0AdEXEYuCfge9K+iLZ2cfJEeGmHzOzBhowEUj6AvBJXu4j+JGkKyLiXwbaNiJuBG7sV3Zu2fT9wMxBRWxmZiMqzxnBqcBbI+LvAJIuAn4PDJgIzMxs65fnhjIB5Q/23UTlK4LMzKwF5Tkj+AFwp6TSvQOzge/XLyQzM2ukPDeUXSLpt2Rt+QJOiYjldY+sDhYt7/HTyMzM+sn7hLJ7gEdL60uaEhEP1y2qOli0vIf5C1fStyFr5erp7WP+wpUATgZmVmgD9hGk4R8eA34D/Aq4Ib23lAWd3ZuTQEnfhk0s6OxuUkRmZluHPGcEXwCmR8ST9Q6mntb19g2q3MysKPJcNfQI8FS9A6m3iWPbB1VuZlYUtcYaOjNNriEbivoG4IXS8oi4pM6xjah5s6Zv0UcA0D6mjXmzpjcxKjOz5qvVNLRLen84vbZLr5ZU6hD2VUNmZlvSYIb2kbQNsHNEPF2/kGrr6OiIrq6uZlVvZtaSJC2LiI5Ky/JcNXSNpF0l7UT2dLFuSfNGOkgzM2uOPJ3FB6QzgNlkA8hNAT5e16jMzKxh8iSCMZLGkCWCX0TEBio/sMbMzFpQnkRwOfAg2TOFb5e0N9C0PgIzMxtZecYa+ibwzbKihyQdXr+QzMyskfJ0Fu8p6fuSbkrzBwAn1T0yMzNriDxNQ1eRPW5yYpr/E3BGvQIyM7PGypMIxkXE9cBLkD2LmC0fVGNmZi0sTyL4u6Q9SFcKSTqUUTD2kJmZZfKMPnomsBjYV9LvgPHAcXWNyszMGqZmIkhDSuwAvBuYTvaEsu50L4GZmY0CNRNBRLwk6eKIeBuwqkExmZlZA+XpI7hZ0ockqe7RmJlZw+XtI9gJ2CjpebLmoYiIXesamZmZNUSeO4t3GWgdMzNrXXnOCJA0Cdi7fP2IuL1eQZmZWeMMmAgkXQQcT/YsgtKNZAE4EZiZjQJ5zghmA9Mj4oUB1zQzs5aT56qhNcCYegdiZmbNUfWMQNK/kDUBPQfcI+kWYPNZQUR8vv7hmZlZvdVqGio9IX4Z2RATZmY2CtVKBIdHxMmNCsTMzJqjVh/BgQ2LwszMmqbWGcGOkmaQ3Un8ChFxd31CMjOzRqqVCCYBF1M5EQTwj3WJyMzMGqpWIlgdEf6yNzMb5fLcR2BmZqNYrUTwlYZFYWZmTVM1EUTEzcPduaQjJXVLWi3prCrrfETS/ZJWSbpmuHVWs2h5DzMvvJVpZ93AzAtvZdHynnpVZWbWUnKNPjoUktqAy4D3AmuBpZIWR8T9ZevsB8wHZkbE3yS9uh6xLFrew/yFK+nbkI2Z19Pbx/yFKwGYPWNSPao0M2sZ9ewjOISsw3lNRLwIXAcc22+dTwKXRcTfACJifT0CWdDZvTkJlPRt2MSCzu56VGdm1lKGlAgknZZjtUnAI2Xza1NZuf2B/SX9TtIdko6sVp+kLkldjz/++KDjXdfbN6hyM7MiGeoZQZ7nF1e7/6DctsB+wGHAXOB7ksa+YqOIKyKiIyI6xo8fP9hYmTi2fVDlZmZFMqREEBGX51htLbBX2fxkYF2FdX4RERsi4gGgmywxjKh5s6bTPqZti7L2MW3MmzV9pKsyM2s5eZ5QNhY4EZjKlo+qHGgY6qXAfpKmAT3AHOCj/dZZRHYmcJWkcWRNRWvyBp9XqUN4QWc363r7mDi2nXmzpruj2MyMfFcN3QjcAawEXsq744jYKOl0oBNoA66MiFWSLgC6ImJxWvY+SaXHYM6LiCcHexB5zJ4xyV/8ZmYVKKJ/s32/FaS7I+LgBsUzoI6Ojujq6hp4RTMz20zSsojoqLQsTx/B1ZI+KWmCpN1LrxGO0czMmiRP09CLwALgbF6+6ieAfeoVlJmZNU6eRHAm8JqIeKLewZiZWePlaRpaRfYAezMzG4XynBFsAu6RtAR4oVSY4/JRMzNrAXkSwaL0MjOzUWjARBAR/9aIQMzMrDmqJgJJ10fERySt5JVjBBERB9Y1MjMza4haZwRfSO/HNCIQMzNrjlpPKHs0vT8UEQ8BzwIHA+PSvJmZjQJVE4GkX0l6Q5qeANwHfILsTuMzGhSfmZnVWa37CKZFxH1p+hTgNxHxfuCtZAnBzMxGgVqJYEPZ9HvIRiElIp5hEKOQmpnZ1q1WZ/Ejkj5H9vCYg4FfA0hqB8Y0IDYzM2uAWmcEpwKvB04Gjo+I3lR+KPCDOsdlZmYNUvWMICLWA5+qUL4EWFLPoMzMrHGG+vB6MzMbJZwIzMwKzonAzKzgBkwEkvaXdIuk+9L8gZLOqX9oZmbWCHnOCL4LzCfdVxARK4A59QzKzMwaJ08i2DEi7upXtrEewZiZWePlSQRPSNqXNBS1pOOAR+salZmZNUyeJ5R9FrgCeK2kHuAB4IS6RmVmZg2T5wlla4AjJO0EbJPGGjIzs1FiwEQgaXvgQ8BUYFtJAETEBXWNzMzMGiJP09AvgKeAZcAL9Q3HzMwaLU8imBwRR9Y9EjMza4o8Vw39h6Q31j0SMzNrijxnBO8ATpb0AFnTkICIiAPrGpmZmTVEnkRwVN2jMDOzpqmaCCTtGhFPA75c1MxsFKt1RnANcAzZ1UJB1iRUEsA+dYzLzMwapNYTyo5J79MaF46ZmTVanmGoZ6a7ipF0gqRLJE2pf2hmZtYIeS4f/TbwnKSDgC8DDwFX1zUqMzNrmDyJYGNEBHAs8I2I+AawS33DMjOzRslz+egzkuYDHwfeKakNGFPfsMzMrFHynBEcT3Yj2Sci4j+BScCCukZlZmYNM2AiSF/+PwJ2k3QM8HxE/DDPziUdKalb0mpJZ9VY7zhJIakjd+SDsGh5DzMvvJVpZ93AzAtvZdHynnpUY2bWkvJcNfQR4C7gw8BHgDvTU8oG2q4NuIzszuQDgLmSDqiw3i7A54E7Bxd6PouW9zB/4Up6evsIoKe3j/kLVzoZmJkleZqGzgbeEhEnRcSJwCHAf8+x3SHA6ohYExEvAteRdTj39z+BrwPP54x5UBZ0dtO3YdMWZX0bNrGgs7se1ZmZtZw8iWCbiFhfNv9kzu0mAY+Uza9NZZtJmgHsFRG/qrUjSadJ6pLU9fjjj+eo+mXrevsGVW5mVjR5vtB/LalT0smSTgZuAG7KsZ0qlMXmhdI2wP8B/nmgHUXEFRHREREd48ePz1H1yyaObR9UuZlZ0eTpLJ4HXA4cCBwEXBERX86x77XAXmXzk4F1ZfO7AG8AbpP0IHAosHikO4znzZpO+5i2Lcrax7Qxb9b0kazGzKxl1Rp99DXAnhHxu4hYCCxM5e+StG9E/GWAfS8F9pM0DegB5gAfLS2MiKeAcWX13QZ8KSK6hnowlcyekbVGLejsZl1vHxPHtjNv1vTN5WZmRVfrhrJLga9WKH8uLXt/rR1HxEZJpwOdQBtwZUSsknQB0BURi4cY86DNnjHJX/xmZlXUSgRTI2JF/8KI6JI0Nc/OI+JG4MZ+ZedWWfewPPs0M7ORVauPYIcay9zTamY2StRKBEslfbJ/oaRTyR5WY2Zmo0CtpqEzgJ9L+hgvf/F3ANsBH6h3YGZm1hi1nlD2GPB2SYeTXeYJcENE3NqQyMzMrCEGHIY6IpYASxoQi5mZNUGeO4vNzGwUcyIwMys4JwIzs4JzIjAzKzgnAjOzgnMiMDMrOCcCM7OCcyIwMys4JwIzs4JzIjAzKzgnAjOzgnMiMDMrOCcCM7OCcyIwMys4JwIzs4JzIjAzKzgnAjOzgnMiMDMrOCcCM7OCcyIwMys4JwIzs4JzIjAzKzgnAjOzgnMiMDMrOCcCM7OCcyIwMys4JwIzs4JzIjAzKzgnAjOzgnMiMDMrOCcCM7OCcyIwMys4JwIzs4KrayKQdKSkbkmrJZ1VYfmZku6XtELSLZL2rmc8Zmb2StvWa8eS2oDLgPcCa4GlkhZHxP1lqy0HOiLiOUmfBr4OHD/SsSxa3sOCzm7W9fYxcWw782ZNZ/aMSSNdjZlZS6rnGcEhwOqIWBMRLwLXAceWrxARSyLiuTR7BzB5pINYtLyH+QtX0tPbRwA9vX3MX7iSRct7RroqM7OWVM9EMAl4pGx+bSqr5lTgppEOYkFnN30bNm1R1rdhEws6u0e6KjOzllS3piFAFcqi4orSCUAH8O4qy08DTgOYMmXKoIJY19s3qHIzs6Kp5xnBWmCvsvnJwLr+K0k6Ajgb+KeIeKHSjiLiiojoiIiO8ePHDyqIiWPbB1VuZlY09UwES4H9JE2TtB0wB1hcvoKkGcDlZElgfT2CmDdrOu1j2rYoax/TxrxZ0+tRnZlZy6lb01BEbJR0OtAJtAFXRsQqSRcAXRGxGFgA7Az8RBLAwxHxTyMZR+nqIF81ZGZWmSIqNttvtTo6OqKrq6vZYZiZtRRJyyKio9Iy31lsZlZwTgRmZgXnRGBmVnBOBGZmBedEYGZWcE4EZmYF50RgZlZwTgRmZgXXcjeUSXoceGiIm48DnhjBcBrFcTeW424sx90Ye0dExcHaWi4RDIekrmp31m3NHHdjOe7GctzN56YhM7OCcyIwMyu4oiWCK5odwBA57sZy3I3luJusUH0EZmb2SkU7IzAzs36cCMzMCq4wiUDSkZK6Ja2WdFaTYnhQ0kpJ90jqSmW7S/qNpD+n91elckn6Zop3haSDy/ZzUlr/z5JOKit/c9r/6rSthhHrlZLWS7qvrKzusVarYxgxnyepJ33m90g6umzZ/FR/t6RZZeUV/1bSY1fvTLH9OD2CFUnbp/nVafnU/J80SNpL0hJJf5C0StIXWuTzrhb3Vv2ZS9pB0l2S7k1xnz/UukbqeJouIkb9i+xRmX8B9gG2A+4FDmhCHA8C4/qVfR04K02fBVyUpo8GbgIEHArcmcp3B9ak91el6VelZXcBb0vb3AQcNYxY3wUcDNzXyFir1TGMmM8DvlRh3QPS38H2wLT099FW628FuB6Yk6a/A3w6TX8G+E6angP8eJCf9QTg4DS9C/CnFN/W/nlXi3ur/szTZ7Bzmh4D3Jk+x0HVNZLH0+xX0wNoyEFm/wE6y+bnA/ObEMeDvDIRdAMT0vQEoDtNXw7M7b8eMBe4vKz88lQ2AfhjWfkW6w0x3qls+aVa91ir1TGMmM+j8pfSFn8DZM/Wflu1v5X05fEEsG3/v6nStml627SehvG5/wJ4byt83lXibpnPHNgRuBt462DrGsnjafarKE1Dk4BHyubXprJGC+BmScsknZbK9oyIRwHS+6tTebWYa5WvrVA+khoRa7U6huP01IRyZVnTx2Bj3gPojYiNFWLevE1a/lRaf9BSs8MMsl+pLfN594sbtvLPXFKbpHuA9cBvyH7BD7aukTyepipKIqjUVt6M62ZnRsTBwFHAZyW9q8a61WIebHkjbM2xfhvYF3gT8ChwcSofyZhH5Hgk7Qz8DDgjIp6utWqV+pryeVeIe6v/zCNiU0S8CZgMHAK8bgh1bVX/DsNRlESwFtirbH4ysK7RQUTEuvS+Hvg52R/gY5ImAKT39Wn1ajHXKp9coXwkNSLWanUMSUQ8lv7TvwR8l+wzH0rMTwBjJW1bIebN26TluwF/HUycksaQfZn+KCIWpuKt/vOuFHerfOYp1l7gNrI+gsHWNZLH01RFSQRLgf1Sj/12ZB0+ixsZgKSdJO1SmgbeB9yX4ihd3XESWTsrqfzEdIXIocBT6dS9E3ifpFelU+73kbUzPgo8I+nQdEXIiWX7GimNiLVaHUNS+pJLPkD2mZfqmZOuCJkG7EfWoVrxbyWyRt0lwHFVjr8U83HArWn9vDEK+D7wh4i4pGzRVv15V4t7a//MJY2XNDZNtwNHAH8YQl0jeTzN1exOika9yK60+BNZW+DZTah/H7KrB+4FVpViIGs3vAX4c3rfPZULuCzFuxLoKNvXJ4DV6XVKWXkH2X+6vwDfYngdlteSndZvIPuFc2ojYq1WxzBivjrFtILsP+6EsvXPTvV3U3aFVbW/lfRveFc6lp8A26fyHdL86rR8n0F+1u8gayJYAdyTXke3wOddLe6t+jMHDgSWp/juA84dal0jdTzNfnmICTOzgitK05CZmVXhRGBmVnBOBGZmBedEYGZWcE4EZmYF50RgTSMpJF1cNv8lSeeN0L6vknTcwGsOu54PKxt9c0m/8m2UjfJ5n7JRP5ema82HUsdhkt4+MhGbvZITgTXTC8AHJY1rdiDlJLUNYvVTgc9ExOH9yo8HJgIHRsQbyW6s6h1iSIcBTgRWN04E1kwbyZ77+sX+C/r/opf0bHo/TNJvJV0v6U+SLpT0MWXjy6+UtG/Zbo6Q9H/Tesek7dskLUi/0FdI+m9l+10i6Rqym6H6xzM37f8+SRelsnPJbqr6jqQF/TaZADwa2TALRMTaiPhb2u59kn4v6W5JP1E2Vk/peRXnp/KVkl6rbDC3TwFfVDa2/zvTnbE/S8ewVNLMtP15ygZ5u03SGkmfL4v/xHS890q6OpVV28+79fKzBJYr3RFvo1iz72jzq7gv4FlgV7LhuXcDvgScl5ZdBRxXvm56P4zsl/UEsnHge4Dz07IvAJeWbf9rsh87+5HdabwDcBpwTlpne6CLbCz5w4C/A9MqxDkReBgYTzYM8a3A7LTsNsru7C3bZnI6rnvIBl2bkcrHAbcDO6X5r/Dyna0PAp9L058Bvpemz6NsWGfgGuAdaXoK2RAPpfX+Ix3XOOBJsvH2X0925+u4tN7uA+znl2QDJALsTBo22a/R+yoNfmTWFBHxtKQfAp8H+nJutjTS0MmS/gLcnMpXAuVNNNdH9ov8z5LWAK8lG3/nwLKzjd3IEsWLwF0R8UCF+t4C3BYRj6c6f0T2EJxFNY5rraTpwD+m1y2SPgy0kz3Q5HfZUD1sB/y+bNPSgHPLgA9W2f0RwAF6+QF0u5b9ar8hIl4AXpC0Htgz1f/TiHgixfbXAfbzO+CSdJwLI6J8CGsbhZwIbGtwKdnDQX5QVraR1HSZBjcrf6TfC2XTL5XNv8SWf9P9x08pDQX8uYjoLF8g6TCyM4JKhvTIz/SFfBNwk6THgNlkSes3ETG3ymalY9lE9f+f25A9KGWLxJm+0Ms/m9I+ROXhjivuB7hQ0g1k4+XcIemIiPhjlVhsFHAfgTVd+oV6PVnHa8mDwJvT9LFkTRyD9eF09c6+ZIN9dZON0PlpZcMnI2l/ZaPB1nIn8G5J41JH8lzgt7U2kHSwpIlpehuygc4eAu4AZkp6TVq2o6T9B6j/GbJHQZbcDJxeVtebBtj+FuAjkvZI6+9eaz+S9o2IlRFxEVnT2WsH2L+1OCcC21pcTNauXfJdsi/fu8geI1jt13ot3WRf2DcBn4qI54HvAfcDdyt7yP3lDHBmnJqh5pMNIXwvcHdEDDR88KuBX6Y6VpCd4XwrNS+dDFwraQVZYhjoi/aXwAdKncVkzWgdqfP3frLO5FrxrwK+BvxW0r1Aacjoavs5I3WK30vWXHfTAPFZi/Poo2ZmBeczAjOzgnMiMDMrOCcCM7OCcyIwMys4JwIzs4JzIjAzKzgnAjOzgvv/gvrIfOQhElQAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.scatter(sizes + [331007], cutoffs[::-1] + [1.00])\n",
    "plt.title(\"Compression\")\n",
    "plt.xlabel(\"Number of Sentences\")\n",
    "plt.ylabel(\"Cosine Sim. Threshold\")\n",
    "plt.savefig(\"./compress_sentences.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
