{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from minicons import cwe\n",
    "from torch.utils.data import DataLoader\n",
    "\n",
    "import torch\n",
    "\n",
    "from tqdm import tqdm\n",
    "import csv\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sat Apr 10 22:10:59 2021       \n",
      "+-----------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |\n",
      "|-------------------------------+----------------------+----------------------+\n",
      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
      "|                               |                      |               MIG M. |\n",
      "|===============================+======================+======================|\n",
      "|   0  Tesla V100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |\n",
      "| N/A   32C    P0    42W / 250W |   2638MiB / 32510MiB |     31%      Default |\n",
      "|                               |                      |                  N/A |\n",
      "+-------------------------------+----------------------+----------------------+\n",
      "|   1  Tesla V100-PCIE...  Off  | 00000000:D8:00.0 Off |                    0 |\n",
      "| N/A   30C    P0    35W / 250W |   1803MiB / 32510MiB |      0%      Default |\n",
      "|                               |                      |                  N/A |\n",
      "+-------------------------------+----------------------+----------------------+\n",
      "                                                                               \n",
      "+-----------------------------------------------------------------------------+\n",
      "| Processes:                                                                  |\n",
      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
      "|        ID   ID                                                   Usage      |\n",
      "|=============================================================================|\n",
      "|    0   N/A  N/A      1760      C   python3                          1313MiB |\n",
      "|    0   N/A  N/A      1994      G   /usr/lib/xorg/Xorg                  4MiB |\n",
      "|    0   N/A  N/A     14029      C   python3                          1317MiB |\n",
      "|    1   N/A  N/A      1760      C   python3                          1795MiB |\n",
      "|    1   N/A  N/A      1994      G   /usr/lib/xorg/Xorg                  4MiB |\n",
      "+-----------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "'''\n",
    "Check if root is (ab)using their root privilege to run stuff on gpu\n",
    "'''\n",
    "\n",
    "!nvidia-smi\n",
    "\n",
    "# !free -m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bert = cwe.CWE(\"bert-base-uncased\", \"cuda:1\")\n",
    "\n",
    "LAYER = 0 # specify which layer: [0, 12]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sense_data = []\n",
    "with open(\"../data/sense_metadata.csv\", \"r\") as f:\n",
    "    reader = csv.reader(f)\n",
    "    next(f)\n",
    "    for line in reader:\n",
    "        sense_data.append(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sense_dl = DataLoader(sense_data, batch_size = 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_data = []\n",
    "for batch in tqdm(sense_dl):\n",
    "    _, _, sentence, word, position, sense = batch\n",
    "    position = [int(item) for item in position]\n",
    "    position_tuple = [(i, i+1) for i in position]\n",
    "    representation = bert.extract_representation(list(zip(sentence, position_tuple)), layer = LAYER)\n",
    "    embedding_data.extend(list(zip(position, word, sense, representation)))\n",
    "\n",
    "torch.cuda.empty_cache()\n",
    "    \n",
    "# embedding data: [(position, word, sense, vector), ...]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "To unpack the representations into one matrix, say for PCA:\n",
    "'''\n",
    "\n",
    "for_pca = torch.stack(list(zip(*embedding_data))[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for_pca.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
