{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "from minicons import cwe\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "import csv\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "\n",
    "from collections import defaultdict, Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bert = cwe.CWE(\"bert-base-uncased\", \"cuda:0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sense_data = []\n",
    "words = defaultdict(lambda: len(words))\n",
    "senses = defaultdict(lambda: len(senses))\n",
    "sents = defaultdict(lambda: len(sents))\n",
    "\n",
    "with open(\"../sent_data/all_sent_data.pkl\", \"rb\") as f:\n",
    "    semeval = pickle.load(f)\n",
    "\n",
    "for line in semeval:\n",
    "    dataset_id, sent_id, sentence, word, position, sense = line\n",
    "    sent_id = sents[sentence]\n",
    "    word_id = words[word]\n",
    "    sense_id = senses[sense]\n",
    "    if sense == None:\n",
    "        pass\n",
    "    else:\n",
    "        sense_data.append((dataset_id, sent_id, sentence, word, position, sense))\n",
    "\n",
    "with open(\"../data/multi_sense_nva.csv\", \"r\") as f:\n",
    "    reader = csv.DictReader(f)\n",
    "    for line in reader:\n",
    "        if line['pos'] != 'NNP' and line['pos'] != 'RB':\n",
    "            if line['context'][-1] != \".\":\n",
    "                sentence = line['context'] + \" .\"\n",
    "            else:\n",
    "                sentence = line['context']\n",
    "            sent_id = sents[sentence]\n",
    "            word_id = words[line['word']]\n",
    "            sense_id = senses[line['lex_sense']]\n",
    "            position = int(line['index'])\n",
    "            sense_data.append((\"semcor\", sent_id, sentence , line['word'], position, line['lex_sense']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(sense_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "senses_total = list(zip(*sense_data))[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sense_counts = Counter(senses_total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "single_sense = [k for k,v in sense_counts.items() if v == 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sense_data[:1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sense_data_final = []\n",
    "for data in sense_data:\n",
    "    dataset, sent_id, sentence, word, position, sense = data\n",
    "    if not sense in single_sense:\n",
    "        sense_data_final.append((dataset, sent_id, sentence, word, position, sense))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"{len(sense_data_final)} total instances with {len(senses)} unique senses and {len(words)} unique words and {len(sents)} unique sentences\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "181768/16528"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"../data/sense_metadata.csv\", \"w\") as f:\n",
    "    writer = csv.writer(f)\n",
    "    writer.writerow([\"dataset_id\", \"sent_id\", \"sentence\", \"word\", \"position\", \"sense\"])\n",
    "    writer.writerows(sense_data_final)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}