{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "eafc20bb",
   "metadata": {},
   "source": [
    "## Create Your own lexicons based on user requirements\n",
    "\n",
    "\n",
    "There could be two ways to create your own lexicons.\n",
    "\n",
    "- Training your own word2vec models- In this case, first run the train_word2vec_models.py script\n",
    "- Using the pre-trained word2vec models and choose a specific range of neighbors to determine a score for each word- In this case, run the below cell\n",
    "\n",
    "It is also worth noting the Lexicons are created based on NRC VAD Lexicons V2. So, we also need to download the NRC VAD Lexicons from [NRC](https://saifmohammad.com/WebPages/nrc-vad.html)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "be0c22a8",
   "metadata": {},
   "source": [
    "### Load the pre-trained word2vec models \n",
    "\n",
    "We will specifically load all the five models for each splits of 50 time-intervals of Chronoberg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b44866c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "from gensim.models import Word2Vec \n",
    "\n",
    "slice_one = Word2Vec.load(path_to_slice_one_model)\n",
    "slice_two = Word2Vec.load(path_to_slice_two_model)\n",
    "slice_three = Word2Vec.load(path_to_slice_three_model)\n",
    "slice_four =  Word2Vec.load(path_to_slice_four_model)\n",
    "slice_five =  Word2Vec.load(path_to_slice_five_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c61f827",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "data = torch.load(path_to_NRC_VAD_Lexicon)\n",
    "\n",
    "\n",
    "positives = []\n",
    "\n",
    "n_w = ['enjoy', 'happiest', 'hugs', 'success']\n",
    "n_w = ['abundant', 'enjoy', 'hugs', 'jolly', 'laughter', 'liking', 'lucky', 'marvel', 'merry', 'respectful']\n",
    "n_w = ['shit', 'afraid', 'angered', 'annihilation','bankruptcy','betray','stabbed','strangulation','suicidal','chaos']\n",
    "n_w = ['asylum', 'coronary', 'depressive', 'germs', 'heartbreak', 'homeless', 'malfeasance', 'punk', 'sanctimonious', 'senile', 'unustified', 'weird']\n",
    "n_w = ['bloomers', 'destiny', 'dunk', 'febrile', 'infatuation', 'karma', 'outing','repertoire','sanitation','stockbroker','technology','tweak']\n",
    "n_w= ['redoubled', 'gayety', 'carelessness']\n",
    "for w_ in tqdm(n_w):\n",
    "    positives.append(w_)\n",
    "valence = {}\n",
    "for w_ in positives:\n",
    "    valence[w_] = []\n",
    "print(\"Number of words:\", len(positives))\n",
    "\n",
    "for n_ in tqdm(positives):\n",
    "    try:\n",
    "        top_10= slice_one.wv.most_similar(n_, topn=100)\n",
    "        res= []\n",
    "        click = 0\n",
    "        tri = 0 \n",
    "        for t_ in top_10:\n",
    "            w_ = t_[0]\n",
    "            tri += 1\n",
    "            try:\n",
    "                res.append(float(data[w_]))\n",
    "                click += 1\n",
    "                if click == 20:\n",
    "                    break\n",
    "            except:\n",
    "                pass #print(f\"Word not found: {w_}\")\n",
    "        \n",
    "        if len(res) == 0:\n",
    "            valence[n_].append('None')\n",
    "        else:\n",
    "            valence[n_].append(round(np.mean(res),2))\n",
    "\n",
    "    except:\n",
    "        valence[n_].append('None')\n",
    "\n",
    "    try:\n",
    "        top_10= slice_two.wv.most_similar(n_, topn=100)\n",
    "        res= []\n",
    "        click = 0\n",
    "        tri = 0 \n",
    "        for t_ in top_10:\n",
    "            w_ = t_[0]\n",
    "            tri += 1\n",
    "            try:\n",
    "                res.append(float(data[w_]))\n",
    "                click += 1\n",
    "                if click == 20:\n",
    "                    break\n",
    "            except:\n",
    "                pass #print(f\"Word not found: {w_}\")\n",
    "        \n",
    "        if len(res) == 0:\n",
    "            valence[n_].append('None')\n",
    "        else:\n",
    "            valence[n_].append(round(np.mean(res),2))\n",
    "\n",
    "    except:\n",
    "        valence[n_].append('None')\n",
    "\n",
    "    try:\n",
    "        top_10= slice_three.wv.most_similar(n_, topn=100)\n",
    "        res= []\n",
    "        click = 0\n",
    "        tri = 0 \n",
    "        for t_ in top_10:\n",
    "            w_ = t_[0]\n",
    "            tri += 1\n",
    "            try:\n",
    "                res.append(float(data[w_]))\n",
    "                click += 1\n",
    "                if click == 20:\n",
    "                    break\n",
    "            except:\n",
    "                pass #print(f\"Word not found: {w_}\")\n",
    "        \n",
    "        if len(res) == 0:\n",
    "            valence[n_].append('None')\n",
    "        else:\n",
    "            valence[n_].append(round(np.mean(res),2))\n",
    "\n",
    "    except:\n",
    "        valence[n_].append('None')\n",
    "\n",
    "    try:\n",
    "        top_10= slice_four.wv.most_similar(n_, topn=100)\n",
    "        res= []\n",
    "        click = 0\n",
    "        tri = 0 \n",
    "        for t_ in top_10:\n",
    "            w_ = t_[0]\n",
    "            tri += 1\n",
    "            try:\n",
    "                res.append(float(data[w_]))\n",
    "                click += 1\n",
    "                if click == 20:\n",
    "                    break\n",
    "            except:\n",
    "                pass #print(f\"Word not found: {w_}\")\n",
    "        \n",
    "        if len(res) == 0:\n",
    "            valence[n_].append('None')\n",
    "        else:\n",
    "            valence[n_].append(round(np.mean(res),2))\n",
    "\n",
    "    except:\n",
    "        valence[n_].append('None')\n",
    "\n",
    "    try:\n",
    "        top_10= slice_five.wv.most_similar(n_, topn=100)\n",
    "        res= []\n",
    "        click = 0\n",
    "        tri = 0 \n",
    "        for t_ in top_10:\n",
    "            w_ = t_[0]\n",
    "            tri += 1\n",
    "            try:\n",
    "                res.append(float(data[w_]))\n",
    "                click += 1\n",
    "                if click == 20:\n",
    "                    break\n",
    "            except:\n",
    "                pass #print(f\"Word not found: {w_}\")\n",
    "        \n",
    "        if len(res) == 0:\n",
    "            valence[n_].append('None')\n",
    "        else:\n",
    "            valence[n_].append(round(np.mean(res),2))\n",
    "\n",
    "    except:\n",
    "        valence[n_].append('None')\n",
    "valence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50161485",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
