{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "from tqdm.notebook import tqdm_notebook as tqdm\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from embeds import ppmi"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "source": [
    "# Load the thesaurus from JSON file (https://github.com/dariusk/ea-thesaurus)\n",
    "with open('../../data/free_assoc/ea-thesaurus.json') as json_file:\n",
    "    eat_dict = json.load(json_file)\n",
    "    \n",
    "eat_dict[\"0\"]"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "7c8e8d891ba41340",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# Finding all unique cues and responses\n",
    "cues, resps = set(), set()\n",
    "for key, values in tqdm(eat_dict.items()):\n",
    "    cues.add(key)\n",
    "    cue_resps = {list(resp_count.keys())[0] for resp_count in values}\n",
    "    resps = resps.union(cue_resps)\n",
    "\n",
    "# Initializing empty dataframe with cues and responses \n",
    "eat = pd.DataFrame(index=list(cues), columns=list(resps))\n",
    "eat.shape"
   ],
   "id": "495c583ce9b0e395",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# Populating dataframe\n",
    "for key, values in tqdm(eat_dict.items()):\n",
    "    for resp_count in values:\n",
    "        resp = list(resp_count.keys())[0]\n",
    "        eat.loc[key, resp] = resp_count[resp]\n",
    "\n",
    "eat = eat.astype(float)\n",
    "eat"
   ],
   "id": "ec1433a56c5bbe71",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "ppmi_eat = ppmi(eat).fillna(0.0)\n",
    "ppmi_eat"
   ],
   "id": "c9bee1b9c533ac4f",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "svd = TruncatedSVD(n_components=300, algorithm='arpack')\n",
    "ppmi_svd_eat = pd.DataFrame( svd.fit_transform(ppmi_eat.values), index=ppmi_eat.index)\n",
    "ppmi_svd_eat.index = ppmi_svd_eat.index.str.lower()\n",
    "ppmi_svd_eat"
   ],
   "id": "365f16ca9357f6e0",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# checking it worked\n",
    "def find_closest_k(word, k=5):\n",
    "    word_vec = ppmi_svd_eat.loc[word]\n",
    "    return ppmi_svd_eat.dot(word_vec).nlargest(k)\n",
    "\n",
    "find_closest_k('dog')"
   ],
   "id": "7cd0f7c813cffea4",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# Subsetting to only the words in psychNorms norms\n",
    "to_pull = set(\n",
    "    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index\n",
    ")\n",
    "ppmi_svd_eat = ppmi_svd_eat.loc[ppmi_svd_eat.index.isin(to_pull)].astype(float)\n",
    "\n",
    "ppmi_svd_eat.to_csv('../../data/embeds/PPMI_SVD_EAT.csv')"
   ],
   "id": "826e982401c4de30",
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
