{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import json\n",
    "from tqdm.notebook import tqdm_notebook as tqdm"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# Loading dictionary of dtype to embed\n",
    "with open('../../data/dtype_to_embed.json', 'r') as f:\n",
    "    dtype_to_embed = json.load(f)\n",
    "    \n",
    "brain_behav_names = dtype_to_embed['brain'] + dtype_to_embed['behavior']\n",
    "\n",
    "# Iterating through pulled_embeds and finding union of all brain and behavior vocabs\n",
    "embeds_path = '../../data/embeds/'\n",
    "brain_behav_union = set()\n",
    "for name in tqdm(brain_behav_names):\n",
    "    vocab = set(pd.read_csv(embeds_path + name + '.csv', index_col=0).index)\n",
    "    brain_behav_union = brain_behav_union.union(vocab)\n",
    "\n",
    "len(brain_behav_union)    "
   ],
   "id": "a3b60c2427a14f6f",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "embed_counts = []\n",
    "embed_path = '../../data/embeds/'\n",
    "for f_name in tqdm(os.listdir(embed_path)):\n",
    "    embed_name = f_name.split('.')[0]\n",
    "    embed = pd.read_csv(embed_path + f_name, index_col=0)\n",
    "    \n",
    "    # Subsetting to brain and behavior union\n",
    "    embed = embed[embed.index.isin(brain_behav_union)]\n",
    "    \n",
    "    embed_counts.append([embed_name, embed.shape[0]])\n",
    "\n",
    "embed_counts = pd.DataFrame(embed_counts, columns=['name', 'count'])\n",
    "embed_counts"
   ],
   "id": "adbf1c21ac762c8b",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# Log transforming the counts\n",
    "embed_counts['count'] = np.log10(embed_counts['count']) \n",
    "\n",
    "# Sorting by count\n",
    "embed_counts = embed_counts.sort_values(by='count', ascending=False) \n",
    "embed_counts"
   ],
   "id": "6ed0d30423f503a9",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "with open('../../data/embed_to_dtype.json', 'r') as f:\n",
    "    embed_to_dtype = json.load(f)\n",
    "\n",
    "# Adding dtype column and renaming names\n",
    "embed_counts['dtype'] = embed_counts['name'].map(embed_to_dtype)\n",
    "\n",
    "# Renaming and reformating names\n",
    "rename_dict = {'SVD_sim_rel': 'SVD_similarity_relatedness', 'compo_attribs': 'experiential_attributes'}\n",
    "embed_counts['name'] = embed_counts['name'].replace(rename_dict)\n",
    "embed_counts['name'] = embed_counts['name'].str.replace('_', ' ')\n",
    "embed_counts"
   ],
   "id": "f8ce7876510a9632",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# Loading psychNorms data\n",
    "norms = pd.read_csv('../../data/psychNorms/psychNorms.zip', compression='zip', index_col=0, low_memory=False)\n",
    "meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata.csv', index_col=0)\n",
    "\n",
    "# Getting counts of each norm and sorting by count\n",
    "norm_counts = norms.count().sort_values(ascending=False).reset_index()\n",
    "norm_counts.columns = ['name', 'count']\n",
    "norm_counts['category'] = (\n",
    "    norm_counts['name'].map(meta['category'])\n",
    "    .str.replace('_', ' ').str.title()\n",
    "    .str.replace('Of', 'of', regex=True)\n",
    ")\n",
    "\n",
    "# Log transforming the counts\n",
    "norm_counts['count'] = np.log10(norm_counts['count'])\n",
    "norm_counts"
   ],
   "id": "505b6d71d5bc8b1",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "with open('../../data/dtype_to_embed.json', 'r') as f:\n",
    "    dtype_to_embed = json.load(f)\n",
    "\n",
    "width_ratios = [len(dtype_to_embed[dtype]) for dtype in ['text', 'behavior', 'brain']] + [len(norm_counts['category'].unique())]\n",
    "\n",
    "# Colors \n",
    "cmap = plt.get_cmap('viridis', 4)\n",
    "embed_type_to_color = {\n",
    "    'brain': cmap(1),\n",
    "    'behavior': cmap(0),\n",
    "    'text': cmap(2)\n",
    "}\n",
    "\n",
    "fig, axs = plt.subplots(1, 4, figsize=(12, 4), width_ratios=width_ratios, sharey=True)\n",
    "\n",
    "# Text embed counts\n",
    "sns.scatterplot(\n",
    "    embed_counts.query('dtype == \"text\"'), x='name', y='count', color=embed_type_to_color['text'],  ax=axs[0]\n",
    ")\n",
    "ymin, ymax = 2, 6\n",
    "axs[0].set(\n",
    "    ylim=(ymin, ymax), yticks=range(ymin, ymax+1), yticklabels=[f'$10^{i}$' for i in range(ymin, ymax+1)], \n",
    "    ylabel='Vocabulary Size', title='Text'\n",
    ")\n",
    "\n",
    "# Behavior embed counts\n",
    "sns.scatterplot(\n",
    "    embed_counts.query('dtype == \"behavior\"'), x='name', y='count', color=embed_type_to_color['behavior'], ax=axs[1]\n",
    ")\n",
    "axs[1].set(title='Behavior')\n",
    "\n",
    "# Brain embed counts\n",
    "sns.scatterplot(\n",
    "    embed_counts.query('dtype == \"brain\"'), x='name', y='count', color=embed_type_to_color['brain'], ax=axs[2]\n",
    ")\n",
    "axs[2].set(title='Brain')\n",
    "\n",
    "# Norms\n",
    "sns.scatterplot(\n",
    "    norm_counts, x='category', y='count', hue='category', palette='viridis', legend=False, alpha=0.7,\n",
    "    ax=axs[3]\n",
    ")\n",
    "axs[3].set(title='psychNorms', yticks=range(ymin, ymax+1), ylabel='PAD')\n",
    "\n",
    "for ax in axs:\n",
    "    # rotate x-axis labels\n",
    "    ax.tick_params(axis='x', rotation=90, labelsize=10)\n",
    "    ax.set(xlabel='' )\n",
    "\n",
    "# Adds ylabel to last plot by freeing it from the shared y-axis\n",
    "axs[3].set(ylabel='Vocabulary Size')\n",
    "\n",
    "sns.despine(offset=5, trim=True)\n",
    "plt.tight_layout()\n",
    "plt.savefig('../../figures/vocab_sizes.png', dpi=300)"
   ],
   "id": "ffbb3c17dc87d5cf",
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
