{
 "cells": [
  {
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import minmax_scale\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from tqdm.notebook import tqdm_notebook as tqdm"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "96838b3f9b81474f",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "source": [
    "sims_long = pd.read_csv('../../data/raw/sim_rel_combined.csv')\n",
    "sims_long"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e6aa9c730960ac95",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "sims_long['data'].unique()",
   "id": "2c7d61a7e77ab908",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "source": [
    "# Looking at the scales for each dataset\n",
    "sims_long.groupby('data').agg(['min', 'max'])"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "7dca59e694eec09f",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "source": [
    "# Min max scaling\n",
    "for dataset in tqdm(sims_long['data'].unique()):\n",
    "    dat_bool = sims_long['data'] == dataset\n",
    "    sims_long.loc[dat_bool, 'value'] = minmax_scale(sims_long.loc[dat_bool, 'value'])\n",
    "\n",
    "sims_long"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "c982335baea4aad2",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "source": [
    "# Ordering word pairs so that each pair only has one ordering for aggregating\n",
    "sims_long = pd.DataFrame({\n",
    "    'word_a': sims_long[['word_1', 'word_2']].min(axis=1),\n",
    "    'word_b': sims_long[['word_1', 'word_2']].max(axis=1),\n",
    "    'value': sims_long['value']\n",
    "})\n",
    "\n",
    "# Aggregating \n",
    "sims_long = sims_long.groupby(['word_a', 'word_b'], as_index=False).mean()\n",
    "\n",
    "# Pivoting \n",
    "voc = pd.concat([sims_long['word_a'], sims_long['word_b']]).unique()\n",
    "sims_wide = pd.DataFrame(columns=voc, index=voc)\n",
    "for _, row in tqdm(sims_long.iterrows(), total=len(sims_long)):\n",
    "    word_a, word_b, val = row['word_a'], row['word_b'], row['value']\n",
    "    sims_wide.loc[word_a, word_b] = val\n",
    "    sims_wide.loc[word_b, word_a] = val\n",
    "\n",
    "sims_wide.shape"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "7921963793cb0f54",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "source": [
    "# Dropping columns with < 5 values\n",
    "sims_wide = sims_wide.dropna(thresh=5, axis=1)\n",
    "\n",
    "# Filling in missing values\n",
    "sims_wide = sims_wide.fillna(0.0)\n",
    "sims_wide.shape"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "f819fd7261d36ebc",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "source": [
    "# SVD\n",
    "svd = TruncatedSVD(n_components=300, algorithm='arpack', random_state=42)\n",
    "sims = pd.DataFrame(svd.fit_transform(sims_wide), index=sims_wide.index)\n",
    "sims"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e404e0584bacaedd",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "source": [
    "sims.to_pickle('../../data/processed/SVD_sim_rel.pkl')"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "261747948707a7ab",
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
