{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.io import loadmat"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Epinions dataset - link: https://www.cse.msu.edu/~tangjili/datasetcode/truststudy.htm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userID</th>\n",
       "      <th>itemID</th>\n",
       "      <th>catID</th>\n",
       "      <th>score</th>\n",
       "      <th>helpfulness</th>\n",
       "      <th>snapshot</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userID  itemID  catID  score  helpfulness  snapshot\n",
       "0       1       1      3      2            2         1\n",
       "9       1      10      3      1            3         1\n",
       "7       1       8      3      4            1         1\n",
       "6       1       7      3      1            1         1\n",
       "5       1       6     11      4            1         1"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load user-item ratings with timestamps\n",
    "\n",
    "mat = loadmat('rating_with_snapshot.mat')  # load mat-file\n",
    "mdata = mat['rating']  # variable in mat file\n",
    "df_time = pd.DataFrame(mdata)\n",
    "df_time.columns = ['userID','itemID','catID','score','helpfulness', 'snapshot']\n",
    "\n",
    "df_time.score.unique()\n",
    "df_time = df_time.sort_values(by=['userID'])\n",
    "df_time.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user1</th>\n",
       "      <th>user2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15373</td>\n",
       "      <td>9831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4247</td>\n",
       "      <td>9831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4644</td>\n",
       "      <td>9831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6823</td>\n",
       "      <td>9831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7479</td>\n",
       "      <td>9831</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user1  user2\n",
       "0  15373   9831\n",
       "1   4247   9831\n",
       "2   4644   9831\n",
       "3   6823   9831\n",
       "4   7479   9831"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load social network\n",
    "\n",
    "mat = loadmat('trustnetwork.mat')  # load mat-file\n",
    "mdata = mat['trustnetwork']  # variable in mat file\n",
    "\n",
    "social_df = pd.DataFrame(mdata)\n",
    "social_df.columns = ['user1','user2']\n",
    "\n",
    "social_df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# ratings  922267\n",
      "# ratings after filter (rating >=3)  800493\n"
     ]
    }
   ],
   "source": [
    "# Include only ratings >= 3\n",
    "print (\"# ratings \", len(df_time))\n",
    "df_time = df_time[df_time['score']>=3]\n",
    "\n",
    "print (\"# ratings after filter (rating >=3) \", len(df_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# items per user count    22158.000000\n",
      "mean        36.126591\n",
      "std         99.650760\n",
      "min          1.000000\n",
      "25%         11.000000\n",
      "50%         17.000000\n",
      "75%         29.000000\n",
      "max       5248.000000\n",
      "Name: itemID, dtype: float64\n",
      "# users per item count    274124.000000\n",
      "mean          2.920186\n",
      "std          10.483347\n",
      "min           1.000000\n",
      "25%           1.000000\n",
      "50%           1.000000\n",
      "75%           2.000000\n",
      "max        1163.000000\n",
      "Name: userID, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print (\"# items per user\", df_time.groupby('userID').count().itemID.describe())\n",
    "print (\"# users per item\", df_time.groupby('itemID').count().userID.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(274124, 22158)"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df_time.itemID.unique()),len(df_time.userID.unique())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Intesection to retain only common users. \n",
    "\n",
    "df_time = df_time[df_time['userID'].isin(social_df.user1.unique())]\n",
    "df_time = df_time[df_time['userID'].isin(social_df.user2.unique())]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filter based on user threshold followed by item threshold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "user_per_item = 20\n",
    "item_per_user = 5\n",
    "\n",
    "df_time = df_time.groupby('userID').filter(lambda x: len(x) >= item_per_user)\n",
    "df_time = df_time.groupby('itemID').filter(lambda x: len(x) >= user_per_item)  #only use url which is bookmarked by at least 5 users\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# items 2859 # users 11823\n"
     ]
    }
   ],
   "source": [
    "print (\"# items\", len(df_time.itemID.unique()), \"# users\", len(df_time.userID.unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userID</th>\n",
       "      <th>new_userID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userID  new_userID\n",
       "0       2           0\n",
       "1       5           1\n",
       "2       7           2\n",
       "3       8           3\n",
       "4       9           4"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# map user to sequential id\n",
    "user_map = df_time.groupby('userID').count()\n",
    "user_map.insert(0, 'new_userID', range(len(user_map))) # insert a new column. \n",
    "user_map['userID'] = user_map.index\n",
    "user_map = user_map[[\"userID\", \"new_userID\"]]\n",
    "user_map.reset_index(drop=True, inplace=True)\n",
    "user_map.to_csv('user2id.txt',columns=['userID','new_userID'], header=None, index=False, sep='\\t', mode='w')\n",
    "\n",
    "# Add new column to the original ratings set with a join.\n",
    "df_time = df_time.merge(user_map,how='left',on='userID') # left-outer join - preseve keys from left table.\n",
    "user_map.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "cannot insert new_itemID, already exists",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-80-5d60688ead2d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mitem_map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_time\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'itemID'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mitem_map\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minsert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'new_itemID'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem_map\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m \u001b[0mitem_map\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'itemID'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mitem_map\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0mitem_map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mitem_map\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"itemID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"new_itemID\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/anaconda2/envs/tf3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36minsert\u001b[0;34m(self, loc, column, value, allow_duplicates)\u001b[0m\n\u001b[1;32m   3217\u001b[0m         \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sanitize_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbroadcast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3218\u001b[0m         self._data.insert(loc, column, value,\n\u001b[0;32m-> 3219\u001b[0;31m                           allow_duplicates=allow_duplicates)\n\u001b[0m\u001b[1;32m   3220\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3221\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0massign\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/anaconda2/envs/tf3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36minsert\u001b[0;34m(self, loc, item, value, allow_duplicates)\u001b[0m\n\u001b[1;32m   4336\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mallow_duplicates\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4337\u001b[0m             \u001b[0;31m# Should this be a different kind of error??\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4338\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cannot insert {}, already exists'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4339\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4340\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: cannot insert new_itemID, already exists"
     ]
    }
   ],
   "source": [
    "#len(df_time)\n",
    "# map item to sequential id\n",
    "item_map = df_time.groupby('itemID').count()\n",
    "\n",
    "item_map.insert(0, 'new_itemID', range(len(item_map)))\n",
    "item_map['itemID'] = item_map.index\n",
    "item_map = item_map[[\"itemID\", \"new_itemID\"]]\n",
    "item_map.reset_index(drop=True, inplace=True)\n",
    "\n",
    "df_time = df_time.merge(item_map[['itemID','new_itemID']],how='left',on='itemID')\n",
    "\n",
    "#temp = df_time[['itemID','new_itemID']].drop_duplicates()\n",
    "\n",
    "item_map.to_csv('item2id.txt',columns=['itemID','new_itemID'], header=None, index=None, sep='\\t', mode='w')\n",
    "\n",
    "item_map.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user1</th>\n",
       "      <th>user2</th>\n",
       "      <th>new_user1_ID</th>\n",
       "      <th>new_user2_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15373</td>\n",
       "      <td>9831</td>\n",
       "      <td>8221</td>\n",
       "      <td>5257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4247</td>\n",
       "      <td>9831</td>\n",
       "      <td>2365</td>\n",
       "      <td>5257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6823</td>\n",
       "      <td>9831</td>\n",
       "      <td>3695</td>\n",
       "      <td>5257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7479</td>\n",
       "      <td>9831</td>\n",
       "      <td>4029</td>\n",
       "      <td>5257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16058</td>\n",
       "      <td>9831</td>\n",
       "      <td>8597</td>\n",
       "      <td>5257</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user1  user2  new_user1_ID  new_user2_ID\n",
       "0  15373   9831          8221          5257\n",
       "1   4247   9831          2365          5257\n",
       "2   6823   9831          3695          5257\n",
       "3   7479   9831          4029          5257\n",
       "4  16058   9831          8597          5257"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Filter social links based on final user set and remap. \n",
    "\n",
    "social_df = social_df[social_df['user1'].isin(df_time.userID)]\n",
    "social_df = social_df[social_df['user2'].isin(df_time.userID)]\n",
    "\n",
    "social_df = social_df.merge(user_map, how='inner',left_on='user1',right_on='userID')\n",
    "social_df[\"new_user1_ID\"] = social_df[\"new_userID\"]\n",
    "social_df = social_df.drop(columns = [\"userID\", \"new_userID\"])\n",
    "\n",
    "social_df = social_df.merge(user_map, how='inner',left_on='user2',right_on='userID')\n",
    "social_df[\"new_user2_ID\"] = social_df[\"new_userID\"]\n",
    "social_df = social_df.drop(columns = [\"userID\", \"new_userID\"])\n",
    "\n",
    "social_df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1,  2,  5,  3,  4,  6,  7,  9, 10,  8, 11])"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#df_time.snapshot.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# train 126030 # val 716 # test 2863\n"
     ]
    }
   ],
   "source": [
    "train_ratings = df_time[df_time.snapshot < 9]\n",
    "eval_ratings = df_time[df_time.snapshot >= 9]\n",
    "val_ratings = eval_ratings.sample(frac=0.2)\n",
    "test_ratings = eval_ratings.drop(val_ratings.index) # remove indices chosen in val\n",
    "print (\"# train\", len(train_ratings), \"# val\", len(val_ratings), \"# test\", len(test_ratings))\n",
    "assert len(train_ratings) + len(val_ratings) +  len(test_ratings) == len(df_time)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Output to csv files.\n",
    "dataset = 'epinions'\n",
    "data_dir = '../../data/{}'.format(dataset)\n",
    "\n",
    "social_df.to_csv('{}/{}.social.csv'.format(data_dir, dataset),columns=['new_user1_ID','new_user2_ID'],header=['user1','user2'],index=False)\n",
    "df_time.to_csv('{}/{}.ratings.all.csv'.format(data_dir,dataset), columns=['new_userID', 'new_itemID', 'snapshot'], header=['user', 'item', 'time'])\n",
    "train_ratings.to_csv('{}/{}.ratings.train.csv'.format(data_dir,dataset), columns=['new_userID', 'new_itemID', 'snapshot'], header=['user', 'item', 'time'])\n",
    "val_ratings.to_csv('{}/{}.ratings.val.csv'.format(data_dir, dataset), columns=['new_userID', 'new_itemID', 'snapshot'], header=['user', 'item', 'time'])\n",
    "test_ratings.to_csv('{}/{}.ratings.test.csv'.format(data_dir, dataset), columns=['new_userID', 'new_itemID', 'snapshot'], header=['user', 'item', 'time'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# url_map['niche'] = 1\n",
    "# url_map['niche'] = url_map.niche.where(url_map['userID']<=5, 0)\n",
    "# url_map.niche.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# df = df.merge(url_map[['urlID','niche']],how='left',on='urlID')\n",
    "# df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# train_test_split\n",
    "# each user has 40% urls in test and 60% urls in train\n",
    "# import numpy as np\n",
    "# train_u, validate_u, test_u = np.split(user_map.sample(frac=1), [int(.6*len(user_map)), int(.8*len(user_map))])\n",
    "# len(train_u),len(validate_u),len(test_u)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# train_csv = df[df['userID'].isin(train_u['userID'])]\n",
    "# train_csv.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# validate_csv = df[df['userID'].isin(validate_u['userID'])]\n",
    "# validate_csv.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# test_csv = df[df['userID'].isin(test_u['userID'])]\n",
    "# test_csv.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# gb = train_csv.groupby('userID')\n",
    "# train_tr = train_csv.groupby('userID').apply(lambda x: x.iloc[:int(x.userID.size/1.66)])\n",
    "# train_te = train_csv.groupby('userID').apply(lambda x: x.iloc[int(x.userID.size/1.66):])\n",
    "# validation_tr = validate_csv.groupby('userID').apply(lambda x: x.iloc[:int(x.userID.size/1.66)])\n",
    "# validation_te = validate_csv.groupby('userID').apply(lambda x: x.iloc[int(x.userID.size/1.66):])\n",
    "# test_tr = test_csv.groupby('userID').apply(lambda x: x.iloc[:int(x.userID.size/1.66)])\n",
    "# test_te = test_csv.groupby('userID').apply(lambda x: x.iloc[int(x.userID.size/1.66):])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# train_niche = train_csv[train_csv['niche'] == 1]\n",
    "# train_popular = train_csv[train_csv['niche'] == 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# train_niche.to_csv('train_GAN_niche.csv',columns=['new_userID','new_URLID'],header=['uid','sid'],index=False)\n",
    "# train_popular.to_csv('train_GAN_popular.csv',columns=['new_userID','new_URLID'],header=['uid','sid'],index=False)\n",
    "# train_csv.to_csv('train_GAN.csv',columns=['new_userID','new_URLID'],header=['uid','sid'],index=False)\n",
    "# validation_tr.to_csv('validation_tr.csv',columns=['new_userID','new_URLID'],header=['uid','sid'],index=False)\n",
    "# validation_te.to_csv('validation_te.csv',columns=['new_userID','new_URLID'],header=['uid','sid'],index=False)\n",
    "# test_tr.to_csv('test_tr.csv',columns=['new_userID','new_URLID'],header=['uid','sid'],index=False)\n",
    "# test_te.to_csv('test_te.csv',columns=['new_userID','new_URLID'],header=['uid','sid'],index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# train_csv.to_csv('item_counts.csv',columns=['userID','bookmarkID'],header=['userID','tagID'],index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# np.savetxt('niche_items.txt', train_niche.bookmarkID.unique(), fmt='%d')\n",
    "# np.savetxt('unique_item_id.txt', df.bookmarkID.unique(), fmt='%d')\n",
    "# np.savetxt('item_list.txt', df.bookmarkID.unique(), fmt='%d')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "change output of vae from item-vector to user-similarity, multiply it with user-item matrix to get conventional item-recommendation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "user 1800\n",
    "item 2-3000\n",
    "link 8000\n",
    "\n",
    "common code: sample 200 users 200 items\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
