{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.io import loadmat\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = 'epinions'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load user-item ratings with timestamps\n",
    "\n",
    "mat = loadmat('data/epinions/rating_with_snapshot.mat')  # load mat-file\n",
    "mdata = mat['rating']  # variable in mat file\n",
    "df = pd.DataFrame(mdata)\n",
    "df.columns = ['user','item','catID','score','helpfulness', 'snapshot']\n",
    "\n",
    "df.score.unique()\n",
    "df = df.sort_values(by=['user'])\n",
    "\n",
    "\n",
    "cats = pd.read_csv('data/epinions/catalog_epinion.txt', sep='\\t', header=None)\n",
    "cats.columns = ['catID','name']\n",
    "\n",
    "# Include only ratings >= 3\n",
    "print (\"# ratings \", len(df))\n",
    "df = df[df['score']>=3]\n",
    "\n",
    "print (\"# ratings after filter (rating >=3) \", len(df))\n",
    "\n",
    "print (\"# items per user\", df.groupby('user').count().item.describe())\n",
    "print (\"# users per item\", df.groupby('item').count().user.describe())\n",
    "\n",
    "df_filter = df\n",
    "df_filter.drop_duplicates(keep='first', inplace=True, subset=['user', 'item'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter (df, n_core=5):\n",
    "    df_filter = df.drop_duplicates(keep='first', subset=['user', 'item'])\n",
    "    print('# users:{}, # items:{}'.format(df_filter.user.nunique(), df_filter.item.nunique()))\n",
    "    \n",
    "    while True: \n",
    "        df_filter = df_filter.groupby('user').filter(lambda x:len(x)>=n_core)\n",
    "        df_filter = df_filter.groupby('item').filter(lambda x:len(x)>=n_core)\n",
    "        print('# users:{}, # items:{}'.format(df_filter.user.nunique(), df_filter.item.nunique()))\n",
    "        if df_filter.groupby('user').size().min() >= n_core and df_filter.groupby('item').size().min() >= n_core:\n",
    "            break\n",
    "            \n",
    "    return df_filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists(os.path.join('data/{}'.format(dataset), 'raw.csv')):\n",
    "    df_filter = filter(raw_df)\n",
    "    df_filter.to_csv(os.path.join('data/{}'.format(dataset), 'raw.csv'))\n",
    "else:\n",
    "    df_filter = pd.read_csv(os.path.join('data/{}'.format(dataset), 'raw.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filter = df_filter\n",
    "print (\"# users\", len(df_filter.user.unique()), len(df_filter.item.unique()))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filter.groupby('item').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mapping(df_filter):\n",
    "    user_map = df_filter.groupby('user').count()\n",
    "    user_map.insert(0, 'userID', range(len(user_map)))\n",
    "    user_map['user'] = user_map.index\n",
    "    user_map = user_map[['user', 'userID']]\n",
    "    user_map.reset_index(drop=True, inplace=True)\n",
    "    df_filter = df_filter.merge(user_map, how='left', on='user')\n",
    "    \n",
    "    item_map = df_filter.groupby('item').count()\n",
    "    item_map.insert(0, 'itemID', range(len(item_map)))\n",
    "    item_map['item'] = item_map.index\n",
    "    item_map = item_map[['item', 'itemID']]\n",
    "    item_map.reset_index(drop=True, inplace=True)\n",
    "    df_filter = df_filter.merge(item_map, how='left', on='item')\n",
    "    \n",
    "    return df_filter, user_map, item_map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_mapped, user_map, item_map = mapping(df_filter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_users = df_mapped.userID.nunique()\n",
    "n_items = df_mapped.itemID.nunique()\n",
    "n_feature = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_users = np.random.choice(range(n_users), n_users//2, replace=False)\n",
    "train_items = np.random.choice(range(n_items), n_items//2, replace=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = df_mapped[df_mapped['userID'].isin(train_users)]\n",
    "train_df = train_df[train_df['itemID'].isin(train_items)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "len(train_df)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_mapped)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df = df_mapped[~df_mapped['userID'].isin(train_users)]\n",
    "test_df = test_df[~test_df['itemID'].isin(train_items)]\n",
    "test_df.reset_index(drop=True, inplace=True)\n",
    "test_df = sample_neg(test_df, df_mapped)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path = 'data/{}'.format(dataset)\n",
    "if not os.path.exists(output_path):\n",
    "    os.mkdir(output_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_csv(os.path.join(output_path, 'train_half.csv'))\n",
    "test_df.to_csv(os.path.join(output_path, 'test_half.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
