{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dataset_filter(ratings, min_items=5):\n",
    "    \"\"\"\n",
    "            Only keep the data useful, which means:\n",
    "                - all ratings are non-zeros\n",
    "                - each user rated at least {self.min_items} items\n",
    "            :param ratings: pd.DataFrame\n",
    "            :param min_items: the least number of items user rated\n",
    "            :return: filter_ratings: pd.DataFrame\n",
    "            \"\"\"\n",
    "\n",
    "    # filter unuseful data\n",
    "    ratings = ratings[ratings['rating'] > 0]\n",
    "\n",
    "    # only keep users who rated at least {self.min_items} items\n",
    "    user_count = ratings.groupby('uid').size()\n",
    "    user_subset = np.in1d(ratings.uid, user_count[user_count >= min_items].index)\n",
    "    filter_ratings = ratings[user_subset].reset_index(drop=True)\n",
    "\n",
    "    del ratings\n",
    "\n",
    "    return filter_ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_348946/3566252918.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  ratings = pd.read_csv('./ratings.txt', sep='::', header=None, names=['uid', 'mid', 'rating'])\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "ratings = pd.read_csv('./ratings.txt', sep='::', header=None, names=['uid', 'mid', 'rating'])\n",
    "\n",
    "rank = ratings[['mid']].drop_duplicates().reindex()\n",
    "rank['timestamp'] = np.arange((len(rank)))\n",
    "ratings = pd.merge(ratings, rank, on=['mid'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings = dataset_filter(ratings, 10)\n",
    "\n",
    "# Reindex user id and item id\n",
    "user_id = ratings[['uid']].drop_duplicates().reindex()\n",
    "user_id['userId'] = np.arange(len(user_id))\n",
    "ratings = pd.merge(ratings, user_id, on=['uid'], how='left')\n",
    "\n",
    "item_id = ratings[['mid']].drop_duplicates()\n",
    "item_id['itemId'] = np.arange(len(item_id))\n",
    "ratings = pd.merge(ratings, item_id, on=['mid'], how='left')\n",
    "\n",
    "ratings = ratings[['userId', 'itemId', 'rating', 'timestamp']].sort_values(by='userId', ascending=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings.to_csv('./ratings.dat', sep=',', index=False, header=False )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./ratings.dat', sep=',', names=['userId', 'itemId', 'rating', 'timestamp'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fcrec",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
