{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "23181"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./ratings.dat', sep=',', header=None, names=['uid', 'mid', 'rating', 'timestamp'])\n",
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dataset_filter(ratings, min_items=5):\n",
    "    \"\"\"\n",
    "            Only keep the data useful, which means:\n",
    "                - all ratings are non-zeros\n",
    "                - each user rated at least {self.min_items} items\n",
    "            :param ratings: pd.DataFrame\n",
    "            :param min_items: the least number of items user rated\n",
    "            :return: filter_ratings: pd.DataFrame\n",
    "            \"\"\"\n",
    "\n",
    "    # filter unuseful data\n",
    "    ratings = ratings[ratings['rating'] > 0]\n",
    "\n",
    "    # only keep users who rated at least {self.min_items} items\n",
    "    user_count = ratings.groupby('uid').size()\n",
    "    user_subset = np.in1d(ratings.uid, user_count[user_count >= min_items].index)\n",
    "    filter_ratings = ratings[user_subset].reset_index(drop=True)\n",
    "\n",
    "    del ratings\n",
    "\n",
    "    return filter_ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./amazon.edges', sep=\",\", header=None, names=['uid', 'mid', 'rating', 'timestamp'],engine='python')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings = dataset_filter(df, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_id = ratings[['uid']].drop_duplicates().reindex()\n",
    "user_id['userId'] = np.arange(len(user_id))\n",
    "ratings = pd.merge(ratings, user_id, on=['uid'], how='left')\n",
    "\n",
    "item_id = ratings[['mid']].drop_duplicates()\n",
    "item_id['itemId'] = np.arange(len(item_id))\n",
    "ratings = pd.merge(ratings, item_id, on=['mid'], how='left')\n",
    "\n",
    "ratings = ratings[['userId', 'itemId', 'rating', 'timestamp']].sort_values(by='userId', ascending=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings.to_csv('./ratings.dat', sep=',', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7957"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(ratings.itemId))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fcrec",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
