{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download data from [Netflix Prize](https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data) and put it to `data/netflix_raw` directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "movie_id = -1\n",
    "userId_list = []\n",
    "movieId_list = []\n",
    "\n",
    "for path in sorted(glob.glob('../data/netflix_raw/combined*.txt')):\n",
    "    with open(path) as f:\n",
    "        for l in f.readlines():\n",
    "            # l = \"key:\" or s = \"movie_id, rate, timestamp\"\n",
    "            key, *u = l.split(',')\n",
    "            if u:\n",
    "                if float(u[0]) >= 4:\n",
    "                    movieId_list.append(movie_id)\n",
    "                    userId_list.append(int(key))\n",
    "            else:\n",
    "                movie_id += 1\n",
    "\n",
    "df = pd.DataFrame([])\n",
    "df['movieId'] = movieId_list\n",
    "df['userId'] = userId_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reassign user id\n",
    "user_list  = np.sort(df['userId'].unique())\n",
    "user_dic = {}\n",
    "for i, user_id in enumerate(user_list):\n",
    "    user_dic[user_id] = i\n",
    "\n",
    "df['userId'] = df['userId'].map(lambda org_id: user_dic[org_id])\n",
    "df = df[['movieId','userId']].sort_values(['movieId','userId'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = len(user_dic)\n",
    "n = len(df['movieId'].unique())\n",
    "nnz = len(df)\n",
    "\n",
    "with open('../data/netflix/B.txt', 'w') as f:\n",
    "    f.write(str(n)+\" \" + str(d)+ \" \" +str(nnz)+\"\\n\")\n",
    "    for row in df.itertuples(name=None):\n",
    "        i, m, u = row\n",
    "        f.write(str(m)+\" \")\n",
    "        f.write(str(u)+\"\\n\")"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
  },
  "kernelspec": {
   "display_name": "Python 3.9.12 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
