{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2022-05-26 21:41:39--  https://files.grouplens.org/datasets/movielens/ml-25m.zip\n",
      "files.grouplens.org (files.grouplens.org) をDNSに問いあわせています... 128.101.65.152\n",
      "files.grouplens.org (files.grouplens.org)|128.101.65.152|:443 に接続しています... 接続しました。\n",
      "HTTP による接続要求を送信しました、応答を待っています... 200 OK\n",
      "長さ: 261978986 (250M) [application/zip]\n",
      "`../data/ml-25m.zip' に保存中\n",
      "\n",
      "ml-25m.zip          100%[===================>] 249.84M  12.8MB/s 時間 24s        \n",
      "\n",
      "2022-05-26 21:42:04 (10.4 MB/s) - `../data/ml-25m.zip' へ保存完了 [261978986/261978986]\n",
      "\n",
      "Archive:  ../data/ml-25m.zip\n",
      "   creating: ../data/ml-25m/\n",
      "  inflating: ../data/ml-25m/tags.csv  \n",
      "  inflating: ../data/ml-25m/links.csv  \n",
      "  inflating: ../data/ml-25m/README.txt  \n",
      "  inflating: ../data/ml-25m/ratings.csv  \n",
      "  inflating: ../data/ml-25m/genome-tags.csv  \n",
      "  inflating: ../data/ml-25m/genome-scores.csv  \n",
      "  inflating: ../data/ml-25m/movies.csv  \n"
     ]
    }
   ],
   "source": [
    "! wget -P ../data https://files.grouplens.org/datasets/movielens/ml-25m.zip\n",
    "! unzip ../data/ml-25m.zip -d ../data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# exclude column 'timestamp' and smaller rates than 4\n",
    "df = pd.read_csv('../data/ml-25m/ratings.csv').drop('timestamp', axis=1)\n",
    "df['rating'] = df['rating'].map(lambda x: 1 if x >= 4 else 0)\n",
    "df = df[df.rating > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# construct a dictionary for reassigning movie id\n",
    "movie_list  = np.sort(df['movieId'].unique())\n",
    "movie_dic = {}\n",
    "for i, movie_id in enumerate(movie_list):\n",
    "    movie_dic[movie_id] = i\n",
    "\n",
    "# construct a dictionary for reassigning user id\n",
    "user_list  = np.sort(df['userId'].unique())\n",
    "user_dic = {}\n",
    "for i, user_id in enumerate(user_list):\n",
    "    user_dic[user_id] = i\n",
    "  \n",
    "df['movieId'] = df['movieId'].map(lambda org_id: movie_dic[org_id])\n",
    "df['userId'] = df['userId'].map(lambda org_id: user_dic[org_id])\n",
    "df = df[['movieId','userId']].sort_values(['movieId','userId'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# output csv\n",
    "d = len(user_dic)\n",
    "n = len(movie_dic)\n",
    "nnz = len(df)\n",
    "\n",
    "with open('../data/movie_lens/B.txt', 'w') as f:\n",
    "    f.write(str(n)+\" \" + str(d)+ \" \" +str(nnz)+\"\\n\")\n",
    "    for row in df.itertuples(name=None):\n",
    "        i, m, u = row\n",
    "        f.write(str(m)+\" \")\n",
    "        f.write(str(u)+\"\\n\")"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "06e75bb08dfe89f79c2990a5579740cd6d2ed8904ec17e16043d8d528483c445"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 ('venv': venv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
