{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T11:16:29.065599Z",
     "start_time": "2025-08-01T11:16:29.056456Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import datetime\n",
    "from tqdm import tqdm\n",
    "np.random.seed(42)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create loss tensor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T11:19:16.148734Z",
     "start_time": "2025-08-01T11:16:29.071855Z"
    }
   },
   "outputs": [],
   "source": [
    "movie_pd = pd.read_csv('movies.csv')\n",
    "rating_pd = pd.read_csv('ratings.csv')\n",
    "rating_pd = pd.merge(rating_pd, movie_pd, on=\"movieId\", how=\"left\", sort=False)\n",
    "\n",
    "x = np.zeros(20)\n",
    "\n",
    "genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',\n",
    "      'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical',\n",
    "      'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', '(no genres listed)'\n",
    "      ]\n",
    "\n",
    "left = -1\n",
    "userID = 1\n",
    "drop_indices = []\n",
    "users = 0\n",
    "T = 0\n",
    "max_num_user = 3969\n",
    "length = len(rating_pd.index)\n",
    "\n",
    "for index in tqdm(range(length)):\n",
    "  user = rating_pd[\"userId\"].values[index]\n",
    "  movie = rating_pd[\"movieId\"].values[index]\n",
    "  if user == userID:\n",
    "    if movie <= len(movie_pd.index):\n",
    "      for g in movie_pd[\"genres\"].values[movie - 1].split(\"|\"):\n",
    "        x[genres.index(g)] += 1\n",
    "  else:\n",
    "    flag = 1\n",
    "    for g in range(20):\n",
    "      if x[g] == 0:\n",
    "        flag = 0\n",
    "        break\n",
    "    if flag and np.random.binomial(1, 0.3):\n",
    "      T = max(x.max(), T)\n",
    "      users += 1\n",
    "    else:\n",
    "      drop_indices += range(left + 1, index)\n",
    "    left = index - 1\n",
    "    userID += 1\n",
    "    x = np.zeros(20)\n",
    "    index -= 1\n",
    "    if users >= max_num_user:\n",
    "      drop_indices += range(left + 1, length)\n",
    "      break\n",
    "\n",
    "\n",
    "\n",
    "print(users)\n",
    "print(T)\n",
    "rating_pd = rating_pd.drop(drop_indices)\n",
    "rating_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T11:19:17.985305Z",
     "start_time": "2025-08-01T11:19:16.388212Z"
    }
   },
   "outputs": [],
   "source": [
    "rating_pd['date'] = rating_pd['timestamp'].apply(\n",
    "    lambda row: str(\n",
    "        datetime.datetime.fromtimestamp(row).date()\n",
    "    )\n",
    ")\n",
    "rating_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T11:19:20.703801Z",
     "start_time": "2025-08-01T11:19:18.048963Z"
    }
   },
   "outputs": [],
   "source": [
    "rating_pd['genres'] = rating_pd['genres'].apply(\n",
    "    lambda row: row.split('|')\n",
    ")\n",
    "\n",
    "all_pd = rating_pd.explode('genres', ignore_index=True)\n",
    "all_pd = all_pd.sort_values(['date', 'userId', 'genres'])\n",
    "all_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T11:19:21.683915Z",
     "start_time": "2025-08-01T11:19:20.782279Z"
    }
   },
   "outputs": [],
   "source": [
    "best_genre = all_pd[['rating', 'genres']] \\\n",
    "                .groupby('genres') \\\n",
    "                .mean()['rating'].idxmax()\n",
    "best_genre"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T11:19:21.760964Z",
     "start_time": "2025-08-01T11:19:21.716894Z"
    }
   },
   "outputs": [],
   "source": [
    "loss = (all_pd['rating'].max() + all_pd['rating'].min() - all_pd['rating']) / all_pd['rating'].max()\n",
    "all_pd['loss'] = loss\n",
    "all_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T11:19:23.373076Z",
     "start_time": "2025-08-01T11:19:21.840784Z"
    }
   },
   "outputs": [],
   "source": [
    "rec_loss = all_pd[['userId', 'loss', 'genres', 'date']] \\\n",
    "                    .groupby(['date', 'userId', 'genres']) \\\n",
    "                    .mean()\n",
    "rec_loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T11:19:23.592026Z",
     "start_time": "2025-08-01T11:19:23.437699Z"
    }
   },
   "outputs": [],
   "source": [
    "assert rec_loss.groupby('genres').mean()['loss'].idxmin() == 'Film-Noir'\n",
    "assert rec_loss.groupby('genres').mean()['loss'].max() <= 1\n",
    "assert rec_loss.groupby('genres').mean()['loss'].min() >= 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T11:19:23.639421Z",
     "start_time": "2025-08-01T11:19:23.624563Z"
    }
   },
   "outputs": [],
   "source": [
    "rec_loss.to_pickle('./MovieLens_loss.pkl')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
