{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Context Aware Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For data manipulation and analysis\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# For text preprocessing\n",
    "import re\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import datetime\n",
    "import string\n",
    "\n",
    "# For multilabel classification\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "# For neural networks\n",
    "\n",
    "\n",
    "\n",
    "# For model evaluation\n",
    "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Context Vectors (For CB Model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. Explicit Context - extract context features from MovieLens dataframe or imdb link \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Movies: \n",
    "Movie Ids\n",
    "---------\n",
    "\n",
    "Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL <https://movielens.org/movies/1>). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files).\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Movies Data File Structure (movies.csv)\n",
    "---------------------------------------\n",
    "\n",
    "Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:\n",
    "\n",
    "    movieId,title,genres\n",
    "\n",
    "Movie titles are entered manually or imported from <https://www.themoviedb.org/>, and include the year of release in parentheses. Errors and inconsistencies may exist in these titles.\n",
    "\n",
    "Genres are a pipe-separated list, and are selected from the following:\n",
    "\n",
    "* Action\n",
    "* Adventure\n",
    "* Animation\n",
    "* Children's\n",
    "* Comedy\n",
    "* Crime\n",
    "* Documentary\n",
    "* Drama\n",
    "* Fantasy\n",
    "* Film-Noir\n",
    "* Horror\n",
    "* Musical\n",
    "* Mystery\n",
    "* Romance\n",
    "* Sci-Fi\n",
    "* Thriller\n",
    "* War\n",
    "* Western\n",
    "* (no genres listed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies = pd.read_csv(\"../dataset/ml-20m/filtered_movies.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>Heat (1995)</td>\n",
       "      <td>Action|Crime|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6232</th>\n",
       "      <td>130856</td>\n",
       "      <td>Severe Clear (2010)</td>\n",
       "      <td>Comedy|Documentary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6233</th>\n",
       "      <td>130958</td>\n",
       "      <td>Killer Crocodile (1989)</td>\n",
       "      <td>Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6234</th>\n",
       "      <td>130984</td>\n",
       "      <td>Santo vs. las lobas (1976)</td>\n",
       "      <td>Action|Fantasy|Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6235</th>\n",
       "      <td>131011</td>\n",
       "      <td>Execution Squad (1972)</td>\n",
       "      <td>Crime|Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6236</th>\n",
       "      <td>131015</td>\n",
       "      <td>Hellgate (2011)</td>\n",
       "      <td>Horror|Thriller</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6237 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieId                               title  \\\n",
       "0           1                    Toy Story (1995)   \n",
       "1           2                      Jumanji (1995)   \n",
       "2           4            Waiting to Exhale (1995)   \n",
       "3           5  Father of the Bride Part II (1995)   \n",
       "4           6                         Heat (1995)   \n",
       "...       ...                                 ...   \n",
       "6232   130856                 Severe Clear (2010)   \n",
       "6233   130958             Killer Crocodile (1989)   \n",
       "6234   130984          Santo vs. las lobas (1976)   \n",
       "6235   131011              Execution Squad (1972)   \n",
       "6236   131015                     Hellgate (2011)   \n",
       "\n",
       "                                           genres  \n",
       "0     Adventure|Animation|Children|Comedy|Fantasy  \n",
       "1                      Adventure|Children|Fantasy  \n",
       "2                            Comedy|Drama|Romance  \n",
       "3                                          Comedy  \n",
       "4                           Action|Crime|Thriller  \n",
       "...                                           ...  \n",
       "6232                           Comedy|Documentary  \n",
       "6233                                       Horror  \n",
       "6234                        Action|Fantasy|Horror  \n",
       "6235                                  Crime|Drama  \n",
       "6236                              Horror|Thriller  \n",
       "\n",
       "[6237 rows x 3 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies.drop(columns=['Unnamed: 0'], inplace=True)\n",
    "\n",
    "movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>Heat (1995)</td>\n",
       "      <td>Action|Crime|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>7</td>\n",
       "      <td>Sabrina (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10</td>\n",
       "      <td>GoldenEye (1995)</td>\n",
       "      <td>Action|Adventure|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>11</td>\n",
       "      <td>American President, The (1995)</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>12</td>\n",
       "      <td>Dracula: Dead and Loving It (1995)</td>\n",
       "      <td>Comedy|Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>14</td>\n",
       "      <td>Nixon (1995)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId                               title  \\\n",
       "0        1                    Toy Story (1995)   \n",
       "1        2                      Jumanji (1995)   \n",
       "2        4            Waiting to Exhale (1995)   \n",
       "3        5  Father of the Bride Part II (1995)   \n",
       "4        6                         Heat (1995)   \n",
       "5        7                      Sabrina (1995)   \n",
       "6       10                    GoldenEye (1995)   \n",
       "7       11      American President, The (1995)   \n",
       "8       12  Dracula: Dead and Loving It (1995)   \n",
       "9       14                        Nixon (1995)   \n",
       "\n",
       "                                        genres  \n",
       "0  Adventure|Animation|Children|Comedy|Fantasy  \n",
       "1                   Adventure|Children|Fantasy  \n",
       "2                         Comedy|Drama|Romance  \n",
       "3                                       Comedy  \n",
       "4                        Action|Crime|Thriller  \n",
       "5                               Comedy|Romance  \n",
       "6                    Action|Adventure|Thriller  \n",
       "7                         Comedy|Drama|Romance  \n",
       "8                                Comedy|Horror  \n",
       "9                                        Drama  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>Heat (1995)</td>\n",
       "      <td>Action|Crime|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6232</th>\n",
       "      <td>130856</td>\n",
       "      <td>Severe Clear (2010)</td>\n",
       "      <td>Comedy|Documentary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6233</th>\n",
       "      <td>130958</td>\n",
       "      <td>Killer Crocodile (1989)</td>\n",
       "      <td>Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6234</th>\n",
       "      <td>130984</td>\n",
       "      <td>Santo vs. las lobas (1976)</td>\n",
       "      <td>Action|Fantasy|Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6235</th>\n",
       "      <td>131011</td>\n",
       "      <td>Execution Squad (1972)</td>\n",
       "      <td>Crime|Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6236</th>\n",
       "      <td>131015</td>\n",
       "      <td>Hellgate (2011)</td>\n",
       "      <td>Horror|Thriller</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6237 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieId                               title  \\\n",
       "0           1                    Toy Story (1995)   \n",
       "1           2                      Jumanji (1995)   \n",
       "2           4            Waiting to Exhale (1995)   \n",
       "3           5  Father of the Bride Part II (1995)   \n",
       "4           6                         Heat (1995)   \n",
       "...       ...                                 ...   \n",
       "6232   130856                 Severe Clear (2010)   \n",
       "6233   130958             Killer Crocodile (1989)   \n",
       "6234   130984          Santo vs. las lobas (1976)   \n",
       "6235   131011              Execution Squad (1972)   \n",
       "6236   131015                     Hellgate (2011)   \n",
       "\n",
       "                                           genres  \n",
       "0     Adventure|Animation|Children|Comedy|Fantasy  \n",
       "1                      Adventure|Children|Fantasy  \n",
       "2                            Comedy|Drama|Romance  \n",
       "3                                          Comedy  \n",
       "4                           Action|Crime|Thriller  \n",
       "...                                           ...  \n",
       "6232                           Comedy|Documentary  \n",
       "6233                                       Horror  \n",
       "6234                        Action|Fantasy|Horror  \n",
       "6235                                  Crime|Drama  \n",
       "6236                              Horror|Thriller  \n",
       "\n",
       "[6237 rows x 3 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "Links Data File Structure (links.csv)\n",
    "---------------------------------------\n",
    "\n",
    "Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:\n",
    "\n",
    "    movieId,imdbId,tmdbId\n",
    "\n",
    "movieId is an identifier for movies used by <https://movielens.org>. E.g., the movie Toy Story has the link <https://movielens.org/movies/1>.\n",
    "\n",
    "imdbId is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.\n",
    "\n",
    "tmdbId is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.\n",
    "\n",
    "Use of the resources listed above is subject to the terms of each provider."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "links = pd.read_csv(\"../dataset/ml-20m/links.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>imdbId</th>\n",
       "      <th>tmdbId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>114709</td>\n",
       "      <td>862.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>113497</td>\n",
       "      <td>8844.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>113228</td>\n",
       "      <td>15602.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>114885</td>\n",
       "      <td>31357.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>113041</td>\n",
       "      <td>11862.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>113277</td>\n",
       "      <td>949.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>114319</td>\n",
       "      <td>11860.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>112302</td>\n",
       "      <td>45325.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>114576</td>\n",
       "      <td>9091.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>113189</td>\n",
       "      <td>710.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId  imdbId   tmdbId\n",
       "0        1  114709    862.0\n",
       "1        2  113497   8844.0\n",
       "2        3  113228  15602.0\n",
       "3        4  114885  31357.0\n",
       "4        5  113041  11862.0\n",
       "5        6  113277    949.0\n",
       "6        7  114319  11860.0\n",
       "7        8  112302  45325.0\n",
       "8        9  114576   9091.0\n",
       "9       10  113189    710.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "links.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies = pd.merge(links, movies, on='movieId', how='inner')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>imdbId</th>\n",
       "      <th>tmdbId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>114709</td>\n",
       "      <td>862.0</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>113497</td>\n",
       "      <td>8844.0</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>114885</td>\n",
       "      <td>31357.0</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>113041</td>\n",
       "      <td>11862.0</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>113277</td>\n",
       "      <td>949.0</td>\n",
       "      <td>Heat (1995)</td>\n",
       "      <td>Action|Crime|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6232</th>\n",
       "      <td>130856</td>\n",
       "      <td>494826</td>\n",
       "      <td>48376.0</td>\n",
       "      <td>Severe Clear (2010)</td>\n",
       "      <td>Comedy|Documentary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6233</th>\n",
       "      <td>130958</td>\n",
       "      <td>143338</td>\n",
       "      <td>78402.0</td>\n",
       "      <td>Killer Crocodile (1989)</td>\n",
       "      <td>Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6234</th>\n",
       "      <td>130984</td>\n",
       "      <td>208423</td>\n",
       "      <td>317168.0</td>\n",
       "      <td>Santo vs. las lobas (1976)</td>\n",
       "      <td>Action|Fantasy|Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6235</th>\n",
       "      <td>131011</td>\n",
       "      <td>69109</td>\n",
       "      <td>79572.0</td>\n",
       "      <td>Execution Squad (1972)</td>\n",
       "      <td>Crime|Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6236</th>\n",
       "      <td>131015</td>\n",
       "      <td>1430116</td>\n",
       "      <td>143928.0</td>\n",
       "      <td>Hellgate (2011)</td>\n",
       "      <td>Horror|Thriller</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6237 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieId   imdbId    tmdbId                               title  \\\n",
       "0           1   114709     862.0                    Toy Story (1995)   \n",
       "1           2   113497    8844.0                      Jumanji (1995)   \n",
       "2           4   114885   31357.0            Waiting to Exhale (1995)   \n",
       "3           5   113041   11862.0  Father of the Bride Part II (1995)   \n",
       "4           6   113277     949.0                         Heat (1995)   \n",
       "...       ...      ...       ...                                 ...   \n",
       "6232   130856   494826   48376.0                 Severe Clear (2010)   \n",
       "6233   130958   143338   78402.0             Killer Crocodile (1989)   \n",
       "6234   130984   208423  317168.0          Santo vs. las lobas (1976)   \n",
       "6235   131011    69109   79572.0              Execution Squad (1972)   \n",
       "6236   131015  1430116  143928.0                     Hellgate (2011)   \n",
       "\n",
       "                                           genres  \n",
       "0     Adventure|Animation|Children|Comedy|Fantasy  \n",
       "1                      Adventure|Children|Fantasy  \n",
       "2                            Comedy|Drama|Romance  \n",
       "3                                          Comedy  \n",
       "4                           Action|Crime|Thriller  \n",
       "...                                           ...  \n",
       "6232                           Comedy|Documentary  \n",
       "6233                                       Horror  \n",
       "6234                        Action|Fantasy|Horror  \n",
       "6235                                  Crime|Drama  \n",
       "6236                              Horror|Thriller  \n",
       "\n",
       "[6237 rows x 5 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_movies"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Converting to correct data types\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "movieId     int64\n",
      "imdbId     object\n",
      "tmdbId     object\n",
      "title      object\n",
      "genres     object\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "dtype_dict = {col: 'str' for col in df_movies.columns}\n",
    "dtype_dict['movieId'] = 'int'\n",
    "df_movies = df_movies.astype(dtype_dict)\n",
    "\n",
    "# Verify the conversion\n",
    "print(df_movies.dtypes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Accessing the links \n",
    "\n",
    "- Using web scraping to access the storyline/synopsis from each link\n",
    "- Stored in separate columns - \"imdb_doc\", \"tmdb_doc\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install IMDbPY"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A little boy named Andy loves to be in his room, playing with his toys, especially his doll named \"Woody\". But, what do the toys do when Andy is not with them, they come to life. Woody believes that his life (as a toy) is good. However, he must worry about Andy's family moving, and what Woody does not know is about Andy's birthday party. Woody does not realize that Andy's mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy's new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips.\n"
     ]
    }
   ],
   "source": [
    "from imdb import IMDb\n",
    "\n",
    "ia = IMDb()\n",
    "movie = ia.get_movie('0114709')  # Use IMDb ID without \"tt\"\n",
    "print(movie.get('plot outline'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>imdbId</th>\n",
       "      <th>tmdbId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>114709</td>\n",
       "      <td>862.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>113497</td>\n",
       "      <td>8844.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>113228</td>\n",
       "      <td>15602.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>114885</td>\n",
       "      <td>31357.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>113041</td>\n",
       "      <td>11862.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27273</th>\n",
       "      <td>131254</td>\n",
       "      <td>466713</td>\n",
       "      <td>4436.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27274</th>\n",
       "      <td>131256</td>\n",
       "      <td>277703</td>\n",
       "      <td>9274.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27275</th>\n",
       "      <td>131258</td>\n",
       "      <td>3485166</td>\n",
       "      <td>285213.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27276</th>\n",
       "      <td>131260</td>\n",
       "      <td>249110</td>\n",
       "      <td>32099.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27277</th>\n",
       "      <td>131262</td>\n",
       "      <td>1724965</td>\n",
       "      <td>286971.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>27278 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       movieId   imdbId    tmdbId\n",
       "0            1   114709     862.0\n",
       "1            2   113497    8844.0\n",
       "2            3   113228   15602.0\n",
       "3            4   114885   31357.0\n",
       "4            5   113041   11862.0\n",
       "...        ...      ...       ...\n",
       "27273   131254   466713    4436.0\n",
       "27274   131256   277703    9274.0\n",
       "27275   131258  3485166  285213.0\n",
       "27276   131260   249110   32099.0\n",
       "27277   131262  1724965  286971.0\n",
       "\n",
       "[27278 rows x 3 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "links"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Due to it taking a long time - only using tmdb for now, and then we can add this if necessary "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "# from imdb import IMDb\n",
    "\n",
    "# # Assuming df_movies is your DataFrame and it has a column named 'imdbId'\n",
    "# # Initialize IMDb object\n",
    "# ia = IMDb()\n",
    "\n",
    "# # Create a new column 'imdb_syn' initialized with None only if it does not exist\n",
    "# if 'imdb_syn' not in df_movies.columns:\n",
    "#     df_movies['imdb_syn'] = None\n",
    "\n",
    "# #\n",
    "# # Loop through the first 10 IMDb IDs in the DataFrame using integer-based indexing\n",
    "# for index in range(0, 100):\n",
    "#     try:\n",
    "#         imdb_id = str(df_movies.loc[index, 'imdbId']).zfill(7)  # Ensure the IMDb ID has leading zeros up to 7 digits\n",
    "#         movie = ia.get_movie(imdb_id)  # Use IMDb ID without \"tt\"\n",
    "#         plot_outline = movie.get('plot outline')\n",
    "        \n",
    "#         # Assign the plot outline to the corresponding entry in 'imdb_syn' column\n",
    "#         df_movies.loc[index, 'imdb_syn'] = plot_outline\n",
    "#     except Exception as e:\n",
    "#         print(f\"An error occurred for index {index}, IMDb ID {imdb_id}: {e}\")\n",
    "\n",
    "# # Now df_movies['imdb_syn'] will contain the plot outlines for the first 10 IMDb IDs.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "movieId                                                     4\n",
       "imdbId                                                 114885\n",
       "tmdbId                                                31357.0\n",
       "title                                Waiting to Exhale (1995)\n",
       "genres                                   Comedy|Drama|Romance\n",
       "imdb_syn    This story based on the best selling novel by ...\n",
       "Name: 2, dtype: object"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df_movies.loc[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_movies.to_csv(\"../dataset/context_imdb.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the tmdb, we use the Python package 'tmdbv3api'\n",
    "https://github.com/AnthonyBloomer/tmdbv3api"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install tmdbv3api"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2cc6b369ade4867c4efa72198cd6dba9 - API KEY\n",
    "\n",
    "a95e7426cf907141b0b558fef03000ab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import time\n",
    "\n",
    "# import pandas as pd\n",
    "# from concurrent.futures import ThreadPoolExecutor\n",
    "# from tmdbv3api import TMDb, Movie\n",
    "\n",
    "# # Initialize TMDb and Movie objects\n",
    "# tmdb = TMDb()\n",
    "# movie = Movie()\n",
    "\n",
    "# # Your TMDb API key\n",
    "# tmdb.api_key = 'a95e7426cf907141b0b558fef03000ab'\n",
    "\n",
    "# # Function to fetch movie overview\n",
    "# def fetch_overview(tmdb_id):\n",
    "#     try:\n",
    "#         if tmdb_id:\n",
    "#             details = movie.details(tmdb_id)\n",
    "#             if details:\n",
    "#                 return details.overview\n",
    "#             else:\n",
    "#                 print(f\"Resource with ID {tmdb_id} could not be found.\")\n",
    "#                 return 'N/A'\n",
    "#         else:\n",
    "#             return 'N/A'\n",
    "#     except Exception as e:\n",
    "#         print(f\"An error occurred: {e}\")\n",
    "#         return 'N/A'\n",
    "#     finally:\n",
    "#         # Sleep briefly to avoid overwhelming the API and hitting rate limits\n",
    "#         time.sleep(0.2)  \n",
    "\n",
    "# # Function to handle each batch\n",
    "# def process_batch(batch):\n",
    "#     max_workers = 5\n",
    "#     with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
    "#         return list(executor.map(fetch_overview, batch))\n",
    "\n",
    "# # Batch size (adjust based on your requirements)\n",
    "# batch_size = 50\n",
    "\n",
    "# # Initialize an empty list to hold the results\n",
    "# all_results = []\n",
    "\n",
    "# # Process each batch\n",
    "# for i in range(0, len(df_movies['tmdbId']), batch_size):\n",
    "#     batch = df_movies['tmdbId'][i:i + batch_size]\n",
    "#     batch_results = process_batch(batch)\n",
    "#     all_results.extend(batch_results)\n",
    "\n",
    "# # Add the results back into the DataFrame\n",
    "# df_movies['tmdb_syn'] = all_results\n",
    "\n",
    "# # Display the updated DataFrame\n",
    "# print(df_movies)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>imdbId</th>\n",
       "      <th>tmdbId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>imdb_syn</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>74</td>\n",
       "      <td>115644</td>\n",
       "      <td>20927.0</td>\n",
       "      <td>Bed of Roses (1996)</td>\n",
       "      <td>Drama|Romance</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>166</td>\n",
       "      <td>112887</td>\n",
       "      <td>13552.0</td>\n",
       "      <td>Doom Generation, The (1995)</td>\n",
       "      <td>Comedy|Crime|Drama</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>176</td>\n",
       "      <td>113677</td>\n",
       "      <td>9071.0</td>\n",
       "      <td>Living in Oblivion (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>186</td>\n",
       "      <td>113986</td>\n",
       "      <td>11472.0</td>\n",
       "      <td>Nine Months (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>187</td>\n",
       "      <td>114095</td>\n",
       "      <td>36196.0</td>\n",
       "      <td>Party Girl (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6232</th>\n",
       "      <td>130856</td>\n",
       "      <td>494826</td>\n",
       "      <td>48376.0</td>\n",
       "      <td>Severe Clear (2010)</td>\n",
       "      <td>Comedy|Documentary</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6233</th>\n",
       "      <td>130958</td>\n",
       "      <td>143338</td>\n",
       "      <td>78402.0</td>\n",
       "      <td>Killer Crocodile (1989)</td>\n",
       "      <td>Horror</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6234</th>\n",
       "      <td>130984</td>\n",
       "      <td>208423</td>\n",
       "      <td>317168.0</td>\n",
       "      <td>Santo vs. las lobas (1976)</td>\n",
       "      <td>Action|Fantasy|Horror</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6235</th>\n",
       "      <td>131011</td>\n",
       "      <td>69109</td>\n",
       "      <td>79572.0</td>\n",
       "      <td>Execution Squad (1972)</td>\n",
       "      <td>Crime|Drama</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6236</th>\n",
       "      <td>131015</td>\n",
       "      <td>1430116</td>\n",
       "      <td>143928.0</td>\n",
       "      <td>Hellgate (2011)</td>\n",
       "      <td>Horror|Thriller</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6140 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieId   imdbId    tmdbId                        title  \\\n",
       "46         74   115644   20927.0          Bed of Roses (1996)   \n",
       "86        166   112887   13552.0  Doom Generation, The (1995)   \n",
       "94        176   113677    9071.0    Living in Oblivion (1995)   \n",
       "100       186   113986   11472.0           Nine Months (1995)   \n",
       "101       187   114095   36196.0            Party Girl (1995)   \n",
       "...       ...      ...       ...                          ...   \n",
       "6232   130856   494826   48376.0          Severe Clear (2010)   \n",
       "6233   130958   143338   78402.0      Killer Crocodile (1989)   \n",
       "6234   130984   208423  317168.0   Santo vs. las lobas (1976)   \n",
       "6235   131011    69109   79572.0       Execution Squad (1972)   \n",
       "6236   131015  1430116  143928.0              Hellgate (2011)   \n",
       "\n",
       "                     genres imdb_syn  \n",
       "46            Drama|Romance     None  \n",
       "86       Comedy|Crime|Drama     None  \n",
       "94                   Comedy     None  \n",
       "100          Comedy|Romance     None  \n",
       "101                  Comedy     None  \n",
       "...                     ...      ...  \n",
       "6232     Comedy|Documentary     None  \n",
       "6233                 Horror     None  \n",
       "6234  Action|Fantasy|Horror     None  \n",
       "6235            Crime|Drama     None  \n",
       "6236        Horror|Thriller     None  \n",
       "\n",
       "[6140 rows x 6 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_movies_null = df_movies[df_movies['imdb_syn'].isna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "An error occurred: The resource you requested could not be found.\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/23706.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by SSLError(OSError(24, 'Too many open files')))Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/39488.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by SSLError(OSError(24, 'Too many open files')))\n",
      "\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/23966.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bff1a90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/16523.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bff3810>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/38702.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bff0310>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/19837.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bff0950>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/12572.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bff3c50>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/19918.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf63c90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/31189.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf72090>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/8926.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf83e90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/16398.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf800d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/2973.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf82e90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22947.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf66610>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/44338.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf65910>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22804.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf4b410>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/24418.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf4bdd0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22821.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf4b650>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/24804.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf4bb90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/29150.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf4a850>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22820.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf4aad0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/10315.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf4acd0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/36278.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf49310>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/17979.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf4a590>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/105077.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf66250>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "An error occurred: The resource you requested could not be found.\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/24480.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf66e10>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/14161.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bfdc910>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22825.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf82210>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/8088.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf82590>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22824.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf2ef10>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/25793.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38d688b90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/11699.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bfded10>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/18239.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38d68b390>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/28089.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38d6882d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/14859.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38d6882d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22832.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38d68b390>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/33273.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38d68b390>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/7445.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf69290>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/53516.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bfa3f90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22881.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bfa3810>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22640.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bfa1bd0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n",
      "Network error occurred: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/22949.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38a846c90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))\n"
     ]
    },
    {
     "ename": "ZMQError",
     "evalue": "Too many open files",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mgaierror\u001b[0m                                  Traceback (most recent call last)",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/urllib3/connection.py:174\u001b[0m, in \u001b[0;36mHTTPConnection._new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    173\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 174\u001b[0m     conn \u001b[38;5;241m=\u001b[39m connection\u001b[38;5;241m.\u001b[39mcreate_connection(\n\u001b[1;32m    175\u001b[0m         (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dns_host, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mport), \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mextra_kw\n\u001b[1;32m    176\u001b[0m     )\n\u001b[1;32m    178\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m SocketTimeout:\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/urllib3/util/connection.py:72\u001b[0m, in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m     68\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m six\u001b[38;5;241m.\u001b[39mraise_from(\n\u001b[1;32m     69\u001b[0m         LocationParseError(\u001b[38;5;124mu\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, label empty or too long\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m host), \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m     70\u001b[0m     )\n\u001b[0;32m---> 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m socket\u001b[38;5;241m.\u001b[39mgetaddrinfo(host, port, family, socket\u001b[38;5;241m.\u001b[39mSOCK_STREAM):\n\u001b[1;32m     73\u001b[0m     af, socktype, proto, canonname, sa \u001b[38;5;241m=\u001b[39m res\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/socket.py:962\u001b[0m, in \u001b[0;36mgetaddrinfo\u001b[0;34m(host, port, family, type, proto, flags)\u001b[0m\n\u001b[1;32m    961\u001b[0m addrlist \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m--> 962\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m _socket\u001b[38;5;241m.\u001b[39mgetaddrinfo(host, port, family, \u001b[38;5;28mtype\u001b[39m, proto, flags):\n\u001b[1;32m    963\u001b[0m     af, socktype, proto, canonname, sa \u001b[38;5;241m=\u001b[39m res\n",
      "\u001b[0;31mgaierror\u001b[0m: [Errno 8] nodename nor servname provided, or not known",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mNewConnectionError\u001b[0m                        Traceback (most recent call last)",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:714\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;66;03m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[0;32m--> 714\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[1;32m    715\u001b[0m     conn,\n\u001b[1;32m    716\u001b[0m     method,\n\u001b[1;32m    717\u001b[0m     url,\n\u001b[1;32m    718\u001b[0m     timeout\u001b[38;5;241m=\u001b[39mtimeout_obj,\n\u001b[1;32m    719\u001b[0m     body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    720\u001b[0m     headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m    721\u001b[0m     chunked\u001b[38;5;241m=\u001b[39mchunked,\n\u001b[1;32m    722\u001b[0m )\n\u001b[1;32m    724\u001b[0m \u001b[38;5;66;03m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[1;32m    725\u001b[0m \u001b[38;5;66;03m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[1;32m    726\u001b[0m \u001b[38;5;66;03m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[1;32m    727\u001b[0m \u001b[38;5;66;03m# mess.\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:403\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m    402\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 403\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_conn(conn)\n\u001b[1;32m    404\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (SocketTimeout, BaseSSLError) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    405\u001b[0m     \u001b[38;5;66;03m# Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:1053\u001b[0m, in \u001b[0;36mHTTPSConnectionPool._validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m   1052\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(conn, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msock\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m):  \u001b[38;5;66;03m# AppEngine might not have  `.sock`\u001b[39;00m\n\u001b[0;32m-> 1053\u001b[0m     conn\u001b[38;5;241m.\u001b[39mconnect()\n\u001b[1;32m   1055\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m conn\u001b[38;5;241m.\u001b[39mis_verified:\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/urllib3/connection.py:363\u001b[0m, in \u001b[0;36mHTTPSConnection.connect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    361\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mconnect\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    362\u001b[0m     \u001b[38;5;66;03m# Add certificate verification\u001b[39;00m\n\u001b[0;32m--> 363\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msock \u001b[38;5;241m=\u001b[39m conn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_new_conn()\n\u001b[1;32m    364\u001b[0m     hostname \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhost\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/urllib3/connection.py:186\u001b[0m, in \u001b[0;36mHTTPConnection._new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m SocketError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 186\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m NewConnectionError(\n\u001b[1;32m    187\u001b[0m         \u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to establish a new connection: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m e\n\u001b[1;32m    188\u001b[0m     )\n\u001b[1;32m    190\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m conn\n",
      "\u001b[0;31mNewConnectionError\u001b[0m: <urllib3.connection.HTTPSConnection object at 0x38bf63c90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mMaxRetryError\u001b[0m                             Traceback (most recent call last)",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/requests/adapters.py:486\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m    485\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 486\u001b[0m     resp \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39murlopen(\n\u001b[1;32m    487\u001b[0m         method\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mmethod,\n\u001b[1;32m    488\u001b[0m         url\u001b[38;5;241m=\u001b[39murl,\n\u001b[1;32m    489\u001b[0m         body\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mbody,\n\u001b[1;32m    490\u001b[0m         headers\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mheaders,\n\u001b[1;32m    491\u001b[0m         redirect\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    492\u001b[0m         assert_same_host\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    493\u001b[0m         preload_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    494\u001b[0m         decode_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    495\u001b[0m         retries\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_retries,\n\u001b[1;32m    496\u001b[0m         timeout\u001b[38;5;241m=\u001b[39mtimeout,\n\u001b[1;32m    497\u001b[0m         chunked\u001b[38;5;241m=\u001b[39mchunked,\n\u001b[1;32m    498\u001b[0m     )\n\u001b[1;32m    500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:798\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m    796\u001b[0m     e \u001b[38;5;241m=\u001b[39m ProtocolError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mConnection aborted.\u001b[39m\u001b[38;5;124m\"\u001b[39m, e)\n\u001b[0;32m--> 798\u001b[0m retries \u001b[38;5;241m=\u001b[39m retries\u001b[38;5;241m.\u001b[39mincrement(\n\u001b[1;32m    799\u001b[0m     method, url, error\u001b[38;5;241m=\u001b[39me, _pool\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, _stacktrace\u001b[38;5;241m=\u001b[39msys\u001b[38;5;241m.\u001b[39mexc_info()[\u001b[38;5;241m2\u001b[39m]\n\u001b[1;32m    800\u001b[0m )\n\u001b[1;32m    801\u001b[0m retries\u001b[38;5;241m.\u001b[39msleep()\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/urllib3/util/retry.py:592\u001b[0m, in \u001b[0;36mRetry.increment\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m    591\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m new_retry\u001b[38;5;241m.\u001b[39mis_exhausted():\n\u001b[0;32m--> 592\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m MaxRetryError(_pool, url, error \u001b[38;5;129;01mor\u001b[39;00m ResponseError(cause))\n\u001b[1;32m    594\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncremented Retry for (url=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m): \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, url, new_retry)\n",
      "\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/19918.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf63c90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mConnectionError\u001b[0m                           Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[19], line 18\u001b[0m, in \u001b[0;36mfetch_overview\u001b[0;34m(tmdb_id)\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tmdb_id:\n\u001b[0;32m---> 18\u001b[0m     details \u001b[38;5;241m=\u001b[39m movie\u001b[38;5;241m.\u001b[39mdetails(tmdb_id)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m details:\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/tmdbv3api/objs/movie.py:41\u001b[0m, in \u001b[0;36mMovie.details\u001b[0;34m(self, movie_id, append_to_response)\u001b[0m\n\u001b[1;32m     35\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     36\u001b[0m \u001b[38;5;124;03mGet the primary information about a movie.\u001b[39;00m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;124;03m:param movie_id: int\u001b[39;00m\n\u001b[1;32m     38\u001b[0m \u001b[38;5;124;03m:param append_to_response: str\u001b[39;00m\n\u001b[1;32m     39\u001b[0m \u001b[38;5;124;03m:return:\u001b[39;00m\n\u001b[1;32m     40\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m---> 41\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request_obj(\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_urls[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdetails\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m%\u001b[39m movie_id,\n\u001b[1;32m     43\u001b[0m     params\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mappend_to_response=\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m append_to_response\n\u001b[1;32m     44\u001b[0m )\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/tmdbv3api/tmdb.py:150\u001b[0m, in \u001b[0;36mTMDb._request_obj\u001b[0;34m(self, action, params, call_cached, method, data, json, key)\u001b[0m\n\u001b[1;32m    149\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj_cached \u001b[38;5;129;01mand\u001b[39;00m call_cached \u001b[38;5;129;01mand\u001b[39;00m method \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 150\u001b[0m     req \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcached_request(method, url, data, json, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mproxies)\n\u001b[1;32m    151\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/tmdbv3api/tmdb.py:132\u001b[0m, in \u001b[0;36mTMDb.cached_request\u001b[0;34m(method, url, data, json, proxies)\u001b[0m\n\u001b[1;32m    129\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m    130\u001b[0m \u001b[38;5;129m@lru_cache\u001b[39m(maxsize\u001b[38;5;241m=\u001b[39mREQUEST_CACHE_MAXSIZE)\n\u001b[1;32m    131\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcached_request\u001b[39m(method, url, data, json, proxies):\n\u001b[0;32m--> 132\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mrequest(method, url, data\u001b[38;5;241m=\u001b[39mdata, json\u001b[38;5;241m=\u001b[39mjson, proxies\u001b[38;5;241m=\u001b[39mproxies)\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/requests/api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m     58\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m sessions\u001b[38;5;241m.\u001b[39mSession() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[0;32m---> 59\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m session\u001b[38;5;241m.\u001b[39mrequest(method\u001b[38;5;241m=\u001b[39mmethod, url\u001b[38;5;241m=\u001b[39murl, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m    588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msend(prep, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39msend_kwargs)\n\u001b[1;32m    591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m    702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m adapter\u001b[38;5;241m.\u001b[39msend(request, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/requests/adapters.py:519\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m    517\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m SSLError(e, request\u001b[38;5;241m=\u001b[39mrequest)\n\u001b[0;32m--> 519\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(e, request\u001b[38;5;241m=\u001b[39mrequest)\n\u001b[1;32m    521\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ClosedPoolError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
      "\u001b[0;31mConnectionError\u001b[0m: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/19918.0?api_key=a95e7426cf907141b0b558fef03000ab&append_to_response=videos,trailers,images,casts,translations,keywords,release_dates&language=en-US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x38bf63c90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py:131\u001b[0m, in \u001b[0;36mIOPubThread._event_pipe\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    130\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 131\u001b[0m     event_pipe \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_local\u001b[38;5;241m.\u001b[39mevent_pipe\n\u001b[1;32m    132\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m:\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;66;03m# new thread, new event pipe\u001b[39;00m\n",
      "\u001b[0;31mAttributeError\u001b[0m: '_thread._local' object has no attribute 'event_pipe'",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mZMQError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[19], line 53\u001b[0m\n\u001b[1;32m     51\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;28mlen\u001b[39m(df_movies[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtmdbId\u001b[39m\u001b[38;5;124m'\u001b[39m]), batch_size):\n\u001b[1;32m     52\u001b[0m     batch \u001b[38;5;241m=\u001b[39m df_movies[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtmdbId\u001b[39m\u001b[38;5;124m'\u001b[39m][i:i \u001b[38;5;241m+\u001b[39m batch_size]\n\u001b[0;32m---> 53\u001b[0m     batch_results \u001b[38;5;241m=\u001b[39m process_batch(batch)\n\u001b[1;32m     54\u001b[0m     all_results\u001b[38;5;241m.\u001b[39mextend(batch_results)\n\u001b[1;32m     56\u001b[0m \u001b[38;5;66;03m# Add the results back into the DataFrame\u001b[39;00m\n",
      "Cell \u001b[0;32mIn[19], line 40\u001b[0m, in \u001b[0;36mprocess_batch\u001b[0;34m(batch)\u001b[0m\n\u001b[1;32m     38\u001b[0m max_workers \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m  \u001b[38;5;66;03m# Adjust to a smaller value to reduce concurrency\u001b[39;00m\n\u001b[1;32m     39\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ThreadPoolExecutor(max_workers\u001b[38;5;241m=\u001b[39mmax_workers) \u001b[38;5;28;01mas\u001b[39;00m executor:\n\u001b[0;32m---> 40\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(executor\u001b[38;5;241m.\u001b[39mmap(fetch_overview, batch))\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/concurrent/futures/_base.py:619\u001b[0m, in \u001b[0;36mExecutor.map.<locals>.result_iterator\u001b[0;34m()\u001b[0m\n\u001b[1;32m    616\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m fs:\n\u001b[1;32m    617\u001b[0m     \u001b[38;5;66;03m# Careful not to keep a reference to the popped future\u001b[39;00m\n\u001b[1;32m    618\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 619\u001b[0m         \u001b[38;5;28;01myield\u001b[39;00m _result_or_cancel(fs\u001b[38;5;241m.\u001b[39mpop())\n\u001b[1;32m    620\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    621\u001b[0m         \u001b[38;5;28;01myield\u001b[39;00m _result_or_cancel(fs\u001b[38;5;241m.\u001b[39mpop(), end_time \u001b[38;5;241m-\u001b[39m time\u001b[38;5;241m.\u001b[39mmonotonic())\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/concurrent/futures/_base.py:317\u001b[0m, in \u001b[0;36m_result_or_cancel\u001b[0;34m(***failed resolving arguments***)\u001b[0m\n\u001b[1;32m    315\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    316\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 317\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fut\u001b[38;5;241m.\u001b[39mresult(timeout)\n\u001b[1;32m    318\u001b[0m     \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    319\u001b[0m         fut\u001b[38;5;241m.\u001b[39mcancel()\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/concurrent/futures/_base.py:449\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    447\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[1;32m    448\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[0;32m--> 449\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[1;32m    451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n\u001b[1;32m    453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/concurrent/futures/_base.py:401\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception:\n\u001b[1;32m    400\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 401\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[1;32m    402\u001b[0m     \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    403\u001b[0m         \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[1;32m    404\u001b[0m         \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/concurrent/futures/thread.py:58\u001b[0m, in \u001b[0;36m_WorkItem.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     55\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfn(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkwargs)\n\u001b[1;32m     59\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m     60\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mset_exception(exc)\n",
      "Cell \u001b[0;32mIn[19], line 27\u001b[0m, in \u001b[0;36mfetch_overview\u001b[0;34m(tmdb_id)\u001b[0m\n\u001b[1;32m     25\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mN/A\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m     26\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m RequestException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m---> 27\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNetwork error occurred: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     28\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mN/A\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m     29\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py:648\u001b[0m, in \u001b[0;36mOutStream.write\u001b[0;34m(self, string)\u001b[0m\n\u001b[1;32m    646\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpub_thread\u001b[38;5;241m.\u001b[39mschedule(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_flush)\n\u001b[1;32m    647\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 648\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_schedule_flush()\n\u001b[1;32m    650\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(string)\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py:545\u001b[0m, in \u001b[0;36mOutStream._schedule_flush\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    542\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_schedule_in_thread\u001b[39m():\n\u001b[1;32m    543\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_io_loop\u001b[38;5;241m.\u001b[39mcall_later(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mflush_interval, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_flush)\n\u001b[0;32m--> 545\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpub_thread\u001b[38;5;241m.\u001b[39mschedule(_schedule_in_thread)\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py:251\u001b[0m, in \u001b[0;36mIOPubThread.schedule\u001b[0;34m(self, f)\u001b[0m\n\u001b[1;32m    249\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_events\u001b[38;5;241m.\u001b[39mappend(f)\n\u001b[1;32m    250\u001b[0m     \u001b[38;5;66;03m# wake event thread (message content is ignored)\u001b[39;00m\n\u001b[0;32m--> 251\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_event_pipe\u001b[38;5;241m.\u001b[39msend(\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    252\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    253\u001b[0m     f()\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/ipykernel/iostream.py:135\u001b[0m, in \u001b[0;36mIOPubThread._event_pipe\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    132\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m:\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;66;03m# new thread, new event pipe\u001b[39;00m\n\u001b[1;32m    134\u001b[0m     ctx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msocket\u001b[38;5;241m.\u001b[39mcontext\n\u001b[0;32m--> 135\u001b[0m     event_pipe \u001b[38;5;241m=\u001b[39m ctx\u001b[38;5;241m.\u001b[39msocket(zmq\u001b[38;5;241m.\u001b[39mPUSH)\n\u001b[1;32m    136\u001b[0m     event_pipe\u001b[38;5;241m.\u001b[39mlinger \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m    137\u001b[0m     event_pipe\u001b[38;5;241m.\u001b[39mconnect(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_event_interface)\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/zmq/sugar/context.py:259\u001b[0m, in \u001b[0;36mContext.socket\u001b[0;34m(self, socket_type, **kwargs)\u001b[0m\n\u001b[1;32m    257\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclosed:\n\u001b[1;32m    258\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m ZMQError(Errno\u001b[38;5;241m.\u001b[39mENOTSUP)\n\u001b[0;32m--> 259\u001b[0m s: ST \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_socket_class(  \u001b[38;5;66;03m# set PYTHONTRACEMALLOC=2 to get the calling frame\u001b[39;00m\n\u001b[1;32m    260\u001b[0m     \u001b[38;5;28mself\u001b[39m, socket_type, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m    261\u001b[0m )\n\u001b[1;32m    262\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m opt, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msockopts\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m    263\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/zmq/sugar/socket.py:93\u001b[0m, in \u001b[0;36mSocket.__init__\u001b[0;34m(self, *a, **kw)\u001b[0m\n\u001b[1;32m     92\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSocket[bytes]\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m*\u001b[39ma, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw):\n\u001b[0;32m---> 93\u001b[0m     \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;241m*\u001b[39ma, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw)\n\u001b[1;32m     94\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mshadow\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m kw:\n\u001b[1;32m     95\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_shadow \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
      "File \u001b[0;32mzmq/backend/cython/socket.pyx:330\u001b[0m, in \u001b[0;36mzmq.backend.cython.socket.Socket.__init__\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mZMQError\u001b[0m: Too many open files"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "import pandas as pd\n",
    "from concurrent.futures import ThreadPoolExecutor\n",
    "from tmdbv3api import TMDb, Movie\n",
    "\n",
    "# Initialize TMDb and Movie objects\n",
    "tmdb = TMDb()\n",
    "movie = Movie()\n",
    "\n",
    "# Your TMDb API key\n",
    "tmdb.api_key = 'a95e7426cf907141b0b558fef03000ab'\n",
    "\n",
    "# Function to fetch movie overview\n",
    "def fetch_overview(tmdb_id):\n",
    "    try:\n",
    "        if tmdb_id:\n",
    "            details = movie.details(tmdb_id)\n",
    "            if details:\n",
    "                return details.overview\n",
    "            else:\n",
    "                print(f\"Resource with ID {tmdb_id} could not be found.\")\n",
    "                return 'N/A'\n",
    "        else:\n",
    "            return 'N/A'\n",
    "    except RequestException as e:\n",
    "        print(f\"Network error occurred: {e}\")\n",
    "        return 'N/A'\n",
    "    except Exception as e:\n",
    "        print(f\"An error occurred: {e}\")\n",
    "        return 'N/A'\n",
    "    finally:\n",
    "        # Sleep briefly to avoid overwhelming the API and hitting rate limits\n",
    "        time.sleep(0.1)\n",
    "\n",
    "# Function to handle each batch\n",
    "def process_batch(batch):\n",
    "    max_workers = 5  # Adjust to a smaller value to reduce concurrency\n",
    "    with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
    "        return list(executor.map(fetch_overview, batch))\n",
    "\n",
    "    \n",
    "    \n",
    "# Batch size (adjust based on your requirements)\n",
    "batch_size = 50  # Adjust to a smaller value to reduce batch size\n",
    "\n",
    "# Initialize an empty list to hold the results\n",
    "all_results = []\n",
    "\n",
    "# Process each batch\n",
    "for i in range(0, len(df_movies_null['tmdbId']), batch_size):\n",
    "    batch = df_movies_null['tmdbId'][i:i + batch_size]\n",
    "    batch_results = process_batch(batch)\n",
    "    all_results.extend(batch_results)\n",
    "\n",
    "# Add the results back into the DataFrame\n",
    "df_movies_null['tmdb_syn'] = all_results\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       False\n",
       "1       False\n",
       "2       False\n",
       "3       False\n",
       "4       False\n",
       "        ...  \n",
       "6232     True\n",
       "6233     True\n",
       "6234     True\n",
       "6235     True\n",
       "6236     True\n",
       "Name: imdb_syn, Length: 6237, dtype: bool"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "non_null_tmdb_syn = df_movies_null[df_movies_null['tmdb_syn'].notna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies.to_csv(\"../dataset/tmdb_syn_labelled.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Combining the imdb and tmdb synopsis into one column - 2 Oct: Summarise once imdb is done - if used\n",
    "- Combine and summarise the synopses using the gensim package\n",
    "- ratio (float, optional) – Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # from gensim.summarization import summarize\n",
    "\n",
    "# # Combine and summarize the synopses\n",
    "# def combine_and_summarize(imdb_syn, tmdb_syn):\n",
    "#     combined_syn = imdb_syn + \" \" + tmdb_syn\n",
    "#     summarized_syn = summarize(combined_syn, ratio=0.7)  # Adjust the ratio as needed\n",
    "#     return summarized_syn if summarized_syn else combined_syn  # Use original if summarization fails\n",
    "\n",
    "# # Apply the function to your DataFrame\n",
    "# df_movies['summarized_syn'] = df_movies.apply(lambda x: combine_and_summarize(x['imdb_syn'], x['tmdb_syn']), axis=1)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reading in the tmdb column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmdb = pd.read_csv(\"../dataset/tmdb_syn_labelled.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmdb['movieId'] = tmdb['movieId'].astype('int')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmdb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge df_movies with tmdb based on 'movieId'\n",
    "# This will add the 'tmdb_syn' column from tmdb to df_movies\n",
    "df_movies = pd.merge(df_movies, tmdb[['movieId', 'tmdb_syn']], on='movieId', how='left')\n",
    "\n",
    "# Display the updated DataFrame\n",
    "#df_movies[[\"movieId\", \"title\", \"summarized_syn\"]].head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Investigating movies with NO tmdb_syn:\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Create a DataFrame with only the rows where 'tmdb_syn' is NaN\n",
    "df_movies_missing_tmdb_syn = df_movies[pd.isna(df_movies['tmdb_syn'])]\n",
    "\n",
    "# Display or analyze the DataFrame\n",
    "print(df_movies_missing_tmdb_syn)\n",
    "\n",
    "# If you want to know the number of such rows\n",
    "print(\"Number of rows with missing tmdb_syn:\", len(df_movies_missing_tmdb_syn))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For these 56 missing - will try to get from imdb API:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies_missing_tmdb_syn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from imdb import IMDb\n",
    "\n",
    "# # Initialize IMDb object\n",
    "# ia = IMDb()\n",
    "\n",
    "# # Function to fetch the plot outline\n",
    "# def get_imdb_syn(imdbId):\n",
    "#     try:\n",
    "#         movie = ia.get_movie(str(imdbId))\n",
    "#         return movie.get('plot outline', 'N/A')\n",
    "#     except Exception as e:\n",
    "#         print(f\"An error occurred: {e}\")\n",
    "#         return 'N/A'\n",
    "\n",
    "# # Find rows where 'tmdb_syn' is NaN\n",
    "# missing_tmdb_syn_idx = df_movies[pd.isna(df_movies['tmdb_syn'])].index.tolist()\n",
    "\n",
    "# # Initialize an empty list to keep track of updated rows\n",
    "# updated_rows = []\n",
    "\n",
    "# # Fetch 'imdb_syn' for these rows\n",
    "# for idx in missing_tmdb_syn_idx:\n",
    "#     imdbId = df_movies.loc[idx, 'imdbId']\n",
    "#     new_syn = get_imdb_syn(imdbId)\n",
    "    \n",
    "#     if new_syn != 'N/A':\n",
    "#         df_movies.loc[idx, 'imdb_syn'] = new_syn\n",
    "#         updated_rows.append(idx)\n",
    "\n",
    "# # Show the rows that were updated\n",
    "# print(\"Updated rows:\")\n",
    "# print(df_movies.loc[updated_rows])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Just for now, until the imdb API works -> we are removing these movieIds:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "missing_tmdb_syn_idx = df_movies[pd.isna(df_movies['tmdb_syn'])].index.tolist()\n",
    "missing_tmdb_syn_idx # for \n",
    "\n",
    "# Remove rows with indices in missing_tmdb_syn_idx from df_movies\n",
    "df_movies.drop(missing_tmdb_syn_idx, inplace=True)\n",
    "\n",
    "# Reset index if needed\n",
    "df_movies.reset_index(drop=True, inplace=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Text Pre-processing \n",
    "- Need to apply to the following colunns: summarized_syn, title, genre\n",
    "\n",
    "\n",
    "1. Removal of special characters  \n",
    "2. Uniform size of letters - lowercase\n",
    "3. Remove punctuation and quotation marks\n",
    "4. Remove possessive pronouns \n",
    "5. Lemmatisation \n",
    "6. Removal of “stop words”"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. Removal of special characters  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace non-alphabetical characters with empty string, leaving spaces intact\n",
    "df_movies['summarized_syn_cleaned'] = df_movies['summarized_syn'].str.replace(r'[^a-zA-Z\\s]', '', regex=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. Uniform size of letters - lowercase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies['summarized_syn'] = df_movies['summarized_syn'].str.lower() #lowercase\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3. Remove punctuation and quotation marks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies['summarized_syn'] = df_movies['summarized_syn'].str.replace(r'[^\\w\\s]', '', regex=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "4. Remove possessive pronouns "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "\n",
    "# Define a regular expression to match possessive pronouns, with word boundaries\n",
    "possessive_pronouns = r'\\b(my|your|his|her|its|our|their)\\b'\n",
    "\n",
    "# Replace possessive pronouns with empty strings\n",
    "df_movies['summarized_syn'] = df_movies['summarized_syn'].str.replace(possessive_pronouns, '', regex=True)\n",
    "\n",
    "# Remove extra spaces (since the possessive pronouns might leave extra spaces when removed)\n",
    "df_movies['summarized_syn'] = df_movies['summarized_syn'].str.replace(r'\\s+', ' ', regex=True).str.strip()\n",
    "\n",
    "print(df_movies)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "5. Lemmatisation "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from nltk.corpus import wordnet\n",
    "\n",
    "# Download necessary NLTK data\n",
    "nltk.download('averaged_perceptron_tagger')\n",
    "nltk.download('punkt')  # for word_tokenize\n",
    "\n",
    "# Initialize the WordNetLemmatizer\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "\n",
    "# Function to map NLTK's POS tags to the first character used by WordNetLemmatizer\n",
    "def pos_tagger(nltk_tag):\n",
    "    if nltk_tag.startswith('J'):\n",
    "        return wordnet.ADJ\n",
    "    elif nltk_tag.startswith('V'):\n",
    "        return wordnet.VERB\n",
    "    elif nltk_tag.startswith('N'):\n",
    "        return wordnet.NOUN\n",
    "    elif nltk_tag.startswith('R'):\n",
    "        return wordnet.ADV\n",
    "    else:         \n",
    "        return None\n",
    "\n",
    "# Function to lemmatize a single word (removed the keep check)\n",
    "def lemmatize_word(word):\n",
    "    pos = nltk.pos_tag([word])[0][1]  # POS tagging\n",
    "    wordnet_pos = pos_tagger(pos)     # Map POS tag to first character used by WordNetLemmatizer\n",
    "    if wordnet_pos is None:\n",
    "        return word\n",
    "    else:\n",
    "        return lemmatizer.lemmatize(word, wordnet_pos)\n",
    "\n",
    "# Tokenize and then lemmatize\n",
    "df_movies['summarized_syn'] = df_movies['summarized_syn'].apply(\n",
    "    lambda text: ' '.join([lemmatize_word(word) for word in word_tokenize(text)])\n",
    ")\n",
    "\n",
    "# Display the updated DataFrame\n",
    "print(df_movies)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "6. Removal of “stop words”"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "from spacy.lang.en import STOP_WORDS\n",
    "\n",
    "# Function to remove stop words from a list of words\n",
    "def remove_stop_words(words_list):\n",
    "    return [word for word in words_list if word.lower() not in STOP_WORDS]\n",
    "\n",
    "# First, tokenize the sentences into words\n",
    "df_movies['summarized_syn_tokens'] = df_movies['summarized_syn'].apply(\n",
    "    lambda element: word_tokenize(element) if isinstance(element, str) else element\n",
    ")\n",
    "\n",
    "# Now remove stop words from the 'summarized_syn_tokens' column\n",
    "df_movies['summarized_syn_cleaned'] = df_movies['summarized_syn_tokens'].apply(\n",
    "    lambda element: remove_stop_words(element) if isinstance(element, list) else element\n",
    ")\n",
    "\n",
    "# Display the updated DataFrame\n",
    "df_movies\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Encoding genre column using GloVe vectors\n",
    "\n",
    "- Using pre-trained word vectors (wikipedia) - 200d\n",
    "ref: https://github.com/stanfordnlp/GloVe \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies['genres'] = df_movies['genres'].apply(lambda x: x.split('|'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get a distinct list of genres\n",
    "\n",
    "\n",
    "# Assuming df_movies['genres'] has been split into lists of strings\n",
    "unique_genres = set()\n",
    "\n",
    "# Iterate over the 'genres' column to populate the unique_genres set\n",
    "for genre_list in df_movies['genres']:\n",
    "    unique_genres.update(genre_list)\n",
    "\n",
    "# Convert the set to a list, if needed\n",
    "unique_genres = list(unique_genres)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_genres = [genre.lower() for genre in unique_genres]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_genres"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\"film-noir\" has been replaced to \"noir\"\n",
    "- As film-noir is not detected in the glove vectors \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_genres = [genre.replace(\"film-noir\", \"noir\") for genre in unique_genres]\n",
    "unique_genres"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Getting the glove vectors for each unique genre "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ref: https://keras.io/examples/nlp/pretrained_word_embeddings/\n",
    "\n",
    "def get_glove(file_path):\n",
    "\n",
    "    embeddings_index = {}\n",
    "    with open(path_to_glove_file, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            word, coefs = line.split(maxsplit=1)\n",
    "            coefs = np.fromstring(coefs, \"f\", sep=\" \")\n",
    "            embeddings_index[word] = coefs\n",
    "    return embeddings_index\n",
    "\n",
    "path_to_glove_file = \"/Users/aleishamanalo/Desktop/Thesis/glove.6B/glove.6B.200d.txt\"\n",
    "glove_vec = get_glove(path_to_glove_file)\n",
    "\n",
    "print(\"Found %s word vectors.\" % len(glove_vec))\n",
    "\n",
    " # 200 zero vec is assigned when the word is not found in the GloVe index "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize an empty dictionary to hold the GloVe vectors for each unique genre\n",
    "genre_glove_vec = {}\n",
    "\n",
    "# Populate the genre_glove_vec dictionary\n",
    "for genre in unique_genres:\n",
    "    if genre in glove_vec:  # Check if the genre name is available in the GloVe vocab\n",
    "        genre_glove_vec[genre] = glove_vec[genre]\n",
    "    else:\n",
    "        print(f\"No GloVe representation found for genre: {genre}\")\n",
    "        genre_glove_vec[genre] = None  # You can also populate with a default vector if needed\n",
    "\n",
    "# Now, genre_glove_vec contains the GloVe representation for each unique genre\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of movies with  no genres: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count the number of movies with no genres\n",
    "count_no_genres = df_movies['genres'].apply(lambda x: len(x) == 0).sum()\n",
    "\n",
    "print(f'Number of movies with no genres: {count_no_genres}')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All movies have an assigned genre!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# passing the genre_glove_vec to file - which will be read in the CB model \n",
    "import pickle\n",
    "\n",
    "# Writing to file\n",
    "with open('/Users/aleishamanalo/Desktop/Thesis/genre_glove_vec.pkl', 'wb') as f:\n",
    "    pickle.dump(genre_glove_vec, f)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# put movieId and genres into one dataframe too"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_output= df_movies[[\"movieId\", \"genres\"]]\n",
    "df_output.to_csv(\"/Users/aleishamanalo/Desktop/Thesis/movie_genre.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# Initialize MultiLabelBinarizer\n",
    "mlb = MultiLabelBinarizer()\n",
    "\n",
    "# Transform the 'genres' column to a binary matrix\n",
    "binary_genres = mlb.fit_transform(df_movies['genres'])\n",
    "\n",
    "# Create a new DataFrame from the binary matrix\n",
    "df_genres = pd.DataFrame(binary_genres, columns=mlb.classes_)\n",
    "\n",
    "# Concatenate the original DataFrame and the new DataFrame\n",
    "df_movies = pd.concat([df_movies, df_genres], axis=1)\n",
    "\n",
    "df_movies\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies[[\"movieId\", \"genres\"]].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_genres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CBOW TF-IDF Method to generate Context Vector "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. tf-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "vectorizer = TfidfVectorizer()\n",
    "tfidf_matrix = vectorizer.fit_transform(df_movies['summarized_syn_cleaned'].apply(' '.join))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "# Assuming df_movies['summarized_syn_cleaned'] is a Series of lists of words\n",
    "# We'll join the lists into single strings for each document\n",
    "tfidf_matrix = vectorizer.fit_transform(df_movies['summarized_syn_cleaned'].apply(' '.join))\n",
    "\n",
    "# Get the words corresponding to the features of the TF-IDF matrix\n",
    "feature_names = vectorizer.get_feature_names_out()\n",
    "\n",
    "# Convert the tfidf_matrix to a dense matrix, then to a DataFrame\n",
    "tfidf_dataframe = pd.DataFrame(tfidf_matrix.todense(), columns=feature_names)\n",
    "\n",
    "# Show the head of the DataFrame for a preview\n",
    "tfidf_matrix\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. CBOW Representation with weight with TF-IDF"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Need to use 200-dimensional pre-trained word vectors\n",
    "Why? Because this matches the dimensionality of the vectors that we use for the GloVe vectors\n",
    "- This is what we need to compare in the WSD operation. They need to have the same dimensionality.\n",
    "- **Put this in thesis.**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using GloVe vectors in your Continuous Bag-of-Words (CBOW) model is a common practice. The idea is to replace the word vectors you generate in the CBOW model with pre-trained GloVe vectors. This way, you can leverage the semantic information captured during the GloVe pre-training process."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def get_glove(file_path):\n",
    "\n",
    "    \n",
    "    embeddings_index = {}\n",
    "    with open(path_to_glove_file, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            word, coefs = line.split(maxsplit=1)\n",
    "            coefs = np.fromstring(coefs, \"f\", sep=\" \")\n",
    "            embeddings_index[word] = coefs\n",
    "    return embeddings_index\n",
    "\n",
    "path_to_glove_file = \"/Users/aleishamanalo/Desktop/Thesis/glove.6B/glove.6B.200d.txt\"\n",
    "glove_vec = get_glove(path_to_glove_file)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_weighted_cbow_vector(tokens, tfidf_scores, vocab, model):\n",
    "    weighted_vector = np.zeros((200,))  # Updated dimensionality to match the GloVe vectors\n",
    "    for token, score in zip(tokens, tfidf_scores):\n",
    "        if token in vocab:\n",
    "            try:\n",
    "                weighted_vector += model[token] * score  # Used 'model' parameter\n",
    "            except KeyError:  # Token might not be in the model vocabulary\n",
    "                continue\n",
    "    if len(tokens) > 0:\n",
    "        return weighted_vector / len(tokens)\n",
    "    else:\n",
    "        return np.zeros((200,))\n",
    "\n",
    "\n",
    "# Convert sparse tfidf_matrix to dense form and iterate to compute weighted CBOW vectors, WHY: because not all algorithms work well with sparse form. Helps with element-wise operations. \n",
    "dense_tfidf = tfidf_matrix.todense()\n",
    "vocab = set(vectorizer.get_feature_names_out())\n",
    "df_movies['weighted_cbow_synopsis'] = [get_weighted_cbow_vector(tokens, dense_tfidf[i].tolist()[0], vocab, glove_vec) for i, tokens in enumerate(df_movies['summarized_syn_cleaned'])]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How is tf-IDF used here? With Filtering (vocab): You ensure that the token is not only in the GloVe vocabulary but also relevant in your specific corpus according to TF-IDF. This could be more precise for your use-case but may miss out on some broader semantic relationships captured by GloVe.\n",
    "- Gives more importance to context words that are relevant within your specific corpus. \n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Transforming genre matrix and weighted cbow for each movie in df_movies\n",
    "\n",
    "These will be separate columns\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "# Convert the 'context_vector_synopsis' and 'binary_genres' to NumPy arrays if they are not already\n",
    "df_movies['final_context_vector'] = df_movies['weighted_cbow_synopsis'].apply(np.array)\n",
    "df_genres_np = df_genres.to_numpy()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_genres_np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(binary_genres)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Passing to a file ->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json \n",
    "\n",
    "# df_movies.to_json(\"/Users/aleishamanalo/Desktop/Thesis/df_context_vec.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sparse Users Method \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reading in the recommendations from CF model - empty ones\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reading in the tags (userId, movieId, tag)\n",
    "tags = pd.read_csv(\"/Users/aleishamanalo/thesis/tags_contentbased.csv\")\n",
    "tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reading in the movie context vectors\n",
    "\n",
    "df_movies = pd.read_json(\"/Users/aleishamanalo/Desktop/Thesis/df_context_vec.json\")\n",
    "df_movies\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rec = pd.read_json('/Users/aleishamanalo/Desktop/Thesis/df_rec.json', orient='split')\n",
    "\n",
    "df_rec_sparse = df_rec[df_rec['recommendations'].str.len() == 0]\n",
    "\n",
    "df_rec_sparse\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "# One-hot encode the genres for each movie\n",
    "df_movies['genres'] = df_movies['genres'].apply(lambda x: x if isinstance(x, list) else [])\n",
    "all_genres = set([genre for sublist in df_movies['genres'].tolist() for genre in sublist])\n",
    "for genre in all_genres:\n",
    "    df_movies[genre] = df_movies['genres'].apply(lambda x: 1 if genre in x else 0)\n",
    "\n",
    "# Extracting genre vectors\n",
    "df_movies['genre_vector'] = df_movies[all_genres].values.tolist()\n",
    "\n",
    "def get_similar_movies_genre_vector(user_movie_vector, threshold=0.98):\n",
    "    similarities = cosine_similarity([user_movie_vector], df_movies['genre_vector'].tolist())[0]\n",
    "    df_movies['cosine_similarity'] = similarities\n",
    "    similar_movies = df_movies[df_movies['cosine_similarity'] >= threshold]\n",
    "    return similar_movies['movieId'].tolist()\n",
    "\n",
    "def get_movie_tags(similar_movie_ids, user):\n",
    "    movie_tags_df = tags[tags['movieId'].isin(similar_movie_ids)][['movieId', 'tag']].groupby('movieId')['tag'].apply(list).reset_index()\n",
    "    movie_tags_df['userId'] = user\n",
    "    return movie_tags_df\n",
    "\n",
    "# Main function using genre vectors\n",
    "def context_aware_model_for_sparse_users_genre_vector():\n",
    "    sparse_users = df_rec_sparse['userId'].unique()\n",
    "    all_tags_for_similar_movies = []\n",
    "    \n",
    "    for user in sparse_users:\n",
    "        # Getting movies tagged by the user\n",
    "        user_movies = tags[tags['userId'] == user]['movieId'].unique()\n",
    "        \n",
    "        for movie in user_movies:\n",
    "            if df_movies[df_movies['movieId'] == movie]['genre_vector'].shape[0] > 0:\n",
    "                user_movie_vector = np.array(df_movies[df_movies['movieId'] == movie]['genre_vector'].iloc[0])\n",
    "            else:\n",
    "                continue\n",
    "\n",
    "            similar_movie_ids = get_similar_movies_genre_vector(user_movie_vector)\n",
    "            movie_tags_df = get_movie_tags(similar_movie_ids, user)\n",
    "            \n",
    "            all_tags_for_similar_movies.append(movie_tags_df)\n",
    "    \n",
    "    return pd.concat(all_tags_for_similar_movies)\n",
    "\n",
    "result_df = context_aware_model_for_sparse_users_genre_vector()\n",
    "print(result_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_df.to_csv(\"/Users/aleishamanalo/Desktop/Thesis/df_sparse_similar_movies.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_df[result_df['userId'] == 122143]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags[tags['userId'] == 122143]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
