{
 "cells": [
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# For data manipulation and analysis\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# For text preprocessing\n",
    "import re\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import datetime\n",
    "\n",
    "\n",
    "# For multilabel classification\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "# For model evaluation\n",
    "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:14.858675Z",
     "start_time": "2024-09-14T03:29:14.854484Z"
    }
   },
   "id": "2b814ff84a7d5256",
   "execution_count": 28
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to /Users/jiayi/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to /Users/jiayi/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": "True"
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download('stopwords')\n",
    "nltk.download('wordnet')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:14.904648Z",
     "start_time": "2024-09-14T03:29:14.901775Z"
    }
   },
   "id": "5412b48d1892e391",
   "execution_count": 29
  },
  {
   "cell_type": "markdown",
   "source": [
    "### List of commonly used movie/tv shorthand notations\n",
    "Including: notations, country codes (only including countries where top movies are created), ratings\n",
    "Don't remove these"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "20ae622fc3206308"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "keep = []\n",
    "\n",
    "# Country Codes for prominent film industries (ISO 3166-1 alpha-2 and alpha-3)\n",
    "country_codes = [\n",
    "    \"US\", \"USA\",  # United States\n",
    "    \"IN\", \"IND\",  # India\n",
    "    \"GB\", \"GBR\",  # United Kingdom\n",
    "    \"FR\", \"FRA\",  # France\n",
    "    \"DE\", \"DEU\",  # Germany\n",
    "    \"CN\", \"CHN\",  # China\n",
    "    \"IT\", \"ITA\",  # Italy\n",
    "    \"JP\", \"JPN\",  # Japan\n",
    "    \"KR\", \"KOR\",  # South Korea\n",
    "    \"RU\", \"RUS\",  # Russia\n",
    "    \"AU\", \"AUS\",  # Australia\n",
    "    \"CA\", \"CAN\",  # Canada\n",
    "    \"ES\", \"ESP\",  # Spain\n",
    "    \"BR\", \"BRA\",  # Brazil\n",
    "    \"MX\", \"MEX\"   # Mexico\n",
    "]\n",
    "\n",
    "keep = [\n",
    "    \"BBC\", \"CNN\", \"HBO\", \"FX\", \"MTV\", \"ESPN\", \"AMC\", \"TNT\", \"TBS\", \"VH1\",\n",
    "    \"HD\", \"SD\", \"4K\", \"HDR\", \"UHD\", \"IMAX\", \"DV\",\n",
    "    \"DD\", \"DTS\", \"THX\",\n",
    "    \"OTT\", \"VOD\", \"DVR\", \"PPV\", \"FTA\"\n",
    "]\n",
    "\n",
    "keep = keep + country_codes + [\n",
    "    \"G\", \"PG\", \"PG-13\", \"R\", \"NC-17\", \"U\", \"UA\", \"A\", \"S\",\n",
    "    \"MA\", \"TV-Y\", \"TV-Y7\", \"TV-G\", \"TV-PG\", \"TV-14\", \"TV-MA\"\n",
    "]\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:14.912874Z",
     "start_time": "2024-09-14T03:29:14.910337Z"
    }
   },
   "id": "819adbf62beda96d",
   "execution_count": 30
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Importing Dataset - MovieLens 20M"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "8181baed3d75eb88"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# reading in the csv files as dataframes\n",
    "genome_scores = pd.read_csv(\"../dataset/ml-20m/genome-scores.csv\")\n",
    "movies = pd.read_csv(\"../dataset/ml-20m/movies.csv\")\n",
    "genome_tags = pd.read_csv(\"../dataset/ml-20m/genome-tags.csv\")\n",
    "links = pd.read_csv(\"../dataset/ml-20m/links.csv\")\n",
    "tags = pd.read_csv(\"../dataset/ml-20m/tags.csv\")\n",
    "ratings = pd.read_csv(\"../dataset/ml-20m/ratings.csv\")\n",
    "\n",
    "dataframes = [(\"genome_scores\",genome_scores),(\"movies\",movies),(\"genome_tags\",genome_tags),(\"links\",links),(\"tags\",tags),(\"ratings\",ratings)]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:19.222118Z",
     "start_time": "2024-09-14T03:29:14.913652Z"
    }
   },
   "id": "8685ecb9891a56cd",
   "execution_count": 31
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:19.242474Z",
     "start_time": "2024-09-14T03:29:19.223522Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "genome_scores dimensions: (11709768, 3)\n",
      "Index(['movieId', 'tagId', 'relevance'], dtype='object')\n",
      "----------------------------------------\n",
      "movies dimensions: (27278, 3)\n",
      "Index(['movieId', 'title', 'genres'], dtype='object')\n",
      "----------------------------------------\n",
      "genome_tags dimensions: (1128, 2)\n",
      "Index(['tagId', 'tag'], dtype='object')\n",
      "----------------------------------------\n",
      "links dimensions: (27278, 3)\n",
      "Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')\n",
      "----------------------------------------\n",
      "tags dimensions: (465564, 4)\n",
      "Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')\n",
      "----------------------------------------\n",
      "ratings dimensions: (20000263, 4)\n",
      "Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')\n",
      "----------------------------------------\n"
     ]
    }
   ],
   "source": [
    "# check basic infomation of dataset\n",
    "for name, df in dataframes:\n",
    "    print(f\"{name} dimensions: {df.shape}\")\n",
    "    print(df.columns)\n",
    "    print(\"-\"*40)"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Pre-processing each dataframe\n",
    "- Data conversions\n",
    "- Investigating missing data proportions\n",
    "- Naming conventions\n",
    "- Identifying relationships between dataframes"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "87a87c3ba2d7af8e"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# check missing data\n",
    "def calculate_missing_data(df):\n",
    "    missing_data = df.isnull().sum() / len(df)\n",
    "    return missing_data"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:19.244551Z",
     "start_time": "2024-09-14T03:29:19.243050Z"
    }
   },
   "id": "670ee5d63512b4e1",
   "execution_count": 33
  },
  {
   "cell_type": "markdown",
   "source": [
    "### (1) tags DF\n",
    "- userId\n",
    "- movieId\n",
    "- tag: user assigned tags to movie items. Contains tags in different languages.\n",
    "- timestamp: Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970."
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "b64ce4dbf55326a3"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Frequency of short tags in descending order:\n",
      "R: 756\n",
      "3D: 112\n",
      "PG: 80\n",
      "3d: 67\n",
      "TV: 64\n",
      "3: 57\n",
      "2: 35\n",
      "SF: 28\n",
      "G: 26\n",
      "4: 26\n",
      "DC: 24\n",
      "1: 24\n",
      "sg: 20\n",
      "NR: 20\n",
      "FX: 14\n",
      "ok: 12\n",
      "f: 10\n",
      "NE: 8\n",
      " : 7\n",
      "AI: 7\n",
      "MT: 6\n",
      "HD: 5\n",
      "bg: 5\n",
      "MF: 5\n",
      "2D: 4\n",
      "Oz: 4\n",
      "hs: 4\n",
      "BD: 4\n",
      "BP: 4\n",
      "tv: 4\n",
      "UK: 4\n",
      "no: 4\n",
      "dj: 3\n",
      "SS: 3\n",
      "s: 3\n",
      "CG: 3\n",
      "bc: 2\n",
      "US: 2\n",
      "DA: 2\n",
      "X: 2\n",
      "Id: 2\n",
      "oz: 2\n",
      "<3: 2\n",
      "We: 1\n",
      "ra: 1\n",
      "ss: 1\n",
      "GM: 1\n",
      "fx: 1\n",
      "hd: 1\n",
      "vw: 1\n",
      "ms: 1\n",
      "cb: 1\n",
      "OJ: 1\n",
      "dc: 1\n",
      "es: 1\n",
      "wy: 1\n",
      "Eh: 1\n",
      "SC: 1\n",
      "JR: 1\n",
      "\\: 1\n",
      "89: 1\n",
      ".: 1\n",
      "il: 1\n",
      "7: 1\n",
      "Ok: 1\n",
      "e: 1\n",
      "c: 1\n",
      "b: 1\n",
      "a: 1\n",
      "d: 1\n",
      "M: 1\n",
      "Q: 1\n",
      "13: 1\n",
      "TX: 1\n",
      "10: 1\n",
      "25: 1\n",
      "85: 1\n",
      "eh: 1\n",
      "4d: 1\n",
      "?: 1\n",
      "dd: 1\n",
      "Na: 1\n",
      "l: 1\n",
      "go: 1\n",
      "Mu: 1\n",
      "ex: 1\n",
      "ds: 1\n",
      "sd: 1\n",
      "uk: 1\n",
      "b5: 1\n",
      "UR: 1\n"
     ]
    }
   ],
   "source": [
    "# tags exploration\n",
    "tags['tag'] = tags['tag'].astype('str')\n",
    "\n",
    "from collections import Counter\n",
    "# Initialize Counter\n",
    "tag_counter = Counter(tags['tag'])\n",
    "\n",
    "# Filter tags based on length and count frequency of short tags\n",
    "short_tags_counter = {k: v for k, v in tag_counter.items() if len(k) < 3}\n",
    "\n",
    "# Sort by frequency in descending order\n",
    "sorted_short_tags = {k: v for k, v in sorted(short_tags_counter.items(), key=lambda item: item[1], reverse=True)}\n",
    "\n",
    "# Output frequencies of short tags\n",
    "print(\"Frequency of short tags in descending order:\")\n",
    "for tag, freq in sorted_short_tags.items():\n",
    "    print(f\"{tag}: {freq}\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:19.287397Z",
     "start_time": "2024-09-14T03:29:19.246159Z"
    }
   },
   "id": "13f2b3d5599206b2",
   "execution_count": 34
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len of tags: 465564\n",
      "Missing data in tags\n",
      "userId       0.0\n",
      "movieId      0.0\n",
      "tag          0.0\n",
      "timestamp    0.0\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# data conversions\n",
    "dt_dict = {'userId' : 'int', 'movieId' : 'int', 'tag' : 'str', 'timestamp' : 'int'} # timestamp to int -> then convert to date \n",
    "tags = tags.astype(dt_dict)\n",
    "tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')\n",
    "\n",
    "\n",
    "# len \n",
    "print(\"len of tags: \" + str(len(tags)))\n",
    "# missing data\n",
    "print(\"Missing data in tags\")\n",
    "print(calculate_missing_data(tags))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:19.329588Z",
     "start_time": "2024-09-14T03:29:19.288118Z"
    }
   },
   "id": "696211b73f1dfbe7",
   "execution_count": 35
  },
  {
   "cell_type": "markdown",
   "source": [
    "#### Subset Data -> Tags (only 70% of users are included)\n",
    "- Randomly generated random.seed(0)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "b24eb1781322b24a"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# import random\n",
    "# random.seed()\n",
    "\n",
    "tags_full = tags\n",
    "user_frac = 0.7\n",
    "# Get a random sample of unique userIds\n",
    "unique_user_ids = tags['userId'].unique()\n",
    "subset_user_ids = np.random.choice(unique_user_ids, size=int(len(unique_user_ids) * user_frac), replace=False)\n",
    "tags_sub = tags[tags['userId'].isin(subset_user_ids)]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:19.349582Z",
     "start_time": "2024-09-14T03:29:19.330241Z"
    }
   },
   "id": "1b6bcaffd50a883e",
   "execution_count": 36
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Subset len of tags(all languages): 26184\n",
      "Full set len of tags(all languages): 31748\n"
     ]
    }
   ],
   "source": [
    "# No. of tags (all languages)\n",
    "len_all_sub = len(tags_sub['tag'].unique())\n",
    "print(\"Subset len of tags(all languages): \" + str(len_all_sub))\n",
    "\n",
    "len_all_full = len(tags_full['tag'].unique())\n",
    "print(\"Full set len of tags(all languages): \" + str(len_all_full))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:47:25.881933Z",
     "start_time": "2024-09-14T03:47:25.836567Z"
    }
   },
   "id": "c9e24266b361c102",
   "execution_count": 49
  },
  {
   "cell_type": "markdown",
   "source": [
    "#### Filtering tags -> English only, Model: FastText"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "90ba071ac48315c9"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# Fasttext import\n",
    "# import sys\n",
    "# !{sys.executable} -m pip install fasttext\n",
    "\n",
    "import urllib.request\n",
    "\n",
    "url = \"https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\"\n",
    "filename = \"../pretrain_model/lid.176.bin\"\n",
    "# for fastText model\n",
    "urllib.request.urlretrieve(url, filename)\n",
    "\n",
    "import fasttext\n",
    "\n",
    "# Load the model\n",
    "language_model = fasttext.load_model(\"lid.176.bin\")\n",
    "\n",
    "# Define a function to detect language\n",
    "def is_english(text):\n",
    "    try:\n",
    "        predictions = language_model.predict(text, k=1)\n",
    "        return predictions[0][0] == '__label__en'\n",
    "    except:\n",
    "        return False"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:39.455039Z",
     "start_time": "2024-09-14T03:29:19.385654Z"
    }
   },
   "id": "d79a98ec16ddad4f",
   "execution_count": 38
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_36496/1987696283.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags_sub['is_english'] = tags_sub['tag'].apply(is_english)\n",
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_36496/1987696283.py:21: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags_full.drop(columns=['is_english'], inplace=True)\n"
     ]
    }
   ],
   "source": [
    "############# For tags (Subset) ############################# \n",
    "# Assuming `tags` is your DataFrame and it has a `tag` column\n",
    "tags_sub['is_english'] = tags_sub['tag'].apply(is_english)\n",
    "\n",
    "# Filter rows where the tag is in English\n",
    "tags_sub = tags_sub[tags_sub['is_english']]\n",
    "\n",
    "# Drop the 'is_english' column as it's no longer needed\n",
    "tags_sub.drop(columns=['is_english'], inplace=True)\n",
    "\n",
    "\n",
    "############# For tags_full ############################# \n",
    "# This is only for the POS tag section in CB model\n",
    "# Assuming `tags` is your DataFrame and it has a `tag` column\n",
    "tags_full['is_english'] = tags_full['tag'].apply(is_english)\n",
    "\n",
    "# Filter rows where the tag is in English\n",
    "tags_full = tags_full[tags_full['is_english']]\n",
    "\n",
    "# Drop the 'is_english' column as it's no longer needed\n",
    "tags_full.drop(columns=['is_english'], inplace=True)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:41.159317Z",
     "start_time": "2024-09-14T03:29:39.455885Z"
    }
   },
   "id": "503461bf904c0276",
   "execution_count": 39
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "82.31891348088531 % For tag subset\n",
      "82.15505641237966 % For tag full\n"
     ]
    }
   ],
   "source": [
    "# No. of subset tags (ENGLISH)\n",
    "len_eng = len(tags_sub['tag'].unique())\n",
    "# calculate the percentage of English tags:\n",
    "per = len_eng/len_all_sub * 100\n",
    "print(str(per) + \" %\" + \" For tag subset\")\n",
    "\n",
    "# No. of full set tags (ENGLISH)\n",
    "len_eng = len(tags_full['tag'].unique())\n",
    "# calculate the percentage of English tags:\n",
    "per = len_eng/len_all_full * 100\n",
    "print(str(per) + \" %\" + \" For tag full\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:41.195881Z",
     "start_time": "2024-09-14T03:29:41.161032Z"
    }
   },
   "id": "2e336a64782d93d3",
   "execution_count": 40
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "296698"
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tags_sub)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:32:39.661455Z",
     "start_time": "2024-09-14T03:32:39.658908Z"
    }
   },
   "id": "be75a5612549ed90",
   "execution_count": 41
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# Save files\n",
    "# tags_sub.to_csv(\"../dataset/subset.csv\",index=False)\n",
    "tags_full.to_csv(\"../dataset/tags_full.csv\",index=False)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:29:41.197723Z",
     "start_time": "2024-09-14T03:29:41.196431Z"
    }
   },
   "id": "ca3f016dc38d5322",
   "execution_count": 40
  },
  {
   "cell_type": "markdown",
   "source": [
    "### (2) genome_scores DF"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "3897a67cd295c76a"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "dt_dict = {'movieId' : 'int', 'tagId' : 'int', 'relevance' : 'float'}\n",
    "genome_scores = genome_scores.astype(dt_dict)\n",
    "print(\"len of genome_scores: \" + str(len(genome_scores)))\n",
    "\n",
    "# missing data\n",
    "print(\"Missing data in genome_scores\")\n",
    "print(calculate_missing_data(genome_scores))"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "ed5a8aa4056236c9"
  },
  {
   "cell_type": "markdown",
   "source": [
    "### (3) genome_tags DF"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "5037188c917e9855"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "df_dict = {'tagId' : 'int', 'tag' : 'str'}\n",
    "genome_tags = genome_tags.astype(df_dict)\n",
    "print(\"len of genome_tags: \" + str(len(genome_tags)))\n",
    "# missing data\n",
    "print(\"Missing data in genome_tags\")\n",
    "print(calculate_missing_data(genome_tags))\n"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e9697c05e1b15dbc"
  },
  {
   "cell_type": "markdown",
   "source": [
    "### (4) movies DF"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "2531c628d1ce5f9d"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "df_dict = {'movieId' : 'int', 'title' : 'str', 'genres' : 'str'}\n",
    "movies = movies.astype(df_dict)\n",
    "print(\"len of movies: \" + str(len(movies)))\n",
    "# missing data\n",
    "print(\"Missing data in movies\")\n",
    "print(calculate_missing_data(movies))"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e3a84b4c1879ba8f"
  },
  {
   "cell_type": "markdown",
   "source": [
    "### (5) links DF\n",
    "- movieId\n",
    "- imdbId\n",
    "- tmdbId"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "685c8bc8121716d5"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Missing data in links\n",
      "movieId    0.000000\n",
      "imdbId     0.000000\n",
      "tmdbId     0.009238\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# missing data\n",
    "print(\"Missing data in links\")\n",
    "print(calculate_missing_data(links)) # very small missing data for tmdbId -> remove"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:39:42.480360Z",
     "start_time": "2024-09-14T03:39:42.473741Z"
    }
   },
   "id": "e8c5684024798d9e",
   "execution_count": 42
  },
  {
   "cell_type": "markdown",
   "source": [
    "- Use links to extract textual data about the movie - descriptions\n",
    "- Use this to identify unreliable tags"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "8ffe2add243d0625"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len of links: 27026\n"
     ]
    }
   ],
   "source": [
    "links = links.dropna()\n",
    "df_dict = {'movieId' : 'int', 'imdbId' : 'int', 'tmdbId' : 'int'}\n",
    "links = links.astype(df_dict)\n",
    "print(\"len of links: \" + str(len(links)))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-14T03:39:44.277729Z",
     "start_time": "2024-09-14T03:39:44.272390Z"
    }
   },
   "id": "60f23768c108767b",
   "execution_count": 43
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
