{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tags: Processing user-generated tags "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T22:59:37.120169Z",
     "start_time": "2024-08-27T22:59:36.442361Z"
    }
   },
   "outputs": [],
   "source": [
    "# For data manipulation and analysis\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# For text preprocessing\n",
    "import re\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import datetime\n",
    "import string\n",
    "\n",
    "# For multilabel classification\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "\n",
    "# For model evaluation\n",
    "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Only re-run the code below (uncomment) if trying to get a new subset of data in - since subset is taken from here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:26:16.375966Z",
     "start_time": "2024-08-27T23:26:16.176272Z"
    }
   },
   "outputs": [],
   "source": [
    "tags = pd.read_csv(\"../dataset/ml-20m/tags.csv\")\n",
    "\n",
    "# run for the full subset for pos tagging for CB model\n",
    "tags = pd.read_csv(\"../dataset/tags_full.csv\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### List of commonly used movie/tv shorthand notations\n",
    "Including: notations, country codes (only including countries where top movies are created), ratings\n",
    "Don't remove these"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:26:27.347162Z",
     "start_time": "2024-08-27T23:26:27.343346Z"
    }
   },
   "outputs": [],
   "source": [
    "keep = []\n",
    "\n",
    "# Country Codes for prominent film industries (ISO 3166-1 alpha-2 and alpha-3)\n",
    "country_codes = [\n",
    "    \"US\", \"USA\",  # United States\n",
    "    \"IN\", \"IND\",  # India\n",
    "    \"GB\", \"GBR\",  # United Kingdom\n",
    "    \"FR\", \"FRA\",  # France\n",
    "    \"DE\", \"DEU\",  # Germany\n",
    "    \"CN\", \"CHN\",  # China\n",
    "    \"IT\", \"ITA\",  # Italy\n",
    "    \"JP\", \"JPN\",  # Japan\n",
    "    \"KR\", \"KOR\",  # South Korea\n",
    "    \"RU\", \"RUS\",  # Russia\n",
    "    \"AU\", \"AUS\",  # Australia\n",
    "    \"CA\", \"CAN\",  # Canada\n",
    "    \"ES\", \"ESP\",  # Spain\n",
    "    \"BR\", \"BRA\",  # Brazil\n",
    "    \"MX\", \"MEX\"   # Mexico\n",
    "]\n",
    "\n",
    "keep = [\n",
    "    \"BBC\", \"CNN\", \"HBO\", \"FX\", \"MTV\", \"ESPN\", \"AMC\", \"TNT\", \"TBS\", \"VH1\",\n",
    "    \"HD\", \"SD\", \"4K\", \"HDR\", \"UHD\", \"IMAX\", \"DV\",\n",
    "    \"DD\", \"DTS\", \"THX\",\n",
    "    \"OTT\", \"VOD\", \"DVR\", \"PPV\", \"FTA\"\n",
    "]\n",
    "\n",
    "keep = keep + country_codes + [\n",
    "    \"G\", \"PG\", \"PG-13\", \"R\", \"NC-17\", \"U\", \"UA\", \"A\", \"S\",\n",
    "    \"MA\", \"TV-Y\", \"TV-Y7\", \"TV-G\", \"TV-PG\", \"TV-14\", \"TV-MA\"\n",
    "]\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Subset Data (30% of users )** - Save in subset_file.csv (this is the subset that is tested on)\n",
    "- Static subset \n",
    "- Run the below code once, then comment out. Otherwise, this changes the subset. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:28:04.960679Z",
     "start_time": "2024-08-27T23:28:04.728293Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "109313"
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# user_frac = 0.3\n",
    "# # Get a random sample of unique userIds\n",
    "# tags = tags_full\n",
    "# unique_user_ids = tags['userId'].unique()\n",
    "# subset_user_ids = np.random.choice(unique_user_ids, size=int(len(unique_user_ids) * user_frac), replace=False)\n",
    "# tags = tags[tags['userId'].isin(subset_user_ids)]\n",
    "# tags.to_csv('../dataset/subset.csv',index=False) # one once then never run again unless testing/increasing users\n",
    "\n",
    "# reading in the subset \n",
    "tags = pd.read_csv('../dataset/subset.csv')\n",
    "\n",
    "# data conversions\n",
    "dt_dict = {'userId' : 'int', 'movieId' : 'int', 'tag' : 'str'} \n",
    "tags = tags.astype(dt_dict)\n",
    "\n",
    "\n",
    "len(tags)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:28:36.205367Z",
     "start_time": "2024-08-27T23:28:36.202350Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of distinct userIds is 2225\n"
     ]
    }
   ],
   "source": [
    "# To find the number of distinct userIds\n",
    "distinct_userIds = tags['userId'].nunique()\n",
    "print(f\"The number of distinct userIds is {distinct_userIds}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:00.250721Z",
     "start_time": "2024-08-27T23:29:00.229676Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "54219"
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "# Only take single word tags\n",
    "# - Removing entries with multiple word tags\n",
    "tags = tags[tags['tag'].apply(lambda x: len(x.split()) == 1)]\n",
    "\n",
    "len(tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:00.391423Z",
     "start_time": "2024-08-27T23:29:00.383870Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  userId  movieId          tag            timestamp\n7                266     318      260        1970s  2015-02-20 22:42:49\n8                267     318   115149       Action  2015-02-21 15:58:30\n15               274     320     2762        twist  2006-04-25 11:33:52\n16               275     320     2959        twist  2006-04-25 11:30:58\n17               276     320     3996    overrated  2006-04-25 11:32:28\n...              ...     ...      ...          ...                  ...\n109306        390955  138280   116797      history  2015-01-30 23:07:25\n109307        390956  138280   116797  informatics  2015-01-30 23:07:35\n109308        390957  138280   116797  mathematics  2015-01-30 23:07:17\n109310        390959  138280   117871        image  2015-01-30 23:09:16\n109311        390960  138280   117871        story  2015-01-30 23:09:25\n\n[54219 rows x 5 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>318</td>\n      <td>260</td>\n      <td>1970s</td>\n      <td>2015-02-20 22:42:49</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>Action</td>\n      <td>2015-02-21 15:58:30</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrated</td>\n      <td>2006-04-25 11:32:28</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n    </tr>\n  </tbody>\n</table>\n<p>54219 rows × 5 columns</p>\n</div>"
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags.drop(columns='Unnamed: 0')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:00.399951Z",
     "start_time": "2024-08-27T23:29:00.392172Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The average number of tags that a userId has given a movie is 2.105265201522094\n"
     ]
    }
   ],
   "source": [
    "grouped_data = tags.groupby(['userId', 'movieId']).size().reset_index(name='num_tags')\n",
    "average_tags_per_movie_per_user = grouped_data['num_tags'].mean()\n",
    "print(f\"The average number of tags that a userId has given a movie is {average_tags_per_movie_per_user}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:00.409180Z",
     "start_time": "2024-08-27T23:29:00.407315Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of distinct userIds is 1699\n"
     ]
    }
   ],
   "source": [
    "# To find the number of distinct userIds\n",
    "distinct_userIds = tags['userId'].nunique()\n",
    "print(f\"The number of distinct userIds is {distinct_userIds}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Removing empty string tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:00.428213Z",
     "start_time": "2024-08-27T23:29:00.419522Z"
    }
   },
   "outputs": [],
   "source": [
    "tags = tags[tags['tag'] != '']\n",
    "tags = tags[tags['tag'] != None]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "English Language Only\n",
    "\n",
    "- Using FastText model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:01.457432Z",
     "start_time": "2024-08-27T23:29:00.471636Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: fasttext in /Users/jiayi/anaconda3/lib/python3.11/site-packages (0.9.3)\r\n",
      "Requirement already satisfied: pybind11>=2.2 in /Users/jiayi/anaconda3/lib/python3.11/site-packages (from fasttext) (2.13.4)\r\n",
      "Requirement already satisfied: setuptools>=0.7.0 in /Users/jiayi/anaconda3/lib/python3.11/site-packages (from fasttext) (68.0.0)\r\n",
      "Requirement already satisfied: numpy in /Users/jiayi/anaconda3/lib/python3.11/site-packages (from fasttext) (1.24.3)\r\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "!{sys.executable} -m pip install fasttext\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:01.462800Z",
     "start_time": "2024-08-27T23:29:01.458432Z"
    }
   },
   "outputs": [],
   "source": [
    "# No. of tags (all languages)\n",
    "len_all = len(tags['tag'].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- Check whether this should be done before or after lemma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:36.993011Z",
     "start_time": "2024-08-27T23:29:36.740412Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100.0 %\n"
     ]
    }
   ],
   "source": [
    "import fasttext\n",
    "\n",
    "# Load the model\n",
    "language_model = fasttext.load_model(\"../pretrain_model/lid.176.bin\")\n",
    "\n",
    "# Define a function to detect language\n",
    "def is_english(text):\n",
    "    try:\n",
    "        predictions = language_model.predict(text, k=1)\n",
    "        return predictions[0][0] == '__label__en'\n",
    "    except:\n",
    "        return False\n",
    "\n",
    "# Assuming `tags` is your DataFrame and it has a `tag` column\n",
    "tags['is_english'] = tags['tag'].apply(is_english)\n",
    "\n",
    "# Filter rows where the tag is in English and \n",
    "tags = tags.loc[(tags['is_english']) | ((~tags['is_english']) & tags['tag'].isin(keep))]\n",
    "\n",
    "# Drop the 'is_english' column as it's no longer needed\n",
    "tags.drop(columns=['is_english'], inplace=True)\n",
    "\n",
    "# No. of tags (ENGLISH)\n",
    "len_eng = len(tags['tag'].unique())\n",
    "\n",
    "# calculate the percentage of English tags:\n",
    "per = len_eng/len_all * 100\n",
    "print(str(per) + \" %\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:45.874232Z",
     "start_time": "2024-08-27T23:29:45.868462Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Frequency of short tags in descending order:\n",
      "R: 326\n",
      "DVD: 188\n",
      "own: 142\n",
      "get: 53\n",
      "cgi: 47\n",
      "2.5: 43\n",
      "War: 41\n",
      "3.5: 34\n",
      "buy: 29\n",
      "art: 28\n",
      "80s: 27\n",
      "3d: 24\n",
      "Gay: 21\n",
      "1: 21\n",
      "G: 20\n",
      "dog: 19\n",
      "CGI: 18\n",
      "wry: 16\n",
      "SF: 15\n",
      "DC: 14\n",
      "FBI: 14\n",
      "60s: 10\n",
      "odd: 10\n",
      "90s: 10\n",
      "f: 10\n",
      "law: 8\n",
      "oil: 8\n",
      "NE: 8\n",
      "Old: 7\n",
      "70s: 7\n",
      "Art: 6\n",
      "cat: 6\n",
      "MT: 6\n",
      "old: 5\n",
      "NYC: 5\n",
      "MMA: 4\n",
      "pub: 4\n",
      "hs: 4\n",
      "bad: 4\n",
      "30s: 4\n",
      "ok: 3\n",
      "meh: 3\n",
      "ice: 3\n",
      "DIY: 3\n",
      "UK: 3\n",
      "wtf: 3\n",
      "WWI: 2\n",
      "zoo: 2\n",
      "bio: 2\n",
      "box: 2\n",
      "gun: 2\n",
      "men: 2\n",
      "dry: 2\n",
      "fbb: 2\n",
      "Sea: 2\n",
      "tea: 2\n",
      "eye: 2\n",
      "bus: 2\n",
      "all: 2\n",
      "s: 2\n",
      "Cat: 2\n",
      "POW: 2\n",
      "tps: 1\n",
      "ss: 1\n",
      "bc: 1\n",
      "pig: 1\n",
      "WHO: 1\n",
      "Gun: 1\n",
      "Boy: 1\n",
      "toy: 1\n",
      "run: 1\n",
      "egg: 1\n",
      "cb: 1\n",
      "orc: 1\n",
      "red: 1\n",
      "wit: 1\n",
      "UAV: 1\n",
      "WTF: 1\n",
      "Doc: 1\n",
      "wy: 1\n",
      "cue: 1\n",
      "koo: 1\n",
      "beh: 1\n",
      "4.5: 1\n",
      "phd: 1\n",
      "JR: 1\n",
      "SNL: 1\n",
      "bjj: 1\n",
      "PKD: 1\n",
      "Rap: 1\n",
      "X: 1\n",
      "nyc: 1\n",
      "wig: 1\n",
      "bed: 1\n",
      "ABA: 1\n",
      "daf: 1\n",
      "DPD: 1\n",
      "Sad: 1\n",
      "edw: 1\n",
      "fox: 1\n",
      "c: 1\n",
      "a: 1\n",
      "SS: 1\n",
      "USN: 1\n",
      "GDR: 1\n",
      "AI: 1\n",
      "Mob: 1\n",
      "WTO: 1\n",
      "Hal: 1\n",
      "car: 1\n",
      "McG: 1\n",
      "SAS: 1\n",
      "ect: 1\n",
      "paz: 1\n",
      "age: 1\n",
      "see: 1\n",
      "RAF: 1\n",
      "Law: 1\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "\n",
    "# Initialize Counter\n",
    "tag_counter = Counter(tags['tag'])\n",
    "\n",
    "# Filter tags based on length and count frequency of short tags\n",
    "short_tags_counter = {k: v for k, v in tag_counter.items() if len(k) < 4}\n",
    "\n",
    "# Sort by frequency in descending order\n",
    "sorted_short_tags = {k: v for k, v in sorted(short_tags_counter.items(), key=lambda item: item[1], reverse=True)}\n",
    "\n",
    "# Output frequencies of short tags\n",
    "print(\"Frequency of short tags in descending order:\")\n",
    "for tag, freq in sorted_short_tags.items():\n",
    "    print(f\"{tag}: {freq}\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exploring the tags\n",
    "- Length\n",
    "- Topic\n",
    "- Semantics\n",
    "- Unreliable tags: based on NER and semantic\n",
    "Consider -> using a pre-trained model for KNOWN words, and then writing an algorithm to process UNKNOWN words\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:29:46.001Z",
     "start_time": "2024-08-27T23:29:45.997011Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n7                266         301     318      260        1970s   \n8                267         304     318   115149       Action   \n15               274         312     320     2762        twist   \n16               275         313     320     2959        twist   \n17               276         314     320     3996    overrated   \n...              ...         ...     ...      ...          ...   \n109306        390955      464426  138280   116797      history   \n109307        390956      464427  138280   116797  informatics   \n109308        390957      464428  138280   116797  mathematics   \n109310        390959      464430  138280   117871        image   \n109311        390960      464432  138280   117871        story   \n\n                  timestamp  \n7       2015-02-20 22:42:49  \n8       2015-02-21 15:58:30  \n15      2006-04-25 11:33:52  \n16      2006-04-25 11:30:58  \n17      2006-04-25 11:32:28  \n...                     ...  \n109306  2015-01-30 23:07:25  \n109307  2015-01-30 23:07:35  \n109308  2015-01-30 23:07:17  \n109310  2015-01-30 23:09:16  \n109311  2015-01-30 23:09:25  \n\n[54219 rows x 6 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>Unnamed: 0</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>301</td>\n      <td>318</td>\n      <td>260</td>\n      <td>1970s</td>\n      <td>2015-02-20 22:42:49</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>304</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>Action</td>\n      <td>2015-02-21 15:58:30</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>312</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>313</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>314</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrated</td>\n      <td>2006-04-25 11:32:28</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>464426</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>464427</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>464428</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>464430</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>464432</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n    </tr>\n  </tbody>\n</table>\n<p>54219 rows × 6 columns</p>\n</div>"
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pre-processing\n",
    "\n",
    "Order of the preprocessing steps is important. \n",
    "\n",
    "1. Lowercase\n",
    "\n",
    "2. Remove punctuation, symbols\n",
    "\n",
    "- English restriction \n",
    "\n",
    "3. Spellchecking:\n",
    "- This is applied before stemming and tokenisation. \n",
    "- Ensures valid words are considered before lemmatisation\n",
    "\n",
    "\n",
    "4. Remove stop words\n",
    "\n",
    "5. Lemmatisation\n",
    "\n",
    "Justification of order:\n",
    "- Spellcheck before tokenisation allows this process to be contextually informed based on the order of words\n",
    "- "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 0) Remove conjoined words \n",
    "- some tags may be multiple words in one -> remove these\n"
   ]
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package words to /Users/jiayi/nltk_data...\n",
      "[nltk_data]   Unzipping corpora/words.zip.\n"
     ]
    },
    {
     "data": {
      "text/plain": "True"
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('words')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-27T23:31:30.310365Z",
     "start_time": "2024-08-27T23:31:29.338710Z"
    }
   },
   "execution_count": 26
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:31:34.820408Z",
     "start_time": "2024-08-27T23:31:34.077694Z"
    }
   },
   "outputs": [],
   "source": [
    "from nltk.corpus import words, wordnet\n",
    "\n",
    "words = set(words.words())\n",
    "\n",
    "\n",
    "\n",
    "def check_conjoined(tag, words):\n",
    "    i = 0 \n",
    "    conjoined_ls = []\n",
    "    while i < len(tag):\n",
    "        for x in range(len(tag), i, -1):\n",
    "            subword = tag[i:x]\n",
    "            if subword in words or bool(wordnet.synsets(subword)):\n",
    "                conjoined_ls.append(subword)\n",
    "                i = x\n",
    "                break\n",
    "        else:\n",
    "            i += 1\n",
    "    if len(conjoined_ls) > 1:\n",
    "        return True\n",
    "    else:\n",
    "        return False\n",
    "                \n",
    "\n",
    "tags['is_conjoined'] = tags['tag'].apply(lambda x: check_conjoined(x, words))\n",
    "\n",
    "tags = tags[tags['is_conjoined'] == False]\n",
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 1) Lowercase: Convert to lowercase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:31:38.876834Z",
     "start_time": "2024-08-27T23:31:38.869613Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/3056136044.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['tag'] = tags['tag'].str.lower() #lowercase\n"
     ]
    }
   ],
   "source": [
    "tags['tag'] = tags['tag'].str.lower() #lowercase\n",
    "\n",
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 2) Remove punctuation, symbols, numbers\n",
    "- only removing from words that are not in the keep list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:31:39.047278Z",
     "start_time": "2024-08-27T23:31:39.005206Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/553236838.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['tag'] = tags['tag'].apply(lambda x: x if x in keep else ''.join(c for c in x if c.isalpha()))\n"
     ]
    }
   ],
   "source": [
    "# Keep words in 'keep' as is, remove non-alphabetic characters from all other words\n",
    "tags['tag'] = tags['tag'].apply(lambda x: x if x in keep else ''.join(c for c in x if c.isalpha()))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:31:39.066475Z",
     "start_time": "2024-08-27T23:31:39.048103Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Series([], Name: tag, dtype: object)\n",
      "Series([], Name: tag, dtype: object)\n"
     ]
    }
   ],
   "source": [
    "# Filter the DataFrame to only include rows where the 'tag' column contains a hyphen\n",
    "hyphen_tags = tags[tags['tag'].str.contains('-', na=False)]\n",
    "\n",
    "# Print these rows or just the 'tag' column\n",
    "print(hyphen_tags['tag'])\n",
    "\n",
    "exclam_tags = tags[tags['tag'].str.contains('!', na=False)]\n",
    "print(exclam_tags['tag'])\n",
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 3) Spellchecking:\n",
    "\n",
    "Current method: spello python package (ref: https://pypi.org/project/spello/)\n",
    "- Output is placed in a separate column\n",
    "\n",
    "Need to consider: \n",
    "- slang\n",
    "- abbreviations\n",
    "\n",
    "Limitations:\n",
    "Future Scope / Limitations\n",
    "One of the limitations of the current model is, it does not suggest corrections for any grammatical mistakes or for words in the vocabulary of the model. For example, in a sentence “I want to by Apple”, it will not suggest any correction for “by” as it is a valid English word but the correct replacement should be \"buy\".\n",
    "\n",
    "- **Potential Solution**: Can consider training the data on actors names or slang, manually impute this in\n",
    "\n",
    "These are difficult to handle with the contextual spell check, solution: use more advanced spell checker informer by context, or create a custom solution\n",
    "--> https://huggingface.co/facebook/bart-base\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:31:39.070908Z",
     "start_time": "2024-08-27T23:31:39.066951Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n7                266         301     318      260            s   \n8                267         304     318   115149       action   \n15               274         312     320     2762        twist   \n16               275         313     320     2959        twist   \n17               276         314     320     3996    overrated   \n...              ...         ...     ...      ...          ...   \n109306        390955      464426  138280   116797      history   \n109307        390956      464427  138280   116797  informatics   \n109308        390957      464428  138280   116797  mathematics   \n109310        390959      464430  138280   117871        image   \n109311        390960      464432  138280   117871        story   \n\n                  timestamp  is_conjoined  \n7       2015-02-20 22:42:49         False  \n8       2015-02-21 15:58:30         False  \n15      2006-04-25 11:33:52         False  \n16      2006-04-25 11:30:58         False  \n17      2006-04-25 11:32:28         False  \n...                     ...           ...  \n109306  2015-01-30 23:07:25         False  \n109307  2015-01-30 23:07:35         False  \n109308  2015-01-30 23:07:17         False  \n109310  2015-01-30 23:09:16         False  \n109311  2015-01-30 23:09:25         False  \n\n[50191 rows x 7 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>Unnamed: 0</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n      <th>is_conjoined</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>301</td>\n      <td>318</td>\n      <td>260</td>\n      <td>s</td>\n      <td>2015-02-20 22:42:49</td>\n      <td>False</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>304</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>action</td>\n      <td>2015-02-21 15:58:30</td>\n      <td>False</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>312</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n      <td>False</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>313</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n      <td>False</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>314</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrated</td>\n      <td>2006-04-25 11:32:28</td>\n      <td>False</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>464426</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n      <td>False</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>464427</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n      <td>False</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>464428</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n      <td>False</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>464430</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n      <td>False</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>464432</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n      <td>False</td>\n    </tr>\n  </tbody>\n</table>\n<p>50191 rows × 7 columns</p>\n</div>"
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Need to load pretrained English model file for spello:\n",
    "Path: located in Desktop/Thesis/... path "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:02.072297Z",
     "start_time": "2024-08-27T23:34:48.529852Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/jiayi/anaconda3/lib/python3.11/site-packages/spello/model.py:301: UserWarning: This model was saved on spell<1.3.0. As such due to a bug in previous versions, none of customisations made to the config at the time of training were saved along with the model. It is recommended to load the model, apply all required customizations to config and save it again. E.g.\n",
      "\n",
      "from spello.model import SpellCorrectionModel \n",
      "sp = SpellCorrectionModel(language='en')  \n",
      "sp.load('/home/ubuntu/model.pkl')\n",
      "sp.config.min_length_for_spellcorrection = 4 # default is 3\n",
      "sp.config.max_length_for_spellcorrection = 12 # default is 15\n",
      "sp.save(model_save_dir='/home/ubuntu/')\n",
      "\n",
      "After this the model will load without any warnings\n",
      "\n",
      "  warnings.warn(\"This model was saved on spell<1.3.0. As such due to a bug in previous versions, \"\n"
     ]
    },
    {
     "data": {
      "text/plain": "'/Users/jiayi/Desktop/Courses/Research pathway/Maryam/Code/pretrain_model/spello/model.pkl'"
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# from spello.model import SpellCorrectionModel\n",
    "# sp = SpellCorrectionModel(language='en')\n",
    "# sp.load('../pretrain_model/spello/en.pkl') # Need to download this locally\n",
    "\n",
    "from spello.model import SpellCorrectionModel\n",
    "\n",
    "# Step 1: Load the model\n",
    "sp = SpellCorrectionModel(language='en')\n",
    "sp.load('../pretrain_model/spello/en.pkl')\n",
    "\n",
    "# Step 2: Apply configurations\n",
    "sp.config.min_length_for_spellcorrection = 4  # You can adjust this based on your needs\n",
    "sp.config.max_length_for_spellcorrection = 12  # You can adjust this based on your needs\n",
    "\n",
    "# Step 3: Save the model again\n",
    "sp.save(model_save_dir='../pretrain_model/spello')\n",
    "\n",
    "# Now, the model should load without any warnings in the future\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:56.542825Z",
     "start_time": "2024-08-27T23:35:55.087668Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/729322253.py:11: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['spellCheckSuggestDoc'] = tags['tag'].apply(spellCheckSuggest)\n"
     ]
    }
   ],
   "source": [
    "def spellCheckSuggest(tag):\n",
    "    '''Apply spello python package to the tag column'''\n",
    "    if tag in keep:\n",
    "        return tag  # return the original tag if it's in the 'keep' list\n",
    "    else:\n",
    "        spell_check = sp.spell_correct(tag)\n",
    "        suggest = spell_check['spell_corrected_text']\n",
    "        return suggest  # return the spell-corrected tag otherwise\n",
    "\n",
    "# Applying the function to the DataFrame\n",
    "tags['spellCheckSuggestDoc'] = tags['tag'].apply(spellCheckSuggest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:56.548139Z",
     "start_time": "2024-08-27T23:35:56.543734Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n7                266         301     318      260            s   \n8                267         304     318   115149       action   \n15               274         312     320     2762        twist   \n16               275         313     320     2959        twist   \n17               276         314     320     3996    overrated   \n...              ...         ...     ...      ...          ...   \n109306        390955      464426  138280   116797      history   \n109307        390956      464427  138280   116797  informatics   \n109308        390957      464428  138280   116797  mathematics   \n109310        390959      464430  138280   117871        image   \n109311        390960      464432  138280   117871        story   \n\n                  timestamp  is_conjoined spellCheckSuggestDoc  \n7       2015-02-20 22:42:49         False                    s  \n8       2015-02-21 15:58:30         False               action  \n15      2006-04-25 11:33:52         False                twist  \n16      2006-04-25 11:30:58         False                twist  \n17      2006-04-25 11:32:28         False            overrated  \n...                     ...           ...                  ...  \n109306  2015-01-30 23:07:25         False              history  \n109307  2015-01-30 23:07:35         False          informatics  \n109308  2015-01-30 23:07:17         False          mathematics  \n109310  2015-01-30 23:09:16         False                image  \n109311  2015-01-30 23:09:25         False                story  \n\n[50191 rows x 8 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>Unnamed: 0</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n      <th>is_conjoined</th>\n      <th>spellCheckSuggestDoc</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>301</td>\n      <td>318</td>\n      <td>260</td>\n      <td>s</td>\n      <td>2015-02-20 22:42:49</td>\n      <td>False</td>\n      <td>s</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>304</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>action</td>\n      <td>2015-02-21 15:58:30</td>\n      <td>False</td>\n      <td>action</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>312</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n      <td>False</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>313</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n      <td>False</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>314</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrated</td>\n      <td>2006-04-25 11:32:28</td>\n      <td>False</td>\n      <td>overrated</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>464426</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n      <td>False</td>\n      <td>history</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>464427</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n      <td>False</td>\n      <td>informatics</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>464428</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n      <td>False</td>\n      <td>mathematics</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>464430</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n      <td>False</td>\n      <td>image</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>464432</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n      <td>False</td>\n      <td>story</td>\n    </tr>\n  </tbody>\n</table>\n<p>50191 rows × 8 columns</p>\n</div>"
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Heuristics to ensure incorrect spellcheck results are not replacements of correct words**\n",
    "- Length Difference: If the difference in length between the original word and the corrected word is significant, it could be an incorrect correction.\n",
    "\n",
    "- Edit Distance: Utilize the Levenshtein distance (or another string distance metric) to check how many changes are required to transform the original word into the corrected word. A high number of changes might signify an incorrect correction.\n",
    "\n",
    "- First and Last Characters: Check if both the first and last characters are different between the original and corrected word.\n",
    "\n",
    "- Frequency of Correction: If a correction occurs very frequently, it might be a systematic error rather than a true correction.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:56.682666Z",
     "start_time": "2024-08-27T23:35:56.548680Z"
    }
   },
   "outputs": [],
   "source": [
    "from Levenshtein import distance\n",
    "\n",
    "def check_differences(row):\n",
    "    original_tag = row['tag']\n",
    "    corrected_tag = row['spellCheckSuggestDoc']\n",
    "    if original_tag == None or original_tag == '':\n",
    "        return False\n",
    "    \n",
    "    # Check if the first three letters are different\n",
    "    if original_tag[:3] != corrected_tag[:3]:\n",
    "        return True\n",
    "\n",
    "    # Check if the length difference is significant (e.g., more than 3 characters)\n",
    "    if abs(len(original_tag) - len(corrected_tag)) > 3:\n",
    "        return True\n",
    "\n",
    "    # Check if the edit distance is significant (e.g., more than 3 changes)\n",
    "    if distance(original_tag, corrected_tag) > 3:\n",
    "        return True\n",
    "\n",
    "    # Check if both the first and last characters are different\n",
    "    if original_tag[0] != corrected_tag[0] and original_tag[-1] != corrected_tag[-1]:\n",
    "        return True\n",
    "    \n",
    "\n",
    "\n",
    "    return False\n",
    "\n",
    "# Create a mask where the condition is True\n",
    "mask = tags.apply(check_differences, axis=1)\n",
    "\n",
    "# Where the mask is True, replace 'spellCheckSuggestDoc' with the original 'tag'\n",
    "tags.loc[mask, 'spellCheckSuggestDoc'] = tags.loc[mask, 'tag']\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:56.687886Z",
     "start_time": "2024-08-27T23:35:56.683943Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n7                266         301     318      260            s   \n8                267         304     318   115149       action   \n15               274         312     320     2762        twist   \n16               275         313     320     2959        twist   \n17               276         314     320     3996    overrated   \n...              ...         ...     ...      ...          ...   \n109306        390955      464426  138280   116797      history   \n109307        390956      464427  138280   116797  informatics   \n109308        390957      464428  138280   116797  mathematics   \n109310        390959      464430  138280   117871        image   \n109311        390960      464432  138280   117871        story   \n\n                  timestamp  is_conjoined spellCheckSuggestDoc  \n7       2015-02-20 22:42:49         False                    s  \n8       2015-02-21 15:58:30         False               action  \n15      2006-04-25 11:33:52         False                twist  \n16      2006-04-25 11:30:58         False                twist  \n17      2006-04-25 11:32:28         False            overrated  \n...                     ...           ...                  ...  \n109306  2015-01-30 23:07:25         False              history  \n109307  2015-01-30 23:07:35         False          informatics  \n109308  2015-01-30 23:07:17         False          mathematics  \n109310  2015-01-30 23:09:16         False                image  \n109311  2015-01-30 23:09:25         False                story  \n\n[50191 rows x 8 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>Unnamed: 0</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n      <th>is_conjoined</th>\n      <th>spellCheckSuggestDoc</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>301</td>\n      <td>318</td>\n      <td>260</td>\n      <td>s</td>\n      <td>2015-02-20 22:42:49</td>\n      <td>False</td>\n      <td>s</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>304</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>action</td>\n      <td>2015-02-21 15:58:30</td>\n      <td>False</td>\n      <td>action</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>312</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n      <td>False</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>313</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n      <td>False</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>314</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrated</td>\n      <td>2006-04-25 11:32:28</td>\n      <td>False</td>\n      <td>overrated</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>464426</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n      <td>False</td>\n      <td>history</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>464427</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n      <td>False</td>\n      <td>informatics</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>464428</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n      <td>False</td>\n      <td>mathematics</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>464430</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n      <td>False</td>\n      <td>image</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>464432</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n      <td>False</td>\n      <td>story</td>\n    </tr>\n  </tbody>\n</table>\n<p>50191 rows × 8 columns</p>\n</div>"
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Correct column now: spellCheckSuggestDoc -> using this column as the 'tag' moving forward"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:56.690654Z",
     "start_time": "2024-08-27T23:35:56.688520Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/947390388.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['NewTag'] = tags['spellCheckSuggestDoc']\n"
     ]
    }
   ],
   "source": [
    "tags['NewTag'] = tags['spellCheckSuggestDoc']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:56.695218Z",
     "start_time": "2024-08-27T23:35:56.691343Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n7                266         301     318      260            s   \n8                267         304     318   115149       action   \n15               274         312     320     2762        twist   \n16               275         313     320     2959        twist   \n17               276         314     320     3996    overrated   \n...              ...         ...     ...      ...          ...   \n109306        390955      464426  138280   116797      history   \n109307        390956      464427  138280   116797  informatics   \n109308        390957      464428  138280   116797  mathematics   \n109310        390959      464430  138280   117871        image   \n109311        390960      464432  138280   117871        story   \n\n                  timestamp  is_conjoined spellCheckSuggestDoc       NewTag  \n7       2015-02-20 22:42:49         False                    s            s  \n8       2015-02-21 15:58:30         False               action       action  \n15      2006-04-25 11:33:52         False                twist        twist  \n16      2006-04-25 11:30:58         False                twist        twist  \n17      2006-04-25 11:32:28         False            overrated    overrated  \n...                     ...           ...                  ...          ...  \n109306  2015-01-30 23:07:25         False              history      history  \n109307  2015-01-30 23:07:35         False          informatics  informatics  \n109308  2015-01-30 23:07:17         False          mathematics  mathematics  \n109310  2015-01-30 23:09:16         False                image        image  \n109311  2015-01-30 23:09:25         False                story        story  \n\n[50191 rows x 9 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>Unnamed: 0</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n      <th>is_conjoined</th>\n      <th>spellCheckSuggestDoc</th>\n      <th>NewTag</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>301</td>\n      <td>318</td>\n      <td>260</td>\n      <td>s</td>\n      <td>2015-02-20 22:42:49</td>\n      <td>False</td>\n      <td>s</td>\n      <td>s</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>304</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>action</td>\n      <td>2015-02-21 15:58:30</td>\n      <td>False</td>\n      <td>action</td>\n      <td>action</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>312</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>313</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>314</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrated</td>\n      <td>2006-04-25 11:32:28</td>\n      <td>False</td>\n      <td>overrated</td>\n      <td>overrated</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>464426</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n      <td>False</td>\n      <td>history</td>\n      <td>history</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>464427</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n      <td>False</td>\n      <td>informatics</td>\n      <td>informatics</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>464428</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n      <td>False</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>464430</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n      <td>False</td>\n      <td>image</td>\n      <td>image</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>464432</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n      <td>False</td>\n      <td>story</td>\n      <td>story</td>\n    </tr>\n  </tbody>\n</table>\n<p>50191 rows × 9 columns</p>\n</div>"
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4) Remove stop words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:56.716552Z",
     "start_time": "2024-08-27T23:35:56.695869Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/620679354.py:8: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['NewTag'] = tags['NewTag'].apply(remove_stopwords)\n"
     ]
    }
   ],
   "source": [
    "from spacy.lang.en import STOP_WORDS\n",
    "  \n",
    "\n",
    "\n",
    "def remove_stopwords(tag):\n",
    "    return tag if tag in keep or tag not in STOP_WORDS else ''\n",
    "\n",
    "tags['NewTag'] = tags['NewTag'].apply(remove_stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:56.720957Z",
     "start_time": "2024-08-27T23:35:56.717157Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n7                266         301     318      260            s   \n8                267         304     318   115149       action   \n15               274         312     320     2762        twist   \n16               275         313     320     2959        twist   \n17               276         314     320     3996    overrated   \n...              ...         ...     ...      ...          ...   \n109306        390955      464426  138280   116797      history   \n109307        390956      464427  138280   116797  informatics   \n109308        390957      464428  138280   116797  mathematics   \n109310        390959      464430  138280   117871        image   \n109311        390960      464432  138280   117871        story   \n\n                  timestamp  is_conjoined spellCheckSuggestDoc       NewTag  \n7       2015-02-20 22:42:49         False                    s            s  \n8       2015-02-21 15:58:30         False               action       action  \n15      2006-04-25 11:33:52         False                twist        twist  \n16      2006-04-25 11:30:58         False                twist        twist  \n17      2006-04-25 11:32:28         False            overrated    overrated  \n...                     ...           ...                  ...          ...  \n109306  2015-01-30 23:07:25         False              history      history  \n109307  2015-01-30 23:07:35         False          informatics  informatics  \n109308  2015-01-30 23:07:17         False          mathematics  mathematics  \n109310  2015-01-30 23:09:16         False                image        image  \n109311  2015-01-30 23:09:25         False                story        story  \n\n[50191 rows x 9 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>Unnamed: 0</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n      <th>is_conjoined</th>\n      <th>spellCheckSuggestDoc</th>\n      <th>NewTag</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>301</td>\n      <td>318</td>\n      <td>260</td>\n      <td>s</td>\n      <td>2015-02-20 22:42:49</td>\n      <td>False</td>\n      <td>s</td>\n      <td>s</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>304</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>action</td>\n      <td>2015-02-21 15:58:30</td>\n      <td>False</td>\n      <td>action</td>\n      <td>action</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>312</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>313</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>314</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrated</td>\n      <td>2006-04-25 11:32:28</td>\n      <td>False</td>\n      <td>overrated</td>\n      <td>overrated</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>464426</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n      <td>False</td>\n      <td>history</td>\n      <td>history</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>464427</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n      <td>False</td>\n      <td>informatics</td>\n      <td>informatics</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>464428</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n      <td>False</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>464430</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n      <td>False</td>\n      <td>image</td>\n      <td>image</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>464432</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n      <td>False</td>\n      <td>story</td>\n      <td>story</td>\n    </tr>\n  </tbody>\n</table>\n<p>50191 rows × 9 columns</p>\n</div>"
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5) Lemmatisation (WordNet Lemmatizer with POS Tag)\n",
    "- Removing stop words before lemmatisation may speed up this process\n",
    "- Need to use POS tags - this is because without POS, lemmatisation doesn't work effectively. E.g leaves certain tags the same"
   ]
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:56.722851Z",
     "start_time": "2024-08-27T23:35:56.721615Z"
    }
   },
   "execution_count": 45
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:59.959045Z",
     "start_time": "2024-08-27T23:35:56.724714Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     /Users/jiayi/nltk_data...\n",
      "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n",
      "7                266         301     318      260            s   \n",
      "8                267         304     318   115149       action   \n",
      "15               274         312     320     2762        twist   \n",
      "16               275         313     320     2959        twist   \n",
      "17               276         314     320     3996    overrated   \n",
      "...              ...         ...     ...      ...          ...   \n",
      "109306        390955      464426  138280   116797      history   \n",
      "109307        390956      464427  138280   116797  informatics   \n",
      "109308        390957      464428  138280   116797  mathematics   \n",
      "109310        390959      464430  138280   117871        image   \n",
      "109311        390960      464432  138280   117871        story   \n",
      "\n",
      "                  timestamp  is_conjoined spellCheckSuggestDoc       NewTag  \\\n",
      "7       2015-02-20 22:42:49         False                    s            s   \n",
      "8       2015-02-21 15:58:30         False               action       action   \n",
      "15      2006-04-25 11:33:52         False                twist        twist   \n",
      "16      2006-04-25 11:30:58         False                twist        twist   \n",
      "17      2006-04-25 11:32:28         False            overrated    overrated   \n",
      "...                     ...           ...                  ...          ...   \n",
      "109306  2015-01-30 23:07:25         False              history      history   \n",
      "109307  2015-01-30 23:07:35         False          informatics  informatics   \n",
      "109308  2015-01-30 23:07:17         False          mathematics  mathematics   \n",
      "109310  2015-01-30 23:09:16         False                image        image   \n",
      "109311  2015-01-30 23:09:25         False                story        story   \n",
      "\n",
      "       lemmatized_text  \n",
      "7                    s  \n",
      "8               action  \n",
      "15               twist  \n",
      "16               twist  \n",
      "17            overrate  \n",
      "...                ...  \n",
      "109306         history  \n",
      "109307     informatics  \n",
      "109308     mathematics  \n",
      "109310           image  \n",
      "109311           story  \n",
      "\n",
      "[50191 rows x 10 columns]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/1249690397.py:39: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['lemmatized_text'] = tags['NewTag'].apply(lambda word: conditional_lemmatize(word, keep))\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from nltk.corpus import wordnet\n",
    "import pandas as pd\n",
    "\n",
    "# Download necessary NLTK data\n",
    "nltk.download('averaged_perceptron_tagger')\n",
    "\n",
    "# Initialize the WordNetLemmatizer\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "\n",
    "# Function to map NLTK's POS tags to the first character used by WordNetLemmatizer\n",
    "def pos_tagger(nltk_tag):\n",
    "    if nltk_tag.startswith('J'):\n",
    "        return wordnet.ADJ\n",
    "    elif nltk_tag.startswith('V'):\n",
    "        return wordnet.VERB\n",
    "    elif nltk_tag.startswith('N'):\n",
    "        return wordnet.NOUN\n",
    "    elif nltk_tag.startswith('R'):\n",
    "        return wordnet.ADV\n",
    "    else:         \n",
    "        return None\n",
    "\n",
    "# Function to conditionally lemmatize a single word\n",
    "def conditional_lemmatize(word, keep):\n",
    "    if word in keep:\n",
    "        return word\n",
    "    else:\n",
    "        pos = nltk.pos_tag([word])[0][1]  # POS tagging\n",
    "        wordnet_pos = pos_tagger(pos)     # Map POS tag to first character used by WordNetLemmatizer\n",
    "        if wordnet_pos is None:\n",
    "            return word\n",
    "        else:\n",
    "            return lemmatizer.lemmatize(word, wordnet_pos)\n",
    "\n",
    "\n",
    "# Apply the function to the 'NewTag' column\n",
    "tags['lemmatized_text'] = tags['NewTag'].apply(lambda word: conditional_lemmatize(word, keep))\n",
    "\n",
    "print(tags)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:59.961609Z",
     "start_time": "2024-08-27T23:35:59.959713Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/3467209262.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['un-lemmatised'] = tags['NewTag']\n"
     ]
    }
   ],
   "source": [
    "tags['un-lemmatised'] = tags['NewTag']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Assigning NewTag column to lemmatized_text\n",
    "- Removing the brackets from lemmatized_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:59.963953Z",
     "start_time": "2024-08-27T23:35:59.962038Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/1793468211.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['NewTag'] = tags['lemmatized_text']\n"
     ]
    }
   ],
   "source": [
    "tags['NewTag'] = tags['lemmatized_text']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:59.968460Z",
     "start_time": "2024-08-27T23:35:59.964401Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n7                266         301     318      260            s   \n8                267         304     318   115149       action   \n15               274         312     320     2762        twist   \n16               275         313     320     2959        twist   \n17               276         314     320     3996    overrated   \n...              ...         ...     ...      ...          ...   \n109306        390955      464426  138280   116797      history   \n109307        390956      464427  138280   116797  informatics   \n109308        390957      464428  138280   116797  mathematics   \n109310        390959      464430  138280   117871        image   \n109311        390960      464432  138280   117871        story   \n\n                  timestamp  is_conjoined spellCheckSuggestDoc       NewTag  \\\n7       2015-02-20 22:42:49         False                    s            s   \n8       2015-02-21 15:58:30         False               action       action   \n15      2006-04-25 11:33:52         False                twist        twist   \n16      2006-04-25 11:30:58         False                twist        twist   \n17      2006-04-25 11:32:28         False            overrated     overrate   \n...                     ...           ...                  ...          ...   \n109306  2015-01-30 23:07:25         False              history      history   \n109307  2015-01-30 23:07:35         False          informatics  informatics   \n109308  2015-01-30 23:07:17         False          mathematics  mathematics   \n109310  2015-01-30 23:09:16         False                image        image   \n109311  2015-01-30 23:09:25         False                story        story   \n\n       lemmatized_text un-lemmatised  \n7                    s             s  \n8               action        action  \n15               twist         twist  \n16               twist         twist  \n17            overrate     overrated  \n...                ...           ...  \n109306         history       history  \n109307     informatics   informatics  \n109308     mathematics   mathematics  \n109310           image         image  \n109311           story         story  \n\n[50191 rows x 11 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>Unnamed: 0</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n      <th>is_conjoined</th>\n      <th>spellCheckSuggestDoc</th>\n      <th>NewTag</th>\n      <th>lemmatized_text</th>\n      <th>un-lemmatised</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>301</td>\n      <td>318</td>\n      <td>260</td>\n      <td>s</td>\n      <td>2015-02-20 22:42:49</td>\n      <td>False</td>\n      <td>s</td>\n      <td>s</td>\n      <td>s</td>\n      <td>s</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>304</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>action</td>\n      <td>2015-02-21 15:58:30</td>\n      <td>False</td>\n      <td>action</td>\n      <td>action</td>\n      <td>action</td>\n      <td>action</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>312</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>313</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>314</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrated</td>\n      <td>2006-04-25 11:32:28</td>\n      <td>False</td>\n      <td>overrated</td>\n      <td>overrate</td>\n      <td>overrate</td>\n      <td>overrated</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>464426</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n      <td>False</td>\n      <td>history</td>\n      <td>history</td>\n      <td>history</td>\n      <td>history</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>464427</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n      <td>False</td>\n      <td>informatics</td>\n      <td>informatics</td>\n      <td>informatics</td>\n      <td>informatics</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>464428</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n      <td>False</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>464430</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n      <td>False</td>\n      <td>image</td>\n      <td>image</td>\n      <td>image</td>\n      <td>image</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>464432</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n      <td>False</td>\n      <td>story</td>\n      <td>story</td>\n      <td>story</td>\n      <td>story</td>\n    </tr>\n  </tbody>\n</table>\n<p>50191 rows × 11 columns</p>\n</div>"
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:59.972848Z",
     "start_time": "2024-08-27T23:35:59.968856Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "3133"
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len((tags['tag']).drop_duplicates())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:59.976617Z",
     "start_time": "2024-08-27T23:35:59.973263Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "3084"
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len((tags['un-lemmatised']).drop_duplicates())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Removing single character tags\n",
    "- If they are NOT in the keep list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:35:59.995094Z",
     "start_time": "2024-08-27T23:35:59.977067Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/912675763.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['NewTag'] = tags['NewTag'].apply(lambda x: x if (len(x) > 1 or x in keep) else None)\n",
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/912675763.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['un-lemmatised'] = tags['un-lemmatised'].apply(lambda x: x if (len(x) > 1 or x in keep) else None)\n",
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/912675763.py:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags.dropna(subset=['tag'], inplace=True)\n"
     ]
    }
   ],
   "source": [
    "# Remove single-character entries unless they are in the 'keep' list\n",
    "tags['NewTag'] = tags['NewTag'].apply(lambda x: x if (len(x) > 1 or x in keep) else None)\n",
    "tags['un-lemmatised'] = tags['un-lemmatised'].apply(lambda x: x if (len(x) > 1 or x in keep) else None)\n",
    "# Drop the rows where 'tag' is None\n",
    "tags.dropna(subset=['tag'], inplace=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Checking whether tag is valid English word\n",
    "- if not -> remove, as sentiment, semantic analysis will be valid on actual English dictionary words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Renaming tag column -> assign NewTag, etc\n",
    "\n",
    "- Column to access is 'tag' now"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:36:00.001048Z",
     "start_time": "2024-08-27T23:35:59.995627Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/4212363832.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['NewTag'] = tags['lemmatized_text']\n",
      "/var/folders/tk/x3sjpph95kz4ghcssq1py2zc0000gn/T/ipykernel_5869/4212363832.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  tags['tag'] = tags['NewTag']\n"
     ]
    },
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n7                266         301     318      260            s   \n8                267         304     318   115149       action   \n15               274         312     320     2762        twist   \n16               275         313     320     2959        twist   \n17               276         314     320     3996     overrate   \n...              ...         ...     ...      ...          ...   \n109306        390955      464426  138280   116797      history   \n109307        390956      464427  138280   116797  informatics   \n109308        390957      464428  138280   116797  mathematics   \n109310        390959      464430  138280   117871        image   \n109311        390960      464432  138280   117871        story   \n\n                  timestamp  is_conjoined spellCheckSuggestDoc       NewTag  \\\n7       2015-02-20 22:42:49         False                    s            s   \n8       2015-02-21 15:58:30         False               action       action   \n15      2006-04-25 11:33:52         False                twist        twist   \n16      2006-04-25 11:30:58         False                twist        twist   \n17      2006-04-25 11:32:28         False            overrated     overrate   \n...                     ...           ...                  ...          ...   \n109306  2015-01-30 23:07:25         False              history      history   \n109307  2015-01-30 23:07:35         False          informatics  informatics   \n109308  2015-01-30 23:07:17         False          mathematics  mathematics   \n109310  2015-01-30 23:09:16         False                image        image   \n109311  2015-01-30 23:09:25         False                story        story   \n\n       lemmatized_text un-lemmatised  \n7                    s          None  \n8               action        action  \n15               twist         twist  \n16               twist         twist  \n17            overrate     overrated  \n...                ...           ...  \n109306         history       history  \n109307     informatics   informatics  \n109308     mathematics   mathematics  \n109310           image         image  \n109311           story         story  \n\n[50191 rows x 11 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>Unnamed: 0</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n      <th>is_conjoined</th>\n      <th>spellCheckSuggestDoc</th>\n      <th>NewTag</th>\n      <th>lemmatized_text</th>\n      <th>un-lemmatised</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>301</td>\n      <td>318</td>\n      <td>260</td>\n      <td>s</td>\n      <td>2015-02-20 22:42:49</td>\n      <td>False</td>\n      <td>s</td>\n      <td>s</td>\n      <td>s</td>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>304</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>action</td>\n      <td>2015-02-21 15:58:30</td>\n      <td>False</td>\n      <td>action</td>\n      <td>action</td>\n      <td>action</td>\n      <td>action</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>312</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>313</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>314</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrate</td>\n      <td>2006-04-25 11:32:28</td>\n      <td>False</td>\n      <td>overrated</td>\n      <td>overrate</td>\n      <td>overrate</td>\n      <td>overrated</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>464426</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n      <td>False</td>\n      <td>history</td>\n      <td>history</td>\n      <td>history</td>\n      <td>history</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>464427</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n      <td>False</td>\n      <td>informatics</td>\n      <td>informatics</td>\n      <td>informatics</td>\n      <td>informatics</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>464428</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n      <td>False</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>464430</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n      <td>False</td>\n      <td>image</td>\n      <td>image</td>\n      <td>image</td>\n      <td>image</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>464432</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n      <td>False</td>\n      <td>story</td>\n      <td>story</td>\n      <td>story</td>\n      <td>story</td>\n    </tr>\n  </tbody>\n</table>\n<p>50191 rows × 11 columns</p>\n</div>"
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags['NewTag'] = tags['lemmatized_text']\n",
    "tags['tag'] = tags['NewTag']\n",
    "tags"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Placing dataframe to separate file -> then pipeline to new notebook for Sentiment Analysis models\n",
    "file: \"sentiment_df.csv\"\n",
    "Location: in same repo as this"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:36:00.009009Z",
     "start_time": "2024-08-27T23:36:00.004793Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \\\n7                266         301     318      260            s   \n8                267         304     318   115149       action   \n15               274         312     320     2762        twist   \n16               275         313     320     2959        twist   \n17               276         314     320     3996     overrate   \n...              ...         ...     ...      ...          ...   \n109306        390955      464426  138280   116797      history   \n109307        390956      464427  138280   116797  informatics   \n109308        390957      464428  138280   116797  mathematics   \n109310        390959      464430  138280   117871        image   \n109311        390960      464432  138280   117871        story   \n\n                  timestamp  is_conjoined spellCheckSuggestDoc       NewTag  \\\n7       2015-02-20 22:42:49         False                    s            s   \n8       2015-02-21 15:58:30         False               action       action   \n15      2006-04-25 11:33:52         False                twist        twist   \n16      2006-04-25 11:30:58         False                twist        twist   \n17      2006-04-25 11:32:28         False            overrated     overrate   \n...                     ...           ...                  ...          ...   \n109306  2015-01-30 23:07:25         False              history      history   \n109307  2015-01-30 23:07:35         False          informatics  informatics   \n109308  2015-01-30 23:07:17         False          mathematics  mathematics   \n109310  2015-01-30 23:09:16         False                image        image   \n109311  2015-01-30 23:09:25         False                story        story   \n\n       lemmatized_text un-lemmatised  \n7                    s          None  \n8               action        action  \n15               twist         twist  \n16               twist         twist  \n17            overrate     overrated  \n...                ...           ...  \n109306         history       history  \n109307     informatics   informatics  \n109308     mathematics   mathematics  \n109310           image         image  \n109311           story         story  \n\n[50191 rows x 11 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>Unnamed: 0</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n      <th>is_conjoined</th>\n      <th>spellCheckSuggestDoc</th>\n      <th>NewTag</th>\n      <th>lemmatized_text</th>\n      <th>un-lemmatised</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>301</td>\n      <td>318</td>\n      <td>260</td>\n      <td>s</td>\n      <td>2015-02-20 22:42:49</td>\n      <td>False</td>\n      <td>s</td>\n      <td>s</td>\n      <td>s</td>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>304</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>action</td>\n      <td>2015-02-21 15:58:30</td>\n      <td>False</td>\n      <td>action</td>\n      <td>action</td>\n      <td>action</td>\n      <td>action</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>312</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>313</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n      <td>False</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>314</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrate</td>\n      <td>2006-04-25 11:32:28</td>\n      <td>False</td>\n      <td>overrated</td>\n      <td>overrate</td>\n      <td>overrate</td>\n      <td>overrated</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>464426</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n      <td>False</td>\n      <td>history</td>\n      <td>history</td>\n      <td>history</td>\n      <td>history</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>464427</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n      <td>False</td>\n      <td>informatics</td>\n      <td>informatics</td>\n      <td>informatics</td>\n      <td>informatics</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>464428</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n      <td>False</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n      <td>mathematics</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>464430</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n      <td>False</td>\n      <td>image</td>\n      <td>image</td>\n      <td>image</td>\n      <td>image</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>464432</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n      <td>False</td>\n      <td>story</td>\n      <td>story</td>\n      <td>story</td>\n      <td>story</td>\n    </tr>\n  </tbody>\n</table>\n<p>50191 rows × 11 columns</p>\n</div>"
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:36:00.012572Z",
     "start_time": "2024-08-27T23:36:00.009712Z"
    }
   },
   "outputs": [],
   "source": [
    "tags = tags.drop(columns={'Unnamed: 0', 'is_conjoined', 'spellCheckSuggestDoc','NewTag',  'lemmatized_text'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:36:00.016719Z",
     "start_time": "2024-08-27T23:36:00.013305Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "        Unnamed: 0.1  userId  movieId          tag            timestamp  \\\n7                266     318      260            s  2015-02-20 22:42:49   \n8                267     318   115149       action  2015-02-21 15:58:30   \n15               274     320     2762        twist  2006-04-25 11:33:52   \n16               275     320     2959        twist  2006-04-25 11:30:58   \n17               276     320     3996     overrate  2006-04-25 11:32:28   \n...              ...     ...      ...          ...                  ...   \n109306        390955  138280   116797      history  2015-01-30 23:07:25   \n109307        390956  138280   116797  informatics  2015-01-30 23:07:35   \n109308        390957  138280   116797  mathematics  2015-01-30 23:07:17   \n109310        390959  138280   117871        image  2015-01-30 23:09:16   \n109311        390960  138280   117871        story  2015-01-30 23:09:25   \n\n       un-lemmatised  \n7               None  \n8             action  \n15             twist  \n16             twist  \n17         overrated  \n...              ...  \n109306       history  \n109307   informatics  \n109308   mathematics  \n109310         image  \n109311         story  \n\n[50191 rows x 6 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0.1</th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>tag</th>\n      <th>timestamp</th>\n      <th>un-lemmatised</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>7</th>\n      <td>266</td>\n      <td>318</td>\n      <td>260</td>\n      <td>s</td>\n      <td>2015-02-20 22:42:49</td>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>267</td>\n      <td>318</td>\n      <td>115149</td>\n      <td>action</td>\n      <td>2015-02-21 15:58:30</td>\n      <td>action</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>274</td>\n      <td>320</td>\n      <td>2762</td>\n      <td>twist</td>\n      <td>2006-04-25 11:33:52</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>275</td>\n      <td>320</td>\n      <td>2959</td>\n      <td>twist</td>\n      <td>2006-04-25 11:30:58</td>\n      <td>twist</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>276</td>\n      <td>320</td>\n      <td>3996</td>\n      <td>overrate</td>\n      <td>2006-04-25 11:32:28</td>\n      <td>overrated</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>109306</th>\n      <td>390955</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>history</td>\n      <td>2015-01-30 23:07:25</td>\n      <td>history</td>\n    </tr>\n    <tr>\n      <th>109307</th>\n      <td>390956</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>informatics</td>\n      <td>2015-01-30 23:07:35</td>\n      <td>informatics</td>\n    </tr>\n    <tr>\n      <th>109308</th>\n      <td>390957</td>\n      <td>138280</td>\n      <td>116797</td>\n      <td>mathematics</td>\n      <td>2015-01-30 23:07:17</td>\n      <td>mathematics</td>\n    </tr>\n    <tr>\n      <th>109310</th>\n      <td>390959</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>image</td>\n      <td>2015-01-30 23:09:16</td>\n      <td>image</td>\n    </tr>\n    <tr>\n      <th>109311</th>\n      <td>390960</td>\n      <td>138280</td>\n      <td>117871</td>\n      <td>story</td>\n      <td>2015-01-30 23:09:25</td>\n      <td>story</td>\n    </tr>\n  </tbody>\n</table>\n<p>50191 rows × 6 columns</p>\n</div>"
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T23:46:28.940776Z",
     "start_time": "2024-08-27T23:46:28.786398Z"
    }
   },
   "outputs": [],
   "source": [
    "tags.to_csv(\"../dataset/sentiment_df.csv\",index=False)\n",
    "tags.to_csv(\"../dataset/tag_full_preprocessed.csv\",index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.7 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.7"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
