{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tag Clustering "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For data manipulation and analysis\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# For text preprocessing\n",
    "import re\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import datetime\n",
    "import string\n",
    "\n",
    "# For multilabel classification\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "import os\n",
    "\n",
    "# For neural networks\n",
    "\n",
    "\n",
    "\n",
    "# For model evaluation\n",
    "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags = pd.read_csv(\"../dataset/sentiment_df.csv\")\n",
    "\n",
    "tags['tag'] = tags['tag'].astype('str')\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Semantic Analysis\n",
    "#### GloVe\n",
    "- Using pre-trained word vectors (wikipedia) - 200d\n",
    "ref: https://github.com/stanfordnlp/GloVe\n",
    "\n",
    "- Explain why the dimensionality has been chosen"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Loading pretrained word embeddings\n",
    "- Loading in file\n",
    "- Create a dict mapping of words to NumPy vector representation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ref: https://keras.io/examples/nlp/pretrained_word_embeddings/\n",
    "\n",
    "def get_glove(file_path):\n",
    "\n",
    "    embeddings_index = {}\n",
    "    with open(path_to_glove_file, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            word, coefs = line.split(maxsplit=1)\n",
    "            coefs = np.fromstring(coefs, \"f\", sep=\" \")\n",
    "            embeddings_index[word] = coefs\n",
    "    return embeddings_index\n",
    "\n",
    "path_to_glove_file = \"../pretrain_model/glove.6B/glove.6B.200d.txt\"\n",
    "glove_vec = get_glove(path_to_glove_file)\n",
    "\n",
    "print(\"Found %s word vectors.\" % len(glove_vec))\n",
    "\n",
    "# applying to the tag df:\n",
    "tags['glove_vec'] = tags['tag'].apply(lambda x: glove_vec.get(x, np.zeros((200,))))\n",
    " # 200 zero vec is assigned when the word is not found in the GloVe index "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "# Assuming 'tags' is a DataFrame with a column named 'tag'\n",
    "# You've already executed the code for populating 'glove_vec' and adding the 'glove_vec' column to 'tags'\n",
    "\n",
    "# Check for each tag if its corresponding vector is all zeros (indicating no GloVe representation)\n",
    "tags['has_glove_vec'] = tags['glove_vec'].apply(lambda x: not np.all(x == 0))\n",
    "\n",
    "# Count the number of tags that have a GloVe vector and those that don't\n",
    "num_tags_with_glove_vec = tags['has_glove_vec'].sum()\n",
    "num_tags_without_glove_vec = len(tags) - num_tags_with_glove_vec\n",
    "\n",
    "# Calculate percentages\n",
    "percentage_with_glove_vec = (num_tags_with_glove_vec / len(tags)) * 100\n",
    "percentage_without_glove_vec = (num_tags_without_glove_vec / len(tags)) * 100\n",
    "\n",
    "print(f\"Percentage of tags with GloVe vectors: {percentage_with_glove_vec:.2f}%\")\n",
    "print(f\"Percentage of tags without GloVe vectors: {percentage_without_glove_vec:.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_tags = tags\n",
    "\n",
    "\n",
    "output_tags.to_csv(\"../dataset/tags_withglovevec.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# output file for content-based model\n",
    "tags.to_csv(\"../dataset/tags_contentbased.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CLUSTERING\n",
    "Semantic cluster based on glove vec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finding the optimal K before applying k-means++ clustering\n",
    "Method:\n",
    "1. Pre-process GloVe vectors: normalisation (why? because distance based clustering like k-means is sensitive) \n",
    "2. Elbow method\n",
    "3. Gap Statistic\n",
    "4. Silhouette method"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Pre-process gloVe vectors: Normalization**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags_c = tags\n",
    "tags_c = tags_c.drop(columns=['Unnamed: 0', 'userId', 'movieId', 'timestamp'])\n",
    "tags_c.drop_duplicates(subset=['tag'], inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Assuming tags['glove_vec'] is a Series of numpy arrays or lists\n",
    "correct_shape = (200,)  # Replace with the correct shape of your GloVe vectors\n",
    "tags_filtered = tags_c[tags_c['glove_vec'].apply(lambda x: np.shape(x) == correct_shape)]\n",
    "\n",
    "# Convert Series of arrays to 2D numpy array\n",
    "glove_matrix = np.stack(tags_filtered['glove_vec'].to_numpy())\n",
    "\n",
    "# Perform scaling\n",
    "scaler = StandardScaler()\n",
    "scaled_features = scaler.fit_transform(glove_matrix)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Elbow Method**\n",
    "Sum of squares at each K is calculated and graphed. There should be a variation of slope from steep to shallow (like an elbow bent) to show the optimal number of clusters (K)\n",
    "\n",
    "Inertia = within-cluster sum of squares \n",
    "\n",
    "\n",
    "- We calculate the inertia at each K-value\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Gap Statistic**\n",
    "Idea = choose number of K where a major jump in WK (within-cluster) distance occurs\n",
    "Dynamically identify suggested no. of clusters in D\n",
    "\n",
    "Ref for code here: https://github.com/milesgranger/gap_statistic/blob/master/Example.ipynb\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install wheel\n",
    "# !pip install gap-stat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "# from gap_statistic import OptimalK\n",
    "\n",
    "from sklearn.cluster import KMeans"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model 1) K-Means++ with dimensionality reduction\n",
    "Using K that is found from previous methods\n",
    "\n",
    "- Removal of duplicate tags for this clustering\n",
    "- This model is a centroid-based clustering algorithm\n",
    "\n",
    "Similarity Metrics:;\n",
    "- Cohesion (Sum of Squared Distances)\n",
    "    - Sum of squared euclidean distances from each point to its cluster center.\n",
    "    - K-means++ aims to minimise this \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score\n",
    "from sklearn.metrics.pairwise import euclidean_distances\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "# Data Preprocessing\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(glove_matrix)\n",
    "\n",
    "# KMeans\n",
    "kmeans = KMeans(n_clusters=188, init='k-means++', n_init='auto', random_state=42)\n",
    "kmeans.fit(X_scaled)\n",
    "\n",
    "# Evaluation Metrics\n",
    "labels = kmeans.labels_\n",
    "print(\"Silhouette Score:\", silhouette_score(X_scaled, labels))\n",
    "print(\"Davies-Bouldin Score:\", davies_bouldin_score(X_scaled, labels))\n",
    "print(\"Calinski-Harabasz Score:\", calinski_harabasz_score(X_scaled, labels))\n",
    "\n",
    "# Cohesion\n",
    "cohesion = kmeans.inertia_\n",
    "print(f\"Cohesion (Sum of Squared Distances): {cohesion}\")\n",
    "\n",
    "# Separation\n",
    "cluster_centers = kmeans.cluster_centers_\n",
    "distance_matrix = euclidean_distances(cluster_centers, cluster_centers)\n",
    "\n",
    "# Set diagonal to infinity to ignore self-to-self cluster distance\n",
    "np.fill_diagonal(distance_matrix, np.inf)\n",
    "\n",
    "# Minimum separation between closest clusters\n",
    "min_separation = np.min(distance_matrix)\n",
    "\n",
    "# Average separation between all clusters\n",
    "# Since the matrix is symmetric, we consider upper triangle excluding the diagonal\n",
    "avg_separation = np.sum(np.triu(distance_matrix, k=1)) / (cluster_centers.shape[0] * (cluster_centers.shape[0] - 1) / 2)\n",
    "\n",
    "print(f\"Minimum Separation (between closest clusters): {min_separation}\")\n",
    "print(f\"Average Separation (between all clusters): {avg_separation}\")\n",
    "\n",
    "\n",
    "# To keep cluster labels\n",
    "tags_filtered_unique = tags_filtered.iloc[:2899].copy()\n",
    "tags_filtered_unique['cluster'] = labels\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags_filtered_unique[tags_filtered_unique['cluster'] == 180]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags_filtered_unique[tags_filtered_unique['cluster'] == 10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Qualitative Inspection - KMEans++\n",
    "\n",
    "\n",
    "- Visualising clusters - Hierarhical method to cluster the centroids of the KMeans clusters\n",
    "    - Provides a cluster of clusters effect which is easier to visualise and interpret\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.cluster.hierarchy import dendrogram, linkage\n",
    "import matplotlib.pyplot as plt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use the 'ward' linkage method to perform hierarchical clustering\n",
    "linked = linkage(cluster_centers, 'ward')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate the dendrogram\n",
    "plt.figure(figsize=(20, 10))\n",
    "dendrogram(linked,\n",
    "           orientation='top',\n",
    "           distance_sort='descending',\n",
    "           show_leaf_counts=True)\n",
    "plt.title('Hierarchical Clustering Dendrogram of KMeans Centroids')\n",
    "plt.xlabel('Cluster Label')\n",
    "plt.ylabel('Euclidean Distance')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# orange 150 and 45\n",
    "tags_filtered_unique[tags_filtered_unique['cluster'] == 150]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags_filtered_unique[tags_filtered_unique['cluster'] == 45]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Comparing clusters at Euclidean distance < 10 - ITERATIVE PROGRAM\n",
    "\n",
    "- ONLY RUN IF NEED TO DO THIS AGAIN - otherwsie the file is saved in 'final_clusters.csv'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
    "# import matplotlib.pyplot as plt\n",
    "# import numpy as np\n",
    "\n",
    "# from IPython.display import display\n",
    "# import pandas as pd\n",
    "\n",
    "\n",
    "# # Your imports and previous setup code\n",
    "# # ...\n",
    "\n",
    "# # Perform hierarchical clustering\n",
    "# linked = linkage(cluster_centers, 'ward')\n",
    "# # Generate the dendrogram\n",
    "# plt.figure(figsize=(20, 10))\n",
    "# dendrogram(linked,\n",
    "#            orientation='top',\n",
    "#            distance_sort='descending',\n",
    "#            show_leaf_counts=True)\n",
    "# plt.title('Original Hierarchical Clustering Dendrogram of KMeans Centroids')\n",
    "# plt.xlabel('Cluster Label')\n",
    "# plt.ylabel('Euclidean Distance')\n",
    "# plt.show()\n",
    "\n",
    "# # Get the linkage distances from the 'linked' array\n",
    "# distances = linked[:, 2]\n",
    "\n",
    "# # Mapping for merged clusters; initially each cluster maps to itself\n",
    "# mapping = {i: i for i in range(len(cluster_centers))}\n",
    "\n",
    "\n",
    "\n",
    "# # Set of active clusters\n",
    "# active_clusters = set(range(len(cluster_centers)))\n",
    "\n",
    "# # Loop through distances\n",
    "# for i, d in enumerate(distances):\n",
    "#     if d < 10:\n",
    "#         cluster_1 = int(linked[i, 0])\n",
    "#         cluster_2 = int(linked[i, 1])\n",
    "\n",
    "#         # Skip this pair if either cluster is not active\n",
    "#         if cluster_1 not in active_clusters or cluster_2 not in active_clusters:\n",
    "#             continue\n",
    "\n",
    "#         # Get the \"root\" clusters (if they were already merged)\n",
    "#         while mapping[cluster_1] != cluster_1:\n",
    "#             cluster_1 = mapping[cluster_1]\n",
    "#         while mapping[cluster_2] != cluster_2:\n",
    "#             cluster_2 = mapping[cluster_2]\n",
    "\n",
    "#         # Display clusters in a table\n",
    "#         print(f\"Tags for Cluster {cluster_1}:\")\n",
    "#         display(tags_filtered_unique[tags_filtered_unique['cluster'] == cluster_1])\n",
    "#         print(f\"Tags for Cluster {cluster_2}:\")\n",
    "#         display(tags_filtered_unique[tags_filtered_unique['cluster'] == cluster_2])\n",
    "\n",
    "#         # User input for merging\n",
    "# #         answer = input(f\"Do you want to merge cluster {cluster_1} and cluster {cluster_2}? (Distance: {d}) Type 'merge' to merge: \")\n",
    "#         answer = 'merge'\n",
    "\n",
    "\n",
    "#         if answer.lower() == 'merge':\n",
    "#             # Remove these clusters from the active set\n",
    "#             active_clusters.remove(cluster_1)\n",
    "#             active_clusters.remove(cluster_2)\n",
    "\n",
    "#             # Merge by setting the mapping for the larger cluster index to the smaller one\n",
    "#             if cluster_1 < cluster_2:\n",
    "#                 mapping[cluster_2] = cluster_1\n",
    "#             else:\n",
    "#                 mapping[cluster_1] = cluster_2\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Creating new dataset that reflects these merges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import matplotlib.colors as mcolors\n",
    "# # Update cluster labels\n",
    "# tags_filtered_unique['new_cluster'] = tags_filtered_unique['cluster'].map(lambda x: mapping.get(x, x))\n",
    "# tags_filtered_unique.groupby('new_cluster')\n",
    "# tags_filtered_unique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(\"reduced the number of clusters from 188 to\",len(tags_filtered_unique['new_cluster'].unique()))\n",
    "# new_label=tags_filtered_unique['new_cluster']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Recompute cluster centers based on new labels\n",
    "\n",
    "# data_with_labels = np.column_stack((X_scaled, new_label))\n",
    "# unique_labels = np.unique(new_label)\n",
    "# new_cluster_centers = np.array([X_scaled[new_label == lbl].mean(axis=0) for lbl in unique_labels])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# import matplotlib.colors as mcolors\n",
    "# # # Update cluster labels\n",
    "# # tags_filtered_unique['new_cluster'] = tags_filtered_unique['cluster'].map(lambda x: mapping.get(x, x))\n",
    "\n",
    "# # # Recompute cluster centers based on new labels\n",
    "# # new_cluster_centers = tags_filtered_unique.groupby('new_cluster').mean().values\n",
    "\n",
    "# # Perform hierarchical clustering on updated cluster centers\n",
    "# new_linked = linkage(new_cluster_centers, 'ward')\n",
    "\n",
    "# # Generate the new dendrogram\n",
    "# plt.figure(figsize=(20, 10))\n",
    "# dendrogram(new_linked,\n",
    "#            orientation='top',\n",
    "#            distance_sort='descending',\n",
    "#            show_leaf_counts=True)\n",
    "# plt.title('Updated Hierarchical Clustering Dendrogram of Merged KMeans Centroids')\n",
    "# plt.xlabel('New Cluster Label')\n",
    "# plt.ylabel('Euclidean Distance')\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_cluster_count = tags_filtered_unique['new_cluster'].nunique()\n",
    "print(f\"New number of clusters: {new_cluster_count}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags_filtered_unique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using new_cluster as the cluster label \n",
    "\n",
    "\n",
    "# tags_filtered_unique['cluster'] = tags_filtered_unique['new_cluster']\n",
    "# \n",
    "# tags_c = tags_filtered_unique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tags_filtered_unique.to_csv('../dataset/final_clusters.csv') # output file  - only once"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reading the cluster file back in for CF model\n",
    "\n",
    "tags_c = pd.read_csv('../dataset/final_clusters.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Renaming the column names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags_c['cluster'] = tags_c['new_cluster']\n",
    "\n",
    "tags_c.drop(columns='new_cluster', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags_c"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sentiment - assigning the average to each cluster"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading in the sentiment file from \"sentiment_label.ipynb\" file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_s = pd.read_csv('../dataset/df_tag_sentiment.csv')\n",
    "\n",
    "# data conversions\n",
    "tags_c['tag'] = tags_c['tag'].astype('str')\n",
    "df_s['tag'] = df_s['tag'].astype('str')\n",
    "\n",
    "\n",
    "df_s.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)\n",
    "\n",
    "# combine df_comb and tags_c\n",
    "df_comb = tags_c.merge(df_s, on='tag', how='left')\n",
    "\n",
    "# drop duplicates on (tag, movie) -> so the cluster average is not bias to duplicate tags per movie\n",
    "df_comb.drop_duplicates(subset='tag', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Output file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comb.to_csv(\"../dataset/clustercheck.csv\",index=False)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Assigning the average sentiment value to each movie\n",
    "\n",
    "- Drop duplicate tags first -> otherwise the movie sentiment average will be inflated by multiples of the same tag\n",
    "- Should be unique \n",
    "- Performed on scaled_sentiment_value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags_c"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Invesgtigating the average sentiment by cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "avg_sentiment_by_cluster = df_comb.groupby('cluster')['scaled_sentiment_value'].mean().reset_index()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "avg_sentiment_by_cluster # can use this in content-based model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comb = pd.merge(df_comb, avg_sentiment_by_cluster, on='cluster', suffixes=('', '_cluster_avg'))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Collaborative Filtering"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Creating the user - item (movie) matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assign cluster information to user-level information \n",
    "tags\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comb = df_comb.drop(columns=['glove_vec'])\n",
    "\n",
    "df_mat = tags.merge(df_comb, on=['tag'], how='inner')\n",
    "# df_mat\n",
    "# \n",
    "df_sent = df_mat # take copy for later on \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using the average sentiment per cluster as the values in the matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Duplicate (userId, movieId) pairs\n",
    "This occurs because a user is able to tag the same movie multiple times. \n",
    "For these entries, we will aggregate the sentiment_value_avg for the entire movie and this becomes the entry for the user-movie interaction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the 'scaled_sentiment_value_avg' column to float\n",
    "df_mat['scaled_sentiment_value_cluster_avg'] = df_mat['scaled_sentiment_value_cluster_avg'].astype('float')\n",
    "\n",
    "# Keep only relevant columns\n",
    "df_mat = df_mat[[\"userId\", \"movieId\", \"scaled_sentiment_value_cluster_avg\"]]\n",
    "\n",
    "# Check for duplicates and print them\n",
    "duplicates = df_mat[df_mat.duplicated(subset=['userId', 'movieId'], keep=False)]\n",
    "\n",
    "\n",
    "# Group by 'userId' and 'movieId' to get the average 'scaled_sentiment_value_avg'\n",
    "df_mat = df_mat.groupby(['userId', 'movieId'])['scaled_sentiment_value_cluster_avg'].mean().reset_index()\n",
    "\n",
    "# Create the pivot table\n",
    "mat = pd.pivot_table(df_mat, values='scaled_sentiment_value_cluster_avg', index=['userId'], columns=['movieId'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_mat"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "taking the average sentiment across duplicate user-movie pairs\n",
    "\n",
    "- Because above -> a subset was made on userId, movieId, scaled_sentiment_value_avg (where each row represents a user's tag, despite tag not in the subset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_mat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# creating matrix again -> proper display\n",
    "mat = pd.pivot_table(df_mat, values='scaled_sentiment_value_cluster_avg', index=['userId'], columns=['movieId'])\n",
    "# mat.loc[96][106696] # uses the scaled_sentiment_value_avg as the sentiment value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mat"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Investigating the data sparseness\n",
    "- Sparsity: no. of missing vals / total no. of values \n",
    "- Histograms of Interactions: the distribution of user interactions (based on whether they interacted with the same movie)\n",
    "- Missing values per user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Assuming 'mat' is already your matrix\n",
    "\n",
    "# 1. Calculate the Sparsity\n",
    "total_values = mat.size\n",
    "missing_values = mat.isna().sum().sum()\n",
    "sparsity = missing_values / total_values\n",
    "print(f\"Sparsity of the matrix is: {sparsity * 100:.2f}%\")\n",
    "\n",
    "\n",
    "# 1. Matrix Density Plot\n",
    "plt.figure(figsize=(10, 10))\n",
    "plt.spy(mat, markersize=0.5)\n",
    "plt.title('Matrix Density Plot')\n",
    "plt.xlabel('Movies')\n",
    "plt.ylabel('Users')\n",
    "plt.show()\n",
    "\n",
    "\n",
    "# Calculate the number of ratings per user and per movie\n",
    "tags_per_user = mat.notna().sum  # Sum across row\n",
    "tags_per_movie = mat.notna().sum(axis=1)    # Sum across column\n",
    "print(\"tags per user: \" + str(tags_per_user))\n",
    "print(\"tags per movie: \" + str(tags_per_movie))\n",
    "\n",
    "# Calculate the number of interactions per user and per movie\n",
    "interactions_per_user = mat.notna().sum(axis=1)  # Sum across columns for each user\n",
    "interactions_per_movie = mat.notna().sum()       # Sum across rows for each movie\n",
    "\n",
    "# Plotting\n",
    "\n",
    "fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n",
    "\n",
    "# Histogram for Interactions per User\n",
    "axes[0].hist(interactions_per_user, bins=50, color='blue', alpha=0.7)\n",
    "axes[0].set_title('Distribution of Interactions Per User')\n",
    "axes[0].set_xlabel('Number of Movies Rated')\n",
    "axes[0].set_ylabel('Number of Users')\n",
    "\n",
    "# Histogram for Interactions per Movie\n",
    "axes[1].hist(interactions_per_movie, bins=50, color='green', alpha=0.7)\n",
    "axes[1].set_title('Distribution of Interactions Per Movie')\n",
    "axes[1].set_xlabel('Number of Ratings Received')\n",
    "axes[1].set_ylabel('Number of Movies')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Knn to impute missing values in the matrix "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will return movieIds that correspond to the users' sentiment attitudes towards other movies and also similar users\n",
    "    So for instance (movieId 4, negative) etc\n",
    "    - This sentiment is based on how likely a user is going to feel about that movie - through their tagging behaviour\n",
    "- E.g. if a user is consistently rating movies with positive sentiment, return the movieIds of SIMILAR users that are also rate movies with positive sentiment\n",
    "- AKA finding users similar to target user based on their sentiment history \n",
    "- For similar users: the intersection between movies AND sentiment is taken (for example for a user to be similar, the user needs to have tagged the same movies with similar sentiments)\n",
    "Steps:\n",
    "1. Get similar users (using cosine similarity)\n",
    "2. Recommendation\n",
    "    - For target user, use similarity matrix and get similar users - should set a threshold for similarity.\n",
    "    - Return movieIds and sentiment from the movies of similar users\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Calculate the similarity of users**\n",
    "Based on movies they have tagged and the sentiments applied\n",
    "\n",
    "Similarity score threshold = 0.8"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Issue with mapping - user ids are not in a continuous mapping, this leads to out of bounds errors (need to fix this)\n",
    "\n",
    "Method to fix:\n",
    "- need to create a mapping between user id to continuous scale"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**UserId mapping**\n",
    "- Reset the index of the matrix 'mat' to get the original userId into a column. \n",
    "- Then create a mapping from these original userIds to a new continuous index (from 0 to N).\n",
    "- Use this new index as the userId when performing computations and lookups in mat\n",
    "- movieId here is still in its original form "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Assuming 'mat' is your matrix\n",
    "# Resetting the index will add a column 'userId' with the original values\n",
    "mat = mat.reset_index()\n",
    "\n",
    "# creating a dictionary with original userId and new continuous index\n",
    "original_to_new_index = {old_id: new_id for new_id, old_id in enumerate(mat['userId'])}\n",
    "\n",
    "# assign new continous index to the userId column \n",
    "mat['userId'] = mat['userId'].map(original_to_new_index)\n",
    "\n",
    "# Now, set 'userId' as the index again\n",
    "mat.set_index('userId', inplace=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you want to get the original userId back then use the id lookup in original_to_new_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mat"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 1) User-based method - KNN\n",
    "\n",
    "1. Basic functions \n",
    "- cosine similarity\n",
    "- dominant sentiment detection \n",
    "\n",
    "2. KNN computation\n",
    "- Get the k nearest neighbours\n",
    "- get recommendations based on dominant sentiment, or distributed sentiment \n",
    "\n",
    "3. Recommendations\n",
    "- based on K, provides K different recommendations\n",
    "- Format of recommendation: (movieId, sentiment)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**1) Basic Functions**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from collections import defaultdict\n",
    "\n",
    "def cosine_similarity(v1, v2):\n",
    "    '''Cosine similarity function only computes similarity for NON-NAN values'''\n",
    "    # Indices where both v1 and v2 are not NaN\n",
    "    shared_idx = np.where(~np.isnan(v1) & ~np.isnan(v2))\n",
    "    \n",
    "    # If no shared indices, return 0\n",
    "    if len(shared_idx[0]) == 0:\n",
    "        return 0\n",
    "    \n",
    "    # Extract shared values\n",
    "    v1_shared = v1[shared_idx]\n",
    "    v2_shared = v2[shared_idx]\n",
    "\n",
    "    # Compute the dot product and norms only on shared values\n",
    "    dot_product = np.dot(v1_shared, v2_shared)\n",
    "    norm_v1 = np.linalg.norm(v1_shared)\n",
    "    norm_v2 = np.linalg.norm(v2_shared)\n",
    "\n",
    "    # Prevent division by zero\n",
    "    if norm_v1 == 0 or norm_v2 == 0:\n",
    "        return 0\n",
    "\n",
    "    return dot_product / (norm_v1 * norm_v2)\n",
    "\n",
    "\n",
    "def get_sentiment_label(sentiment):\n",
    "    if sentiment > 0.5:\n",
    "        return \"positive\"\n",
    "    elif sentiment < -0.5:\n",
    "        return \"negative\"\n",
    "    else:\n",
    "        return \"neutral\"\n",
    "    \n",
    "\n",
    "def get_dominant_sentiment(user_sentiments):\n",
    "    pos_count = sum(1 for s in user_sentiments if s > 0.5)\n",
    "    neg_count = sum(1 for s in user_sentiments if s < -0.5)\n",
    "    neutral_count = len(user_sentiments) - pos_count - neg_count\n",
    "\n",
    "    if pos_count > neg_count and pos_count > neutral_count:\n",
    "        return \"positive\"\n",
    "    elif neg_count > pos_count and neg_count > neutral_count:\n",
    "        return \"negative\"\n",
    "    else:\n",
    "        return \"neutral\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**2) KNN**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_k_nearest_neighbors(target_user, matrix, k):\n",
    "    similarities = {}\n",
    "    \n",
    "    for user in matrix.index:\n",
    "        if user == target_user:\n",
    "            continue\n",
    "        common_movies = matrix.loc[target_user].dropna().index.intersection(matrix.loc[user].dropna().index)\n",
    "        if len(common_movies) > 0:\n",
    "            sim = cosine_similarity(matrix.loc[target_user][common_movies].values, matrix.loc[user][common_movies].values)\n",
    "            similarities[user] = sim\n",
    "            \n",
    "    # Sort by similarity\n",
    "    sorted_neighbors = sorted(similarities.items(), key=lambda x: x[1], reverse=True)\n",
    "    return sorted_neighbors[:k]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**3) Recommendations**\n",
    "\n",
    "If the target user has a neutral dominant sentiment (meaning they don't strongly lean positive or negative in their ratings), then the recommendation system will suggest a mix of movies: 3 positive, 3 neutral, and 3 negative (or as many as available in each category).\n",
    "\n",
    "If the user has a clear dominant sentiment (either strongly positive or negative), then the system will recommend movies accordingly.\n",
    "\n",
    "- Note: limit the number of recommendations to k\n",
    "\n",
    "- Prediction: (movieId, sentiment)\n",
    "    - Sentiment: 2 forms returned based on a similar user's sentiment towards a movie\n",
    "        - 1) Clustered sentiment value (based on the semantic cluster previously)\n",
    "        - 2) Raw sentiment value (without clustering)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_indices = mat.index[mat.apply(lambda x: (x.count() == 1) and (x.dropna().iloc[0] > 0.5), axis=1)]\n",
    "\n",
    "print(user_indices)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mat.loc[39].dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_distributed_recommendations(target_user, matrix, new_to_original_index, df_sent, k):\n",
    "    # Get the sentiments for movies rated by the target user\n",
    "    user_sentiments = matrix.loc[target_user].dropna()\n",
    "    \n",
    "    # Determine the dominant sentiment of the target user\n",
    "    dominant_sentiment = get_dominant_sentiment(user_sentiments)\n",
    "    \n",
    "    # Get the k-most similar users\n",
    "    similar_users = get_k_nearest_neighbors(target_user, matrix, k)\n",
    "    sentiment_buckets = defaultdict(list)\n",
    "    # For each similar user, gather movies they've rated\n",
    "\n",
    "    for user, similarity in similar_users:\n",
    "        # Map the user index to its original userId\n",
    "        original_user_id = new_to_original_index[user]\n",
    "        \n",
    "        for movie, sentiment in matrix.loc[user].items():\n",
    "            if not np.isnan(sentiment):\n",
    "                \n",
    "                # Filter df_sent to get rows for the current user and movie\n",
    "                user_movie_df = df_sent[(df_sent['userId'] == original_user_id) & (df_sent['movieId'] == movie)]\n",
    "                \n",
    "                # If there are no sentiments for this user-movie pair, continue\n",
    "                if user_movie_df.empty:\n",
    "                    continue\n",
    "                \n",
    "                # Aggregate the scaled_sentiment_values. Here, I'm using mean, but you can adjust this\n",
    "                movie_sentiment = user_movie_df['scaled_sentiment_value'].mean()\n",
    "                \n",
    "                sentiment_direction = get_dominant_sentiment([movie_sentiment])\n",
    "                sentiment_buckets[sentiment_direction].append((movie, movie_sentiment))\n",
    "\n",
    "    return sentiment_buckets[dominant_sentiment][:k] # returns movieId and sentiment value - scaled sentiment value from df_sent\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_to_original_index = {v: k for k, v in original_to_new_index.items()} # this exists outside the function to ensure the new_to_original_index exists for mapping\n",
    "\n",
    "target_user = 2 # Random target user index - this index is based on the new mapping, not the actual user ids - even if we pass this id into the function, we can store somewhere else the actual userId this recommendation is for \n",
    "recommendations = get_distributed_recommendations(target_user, mat, new_to_original_index, df_sent, 10)\n",
    "recommendations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_to_original_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Testing different K values\n",
    "- Alongside cosine similarity value\n",
    "- For each recommended movie for the test user, you'd like to know the cosine similarity between the test user and the users who recommended that movie."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Evaluation of KNN "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Linking the indexes back to the true userId and visualising recommendations - provides K recommendations\n",
    "- Need to link these back to their true values because then we will use it for content-based and context-aware methods "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(new_to_original_index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Testing:printing the target user average sentiment scores, tags, and tags of movies recommended"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Storing the recommendations in a new dataframe\n",
    "df_rec = df_sent[[\"userId\"]].drop_duplicates().reset_index(drop=True)\n",
    "df_rec['recommendations'] = None\n",
    "\n",
    "df_rec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Function that gets the recommendations and stores in df_rec**\n",
    "\n",
    "- This takes 302 minutes to run \n",
    "\n",
    "\n",
    "- Speeds up computation (from 302 minutes -> to 20m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_and_store_recommendations(df_rec, mat, new_to_original_index, df_sent, k=10):\n",
    "    for idx, row in df_rec.iterrows():\n",
    "        original_user_id = row['userId']\n",
    "        \n",
    "        # Map the original userId to the new continuous index\n",
    "        target_user = original_to_new_index.get(original_user_id, None)\n",
    "        \n",
    "        # If the user exists in the matrix\n",
    "        if target_user is not None:\n",
    "            # Get recommendations\n",
    "            recommendations = get_distributed_recommendations(target_user, mat, new_to_original_index, df_sent, k)\n",
    "            \n",
    "            # Store recommendations in df_rec under the original userId\n",
    "            df_rec.at[idx, 'recommendations'] = recommendations\n",
    "\n",
    "# Assume original_to_new_index exists and is the reverse mapping of new_to_original_index\n",
    "original_to_new_index = {v: k for k, v in new_to_original_index.items()}\n",
    "\n",
    "# Initialize recommendations column in df_rec\n",
    "df_rec['recommendations'] = None\n",
    "\n",
    "# Generate and store recommendations\n",
    "generate_and_store_recommendations(df_rec, mat, new_to_original_index, df_sent)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rec['recommendations'] = df_rec['recommendations'].apply(lambda x: [(int(a), round(b, 4)) for a, b in x])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The recommendations file output\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "df_rec"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": 0
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags[tags['userId'] == 96]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comb_t = pd.merge(tags, df_comb, on='tag', how='left')\n",
    "df_comb_t = df_comb_t[[\"movieId\", \"tag\", \"scaled_sentiment_value\", \"scaled_sentiment_value_cluster_avg\"]]\n",
    "\n",
    "df_comb_t[df_comb_t['movieId'] == 29].drop_duplicates(subset=['tag'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Given this,\n",
    "\n",
    "ideally we would like to recommend \"surreal\", \"dreamlike\", \"atmospheric\", \"child\" to this user\n",
    "\n",
    "Another example for thesis: \n",
    "- If we don't use the cluster sentiment average to recommend -> then we would miss the \"dreamlike\" example \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generating an initial set of tags from the movieId, sentiment recommendation\n",
    "\n",
    "- This will be stored in a separate column in the df_rec dataframe\n",
    "\n",
    "Process:\n",
    "- Access the movieId, tags from \"df_comb_t\" dataframe and then find the similarity on the scaled_sentiment_value_avg column\n",
    "    - similarity calculated by comparing the difference of atomic values (single values) ==> Euclidean distance"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finding a good similarity range \n",
    "- Plotting the distribution of the data\n",
    "    - Sentiment value is between -1 and 1 \n",
    "        - Similar values should be within the SAME sentiment range \n",
    "        For example: if a sentiment is -0.2 - this is neutral so the similar one should be neutral as well \n",
    "\n",
    "        Ranges:\n",
    "        - Negative: [-1, -0.5)\n",
    "        - Neutral: [-0.5, 0.5]\n",
    "        - Positive: (0.5, 1]\n",
    "\n",
    "New column in the dataframe will have this structure: \n",
    "[ [movieId, [tags,......]], ....],   [movieId, [tags,......]], ....] ] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comb_t\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Looping through df_rec and getting the similar tags for each user into a new column and put them in the same dataframe\n",
    "\n",
    "# note: tags for each movieId in the tags_movies column is distinct \n",
    "# Looping through df_rec and getting the similar tags for each user into a new column and put them in the same dataframe\n",
    "\n",
    "# Define sentiment category function\n",
    "def determine_sentiment_category(sentiment_value):\n",
    "    if -1 <= sentiment_value < -0.5:\n",
    "        return 'Negative'\n",
    "    elif -0.5 <= sentiment_value <= 0.5:\n",
    "        return 'Neutral'\n",
    "    else:\n",
    "        return 'Positive'\n",
    "\n",
    "# Extract tags function\n",
    "def get_tags_for_movie(movieId, sentiment_value):\n",
    "    sentiment_category = determine_sentiment_category(sentiment_value)\n",
    "    \n",
    "    # Filter rows with matching movieId and sentiment category\n",
    "    tags = df_comb_t[(df_comb_t['movieId'] == movieId) & \n",
    "                     (df_comb_t['scaled_sentiment_value_cluster_avg'].apply(determine_sentiment_category) == sentiment_category)]['tag'].tolist()\n",
    "    \n",
    "    # Convert tags list to set and back to list to ensure distinct tags\n",
    "    distinct_tags = list(set(tags))\n",
    "    \n",
    "    return (movieId, distinct_tags)\n",
    "\n",
    "# Apply function on recommendations column\n",
    "df_rec['tags_movies'] = df_rec['recommendations'].apply(lambda recs: [get_tags_for_movie(movieId, sentiment_value) for movieId, sentiment_value in recs])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Recommendations file output\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rec.to_json('../dataset/df_rec.json', orient='split')\n",
    "\n",
    "df_rec\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Removing users that have <5 tags**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rec = pd.read_json('../dataset/df_rec.json', orient='split')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count the number of tags for each userId\n",
    "tag_counts = tags.groupby('userId')['tag'].count().reset_index()\n",
    "tag_counts.columns = ['userId', 'tag_count']\n",
    "\n",
    "users_sparse = tag_counts[tag_counts['tag_count'] <= 5]\n",
    "\n",
    "\n",
    "user_ids_to_remove = users_sparse['userId'].tolist()\n",
    "\n",
    "no_rec_and_5_tags = df_rec[(df_rec['userId'].isin(user_ids_to_remove)) & (df_rec['recommendations'].str.len() == 0)]\n",
    "\n",
    "user_ids_to_remove = no_rec_and_5_tags['userId'].tolist()\n",
    "\n",
    "df_rec = df_rec[~df_rec['userId'].isin(user_ids_to_remove)]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# store df_rec in file \n",
    "import json \n",
    "df_rec.to_json('../dataset/df_rec.json', orient='split', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tag_clusters = df_comb[[\"tag\", \"cluster\"]]\n",
    "\n",
    "tag_clusters.to_csv(\"../dataset/tag_clusters.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finding how many tags the sparse users have tagged\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count the number of tags for each userId\n",
    "tag_counts = tags.groupby('userId')['tag'].count().reset_index()\n",
    "tag_counts.columns = ['userId', 'tag_count']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tag_counts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualisations of tag distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "tag_distribution = tag_counts['tag_count'].value_counts().sort_index()\n",
    "\n",
    "plt.figure(figsize=(10,5))\n",
    "plt.hist(tag_counts['tag_count'], bins=50, range=(1, 50), edgecolor='black', alpha=0.7)\n",
    "plt.title('Distribution of Number of Tags per User')\n",
    "plt.xlabel('Number of Tags')\n",
    "plt.ylabel('Number of Users')\n",
    "plt.grid(axis='y')\n",
    "plt.xlim(1, 50)  # set x-axis limits\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluation\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Sample data from the user's description\n",
    "\n",
    "\n",
    "# Creating a DataFrame\n",
    "df = tags\n",
    "\n",
    "# Calculating average number of tags per user\n",
    "avg_tags_per_user = df.groupby('userId')['tag'].count().mean()\n",
    "\n",
    "# For calculating the average number of movies rated per user,\n",
    "# we need to count unique movieIds per user and then find the average\n",
    "avg_movies_rated_per_user = df.groupby('userId')['movieId'].nunique().mean()\n",
    "\n",
    "# Preparing data for the graph\n",
    "user_counts = df.groupby('userId').agg({'tag': 'count', 'movieId': lambda x: x.nunique()}).reset_index()\n",
    "user_counts.columns = ['userId', 'num_tags', 'num_movies']\n",
    "\n",
    "# Plotting\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.scatter(user_counts['num_movies'], user_counts['num_tags'], color='blue')\n",
    "plt.title('Number of Tags vs Number of Movies Rated per User')\n",
    "plt.xlabel('Number of Movies Rated')\n",
    "plt.ylabel('Number of Tags')\n",
    "plt.grid(True)\n",
    "plt.show()\n",
    "\n",
    "avg_tags_per_user, avg_movies_rated_per_user\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Counting the number of tags per user\n",
    "tags_count_per_user = df.groupby('userId')['tag'].count()\n",
    "\n",
    "# Counting the number of users with less than 10 tags\n",
    "users_less_than_10_tags = tags_count_per_user[tags_count_per_user < 10].count()\n",
    "\n",
    "users_less_than_10_tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Recalculating to get distinct count of tags and distinct count of userIds for each tag count\n",
    "tag_count_with_user_count = df.groupby('userId')['tag'].nunique().reset_index()\n",
    "tag_count_with_user_count = tag_count_with_user_count.groupby('tag').count()\n",
    "tag_count_with_user_count = tag_count_with_user_count[tag_count_with_user_count.index < 10]\n",
    "tag_count_with_user_count.columns = ['user_count']\n",
    "tag_count_with_user_count.reset_index(inplace=True)\n",
    "tag_count_with_user_count.columns = ['tag_count', 'user_count']\n",
    "tag_count_with_user_count\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tags"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Dataset Visualization"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "tags = pd.read_csv(\"../dataset/sentiment_df.csv\")\n",
    "tags['tag'] = tags['tag'].astype('str')\n",
    "df_comb = pd.read_csv(\"../dataset/clustercheck.csv\")"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# df_comb.to_csv(\"../dataset/clustercheck.csv\")\n",
    "avg_sentiment_by_cluster = df_comb.groupby('cluster')['scaled_sentiment_value'].mean().reset_index()\n",
    "df_comb = pd.merge(df_comb, avg_sentiment_by_cluster, on='cluster', suffixes=('', '_cluster_avg'))\n"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "df_comb = df_comb.drop(columns=['glove_vec'])\n",
    "\n",
    "df_mat = tags.merge(df_comb, on=['tag'], how='inner')\n",
    "# df_mat\n",
    "\n",
    "# df_sent = df_mat # take copy for later on \n",
    "\n",
    "\n",
    "# Convert the 'scaled_sentiment_value_avg' column to float\n",
    "df_mat['scaled_sentiment_value_cluster_avg'] = df_mat['scaled_sentiment_value_cluster_avg'].astype('float')\n",
    "\n",
    "# Keep only relevant columns\n",
    "df_mat = df_mat[[\"userId\", \"movieId\", \"scaled_sentiment_value_cluster_avg\"]]\n",
    "\n",
    "# Check for duplicates and print them\n",
    "duplicates = df_mat[df_mat.duplicated(subset=['userId', 'movieId'], keep=False)]\n",
    "\n",
    "\n",
    "# Group by 'userId' and 'movieId' to get the average 'scaled_sentiment_value_avg'\n",
    "df_mat = df_mat.groupby(['userId', 'movieId'])['scaled_sentiment_value_cluster_avg'].mean().reset_index()\n",
    "\n",
    "# Create the pivot table\n",
    "mat = pd.pivot_table(df_mat, values='scaled_sentiment_value_cluster_avg', index=['userId'], columns=['movieId'])\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "\n",
    "# tags.drop(columns=['Unnamed: 0','Unnamed: 0.1'], inplace=True)\n",
    "tags"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "df_mat"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# creating matrix again -> proper display\n",
    "mat = pd.pivot_table(df_mat, values='scaled_sentiment_value_cluster_avg', index=['userId'], columns=['movieId'])\n",
    "# mat.loc[96][106696] # uses the scaled_sentiment_value_avg as the sentiment value"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "tag_counts = tags.groupby('userId')['tag'].count().reset_index()\n",
    "tag_counts.columns = ['userId', 'tag_count']\n",
    "tag_counts"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "# 1. Matrix Density Plot\n",
    "plt.figure(figsize=(25, 3))\n",
    "plt.spy(mat, markersize=0.5)\n",
    "plt.title('Matrix Density Plot')\n",
    "plt.xlabel('Movies')\n",
    "plt.ylabel('Users')\n",
    "plt.show()"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": null
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.gridspec as gridspec\n",
    "\n",
    "# Create a 1x4 grid layout\n",
    "fig, axes = plt.subplots(1, 3, figsize=(15, 3))\n",
    "\n",
    "# Histogram for Interactions per User\n",
    "axes[0].hist(interactions_per_user, bins=50, color='blue', alpha=0.7)\n",
    "axes[0].set_title('Distribution of Interactions Per User')\n",
    "axes[0].set_xlabel('Number of Movies Rated')\n",
    "axes[0].set_ylabel('Number of Users')\n",
    "\n",
    "# Histogram for Interactions per Movie\n",
    "axes[1].hist(interactions_per_movie, bins=50, color='green', alpha=0.7)\n",
    "axes[1].set_title('Distribution of Interactions Per Movie')\n",
    "axes[1].set_xlabel('Number of Ratings Received')\n",
    "axes[1].set_ylabel('Number of Movies')\n",
    "\n",
    "# Histogram for Number of Tags per User\n",
    "axes[2].hist(tag_counts['tag_count'], bins=50, range=(1, 50), edgecolor='black', alpha=0.7)\n",
    "axes[2].set_title('Distribution of Number of Tags per User')\n",
    "axes[2].set_xlabel('Number of Tags')\n",
    "axes[2].set_ylabel('Number of Users')\n",
    "# axes[2].grid(axis='y')\n",
    "axes[2].set_xlim(1, 50)  # set x-axis limits\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ],
   "metadata": {
    "collapsed": false
   },
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
