{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import sklearn.linear_model as lm\n",
    "import pandas as pd\n",
    "import tqdm\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## movie lens\n",
    "movies = pd.read_csv(\"./data/benchmark/movies/u.item\", sep=\"|\", header=None, encoding='latin-1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### get summary from imdb api\n",
    "import requests\n",
    "import json\n",
    "def get_title_year(title):\n",
    "    ## get title and year from the title e.g. \"Toy Story (1995)\"\n",
    "    try:\n",
    "        title_split = title.split(\"(\")\n",
    "        title = title_split[0].strip()\n",
    "        year = title_split[1].strip(\"()\")\n",
    "        return title, year\n",
    "    except:\n",
    "        print(title)\n",
    "        return title, None\n",
    "def get_movie_summary(title):\n",
    "    ## get title and year from the title e.g. \"Toy Story (1995)\"\n",
    "    title_split = title.split(\"(\")\n",
    "    title = title_split[0].strip()\n",
    "    year = title_split[1].strip(\"()\")\n",
    "    url = f\"https://api.themoviedb.org/3/search/movie?query={title}&include_adult=false&language=en-US&primary_release_year={year}&page=1\"\n",
    "\n",
    "    headers = {\n",
    "    \"accept\": \"application/json\",\n",
    "    \"Authorization\": \"Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIwZDk5NzAxNDdkNDEyOTEzMDg1ODU3NzU5NTg2ZTQ2ZSIsIm5iZiI6MTcyNjQwODk3MC44MDc4MzcsInN1YiI6IjY2ZTZlODQ1Mzc2OGE3M2Y4ZDkxNjEyOCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.DQkLWXhVOEoPP8pXKO-QP2bytCkrzTK4PQRkH7t6o8k\"\n",
    "}\n",
    "\n",
    "    response = requests.get(url, headers=headers)\n",
    "\n",
    "    return response.json()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies =movies[movies[1].apply(lambda x: x!=\"unknown\")].reset_index(drop=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies['description'] = movies[1].apply(get_movie_summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies['description'] = movies['description'].apply(lambda x: x['results'][0]['overview'] if x['results'] else None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies = movies[movies['description'].notna()].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "description_embeddings = model.encode(movies['description'],show_progress_bar=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "description_embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies['description_embeddings'] = description_embeddings.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "recommendations = pd.read_csv(\"./data/benchmark/movies/u.data\", sep=\"\\t\", header=None)\n",
    "recommendations.columns = [\"user_id\", \"movie_id\", \"rating\", \"timestamp\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies =movies.rename(columns={0:\"movie_id\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "recommendations = recommendations.merge(movies,on=\"movie_id\").reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "N_users = 10\n",
    "users = recommendations.groupby(\"user_id\").count().sort_values(\"rating\", ascending=False).index[:N_users]\n",
    "movies_ratings = []\n",
    "movies_embeddings = []\n",
    "for j,user_index in tqdm.tqdm(enumerate(users)):\n",
    "    ratings = recommendations[recommendations[\"user_id\"]==user_index]['rating'].values\n",
    "    embeddings = np.vstack(recommendations[recommendations[\"user_id\"]==user_index]['description_embeddings'].values)\n",
    "    movies_ratings.append(ratings)\n",
    "    movies_embeddings.append(embeddings)\n",
    "\n",
    "pickle.dump(movies_ratings,open(\"./data/benchmark/movies/movies_ratings.pkl\",\"wb\"))\n",
    "pickle.dump(movies_embeddings,open(\"./data/benchmark/movies/movies_embeddings.pkl\",\"wb\"))\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### \n",
    "from sklearn.linear_model import Lasso,LinearRegression,LassoCV\n",
    "N_random = 50\n",
    "top_k = 200\n",
    "N_mc = 100\n",
    "N_users = 10\n",
    "users = recommendations.groupby(\"user_id\").count().sort_values(\"rating\", ascending=False).index[:N_users]\n",
    "\n",
    "regret_lasso = np.zeros((N_users,N_mc,N_random+top_k))\n",
    "regret_random = np.zeros((N_users,N_mc,N_random+top_k))\n",
    "regret_lr = np.zeros((N_users,N_mc,N_random+top_k))\n",
    "\n",
    "for j,user_index in tqdm.tqdm(enumerate(users)):\n",
    "    ratings = recommendations[recommendations[\"user_id\"]==user_index]['rating'].values\n",
    "    embeddings = np.vstack(recommendations[recommendations[\"user_id\"]==user_index]['description_embeddings'].values)\n",
    "    for i in range(N_mc):\n",
    "        random_indices = np.random.choice(ratings.shape[0], size=N_random, replace=False)\n",
    "        embeddings_random = embeddings[random_indices]\n",
    "        labels_random = ratings[random_indices]\n",
    "        labels_not_random = [x for x in np.arange(ratings.shape[0]) if x not in random_indices]\n",
    "\n",
    "\n",
    "        model = Lasso(alpha=0.001,max_iter=10000)\n",
    "        model.fit(embeddings_random,labels_random)\n",
    "        non_zero_coef = model.coef_!=0\n",
    "        X = embeddings_random[:,non_zero_coef]\n",
    "        y = labels_random\n",
    "        model_lr = LinearRegression()\n",
    "        model_lr.fit(X,y)\n",
    "\n",
    "        recommend = np.argsort(model_lr.predict(embeddings[:,non_zero_coef]))[::-1]\n",
    "        sorted_ratings = ratings[recommend]\n",
    "        sorted_ratings = sorted_ratings[labels_not_random]\n",
    "\n",
    "        sorted_ratings = sorted_ratings[:top_k]\n",
    "        ratings_pred = np.concatenate([y,sorted_ratings])\n",
    "\n",
    "        sorted_true = np.sort(ratings)[::-1][:top_k+N_random]\n",
    "        random_indices = np.random.choice(ratings.shape[0],size=N_random+top_k,replace=False)\n",
    "        ratings_random = ratings[random_indices]\n",
    "\n",
    "        model_lr = LinearRegression()\n",
    "        model_lr.fit(embeddings_random,labels_random)\n",
    "        recommend = np.argsort(model_lr.predict(embeddings))[::-1]\n",
    "        sorted_ratings = ratings[recommend]\n",
    "        sorted_ratings = sorted_ratings[labels_not_random]\n",
    "        sorted_ratings = sorted_ratings[:top_k]\n",
    "        ratings_pred_lr = np.concatenate([y,sorted_ratings])\n",
    "\n",
    "    \n",
    "        \n",
    "        \n",
    "\n",
    "        regret_lasso[j,i,:] = np.cumsum(sorted_true-ratings_pred)\n",
    "        regret_random[j,i,:] = np.cumsum(sorted_true-ratings_random)\n",
    "        regret_lr[j,i,:] = np.cumsum(sorted_true-ratings_pred_lr)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(regret_lasso.mean(axis=1).mean(axis=0),label=\"Lasso\")\n",
    "plt.plot(regret_random.mean(axis=1).mean(axis=0),label=\"Random\")\n",
    "plt.plot(regret_lr.mean(axis=1).mean(axis=0),label=\"LR\")\n",
    "plt.legend(fontsize=15)\n",
    "regret_lasso_mean = regret_lasso.mean(axis=1).mean(axis=0)\n",
    "regret_lasso_std = regret_lasso.std(axis=1).mean(axis=0)\n",
    "regret_random_mean = regret_random.mean(axis=1).mean(axis=0)\n",
    "regret_random_std = regret_random.std(axis=1).mean(axis=0)\n",
    "\n",
    "plt.fill_between(np.arange(N_random+top_k),regret_lasso_mean-regret_lasso_std,regret_lasso_mean+regret_lasso_std,alpha=0.2)\n",
    "plt.fill_between(np.arange(N_random+top_k),regret_random_mean-regret_random_std,regret_random_mean+regret_random_std,alpha=0.2)\n",
    "plt.xlabel(\"Number of Rounds\",fontsize=15)\n",
    "plt.ylabel(\"Regret\",fontsize=15)\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
