{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import numpy.linalg as la\n",
    "from fancyimpute import SoftImpute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def collect_movielens():\n",
    "    ratings = pd.read_csv('./Datasets/Movielens/ratings.dat',header = None, delimiter = '::', encoding='latin1', engine = 'python')\n",
    "    ratings = np.asarray(ratings)\n",
    "    ratings = np.delete(ratings,3,1)\n",
    "    num_ratings = ratings.shape[0]\n",
    "    #Is a 1000209x3 array;(user-id,movie-id,rating) ; timestamp was removed  \n",
    "    users = pd.read_csv('./Datasets/Movielens/users.dat', header = None, delimiter = '::', encoding='latin1', engine = 'python')\n",
    "    users = np.asarray(users)\n",
    "    users = np.delete(users,4,1)\n",
    "    num_users = users.shape[0]\n",
    "    #Is a 6040x4 array; (user-id,gender,age,occupation); zipcode was removed\n",
    "    genres = pd.read_csv('./Datasets/Movielens/movies.dat', header = None, delimiter = '::', encoding='latin1', engine = 'python')\n",
    "    genres = np.asarray(genres)\n",
    "    genres = np.delete(genres,1,1)\n",
    "    num_movies = 3952\n",
    "    #Is a 3883x2 array; (movie-id,genre); title was removed; there are actually 3952 movies.\n",
    "\n",
    "   \n",
    "    #REWARD MATRIX:\n",
    "    rew = np.zeros((num_users,num_movies))\n",
    "    rat = rew\n",
    "    for k in range(num_ratings):\n",
    "        user, movie = ratings[k,0]-1, ratings[k,1]-1\n",
    "        rew[user,movie] = 1 \n",
    "        rat[user,movie] = (ratings[k,2]-1)/4\n",
    "    col = []\n",
    "    for i in range(num_movies):\n",
    "        if sum(rat[:,i]) <= 10:\n",
    "            col+= [i]\n",
    "    rat = np.delete(rat,col,1)\n",
    "    \n",
    "    return ratings,users,genres,rew,rat\n",
    "\n",
    "\n",
    "def complete_rating_matrix():\n",
    "    ratings,users,genres,rew,rat = collect_movielens()\n",
    "    count = 0\n",
    "    for i in range(rat.shape[0]):\n",
    "        for j in range(rat.shape[1]):\n",
    "            if rat[i,j]==0:\n",
    "                rat[i,j] = np.nan\n",
    "                count += 1\n",
    "    print('Number of entries to be filled:  ',count)\n",
    "    ratt = SoftImpute().fit_transform(rat)\n",
    "    return ratt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = complete_rating_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('./Datasets/Movielens/data_new.p','wb') as file:\n",
    "    pickle.dump(x, file, protocol = pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./Datasets/Movielens/data_new.p','rb') as file:\n",
    "    rating_matrix = pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "users = pd.read_csv('./Datasets/Movielens/users.dat', header = None, delimiter = '::', engine = 'python')\n",
    "users = np.asarray(users)\n",
    "users = np.delete(users,4,1)\n",
    "groups = {1: 'young', 18: 'teen', 25: 'adult', 35: 'adult', 45: 'adult', 50: 'old', 56: 'old'}\n",
    "for i in range(users.shape[0]):\n",
    "    users[i,2] = groups[users[i,2]]\n",
    "groups = dict()\n",
    "for i in range(21):\n",
    "    if i in [0,16,19,9,8]:\n",
    "        groups[i] = 'other'\n",
    "    elif i in [4,10]:\n",
    "        groups[i] = 'student'\n",
    "    elif i in [1,15]:\n",
    "        groups[i] = 'academic'\n",
    "    elif i in [17,12,2]:\n",
    "        groups[i] = 'sci'\n",
    "    elif i in [3,5,7,14]:\n",
    "        groups[i] = 'office'\n",
    "    elif i in [2,20,18]:\n",
    "        groups[i] = 'art'\n",
    "    elif i ==11:\n",
    "        groups[i] = 'law'\n",
    "    else:\n",
    "        groups[i] = 'retired'\n",
    "for i in range(users.shape[0]):\n",
    "    users[i,3] = groups[users[i,3]]\n",
    "print(users.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "l = []\n",
    "for i in range(rating_matrix.shape[0]):\n",
    "    if sum(rating_matrix[i,:])<=1500:\n",
    "        l += [i]\n",
    "rating_matrix = np.delete(rating_matrix, l, 0)\n",
    "users = np.delete(users, l , 0)\n",
    "print(rating_matrix.shape, users.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gen_rews = dict()\n",
    "for i in range(users.shape[0]):\n",
    "    if users[i,1] in gen_rews: gen_rews[users[i,1]] = np.vstack((gen_rews[users[i,1]], rating_matrix[i,:]))\n",
    "    else: gen_rews[users[i,1]] = rating_matrix[i,:]\n",
    "for key in gen_rews.keys():\n",
    "    print(key)\n",
    "    print(gen_rews[key].shape)\n",
    "    temp = np.mean(gen_rews[key], 0)\n",
    "    print(min(temp),max(temp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_rews = dict()\n",
    "for i in range(users.shape[0]):\n",
    "    if users[i,2] in age_rews: age_rews[users[i,2]] = np.vstack((age_rews[users[i,2]], rating_matrix[i,:]))\n",
    "    else: age_rews[users[i,2]] = rating_matrix[i,:]\n",
    "for key in age_rews.keys():\n",
    "    print(key)\n",
    "    print(age_rews[key].shape)\n",
    "    temp = np.mean(age_rews[key], 0)\n",
    "    print(min(temp),max(temp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "occ_rews = dict()\n",
    "for i in range(users.shape[0]):\n",
    "    if users[i,3] in occ_rews: occ_rews[users[i,3]] = np.vstack((occ_rews[users[i,3]], rating_matrix[i,:]))\n",
    "    else: occ_rews[users[i,3]] = rating_matrix[i,:]\n",
    "for key in occ_rews.keys():\n",
    "    print(key)\n",
    "    print(occ_rews[key].shape)\n",
    "    temp = np.mean(occ_rews[key], 0)\n",
    "    print(min(temp),max(temp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def produce_tables(rew_mat, user_mat, feature):\n",
    "    keys = set(user_mat[:,feature])\n",
    "    if feature == 1: others = [2,3]\n",
    "    elif feature == 2: others = [1,3]\n",
    "    else: others = [1,2]\n",
    "    count_dict = dict()\n",
    "    for i in range(user_mat.shape[0]):\n",
    "        if (user_mat[i,others[0]], user_mat[i,others[1]]) not in count_dict: \n",
    "            count_dict[user_mat[i,others[0]], user_mat[i,others[1]]] = 1\n",
    "        else: count_dict[user_mat[i,others[0]], user_mat[i,others[1]]] += 1\n",
    "    label_dict = dict()\n",
    "    ind = 0\n",
    "    for k in count_dict.keys():\n",
    "        label_dict[k] = ind\n",
    "        ind += 1\n",
    "    data = dict()\n",
    "    for key in keys:\n",
    "        data[key] = [np.zeros((len(label_dict.keys()),rew_mat.shape[1])), np.zeros(len(label_dict.keys()))]\n",
    "    for i in range(rew_mat.shape[0]):\n",
    "        k = label_dict[user_mat[i,others[0]], user_mat[i,others[1]]]\n",
    "        f = user_mat[i,feature]\n",
    "        temp = data[f][0][k,:]\n",
    "        count = data[f][1][k]\n",
    "        temp = (1/(count+1))*np.add(temp*count, rew_mat[i,:])\n",
    "        data[f][0][k,:] = temp\n",
    "        data[f][1][k] += 1\n",
    "    for k in keys:\n",
    "        print(k, len(data[k][1]), data[k][1])\n",
    "        \n",
    "        \n",
    "        \n",
    "    with open('./Files/Movielens_data_'+str(feature)+'_hidden.p','wb') as f:\n",
    "        pickle.dump(data,f,protocol = pickle.HIGHEST_PROTOCOL)\n",
    "              \n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = produce_tables(rating_matrix, users,  3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for k in x.keys():\n",
    "    temp = np.zeros(x[k][0].shape[1])\n",
    "    for i in range(len(temp)):\n",
    "        temp[i] = np.average(x[k][0][:,i], weights = x[k][1])\n",
    "    print(k, min(temp),  max(temp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
