{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import scipy.sparse as sp\n",
    "import networkx as nx\n",
    "import pickle\n",
    "import yelp_parsing_utilities as ypu\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%matplotlib inline\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# parse raw data\n",
    "\n",
    "all_users, all_reviews, all_businesses, categories = ypu.parse_dataset('../data/Yelp/user.json', '../data/Yelp/review.json', '../data/Yelp/business.json', '../data/Yelp/categories.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Active Life' 'Arts & Entertainment' 'Automotive' 'Beauty & Spas'\n",
      " 'Bicycles' 'Education' 'Event Planning & Services' 'Financial Services'\n",
      " 'Food' 'Health & Medical' 'Home Services' 'Hotels & Travel'\n",
      " 'Local Flavor' 'Local Services' 'Mass Media' 'Nightlife' 'Pets'\n",
      " 'Professional Services' 'Public Services & Government'\n",
      " 'Religious Organizations' 'Restaurants' 'Shopping']\n"
     ]
    }
   ],
   "source": [
    "# extract reference categories for network games\n",
    "ref_categories = ypu.extract_reference_categories(categories)\n",
    "\n",
    "print(ref_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sanity check: check that every business is matched at least by a category\n",
    "for b in all_businesses:\n",
    "    if b['categories'] is None:\n",
    "         continue\n",
    "    \n",
    "    found_reference_category = False\n",
    "    for b_c in b['categories'].split(','):\n",
    "        if b_c.strip() in ref_categories:\n",
    "            found_reference_category = True\n",
    "            break\n",
    "    \n",
    "    assert found_reference_category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Completed 0.0 (it took 3.2901763916015625e-05 sec.)\n",
      "Found 116040973 edges and 14304298 edges with IoU > 0.\n"
     ]
    }
   ],
   "source": [
    "# build auxiliary data\n",
    "\n",
    "business_to_category_dict = ypu.build_business_to_categories_dict(all_businesses, ref_categories)  # keys: business ids, values: categories of each business \n",
    "user_category_scores_dict = ypu.build_user_categories_scores_dict(all_reviews, business_to_category_dict, ref_categories)  # keys: user ids, values: average score per reference category\n",
    "user_friends_dict = ypu.build_users_friends_dict(all_users)  # keys: user ids, values: list of friends\n",
    "all_possible_edges, all_weighted_edges = ypu.build_social_graph(user_friends_dict, user_category_scores_dict, ref_categories, verbose=True, print_status_every_percentage=0.05)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "G_weighted_edges = nx.Graph((x, y, {'weight': v}) for x, y, v in all_weighted_edges)\n",
    "G_all_edges = nx.convert.from_edgelist(all_possible_edges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original number of nodes:  945857\n",
      "Number of nodes after last Graclus iteration:  584622\n",
      "Number of nodes after last Graclus iteration:  360274\n",
      "Number of nodes after last Graclus iteration:  218498\n",
      "Number of nodes after last Graclus iteration:  133472\n",
      "Number of nodes after last Graclus iteration:  85389\n",
      "Number of nodes after last Graclus iteration:  58871\n",
      "Number of nodes after last Graclus iteration:  44295\n",
      "Number of nodes after last Graclus iteration:  36160\n",
      "Number of nodes after last Graclus iteration:  31511\n",
      "Number of nodes after last Graclus iteration:  28708\n",
      "Best clustering found with 4 iterations. Overall number of clusters: 133472\n"
     ]
    }
   ],
   "source": [
    "# cluster the original graph\n",
    "\n",
    "all_graph_users = list(G_weighted_edges.nodes())\n",
    "num_graph_users = len(all_graph_users)\n",
    "A = nx.linalg.graphmatrix.adjacency_matrix(G_weighted_edges, nodelist=all_graph_users)\n",
    "\n",
    "n_steps = 10\n",
    "min_num_nodes_per_cluster = 10\n",
    "\n",
    "parents = ypu.graclus(A, n_steps, verbose=True)  # here parents is a list of lists representing different coarsening levels of the input graph. parents[i][j] contains the cluster ID of node j in graph i.\n",
    "clusters = ypu.compute_best_clustering(parents, num_graph_users, min_acceptable_size=min_num_nodes_per_cluster, verbose=True)  # key: cluster id, value: list of users indexes (i.e. clusters[0][1]=k is the index of user all_graph_users[k] in graph G_weighted_edges according to the ordering defined by all_graph_users)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of extracted graphs:  52575\n"
     ]
    }
   ],
   "source": [
    "# extract clusters subgraphs\n",
    "\n",
    "clusters_users_names_dict = ypu.map_users_idx_to_users_ids(clusters, all_graph_users)  # key: cluster id, value: list of users ids\n",
    "all_selected_graphs = ypu.extract_clusters_subgraphs(G_all_edges, clusters_users_names_dict, min_num_nodes_per_cluster, max_fraction_nodes_unitary_degree=1.0, verbose=True)  # list of valid clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Final number of graphs:  22152\n",
      "Density of X scores in top 5000 graphs: 0.25664732127720763 +/- 0.03570140374896308\n",
      "Density of A in top 5000 graphs: 0.17421653299228795 +/- 0.02566230871455056\n"
     ]
    }
   ],
   "source": [
    "# build the dataset discarding non valid clusters\n",
    "\n",
    "minimum_fraction_of_valid_nodes = 0.25\n",
    "min_num_valid_categories = np.floor(0.25 * len(ref_categories))\n",
    "\n",
    "dataset, perm = ypu.build_network_games_dataset(all_selected_graphs, user_category_scores_dict, minimum_fraction_of_valid_nodes, min_num_valid_categories, verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../data/Yelp/dumps/category_dataset_top_category_fixed2_inv_sorting.pickle', 'wb') as f:\n",
    "    pickle.dump((dataset, perm), f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
