{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import gzip\n",
    "# import pandas as pd\n",
    "from urllib.request import urlopen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "datafile = '/home/xiaoxue/data/Amazon/meta_Kindle_Store.json.gz' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "491670\n",
      "{'category': ['Kindle Store', 'Kindle eBooks', 'Science Fiction & Fantasy'], 'tech1': '', 'description': [], 'fit': '', 'title': '', 'also_buy': ['B007NLCJBC', 'B01FARODH8'], 'tech2': '', 'brand': 'Arthur K. Barnes', 'feature': [], 'rank': '1,716,849 Paid in Kindle Store (', 'also_view': ['B000FBF81K', 'B00PBDMER8'], 'details': {'File Size:': '295 KB', 'Print Length:': '113 pages', 'Publisher:': 'FuturesPast Editions (October 14, 2001)', 'Publication Date:': 'October 14, 2001', 'Language:': 'English', 'ASIN:': 'B000FA5KKA', 'Word Wise:': 'Enabled', 'Lending:': 'Enabled'}, 'main_cat': 'Buy a Kindle', 'similar_item': '', 'date': '', 'price': '', 'asin': 'B000FA5KKA', 'imageURL': [], 'imageURLHighRes': []}\n"
     ]
    }
   ],
   "source": [
    "### load the meta data\n",
    "\n",
    "data = []\n",
    "with gzip.open(datafile) as f:\n",
    "    for l in f:\n",
    "        data.append(json.loads(l.strip()))\n",
    "    \n",
    "# total length of list, this number equals total number of products\n",
    "print(len(data))\n",
    "\n",
    "# first row of the list\n",
    "print(data[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "token_dic = defaultdict(int)\n",
    "for book in data:\n",
    "    if book['title'] != '':\n",
    "        tokens = word_tokenize(book['title'])\n",
    "#         print (tokens)\n",
    "        for token in tokens:\n",
    "            token_dic[token.lower()] += 1  \n",
    "token_idx = {}\n",
    "for token in token_dic:\n",
    "    if token_dic[token] < 30000 and token_dic[token]>1000:\n",
    "        token_idx[token] = len(token_idx)\n",
    "\n",
    "        \n",
    "# token_list = sorted(token_list, reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "category_dic = defaultdict(int)\n",
    "for book in data:\n",
    "    if book['title'] !=  '' and book['category'] != []:\n",
    "        for category in book['category']:\n",
    "            category_dic[category] += 1\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "category_idx = {}\n",
    "for category in category_dic:\n",
    "    if category_dic[category]>15000 and category_dic[category]<100000:\n",
    "        category_idx[category] = len(category_idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Religion & Spirituality': 0,\n",
       " \"Children's eBooks\": 1,\n",
       " 'Health, Fitness & Dieting': 2,\n",
       " 'Science Fiction & Fantasy': 3,\n",
       " 'Business & Money': 4,\n",
       " 'Romance': 5}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "category_idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_years = [2012, 2013, 2014, 2015, 2016]\n",
    "start_year = target_years[0]\n",
    "book_category_years = [[0 for _ in category_idx] for y in target_years]\n",
    "for book in data:\n",
    "    if book['title'] !=  '':\n",
    "        if 'Publication Date:' in book['details'] and book['details']['Publication Date:']:    \n",
    "            year = int(book['details']['Publication Date:'].split(', ')[-1])\n",
    "            if year not in target_years: continue\n",
    "            year_idx = year - start_year\n",
    "            for category in book['category']:\n",
    "                if category in category_idx:\n",
    "                    book_category_years[year_idx][category_idx[category]] += 1\n",
    "                    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "datapath = '../data/Amazon-Kindle/'\n",
    "if not os.path.exists(datapath):\n",
    "    os.makedirs(datapath)\n",
    "num_task = len(target_years)\n",
    "num_class = len(category_idx)\n",
    "with open(datapath+'statistics', 'wb') as file:\n",
    "  pickle.dump((num_task, num_class), file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_feature(text):\n",
    "    feature = [0 for i in range(len(token_idx))]\n",
    "    tokens = word_tokenize(text)\n",
    "    for token in tokens:\n",
    "        if token.lower() in token_idx:\n",
    "            feature[token_idx[token.lower()]] += 1\n",
    "    return feature\n",
    "    \n",
    "\n",
    "all_books = {}\n",
    "book_year_id = [set() for i in range(6)]\n",
    "for book in data:\n",
    "    if book['title'] !=  '':\n",
    "        if 'Publication Date:' in book['details'] and book['details']['Publication Date:']:    \n",
    "            year = int(book['details']['Publication Date:'].split(', ')[-1])\n",
    "#             if year not in target_years: continue\n",
    "            year_idx = max(year - start_year, 0)\n",
    "            if year_idx >= num_task: continue\n",
    "            for category in book['category']:\n",
    "                if category in category_idx:\n",
    "                    feature = get_feature(book['title'])\n",
    "                    label = category_idx[category]\n",
    "                    related_books = set(book['also_view']) | set(book['also_buy'])\n",
    "                    all_books[book['asin']] = {'year':year, 'feature':feature, 'label':label, 'related_books':related_books}\n",
    "                    book_year_id[year_idx].add(book['asin'])\n",
    "                    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "id2idx = {}\n",
    "for book_id in all_books:\n",
    "    id2idx[book_id] = len(id2idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Graph(num_nodes=42705, num_edges=60616,\n",
      "      ndata_schemes={'new_node_mask': Scheme(shape=(), dtype=torch.bool), 'x': Scheme(shape=(457,), dtype=torch.int64), 'y': Scheme(shape=(), dtype=torch.int64), 'node_idxs': Scheme(shape=(), dtype=torch.int64)}\n",
      "      edata_schemes={})\n",
      "Graph(num_nodes=40075, num_edges=58846,\n",
      "      ndata_schemes={'new_node_mask': Scheme(shape=(), dtype=torch.bool), 'x': Scheme(shape=(457,), dtype=torch.int64), 'y': Scheme(shape=(), dtype=torch.int64), 'node_idxs': Scheme(shape=(), dtype=torch.int64)}\n",
      "      edata_schemes={})\n",
      "Graph(num_nodes=51206, num_edges=121078,\n",
      "      ndata_schemes={'new_node_mask': Scheme(shape=(), dtype=torch.bool), 'x': Scheme(shape=(457,), dtype=torch.int64), 'y': Scheme(shape=(), dtype=torch.int64), 'node_idxs': Scheme(shape=(), dtype=torch.int64)}\n",
      "      edata_schemes={})\n",
      "Graph(num_nodes=54092, num_edges=197812,\n",
      "      ndata_schemes={'new_node_mask': Scheme(shape=(), dtype=torch.bool), 'x': Scheme(shape=(457,), dtype=torch.int64), 'y': Scheme(shape=(), dtype=torch.int64), 'node_idxs': Scheme(shape=(), dtype=torch.int64)}\n",
      "      edata_schemes={})\n",
      "Graph(num_nodes=34332, num_edges=125474,\n",
      "      ndata_schemes={'new_node_mask': Scheme(shape=(), dtype=torch.bool), 'x': Scheme(shape=(457,), dtype=torch.int64), 'y': Scheme(shape=(), dtype=torch.int64), 'node_idxs': Scheme(shape=(), dtype=torch.int64)}\n",
      "      edata_schemes={})\n"
     ]
    },
    {
     "ename": "IndexError",
     "evalue": "tensors used as indices must be long, byte or bool tensors",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn [14], line 40\u001b[0m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;66;03m#     node_idxs[idx] = id2idx[book_id]\u001b[39;00m\n\u001b[1;32m     38\u001b[0m \u001b[38;5;66;03m#     g.ndata['num_new_nodes'] = torch.tensor([num_new_books for i in range(len(id2idx_t))])\u001b[39;00m\n\u001b[1;32m     39\u001b[0m     node_mask \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor([\u001b[38;5;28;01mFalse\u001b[39;00m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(id2idx_t))])\n\u001b[0;32m---> 40\u001b[0m     \u001b[43mnode_mask\u001b[49m\u001b[43m[\u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mrange\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mnum_new_books\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m     41\u001b[0m     g\u001b[38;5;241m.\u001b[39mndata[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnew_node_mask\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m node_mask\n\u001b[1;32m     42\u001b[0m     g\u001b[38;5;241m.\u001b[39mndata[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m node_features\n",
      "\u001b[0;31mIndexError\u001b[0m: tensors used as indices must be long, byte or bool tensors"
     ]
    }
   ],
   "source": [
    "import dgl\n",
    "import torch\n",
    "\n",
    "\n",
    "g_list = []\n",
    "for year_idx in range(6):\n",
    "    id2idx_t = {}\n",
    "    for book_id in book_year_id[year_idx]:\n",
    "        id2idx_t[book_id] = len(id2idx_t)\n",
    "    num_new_books = len(id2idx_t)\n",
    "    for book_id in book_year_id[year_idx]:\n",
    "        for rel_id in all_books[book_id]['related_books']:\n",
    "            if rel_id in all_books and all_books[rel_id]['year'] <= year_idx+start_year:\n",
    "                if rel_id not in id2idx_t:\n",
    "                    id2idx_t[rel_id] = len(id2idx_t)\n",
    "    \n",
    "    node_features = [[] for i in range(len(id2idx_t))]\n",
    "    node_idxs = [-1 for i in range(len(id2idx_t))]\n",
    "    class_label = [-1 for i in range(len(id2idx_t))]\n",
    "    g = dgl.DGLGraph()\n",
    "    g.add_nodes(len(id2idx_t))\n",
    "    for book_id in id2idx_t:\n",
    "        idx = id2idx_t[book_id]\n",
    "#         year_idx_ = all_books[book_id]['year']-start_year\n",
    "        node_idxs[idx] = id2idx[book_id]\n",
    "        node_features[idx] = all_books[book_id]['feature']\n",
    "        class_label[idx] = all_books[book_id]['label']\n",
    "        \n",
    "    for book_id in book_year_id[year_idx]:\n",
    "        for rel_id in all_books[book_id]['related_books']:\n",
    "            if rel_id in all_books and all_books[rel_id]['year'] <= year_idx+start_year:\n",
    "                g.add_edges(id2idx_t[book_id], id2idx_t[rel_id])\n",
    "                g.add_edges(id2idx_t[rel_id], id2idx_t[book_id])\n",
    "    node_features = torch.tensor(node_features)\n",
    "    class_label = torch.tensor(class_label)\n",
    "    node_idxs = torch.tensor(node_idxs)\n",
    "#     node_idxs[idx] = id2idx[book_id]\n",
    "    g.ndata['num_new_nodes'] = torch.tensor([num_new_books for i in range(len(id2idx_t))])\n",
    "#     node_mask = torch.tensor([False for i in range(len(id2idx_t))])\n",
    "#     node_mask[torch.tensor([i for i in range(num_new_books)])] = True\n",
    "    g.ndata['new_node_mask'] = node_mask\n",
    "    g.ndata['x'] = node_features\n",
    "    g.ndata['y'] = class_label\n",
    "    g.ndata['node_idxs'] = node_idxs\n",
    "    print (g)\n",
    "    g_list.append(g)\n",
    "#     break\n",
    "    with open(datapath+f'graph_{year_idx}_by_edges', 'wb') as file:\n",
    "      pickle.dump(g, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# f = open('statistics.csv', 'w')\n",
    "# writer = csv.writer(f)\n",
    "\n",
    "# for i in range (len(book_category_years)):\n",
    "#     row = ['{} ({:.2f})'.format(x, 100*x/sum(book_category_years[i])) for x in book_category_years[i]]\n",
    "#     writer.writerow(row)\n",
    "# f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([182985, 457])\n"
     ]
    }
   ],
   "source": [
    "# import dgl\n",
    "# import torch\n",
    "# import pickle\n",
    "# # create graph for all books from 2011-2016\n",
    "# id2idx = {}\n",
    "\n",
    "# for time_slot in range(6):\n",
    "#     for book_id in books_year[time_slot]:\n",
    "#         if book_id not in id2idx:\n",
    "#             id2idx[book_id] = len(id2idx)\n",
    "\n",
    "# print (len(id2idx))\n",
    "\n",
    "node_features = [[] for i in range(len(id2idx))]\n",
    "class_label = [-1 for i in range(len(id2idx))]\n",
    "g = dgl.DGLGraph()\n",
    "g.add_nodes(len(id2idx))\n",
    "for time_slot in range(6):\n",
    "    for book_id in book_year_id[time_slot]:\n",
    "        book_year = all_books[book_id]['year']\n",
    "        year_idx = book_year-start_year\n",
    "        idx = id2idx[book_id]\n",
    "        node_features[idx] = all_books[book_id]['feature']\n",
    "        class_label[idx] = all_books[book_id]['label']\n",
    "        for rel_id in all_books[book_id]['related_books']:\n",
    "            if rel_id in id2idx and all_books[rel_id]['year'] <= year_idx+start_year:\n",
    "                g.add_edges(idx, id2idx[rel_id])\n",
    "                g.add_edges(id2idx[rel_id], idx)\n",
    "node_features = torch.tensor(node_features)\n",
    "class_label = torch.tensor(class_label)\n",
    "print (node_features.size())\n",
    "g.ndata['x'] = node_features\n",
    "g.ndata['y'] = class_label\n",
    "with open(datapath+'full_graph', 'wb') as file:\n",
    "  pickle.dump(g, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(datapath+'graph_whole', 'wb') as file:\n",
    "  pickle.dump(g, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
