{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle as pkl\n",
    "import math\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from scipy.sparse import csr_matrix\n",
    "import time\n",
    "import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# some helper functions to load raw data\n",
    "\n",
    "def read_user_rating_records(dir_path, rating_file):\n",
    "    col_names = ['user_id', 'item_id', 'rating', 'timestamp']\n",
    "    data_records = pd.read_csv(dir_path + rating_file, sep=',', names=col_names, engine='python')\n",
    "    return data_records\n",
    "\n",
    "def remove_infrequent_node(df, node_type, min_counts=5):\n",
    "    n_node_type = len(df[node_type].unique())\n",
    "    counts = df[node_type].value_counts()\n",
    "    df = df[df[node_type].isin(counts[counts >= min_counts].index)]\n",
    "    n_removed = n_node_type - len(df[node_type].unique())\n",
    "    return df, n_removed\n",
    "\n",
    "def save_obj(obj, name):\n",
    "    with open(name + '.pkl', 'wb') as f:\n",
    "        pkl.dump(obj, f)\n",
    "        \n",
    "def load_obj(path, name):\n",
    "    with open(path + name, 'rb') as f:\n",
    "        return pkl.load(f, encoding='latin1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset(dataset_name):\n",
    "    # Amazon_CDs\n",
    "    if dataset_name in ['Amazon_Books', 'Amazon_CDs']:\n",
    "        dir_path = 'Amazon_Books/' if dataset_name == 'Amazon_Books' else 'Amazon_CDs/'\n",
    "        rating_file = 'ratings_Books.csv' if dataset_name == 'Amazon_Books' else 'ratings_CDs_and_Vinyl.csv'\n",
    "\n",
    "        data_records = read_user_rating_records(dir_path, rating_file)\n",
    "        data_records.loc[data_records.rating <= 3, 'rating'] = 0\n",
    "        data_records.loc[data_records.rating > 3, 'rating'] = 1\n",
    "        data_records = data_records[data_records.rating > 0]\n",
    "        \n",
    "    # Amazon_Electrionics\n",
    "    elif dataset_name == 'Amazon_Electronics':\n",
    "        data_records = read_user_rating_records('Amazon_Electronics/', 'ratings_Electronics-original.csv')\n",
    "        data_records.loc[data_records.rating <= 3, 'rating'] = 0\n",
    "        data_records.loc[data_records.rating > 3, 'rating'] = 1\n",
    "        data_records = data_records[data_records.rating > 0]\n",
    "        \n",
    "    elif dataset_name == 'Gowalla':\n",
    "        rating_file = './gowalla/Gowalla_totalCheckins.txt'\n",
    "        \n",
    "        dtypes = {'0': np.int64, '4': np.int64, '1': np.float64}\n",
    "        data_records = pd.read_csv(rating_file, sep=r'\\t', engine='python', encoding='latin-1',\n",
    "                                   names=['user_id','timestamp','item_id'], usecols=[0,1,4], parse_dates=[1], dtype=dtypes)\n",
    "    \n",
    "    elif dataset_name == 'moivelens':\n",
    "        rating_file = './ml-10M/ratings.dat'\n",
    "\n",
    "        dtypes = {'user_id': np.int64, 'item_id': np.int64, 'ratings': np.float32, 'timestamp': np.float64}\n",
    "        data_records = pd.read_csv(rating_file, sep=r'\\:\\:', engine='python', encoding='latin-1',\n",
    "                           header=None, names=['user_id', 'item_id', 'ratings', 'timestamp'], dtype=dtypes)\n",
    "        \n",
    "        data_records.loc[data_records.ratings <= 3, 'ratings'] = 0\n",
    "        data_records.loc[data_records.ratings > 3, 'ratings'] = 1\n",
    "        data_records = data_records[data_records.ratings > 0]\n",
    "    \n",
    "    elif dataset_name == 'lastfm-2k':\n",
    "        rating_file = './lastfm-2k/user_taggedartists-timestamps.dat'\n",
    "\n",
    "        dtypes = {'user_id': np.int64, 'item_id': np.int64}\n",
    "        data_records = pd.read_csv(rating_file, sep=r'\\t', engine='python', encoding='latin-1', \n",
    "                           header=0, names=['user_id', 'item_id', 'tag_id', 'timestamp'], dtype=dtypes)\n",
    "    \n",
    "    elif dataset_name == 'tb2014':\n",
    "        rating_file = './tb2014/tianchi_mobile_recommend_train_user.csv'\n",
    "        \n",
    "        col_names = ['user_id', 'item_id', 'behavior_type', 'time']\n",
    "        data_records = pd.read_csv(rating_file, sep=',', usecols=col_names, engine='python')\n",
    "        data_records = data_records.rename(columns={\"time\": \"timestamp\"})\n",
    "        \n",
    "    elif dataset_name == 'tb2015':\n",
    "        rating_file = './tb2015/ijcai2016_taobao.csv'\n",
    "        \n",
    "        col_names = ['use_ID', 'sel_ID', 'act_ID', 'time']\n",
    "        data_records = pd.read_csv(rating_file, sep=',', usecols=col_names, engine='python', chunksize=1e7)\n",
    "        data_records = pd.concat(data_records)\n",
    "        data_records = data_records.rename(columns={\"use_ID\": \"user_id\",\n",
    "                                                    \"sel_ID\": \"item_id\",\n",
    "                                                    \"act_ID\" : \"behavior_type\",\n",
    "                                                    \"time\": \"timestamp\"})\n",
    "    elif dataset_name == 'alimama':\n",
    "        rating_file = './alimama/UserBehavior/UserBehavior.csv'\n",
    "        \n",
    "        col_names = ['user_id', 'item_id', 'behavior_type', 'timestamp']\n",
    "        data_records = pd.read_csv(rating_file, sep=',', usecols=[0,1,3,4], names=col_names, engine='python')\n",
    "    \n",
    "    elif dataset_name == 'yelp':\n",
    "        rating_file = './yelp_reviews_RV.csv'\n",
    "        col_names = ['user_id', 'business_id', 'rating', 'date']\n",
    "        data_records = pd.read_csv(rating_file, engine='python')\n",
    "        data_records = data_records.rename(columns={\"business_id\": \"item_id\",\n",
    "                                                    \"date\": \"timestamp\"})\n",
    "        \n",
    "        data_records.loc[data_records.rating <= 3, 'rating'] = 0\n",
    "        data_records.loc[data_records.rating > 3, 'rating'] = 1\n",
    "        data_records = data_records[data_records.rating > 0]\n",
    "    else:\n",
    "        print(\"Error!!!!\")\n",
    "\n",
    "    return data_records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# filter out user node with degree less than u_thre\n",
    "# filter out item node with degree less than i_thre\n",
    "# sort filtered data by timestamp\n",
    "\n",
    "def filter_dataset(u_thre, i_thre, data):\n",
    "\n",
    "    filtered_data = data.copy()\n",
    "    filtered_data, u_removed = remove_infrequent_node(filtered_data, 'user_id', u_thre)\n",
    "    filtered_data, i_removed = remove_infrequent_node(filtered_data, 'item_id', i_thre)\n",
    "\n",
    "    while(u_removed != 0 or i_removed != 0):\n",
    "        filtered_data, u_removed = remove_infrequent_node(filtered_data, 'user_id', u_thre)\n",
    "        filtered_data, i_removed = remove_infrequent_node(filtered_data, 'item_id', i_thre)\n",
    "\n",
    "    print('user with < {} and and items with < {} interactions are removed'.format(u_thre, i_thre))\n",
    "    print('num of users:{}, num of items:{}'.format(len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())))\n",
    "\n",
    "    filtered_data = filtered_data.sort_values('timestamp')\n",
    "    return filtered_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# example to pre-process yelp dataset\n",
    "\n",
    "DATASET = \"yelp\"\n",
    "U_FILTER, I_FILTER = 10,10\n",
    "SAVEFILE = DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"=========================================================\")\n",
    "print(\"Dataset: \", DATASET)\n",
    "\n",
    "data = load_dataset(DATASET)\n",
    "print(\"Raw data length: \", len(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No dup data length:  5116476\n"
     ]
    }
   ],
   "source": [
    "# for taobao datasets only\n",
    "# to remove unwanted edges\n",
    "\n",
    "# data = data.loc[data['behavior_type'] != 'pv'] # alimama\n",
    "# data = data.loc[data['behavior_type'] == 'buy'] # tb2015"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "user with < 10 and and items with < 10 interactions are removed\n",
      "Filtered data length:  1712236\n",
      "First and last timestamps:  2004-10-19 02:46:40 2019-12-13 15:45:49\n",
      "#users, #items :  68423 39327\n"
     ]
    }
   ],
   "source": [
    "# remove duplicates\n",
    "\n",
    "data = data.groupby(['user_id', 'item_id'], sort=False).last().reset_index().sort_values('timestamp')\n",
    "print(\"No dup data length: \", len(data))\n",
    "\n",
    "# filter low degree nodes\n",
    "\n",
    "filtered_data = filter_dataset(U_FILTER, I_FILTER, data)\n",
    "n_data = len(filtered_data)\n",
    "print(\"Filtered data length: \", n_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1712236"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print time range and dataset stats after pre-processing\n",
    "\n",
    "print(\"First and last timestamps: \", filtered_data.iloc[0]['timestamp'], filtered_data.iloc[-1]['timestamp'])\n",
    "n_user, n_item = len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())\n",
    "print(\"#users, #items : \", n_user, n_item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save pre-processed data\n",
    "\n",
    "save_obj(filtered_data, SAVEFILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Below is some functions to get some insight of data blocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_inc_blocks_info(filtered_data, base_user_set, base_item_set):\n",
    "    n_records = len(filtered_data)\n",
    "    n_split = 20\n",
    "    n_newedge = n_records/n_split\n",
    "\n",
    "    n_edge_on_new_node_list = []\n",
    "    n_edge_btw_new_node_list = []\n",
    "    n_day_list = []\n",
    "    n_new_user_list, n_new_item_list = [], []\n",
    "    node_base_overlap_list = []\n",
    "    acc_user = base_user_set\n",
    "    acc_item = base_item_set\n",
    "    \n",
    "    base_data = filtered_data[0:int(n_records*0.5)]\n",
    "    base_user = set(base_data['user_id'].unique())\n",
    "    base_item = set(base_data['item_id'].unique())\n",
    "    \n",
    "    \n",
    "    for i in range(10, n_split):\n",
    "        sub_data = filtered_data[int(n_records*i/n_split):int(n_records*(i+1)/n_split)]\n",
    "        acc_data = filtered_data[0:int(n_records*(i+1)/n_split)]\n",
    "\n",
    "        cur_acc_user = set(acc_data['user_id'].unique())\n",
    "        new_user = cur_acc_user - acc_user\n",
    "        acc_user = cur_acc_user\n",
    "        cur_user = set(sub_data['user_id'].unique())\n",
    "        \n",
    "        cur_acc_item = set(acc_data['item_id'].unique())\n",
    "        new_item = cur_acc_item - acc_item\n",
    "        acc_item = cur_acc_item\n",
    "        cur_item = set(sub_data['item_id'].unique())\n",
    "        \n",
    "        n_edge_on_new_node = len(sub_data[sub_data['user_id'].isin(new_user) | sub_data['item_id'].isin(new_item)])\n",
    "        n_edge_btw_new_node = len(sub_data[sub_data['user_id'].isin(new_user) & sub_data['item_id'].isin(new_item)])\n",
    "        n_edge_on_new_node_list.append(n_edge_on_new_node)\n",
    "        n_edge_btw_new_node_list.append(n_edge_btw_new_node)\n",
    "        \n",
    "        start_day = sub_data.iloc[0]['timestamp']\n",
    "        end_day = sub_data.iloc[-1]['timestamp']\n",
    "        # Gowalla\n",
    "        diff_day = (end_day - start_day).days \n",
    "        \n",
    "#         # Tb2015 date format\n",
    "#         start_day = time.mktime(datetime.datetime.strptime(str(start_day), \"%Y%m%d\").timetuple()) \n",
    "#         end_day = time.mktime(datetime.datetime.strptime(str(end_day), \"%Y%m%d\").timetuple())\n",
    "#         # Movie-lens date format\n",
    "#         diff_day = int((end_day - start_day) / 3600 / 24) # movie-lens\n",
    "#         # Some other dataset date format\n",
    "#         diff_day = int((end_day - start_day) / 3600 / 24 / 1000)\n",
    "\n",
    "        n_day_list.append(diff_day)\n",
    "        \n",
    "        node_base_overlap = (len((cur_user & base_user_set)) + len((cur_item & base_item_set))) \\\n",
    "                            / (len((cur_user | base_user_set)) + len((cur_item | base_item_set)))\n",
    "        \n",
    "        node_base_overlap_2 = (len((cur_user & base_user)) + len((cur_item & base_item))) \\\n",
    "                            / (len(base_user_set) + len(base_item_set))\n",
    "        \n",
    "        n_new_user, n_new_item = len(new_user), len(new_item)\n",
    "        n_new_user_list.append(n_new_user)\n",
    "        n_new_item_list.append(n_new_item)\n",
    "        node_base_overlap_list.append(node_base_overlap)\n",
    "        \n",
    "    return n_edge_on_new_node_list, n_edge_btw_new_node_list, n_day_list, n_new_user_list, n_new_item_list, node_base_overlap_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_block = filtered_data[:int(n_data * 0.5)]\n",
    "inc_data = filtered_data[int(n_data * 0.5):]\n",
    "\n",
    "print(\"% of user in base block: \", len(base_block['user_id'].unique()) / n_user)\n",
    "print(\"% of item in base block: \", len(base_block['item_id'].unique()) / n_item)\n",
    "\n",
    "n_edge_on_new_node_list, \\\n",
    "n_edge_btw_new_node_list, \\\n",
    "n_day_list, \\\n",
    "n_new_user_list, \\\n",
    "n_new_item_list, \\\n",
    "node_base_overlap_list = get_inc_blocks_info(inc_data, set(base_block['user_id'].unique()), set(base_block['item_id'].unique()))\n",
    "\n",
    "print(\"#edge for 1 inc block: \", int(len(inc_data)/10))\n",
    "print(\"edge_on_new_node_list: \", n_edge_on_new_node_list)\n",
    "print(\"edge_btw_new_node_list: \", n_edge_btw_new_node_list)\n",
    "print(\"timespan(mean, std): \", np.mean(n_day_list), np.std(n_day_list))\n",
    "print(\"new_user_node(mean, std): \", np.mean(n_new_user_list), np.std(n_new_user_list))\n",
    "print(\"new_item_node(mean, std): \", np.mean(n_new_item_list), np.std(n_new_item_list))\n",
    "print(\"node_base_overlap_list: \", node_base_overlap_list)\n",
    "\n",
    "print(\"Done.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
