{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "from tqdm import tqdm\n",
    "import preprocessing as pre"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Yoochoose"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def agg_sessions(events: pd.DataFrame) -> pd.DataFrame:\n",
    "    sessions_dict = dict()\n",
    "    for row in tqdm(events.itertuples()):\n",
    "        if row.session_id in sessions_dict:\n",
    "            sessions_dict[row.session_id][\"ts\"].append(row.ts)\n",
    "            sessions_dict[row.session_id][\"touchpoint\"].append(row.touchpoint)\n",
    "            sessions_dict[row.session_id][\"category_id\"].append(row.category_id)\n",
    "        else:\n",
    "            sessions_dict[row.session_id] = {\n",
    "                \"session_id\": row.session_id,\n",
    "                \"first_ts\": row.ts,\n",
    "                \"month\": row.month,\n",
    "                \"ts\": [row.ts],\n",
    "                \"touchpoint\": [row.touchpoint],\n",
    "                \"category_id\": [row.category_id],\n",
    "            }\n",
    "    return pd.DataFrame.from_records(list(sessions_dict.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# download data from: https://www.kaggle.com/datasets/chadgostopp/recsys-challenge-2015\n",
    "path = \"datasets/yoochoose/raw\"\n",
    "data = pd.read_csv(f\"{path}/yoochoose-clicks.dat\", names=[\"session_id\", \"ts\", \"touchpoint\", \"category_id\"])\n",
    "data = data.astype({\"session_id\": \"UInt32\", \"touchpoint\": \"UInt32\", \"category_id\": \"category\"})\n",
    "# transform to timestamp (in seconds)\n",
    "data.ts = data.ts.apply(lambda x: int(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()))\n",
    "# data.ts = pd.to_datetime(data.ts)\n",
    "data.sort_values(by=\"ts\", inplace=True)\n",
    "\n",
    "data_buys = pd.read_csv(\"{path}/yoochoose-buys.dat\", names=[\"session_id\", \"ts\", \"touchpoint\", \"price\", \"quantity\"])\n",
    "data_buys = data_buys.astype({\"session_id\": \"UInt32\", \"touchpoint\": \"UInt32\",})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 33003944 entries, 1163646 to 32235699\n",
      "Data columns (total 4 columns):\n",
      " #   Column       Dtype   \n",
      "---  ------       -----   \n",
      " 0   session_id   UInt32  \n",
      " 1   ts           int64   \n",
      " 2   touchpoint   UInt32  \n",
      " 3   category_id  category\n",
      "dtypes: UInt32(2), category(1), int64(1)\n",
      "memory usage: 881.3 MB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_data: 27727630 (84.01%)\n",
      "test_data: 5276314 (15.99%)\n",
      "\n",
      "CPU times: total: 5.41 s\n",
      "Wall time: 5.42 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# split the data such that the two last months are used for evaluation.\n",
    "data[\"month\"] = pd.to_datetime(data.ts, unit='s').dt.month\n",
    "train_data = data[data.month < data.month.max()].reset_index(drop=True)\n",
    "test_data = data[data.month >= data.month.max()].reset_index(drop=True)\n",
    "print(f\"\"\"train_data: {len(train_data)} ({round(len(train_data)/len(data)*100, 2)}%)\n",
    "test_data: {len(test_data)} ({round(len(test_data)/len(data)*100, 2)}%)\\n\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Remaining events for training: 20725961. 7001669 of 27727630 events removed!\n",
      "Remaining events for testing: 3902077. 1374237 of 5276314 events removed!\n",
      "\n",
      "CPU times: total: 3.89 s\n",
      "Wall time: 3.9 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# remove sessions with less than three interactions\n",
    "# train data\n",
    "len_data_old = len(train_data)\n",
    "train_data = train_data[train_data.session_id.map(train_data.session_id.value_counts()) > 2].reset_index(drop=True)\n",
    "print(f\"Remaining events for training: {len(train_data)}. {len_data_old - len(train_data)} of {len_data_old} events removed!\")\n",
    "\n",
    "# test data\n",
    "len_data_old = len(test_data)\n",
    "test_data = test_data[test_data.session_id.map(test_data.session_id.value_counts()) > 2].reset_index(drop=True)\n",
    "print(f\"Remaining events for testing: {len(test_data)}. {len_data_old - len(test_data)} of {len_data_old} events removed!\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "20725961it [00:49, 422674.78it/s]\n",
      "3902077it [00:09, 406763.00it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3752161 user sequences for training created and 679773 for testing.\n",
      "\n",
      "CPU times: total: 1min 32s\n",
      "Wall time: 1min 32s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Aggregate user history for each user and split sessions by week \n",
    "# Split by weeks is only necessary for datasets without session_id\n",
    "train_data = agg_sessions(train_data)\n",
    "train_data[\"len\"] = [len(t) for t in train_data.touchpoint.to_numpy()]\n",
    "\n",
    "test_data = agg_sessions(test_data)\n",
    "test_data[\"len\"] = [len(t) for t in test_data.touchpoint.to_numpy()]\n",
    "print(f\"{len(train_data)} user sequences for training created and {len(test_data)} for testing.\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train data:\n",
      "Sequences with more than 30.0 (99.50% quantile) events removed.\n",
      "Remaining sessions: 3731708. 20453 of 3752161 sequences removed!\n",
      "test data:\n",
      "Sequences with more than 32.0 (99.50% quantile) events removed.\n",
      "Remaining sessions: 676286. 3487 of 679773 sequences removed!\n",
      "CPU times: total: 1.98 s\n",
      "Wall time: 1.99 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# remove sessions with less than three interactions\n",
    "# train data\n",
    "print(\"train data:\")\n",
    "train_data = pre.remove_sessions_by_length(train_data, 2, 0.995)\n",
    "\n",
    "#test data\n",
    "print(\"test data:\")\n",
    "test_data = pre.remove_sessions_by_length(test_data, 2, 0.995)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data.len.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train data:\n",
      "            count  percentage\n",
      "purchase                     \n",
      "0         3420857       91.67\n",
      "1          310851        8.33\n",
      "test data:\n",
      "           count  percentage\n",
      "purchase                    \n",
      "0         616611       91.18\n",
      "1          59675        8.82\n",
      "CPU times: total: 1.75 s\n",
      "Wall time: 1.77 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# map if sequences have a purchase or not \n",
    "# train data\n",
    "seq_with_purchase_set = set(data_buys.session_id.unique())\n",
    "train_data[\"purchase\"] = [1 if i in seq_with_purchase_set else 0 for i in train_data.session_id.to_numpy()]\n",
    "print(f\"train data:\\n{pd.concat([train_data.purchase.value_counts(), train_data.purchase.value_counts(normalize=True).mul(100).round(2)], axis=1, keys=['count', 'percentage'])}\")\n",
    "\n",
    "# test_data\n",
    "test_data[\"purchase\"] = [1 if i in seq_with_purchase_set else 0 for i in test_data.session_id.to_numpy()]\n",
    "print(f\"test data:\\n{pd.concat([test_data.purchase.value_counts(), test_data.purchase.value_counts(normalize=True).mul(100).round(2)], axis=1, keys=['count', 'percentage'])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data.to_pickle(\"datasets/yoochoose/preprocessed/train_sequences.pkl\")\n",
    "test_data.to_pickle(\"datasets/yoochoose/preprocessed/test_sequences.pkl\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### N-Gram Creation for Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8565030 ngrams created.\n",
      "44435 different tokens exist for embedding Training.\n",
      "\n",
      "            \n",
      "CPU times: total: 2min 13s\n",
      "Wall time: 2min 13s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "trigrams = pre.create_ngram_dataset(train_data, 2, add_event_type=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1719739 ngrams created.\n",
      "27016 different tokens exist for embedding Training.\n",
      "\n",
      "            \n",
      "\n",
      "    Number of unknown tokens for testing\n",
      "    2511\n",
      "    \n"
     ]
    }
   ],
   "source": [
    "testdata_trigrams = pre.create_ngram_dataset(test_data, 2, add_event_type=False)\n",
    "print(f\"\"\"\n",
    "    Number of unknown tokens for testing\n",
    "    {len(set(testdata_trigrams[\"vocab_map\"].keys() - set(trigrams[\"vocab_map\"].keys())))}\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# store ngrams and vocabulary map\n",
    "with open(\"datasets/yoochoose/preprocessed/trigrams.pkl\", \"wb\") as f:\n",
    "    pickle.dump(trigrams, f, protocol=pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Postprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "train_data, test_data = pd.read_pickle(\"datasets/yoochoose/preprocessed/train_sequences.pkl\"), pd.read_pickle(\"datasets/yoochoose/preprocessed/test_sequences.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_mem_efficiant(data: pd.DataFrame) -> pd.DataFrame:\n",
    "    data.ts = data.ts.apply(tuple)\n",
    "    data.touchpoint = data.touchpoint.apply(tuple)\n",
    "    data.category_id = data.category_id.apply(tuple)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: total: 4.58 s\n",
      "Wall time: 4.57 s\n"
     ]
    }
   ],
   "source": [
    "%%time \n",
    "train_data = make_mem_efficiant(train_data)\n",
    "test_data = make_mem_efficiant(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data.to_pickle(\"datasets/yoochoose/preprocessed/train_sequences.pkl\")\n",
    "test_data.to_pickle(\"datasets/yoochoose/preprocessed/test_sequences.pkl\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
