{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bee90647-6d2d-455f-889e-6b8a0fdc9cb0",
   "metadata": {},
   "source": [
    "# Temporal Point Process Data Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99d86017-6b15-431e-999e-092423ce4d87",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import json\n",
    "import pickle\n",
    "import requests\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tqdm.auto import tqdm\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2625b6f-79d5-418c-bd3b-7891eb6c3780",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_rows', 100)\n",
    "pd.set_option('display.max_columns', 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fd7625e-2662-4085-b0bc-9f095a21b244",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_folder = os.path.join('..', 'data')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "ff4ad4d8-8621-4657-9fd9-7e5922060c78",
   "metadata": {},
   "source": [
    "## Stack Overflow Badges\n",
    "\n",
    "Download the `stackoverflow.com-Badges.7z` data from [Stack Exchange Data Dump](https://archive.org/details/stackexchange) and unzip it to `data/raw/stack_overflow/Badges.xml`. The data schema can be found from [here](https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede), and the badge types can be found from [there](https://meta.stackexchange.com/questions/67397/what-are-the-badges-i-can-earn-on-each-site-and-what-are-the-exact-criteria-for)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0188c1f9-7793-4055-a0ae-6bac10ebaa4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "!head ../data/raw/stack_overflow/Badges.xml"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19e98286-da4a-496e-a1f5-860a71d3202c",
   "metadata": {},
   "source": [
    "### Transforming the XML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9b2ee4e-733e-440a-9519-e85792024a8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import xml.etree.ElementTree as ET\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce6dbab9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def xml_to_csv_chunked(xml_file, csv_file, chunk_size=1000):\n",
    "    context = ET.iterparse(xml_file, events=('start', 'end'))\n",
    "    context = iter(context)\n",
    "    event, root = next(context)\n",
    "    \n",
    "    with open(csv_file, mode='w', newline='') as file:\n",
    "        writer = csv.writer(file)\n",
    "        header_written = False\n",
    "        rows = []\n",
    "\n",
    "        for i, (event, elem) in tqdm(enumerate(context)):\n",
    "            if event == 'end' and elem.tag == 'row':\n",
    "                if not header_written:\n",
    "                    header = list(elem.attrib.keys())\n",
    "                    writer.writerow(header)\n",
    "                    header_written = True\n",
    "                \n",
    "                rows.append(list(elem.attrib.values()))\n",
    "                \n",
    "                if len(rows) >= chunk_size:\n",
    "                    writer.writerows(rows)\n",
    "                    rows = []\n",
    "\n",
    "                root.clear()\n",
    "\n",
    "        if rows:\n",
    "            writer.writerows(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c147883c-56f3-46f5-b1a6-ae2538a75903",
   "metadata": {},
   "outputs": [],
   "source": [
    "xml_file = f'{data_folder}/raw/stack_overflow/Badges.xml'\n",
    "csv_file = f'{data_folder}/raw/stack_overflow/badges.csv'\n",
    "xml_to_csv_chunked(xml_file, csv_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f858429a-587e-478f-9331-62f4d44aad79",
   "metadata": {},
   "source": [
    "### Loading Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "319f0299",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges = pd.read_csv(f\"{data_folder}/raw/stack_overflow/badges.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03709d32-a777-4c6a-98bc-7a927563ace6",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffabd1bd-d819-4a4e-b515-d3493b3dc003",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9cf3919-3d4c-49c2-bb1d-3683caaeaa10",
   "metadata": {},
   "source": [
    "### Preprocessing Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42ad68b8-46c7-4119-a350-e021a2a11bd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop NaNs and duplicates\n",
    "df_badges = df_badges.dropna(subset=['UserId', 'Date', 'Name'])\\\n",
    "    .drop_duplicates(subset=['UserId', 'Date', 'Name'], keep='first')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "918bd3b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop tag-affiliated badges\n",
    "df_badges = df_badges[~df_badges['TagBased']].copy()\n",
    "# Transform the date times\n",
    "df_badges['Date'] = pd.to_datetime(df_badges['Date'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "997585a2-9dff-4d49-9d22-b79221f10cc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges['Date'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f28b4146-d75b-480e-b38d-c399e2842f3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges['Name'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b17144ea-2714-4f44-8843-d5600557c63b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select 2-year data\n",
    "df_badges = df_badges[('2022-01-01' <= df_badges['Date']) & (df_badges['Date'] < '2024-01-01')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d99ebf54-52f2-41b3-9d6a-a59b421c0ca9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges['Date'].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "70396337-4d14-43b7-be78-631b6a23a3e7",
   "metadata": {},
   "source": [
    "### Selecting Badges\n",
    "\n",
    "Badges:\n",
    "\n",
    "* Awarded multiple times: Buzz, Socratic, Enlightened, Guru, Lifejacket, Lifeboat, Nice Answer, Good Answer, Great Answer, Populist, Reversal (retired), Revival, Necromancer, Activist, Campaigner, Founder, Good Question, Great Question, Grassroots, Movement, Nice Question, Promoter, Revolution, Steward, Caucus, Constituent, Yearling, Not a Robot\n",
    "\n",
    "* Awarded once per question: Favorite Question, Stellar Question, Nice Question, Good Question, Great Question, Popular Question, Notable Question, Famous Question\n",
    "\n",
    "* Awarded once per answer: Favorite Answer, Stellar Answer\n",
    "\n",
    "* Awarded once per review queue: Custodian, Reviewer\n",
    "\n",
    "* Awarded once per post: Announcer, Booster, Publicist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "adcff8c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "badge_list = [\n",
    "    \"Buzz\", \"Socratic\", \"Enlightened\", \"Guru\", \"Lifejacket\", \"Lifeboat\",\n",
    "    \"Nice Answer\", \"Good Answer\", \"Great Answer\", \"Populist\", \"Revival\",\n",
    "    \"Necromancer\", \"Activist\", \"Campaigner\", \"Founder\", \"Good Question\",\n",
    "    \"Great Question\", \"Grassroots\", \"Movement\", \"Nice Question\", \"Promoter\",\n",
    "    \"Revolution\", \"Steward\", \"Caucus\", \"Constituent\", \"Yearling\", \"Not a Robot\",\n",
    "    \"Favorite Question\", \"Stellar Question\", \"Nice Question\", \"Good Question\",\n",
    "    \"Great Question\", \"Popular Question\", \"Notable Question\", \"Famous Question\",\n",
    "    \"Favorite Answer\", \"Stellar Answer\", \"Custodian\", \"Reviewer\", \"Announcer\",\n",
    "    \"Booster\", \"Publicist\",\n",
    "]\n",
    "badge_list = set(badge_list)\n",
    "len(badge_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "178f50ff-42f5-4e5c-ba47-6a87845c9ada",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select badges that can be awarded multiple times\n",
    "df_badges = df_badges[df_badges[\"Name\"].isin(badge_list)]\n",
    "len(df_badges)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "232bb924-63ca-4166-976a-f29e23f45de1",
   "metadata": {},
   "source": [
    "### Selecting Users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bebd002c",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_badge_counts = df_badges.groupby('UserId')['Name'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a500011-ce47-45ec-ab67-059f562867a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_badge_counts.describe().astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13f6777a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select users who have earned at 40-100 badges\n",
    "user_list = user_badge_counts[(user_badge_counts >= 40) & (user_badge_counts <= 100)].index\n",
    "len(user_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81a1bafb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges = df_badges[df_badges[\"UserId\"].isin(user_list)]\n",
    "len(df_badges)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de6e1f3e-ac66-4b59-9052-7c3db5a96009",
   "metadata": {},
   "source": [
    "### Selecting Badges Again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cfebeac8-012c-47b8-a70d-053c74a04d01",
   "metadata": {},
   "outputs": [],
   "source": [
    "badge_type_counts = df_badges['Name'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de6caf57-e00b-4fe0-8793-44a3fb378d43",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select badges which have been awarded at least 200 times\n",
    "badge_type_list = badge_type_counts[badge_type_counts >= 200].index\n",
    "len(badge_type_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf5199dd-e296-414b-9085-47cea9c58e63",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges = df_badges[df_badges[\"Name\"].isin(badge_type_list)]\n",
    "len(df_badges)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d547ada4-54ea-4cc0-9671-40785f62dcb5",
   "metadata": {},
   "source": [
    "### Splitting Sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cc02207",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges.groupby('UserId')['Date'].count().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "728e0cdc-fde6-4786-94a6-b86059309e7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_badges['Name'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "058301f7-b7fa-4351-bdb6-240aaff637e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_seq_splits(df, seq_col):\n",
    "    seq_ids = df[seq_col].unique().tolist()\n",
    "    seq_ids_train, seq_ids_val_test = train_test_split(seq_ids, train_size=0.8, random_state=0)\n",
    "    seq_ids_val, seq_ids_test = train_test_split(seq_ids_val_test, train_size=0.5, random_state=0)\n",
    "    seq_splits = {seq_id: 'train' for seq_id in seq_ids_train}\n",
    "    seq_splits.update({seq_id: 'dev' for seq_id in seq_ids_val})\n",
    "    seq_splits.update({seq_id: 'test' for seq_id in seq_ids_test})\n",
    "    print(f'train: {len(seq_ids_train)} seqs, val: {len(seq_ids_val)} seqs, test: {len(seq_ids_test)} seqs')\n",
    "    return seq_splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7f4cd52-9b8a-4e16-9a5a-2ba25adde46e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "badge_seq_splits = get_seq_splits(df=df_badges, seq_col='UserId')\n",
    "len(badge_seq_splits)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ce01823-0c64-4f9f-a595-90d50d348a1f",
   "metadata": {},
   "source": [
    "### Saving Sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8af8ebaa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_seqs(\n",
    "    df: pd.DataFrame, seq_col: str, seq_splits: dict,\n",
    "    time_col: str, time_unit: float, type_col: str, seq_folder: str):\n",
    "    \"\"\"\n",
    "    Save event sequences\n",
    "    \"\"\"\n",
    "    dim_process = df[type_col].nunique()\n",
    "    type_text2id = {type_text: type_id for type_id, type_text in enumerate(df[type_col].unique())}\n",
    "    type_id2text = {type_id: type_text for type_text, type_id in type_text2id.items()}\n",
    "    type_id_col = f'{type_col}_id'\n",
    "    df[type_id_col] = df[type_col].map(type_text2id)\n",
    "    data = {'train': [], 'dev': [], 'test': []}\n",
    "    print(f'type_id2text: {type_id2text}')\n",
    "    \n",
    "    for seq_id, group in tqdm(df.groupby(seq_col)):\n",
    "        group = group.sort_values(by=time_col).reset_index()\n",
    "        split = seq_splits[seq_id]\n",
    "        init_time = group[time_col].min()\n",
    "        pre_event_time = init_time\n",
    "        event_seq = {\n",
    "            'dim_process': dim_process,\n",
    "            'seq_idx': len(data[split]),\n",
    "            'seq_len': len(group),\n",
    "            'time_since_start': [],\n",
    "            'time_since_last_event': [],\n",
    "            'type_event': [],\n",
    "            'type_text': [],\n",
    "        }\n",
    "        \n",
    "        for index, row in group.iterrows():\n",
    "            event_time = pd.to_datetime(row[time_col])\n",
    "            time_since_start = (event_time - init_time).total_seconds() / time_unit\n",
    "            time_since_last_event = (event_time - pre_event_time).total_seconds() / time_unit\n",
    "            event_seq['time_since_start'].append(time_since_start)\n",
    "            event_seq['time_since_last_event'].append(time_since_last_event)\n",
    "            event_seq['type_event'].append(row[type_id_col])\n",
    "            event_seq['type_text'].append(row[type_col])\n",
    "            pre_event_time = event_time\n",
    "        \n",
    "        data[split].append(event_seq)\n",
    "\n",
    "    os.makedirs(seq_folder, exist_ok=True)\n",
    "    for split in ['train', 'dev', 'test']:\n",
    "        json_path = f'{seq_folder}/{split}.json'\n",
    "        with open(json_path, 'w') as file:\n",
    "            json.dump(data[split], file, indent=4)\n",
    "        print(f'{split} saved to {json_path}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cedbe839-379b-4e78-92f6-ee568aa1287f",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_seqs(\n",
    "    df=df_badges, seq_col='UserId', seq_splits=badge_seq_splits,\n",
    "    time_col='Date', time_unit=60*60*24*30, type_col='Name',\n",
    "    seq_folder=f'{data_folder}/stack_overflow',\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81874335-34d6-4792-b86c-aab8451b4df9",
   "metadata": {},
   "source": [
    "## Chicago Crimes\n",
    "\n",
    "Download the data from [Crimes - 2001 to Present](https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2/about_data) to `data/raw/chicago_crime/Crimes_-_2001_to_Present.csv`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d00c4709-158b-4386-b6fb-e1ecd4bcd84e",
   "metadata": {},
   "source": [
    "### Loading Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d3862af",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes = pd.read_csv(f'{data_folder}/raw/chicago_crime/Crimes_-_2001_to_Present.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2587562-a7e3-4696-8f90-431a844ec6f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc4f9527-9fdf-494e-aee8-f158e34d17b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a6e9671-24cf-463f-bde8-47376734926a",
   "metadata": {},
   "source": [
    "### Preprocessing Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab00bc7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes['Date'] = pd.to_datetime(df_crimes['Date'])\n",
    "df_crimes['Primary Type'] = df_crimes['Primary Type'].str.title()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff808672",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes = df_crimes.dropna(subset=['Date', 'Block', 'Primary Type'])\\\n",
    "    .drop_duplicates(subset=['Date', 'Block', 'Primary Type'], keep='first')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4b90492",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes = df_crimes[('2022-01-01' <= df_crimes['Date']) & (df_crimes['Date'] < '2024-01-01')]\n",
    "len(df_crimes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f014dbe-920a-44eb-9725-a20bb1c6f214",
   "metadata": {},
   "source": [
    "### Selecting Crimes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f5fea7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "crime_counts = df_crimes['Primary Type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f6d60e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "crime_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db4baedf-c023-4eb0-9286-cbeb723bf910",
   "metadata": {},
   "outputs": [],
   "source": [
    "crime_list = crime_counts[crime_counts >= 500].index\n",
    "len(crime_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d93e8f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes = df_crimes[df_crimes['Primary Type'].isin(crime_list)]\n",
    "len(df_crimes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9757d671-844b-4e96-94ff-2a67df08922c",
   "metadata": {},
   "source": [
    "### Selecting Blocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a1da519",
   "metadata": {},
   "outputs": [],
   "source": [
    "block_counts = df_crimes['Block'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ac14bf6",
   "metadata": {},
   "outputs": [],
   "source": [
    "block_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f7e228e",
   "metadata": {},
   "outputs": [],
   "source": [
    "block_list = block_counts[(30 <= block_counts) & (block_counts <= 120)].index\n",
    "len(block_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f09d9a59",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes = df_crimes[df_crimes['Block'].isin(block_list)]\n",
    "len(df_crimes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67028d74-aa1a-450c-b883-a291cd3f729f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes['Primary Type'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "965a99d4-1fc9-4a72-9a6c-2fd8949d0819",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes['Primary Type'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "722a6536-fc1d-4f6c-85ef-d63b809849bb",
   "metadata": {},
   "source": [
    "### Saving Sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18b3d1cf-e446-4d74-8acc-3dbe4467a2c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_crimes.groupby('Block')['Date'].count().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36e474ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "crime_seq_splits = get_seq_splits(df=df_crimes, seq_col='Block')\n",
    "len(crime_seq_splits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d05d96e9-b8ac-4ff0-94ad-665ec3c2a867",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_seqs(\n",
    "    df=df_crimes, seq_col='Block', seq_splits=crime_seq_splits,\n",
    "    time_col='Date', time_unit=60*60*24*30, type_col='Primary Type',\n",
    "    seq_folder=f'{data_folder}/chicago_crime',\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f86cee2",
   "metadata": {},
   "source": [
    "## NYC Taxi Trips\n",
    "\n",
    "Download the [NYC Taxi Trips](https://www.andresmh.com/nyctaxitrips/) to `data/raw/nyc_taxi/` and [NYC Borough Boundaries](https://data.cityofnewyork.us/City-Government/Borough-Boundaries/tqmj-j8zm) (\"Export\" then \"Original\") to `data/raw/nyc_taxi/nybb_24c/`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "63d1857a-55eb-4e20-ac38-2c4389a1a14a",
   "metadata": {},
   "source": [
    "### Loading Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba1d9977",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips = pd.read_csv(f'{data_folder}/raw/nyc_taxi/trip_data_5.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8aef14b7-cb36-49e9-8c39-c63e1f289abe",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcad786c-798b-40e4-a869-19503030f937",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "be34de82-9b45-4622-9c0b-a78e479aec71",
   "metadata": {},
   "source": [
    "### Preprocessing Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7db8e0aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81fa85e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips.columns = df_trips.columns.str.strip()\n",
    "df_trips.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b39aa196-9b21-4255-8f13-b33d60a37d4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips = df_trips.dropna(subset=[\n",
    "    'hack_license', 'pickup_datetime', 'dropoff_datetime', \n",
    "    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',\n",
    "]).drop_duplicates(subset=[\n",
    "    'hack_license', 'pickup_datetime', 'dropoff_datetime', \n",
    "    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',\n",
    "], keep='first')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29762d46-a2e1-4d44-862a-54285760f90f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips = df_trips[\n",
    "    (df_trips['pickup_longitude'] != 0) & (df_trips['pickup_latitude'] != 0)\n",
    "    & (df_trips['dropoff_longitude'] != 0) & (df_trips['dropoff_latitude'] != 0)\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6bb49c9b-0c75-4492-b586-49ddeba05db4",
   "metadata": {},
   "source": [
    "### Selecting Pickup Times"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e30d076",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips[\"pickup_datetime\"] = pd.to_datetime(df_trips[\"pickup_datetime\"])\n",
    "df_trips[\"dropoff_datetime\"] = pd.to_datetime(df_trips[\"dropoff_datetime\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c2a9733",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips.pickup_datetime.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1f81990-7cd8-4b57-b228-e13bbfdf437b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips.dropoff_datetime.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64f817e6-d041-4b25-8f10-27e41990967a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips = df_trips[(df_trips.pickup_datetime >= \"2013-05-01\") & (df_trips.pickup_datetime < \"2013-05-08\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a421f35-3c54-4841-9d9f-1682b11aefeb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips.pickup_datetime.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e9a5784-5a86-4be6-bbc4-81ba487b0463",
   "metadata": {},
   "source": [
    "### Loading Boroughs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "122cc6d1-9a18-49de-b89d-d7609707ce62",
   "metadata": {},
   "outputs": [],
   "source": [
    "import geopandas as gpd\n",
    "from shapely.geometry import Point"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0a0de97-bfd6-40b9-bde8-53b22fd9fd7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "gdf_boroughs = gpd.read_file(f'{data_folder}/raw/nyc_taxi/nybb_24c/')\n",
    "print(\"Boroughs CRS:\", gdf_boroughs.crs)\n",
    "gdf_boroughs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "53f58610-e93c-4462-bf54-e068ab818136",
   "metadata": {},
   "source": [
    "### Getting Boroughs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0826c8bd-ca0e-4ef4-8b00-2c9076dc1b8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips['pickup_geometry'] = df_trips.apply(lambda x: Point((x['pickup_longitude'], x['pickup_latitude'])), axis=1)\n",
    "df_trips['dropoff_geometry'] = df_trips.apply(lambda x: Point((x['dropoff_longitude'], x['dropoff_latitude'])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29848648-6439-4f4e-8753-f0cd958250b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the DataFrame to a GeoDataFrame, with the correct initial CRS (EPSG:4326)\n",
    "gdf_pickups = gpd.GeoDataFrame(df_trips, geometry='pickup_geometry', crs='EPSG:4326')\n",
    "gdf_dropoffs = gpd.GeoDataFrame(df_trips, geometry='dropoff_geometry', crs='EPSG:4326')\n",
    "# Reproject the GeoDataFrame to the CRS of the boroughs shapefile (EPSG:2263)\n",
    "gdf_pickups = gdf_pickups.to_crs(gdf_boroughs.crs)\n",
    "gdf_dropoffs = gdf_dropoffs.to_crs(gdf_boroughs.crs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66b7a3cd-8026-47dc-a419-fc1a817265d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Perform spatial join to get the boroughs\n",
    "gdf_pickups = gpd.sjoin(gdf_pickups, gdf_boroughs, how='left', predicate='intersects')\n",
    "gdf_dropoffs = gpd.sjoin(gdf_dropoffs, gdf_boroughs, how='left', predicate='intersects')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd3820b8-7f0d-41a0-a17a-b067a26662cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips['pickup_borough'] = gdf_pickups['BoroName']\n",
    "df_trips['dropoff_borough'] = gdf_dropoffs['BoroName']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43b0173f-2ae7-4fb9-b10a-30aaf4f66436",
   "metadata": {},
   "source": [
    "### Merging Events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed0d66e5-0c66-4e48-a5af-3a15576f2c3a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_trips = df_trips[\n",
    "    df_trips['pickup_borough'].notna() & (df_trips['pickup_borough'] != 'Staten Island') \n",
    "    & df_trips['dropoff_borough'].notna() & (df_trips['dropoff_borough'] != 'Staten Island')].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1620b9c0-f556-4b91-81f3-a233257c36c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trips['pickup_type'] = df_trips['pickup_borough'] + ' Pickup'\n",
    "df_trips['dropoff_type'] = df_trips['dropoff_borough'] + ' Dropoff'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19ffcee7-2201-49b5-96dd-cec2a6576a8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_pickups = df_trips[['hack_license', 'pickup_datetime', 'pickup_type']]\\\n",
    "    .rename(columns={'pickup_datetime': 'datetime', 'pickup_type': 'type'})\n",
    "df_dropoffs = df_trips[['hack_license', 'dropoff_datetime', 'dropoff_type']]\\\n",
    "    .rename(columns={'dropoff_datetime': 'datetime', 'dropoff_type': 'type'})\n",
    "df_all_trips = pd.concat([df_pickups, df_dropoffs], ignore_index=True)\\\n",
    "    .sort_values(by=['hack_license', 'datetime'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca07437d-9abf-40da-8caf-3d165f8992d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all_trips.head(6)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "246d7ae4-1ae5-43f3-80ba-91c47d05d9ee",
   "metadata": {},
   "source": [
    "### Getting Sequence IDs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf3bbd03",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all_trips = df_all_trips.sort_values(by=['hack_license', 'datetime'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a60dc8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "seq_ids = []\n",
    "seq_count = 0\n",
    "last_time = df_all_trips.loc[0, 'datetime']\n",
    "last_license = df_all_trips.loc[0, 'hack_license']\n",
    "max_hours = 12\n",
    "\n",
    "for index, row in tqdm(df_all_trips.iterrows(), total=len(df_all_trips)):\n",
    "    if row[\"hack_license\"] != last_license:\n",
    "        seq_count += 1\n",
    "    elif (row[\"datetime\"] - last_time).total_seconds() / 3600 > max_hours:\n",
    "        seq_count += 1\n",
    "    \n",
    "    seq_ids.append(seq_count)\n",
    "    last_time = row[\"datetime\"]\n",
    "    last_license = row[\"hack_license\"]\n",
    "\n",
    "df_all_trips['seq_id'] = seq_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7ae34cb-cd4d-47f8-8fa9-7a7373ed458f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all_trips['seq_id'].value_counts().describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8d92f716-bc5b-4e24-93e6-9f94196bac52",
   "metadata": {},
   "source": [
    "### Selecting Sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a563ea4-be98-434f-b3c1-6b20baf75869",
   "metadata": {},
   "outputs": [],
   "source": [
    "seq_counts = df_all_trips['seq_id'].value_counts()\n",
    "seq_counts.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72ff1cad-2a4e-4ad5-8d61-39c96bda752d",
   "metadata": {},
   "outputs": [],
   "source": [
    "seq_list = seq_counts[(seq_counts >= 100) & (seq_counts <= 160)]\n",
    "seq_list.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c71cfc3-888e-44bc-b113-b79f3bd92277",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all_trips = df_all_trips[df_all_trips['seq_id'].isin(seq_list.index)]\n",
    "len(df_all_trips)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0c8f77c-d458-46d3-8a2c-decea397313f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all_trips['type'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dee3bb1b-b7b3-4baf-9bc2-381bdf9b7862",
   "metadata": {},
   "source": [
    "### Saving Sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f63b065d-c44e-4ef5-96b5-5a5ba7af3656",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all_trips.groupby('seq_id')['datetime'].count().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3f004c7-18ea-4a8c-876e-90c92289595b",
   "metadata": {},
   "outputs": [],
   "source": [
    "trip_seq_splits = get_seq_splits(df=df_all_trips, seq_col='seq_id')\n",
    "len(trip_seq_splits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57312cfc-e32d-4250-b7c5-8b3a14399de9",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_seqs(\n",
    "    df=df_all_trips, seq_col='seq_id', seq_splits=trip_seq_splits,\n",
    "    time_col='datetime', time_unit=60*60, type_col='type',\n",
    "    seq_folder=f'{data_folder}/nyc_taxi',\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd5bf9ab-aa51-473f-8342-2b618733722f",
   "metadata": {},
   "source": [
    "## US Earthquakes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "181e8804-9816-4481-b224-d39075500c2f",
   "metadata": {},
   "source": [
    "### Downloading Data\n",
    "\n",
    "Download the US earthquake data from 2020-01-01 (inclusive) to 2024-01-01 (exclusive) to `data/raw/us_earthquake`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15826ffa-fccb-4501-beba-b1e9fa2a3afc",
   "metadata": {},
   "outputs": [],
   "source": [
    "from io import StringIO\n",
    "from datetime import datetime, timedelta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "168fe438-771d-4109-9393-71cfc024edda",
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_earthquake_data_chunk(start_time, end_time, region):\n",
    "    \"\"\"Download earthquake data for a given time chunk.\"\"\"\n",
    "    \n",
    "    # USGS Earthquake API endpoint\n",
    "    url = \"https://earthquake.usgs.gov/fdsnws/event/1/query\"\n",
    "    \n",
    "    # Query parameters\n",
    "    params = {\n",
    "        \"format\": \"csv\",           # Output format\n",
    "        \"starttime\": start_time,    # Start date (YYYY-MM-DD)\n",
    "        \"endtime\": end_time,        # End date (YYYY-MM-DD)\n",
    "        # \"minmagnitude\": min_magnitude,  # Minimum magnitude\n",
    "        # \"maxmagnitude\": max_magnitude,  # Maximum magnitude\n",
    "        \"minlatitude\": region[\"minlatitude\"],  # Min latitude of region\n",
    "        \"maxlatitude\": region[\"maxlatitude\"],  # Max latitude\n",
    "        \"minlongitude\": region[\"minlongitude\"],  # Min longitude of region\n",
    "        \"maxlongitude\": region[\"maxlongitude\"],  # Max longitude of region\n",
    "    }\n",
    "    \n",
    "    # Send the request\n",
    "    response = requests.get(url, params=params)\n",
    "    \n",
    "    if response.status_code == 200:\n",
    "        print(f\"Data downloaded successfully for {start_time} to {end_time}.\")\n",
    "        return response.content\n",
    "    else:\n",
    "        print(f\"Failed to download data for {start_time} to {end_time}. HTTP Status Code: {response.status_code}.\")\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1694544e-d809-4d61-8765-549dbe98e938",
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_earthquake_data(start_time, end_time, region, output_file, chunk_size):\n",
    "    \"\"\"Download earthquake data by splitting the request into smaller chunks.\"\"\"\n",
    "    \n",
    "    # Convert start and end times to datetime objects\n",
    "    start_date = datetime.strptime(start_time, \"%Y-%m-%d\")\n",
    "    end_date = datetime.strptime(end_time, \"%Y-%m-%d\")\n",
    "    \n",
    "    # Initialize an empty DataFrame to store all results\n",
    "    all_data = pd.DataFrame()\n",
    "    \n",
    "    # Loop through each month in the date range\n",
    "    current_start = start_date\n",
    "    while current_start < end_date:\n",
    "        # Define the end of the current month\n",
    "        current_end = (current_start + timedelta(days=chunk_size))\n",
    "        if current_end > end_date:\n",
    "            current_end = end_date\n",
    "        \n",
    "        # Download data for the current month\n",
    "        data_chunk = download_earthquake_data_chunk(\n",
    "            current_start.strftime(\"%Y-%m-%d\"), current_end.strftime(\"%Y-%m-%d\"), region)\n",
    "        \n",
    "        # If data is returned, append it to the main DataFrame\n",
    "        if data_chunk:\n",
    "            chunk_df = pd.read_csv(StringIO(data_chunk.decode('utf-8')))\n",
    "            all_data = pd.concat([all_data, chunk_df], ignore_index=True)\n",
    "        \n",
    "        # Move to the next month\n",
    "        current_start = current_end\n",
    "    \n",
    "    # Save the complete dataset to a CSV file\n",
    "    all_data.to_csv(output_file, index=False)\n",
    "    print(f\"Data downloaded successfully and saved to {output_file}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96099672-2c5f-4b17-a16b-a7b92f3ac9d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parameters for the earthquake search\n",
    "start_time = \"2020-01-01\"    # Start date\n",
    "end_time = \"2024-01-01\"      # End date\n",
    "region = {\n",
    "    \"minlatitude\": 24.6,     # Min latitude of the region\n",
    "    \"maxlatitude\": 50.0,     # Max latitude\n",
    "    \"minlongitude\": -125.0,  # Min longitude\n",
    "    \"maxlongitude\": -65.0    # Max longitude\n",
    "}\n",
    "output_file = f\"{data_folder}/raw/us_earthquake/us_earthquakes.csv\"\n",
    "\n",
    "# Download the earthquake data\n",
    "download_earthquake_data(start_time, end_time, region, output_file, chunk_size=30)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b13dc09-89d2-4c11-afbe-6b8760248fb2",
   "metadata": {},
   "source": [
    "### Loading Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43990aee-da26-43f2-b8c7-fdf042d610de",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes = pd.read_csv(f'{data_folder}/raw/us_earthquake/us_earthquakes.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82e62125-2727-428d-abb5-f81bc98056e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32fb4262-f4ce-4588-839e-01d8a40ff075",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.to_datetime(df_earthquakes.time).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "75984a68-e085-4f77-9de8-4cbcaa7b6771",
   "metadata": {},
   "source": [
    "### Preprocessing Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e3c7d46-8f0d-4663-b83b-88a29c4584d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes = df_earthquakes[\n",
    "    (df_earthquakes[\"type\"] == \"earthquake\") & (df_earthquakes[\"status\"] == \"reviewed\") \n",
    "    & (df_earthquakes['magType'] == 'ml')]\n",
    "df_earthquakes = df_earthquakes.dropna(subset=['time', 'latitude', 'longitude', 'mag'])\\\n",
    "    .drop_duplicates(subset=['time', 'latitude', 'longitude', 'mag'], keep='first')\n",
    "df_earthquakes[\"time\"] = pd.to_datetime(df_earthquakes[\"time\"])\n",
    "df_earthquakes['coordinate'] = df_earthquakes.apply(\n",
    "    lambda row: (round(row['latitude']), round(row['longitude'])), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10a9a58f-77cb-4469-a885-2a3ca1a11840",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes['coordinate'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1690b382-7f0d-460e-bdc5-8e537edcfbd4",
   "metadata": {},
   "source": [
    "### Getting Sequence IDs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0681981-4dff-4887-a905-1a882d96c93e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes = df_earthquakes.sort_values(by=[\"coordinate\", \"time\"]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50cb91e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "seq_ids = []\n",
    "seq_count = 0\n",
    "last_time = df_earthquakes.loc[0, 'time']\n",
    "last_coord = df_earthquakes.loc[0, 'coordinate']\n",
    "max_hours = 24\n",
    "\n",
    "for index, row in tqdm(df_earthquakes.iterrows(), total=len(df_earthquakes)):\n",
    "    if row[\"coordinate\"] != last_coord:\n",
    "        seq_count += 1\n",
    "    elif (row[\"time\"] - last_time).total_seconds() / 3600 > max_hours:\n",
    "        seq_count += 1\n",
    "    \n",
    "    seq_ids.append(seq_count)\n",
    "    last_time = row[\"time\"]\n",
    "    last_coord = row[\"coordinate\"]\n",
    "\n",
    "df_earthquakes[\"seq_id\"] = seq_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "529731a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89af4e38-2b86-439d-b278-5608f859c1fe",
   "metadata": {},
   "source": [
    "### Selecting Sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91cd0296-f6ff-43c0-8a6c-4a0e0a9ae196",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes[\"seq_id\"].value_counts().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dea271ab-6b93-429d-9311-eccdb5ef39b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "earthquake_counts = df_earthquakes[\"seq_id\"].value_counts()\n",
    "earthquake_list = earthquake_counts[(earthquake_counts >= 5) & (earthquake_counts <= 30)].index\n",
    "len(earthquake_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f17826d-dd7b-4b2d-befd-a07ae3fa344a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes = df_earthquakes[df_earthquakes[\"seq_id\"].isin(earthquake_list)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "feea5b64-bec6-47a8-b666-ecd1ff484f31",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes.groupby('seq_id')['time'].count().describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e149b885-2c4f-416e-b914-fb349ba16680",
   "metadata": {},
   "source": [
    "### Setting Event Types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44418eca",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes['mag'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32e8657d-5be4-4ecd-82ab-6a6540c39532",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes[\"type\"] = \"Small\"\n",
    "df_earthquakes.loc[df_earthquakes[\"mag\"] >= 1, \"type\"] = \"Medium\"\n",
    "df_earthquakes.loc[df_earthquakes[\"mag\"] >= 2, \"type\"] = \"Large\"\n",
    "df_earthquakes[\"type\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d5c679a3-823a-46a2-9ad0-89facc72fb4b",
   "metadata": {},
   "source": [
    "### Saving Sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28bc713a-4292-4ea1-b054-a7d565645e88",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_earthquakes.groupby('seq_id')['time'].count().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd3c23dd-d86f-4ca0-a86b-9ff3dfeff9ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "earthquake_seq_splits = get_seq_splits(df=df_earthquakes, seq_col='seq_id')\n",
    "len(earthquake_seq_splits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae13d8f2-8128-4241-9ec8-1cec396e226f",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_seqs(\n",
    "    df=df_earthquakes, seq_col='seq_id', seq_splits=earthquake_seq_splits,\n",
    "    time_col='time', time_unit=60*60*24, type_col='type',\n",
    "    seq_folder=f'{data_folder}/us_earthquake',\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7b1d20e",
   "metadata": {},
   "source": [
    "## Amazon Reviews\n",
    "\n",
    "Download the 29 small subsets (ratings only) of [Amazon Review Data](https://nijianmo.github.io/amazon/) to the folder `data/raw/amazon_review/`. Make sure to remave `AMAZON_FASHION.csv` to `Amazon_Fashion.csv`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "526aeaf0-37dc-4198-9911-7fcfb8ced377",
   "metadata": {},
   "source": [
    "### Loading Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7b665f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "amazon_review_folder = f\"{data_folder}/raw/amazon_review\"\n",
    "dfs_reviews = []\n",
    "\n",
    "for file_name in tqdm(os.listdir(amazon_review_folder)):\n",
    "    if file_name.endswith('.csv'):\n",
    "        file_path = os.path.join(amazon_review_folder, file_name)\n",
    "        df_reviews_each = pd.read_csv(file_path, names=[\"item\", \"user\", \"rating\", \"timestamp\"])\n",
    "        df_reviews_each['category'] = file_name.replace('.csv', '').replace('_', ' ')\n",
    "        dfs_reviews.append(df_reviews_each)\n",
    "\n",
    "df_reviews = pd.concat(dfs_reviews, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a359b531-ceff-47d0-8ccc-d473c77d11de",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "947736ed-657e-4677-a254-99b9068b6e91",
   "metadata": {},
   "source": [
    "### Preprocessing Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77cbddfb-607a-4d9d-aa0e-3e0c1f7abf50",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews['date'] = pd.to_datetime(df_reviews['timestamp'], unit='s')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41eda87d-f86e-47d7-8c8a-4d3c1fbfde6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews['date'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9104d795-2305-4bc7-aa2b-502cfa4c5146",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews = df_reviews[(\"2018-01-01\" <= df_reviews[\"date\"]) & (df_reviews[\"date\"] <= \"2018-06-30\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70d65db5-1a20-496d-9c76-f89f199f7666",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews = df_reviews.dropna(subset=['date', 'user', 'category'])\\\n",
    "    .drop_duplicates(subset=['date', 'user', 'category'], keep='first')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7833a193-01b7-4421-a31f-83f5fa26240c",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_reviews)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bcb25b46-9b14-4f82-ba1c-7880c6f64827",
   "metadata": {},
   "source": [
    "### Selecting Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "294823bd-b00b-495d-bf2e-c78459ea88c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "category_review_counts = df_reviews[\"category\"].value_counts()\n",
    "category_review_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35c86747-c86b-44fb-a848-3e78cc72d8ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(category_review_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2e0b3d8-d1f3-40cf-8abd-9e62f469f7e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "category_list = category_review_counts[category_review_counts >= 100000].index\n",
    "len(category_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f601394-5638-4fd6-a29b-99a3517a490f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_reviews = df_reviews[df_reviews[\"category\"].isin(category_list)]\n",
    "df_reviews.loc[~df_reviews[\"category\"].isin(category_list), \"category\"] = \"Other\"\n",
    "len(df_reviews)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed1db980-1ac9-40b2-9e0c-ddfb81466963",
   "metadata": {},
   "source": [
    "### Selecting Users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb2d0fa3-b48a-4424-a613-b1c65a921835",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_review_counts = df_reviews[\"user\"].value_counts()\n",
    "user_review_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5dd3f573-a3c4-454d-babe-b8eb48e5e343",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_list = user_review_counts[(user_review_counts >= 40) & (user_review_counts <= 200)].index\n",
    "len(user_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76659342-0ccd-496b-bd88-b3c1a4eb0bf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews[df_reviews[\"user\"].isin(user_list)][\"category\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d72e7ae-01f4-4a9b-8d5b-927cd42c689a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews = df_reviews[df_reviews[\"user\"].isin(user_list)]\n",
    "len(df_reviews)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf897865-7d2b-48ef-86bd-797a20f20bf2",
   "metadata": {},
   "source": [
    "### Saving Sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ba89cff-1b91-4d44-bc70-43f94fcf3c36",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews[\"user\"].nunique(), df_reviews[\"category\"].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "deed81a8-5396-4991-b7aa-5f2fb4597300",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews[\"category\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39acef78-6593-409f-b749-a1950ce48598",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reviews.groupby('user')['date'].count().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aaa0fd6e-8e38-4c9e-9359-f66c78fad776",
   "metadata": {},
   "outputs": [],
   "source": [
    "review_seq_splits = get_seq_splits(df=df_reviews, seq_col='user')\n",
    "len(review_seq_splits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "947cdd89-f8cb-4bdb-960e-e2d7b9b22325",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_seqs(\n",
    "    df=df_reviews, seq_col='user', seq_splits=review_seq_splits,\n",
    "    time_col='date', time_unit=60*60*24*7, type_col='category',\n",
    "    seq_folder=f'{data_folder}/amazon_review',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc8344cb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
