{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b2040f3-499a-4688-84db-3d73cd5187e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp data.character_trajectories\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c330376a-2593-4d98-96d8-f29784210458",
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['core']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7b3f023-f219-430a-b36a-2e48bd4b579c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17db9f89-a8d1-4491-be8a-e366c6772bf2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "import dask_cudf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98dc782b-77fe-4c15-b31e-08aab3f9350a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# |export\n",
    "upstream = {\n",
    "    \"core\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/00_core.html\",\n",
    "        \"FaceDetection_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts\",\n",
    "        \"FaceDetection_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts\",\n",
    "        \"InsectWingbeat_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts\",\n",
    "        \"InsectWingbeat_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts\",\n",
    "        \"PenDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts\",\n",
    "        \"PenDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts\",\n",
    "        \"SpokenArabicDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts\",\n",
    "        \"SpokenArabicDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts\",\n",
    "        \"CharacterTrajectories_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts\",\n",
    "        \"CharacterTrajectories_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/105_data.character_trajectories.html\",\n",
    "    \"CharacterTrajectories_TRAIN_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/train\",\n",
    "    \"CharacterTrajectories_VALID_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/valid\",\n",
    "    \"CharacterTrajectories_TEST_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/test\",\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e626d9df-d6dc-4ab2-a303-fb02a2df4423",
   "metadata": {},
   "source": [
    "# <center> Data Download and Conversion </center>\n",
    "\n",
    "## <center> [CharacterTrajectories](https://www.timeseriesclassification.com/description.php?Dataset=CharacterTrajectories) dataset </center>\n",
    "\n",
    "> Convert dataset to parquet format to run target encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13a9c838-1b4e-40d1-a099-738697016be2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "DATASET_NAME = 'CharacterTrajectories'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "216da507-b883-4803-b813-b61f31103dff",
   "metadata": {},
   "source": [
    "### Download and Convert dataset in tabular format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92b19b5b-9cff-4816-9368-4cea4a762656",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train = get_mtsc_data_tabular_from_ts(upstream['core']['CharacterTrajectories_TRAIN_TS'])\n",
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddf42fca-94fb-4159-b2a7-1ec472e666ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6678703-96bb-470d-96cf-dd0c3aff7dd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['reading_id'].min(), train['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c244697-950a-4afe-bcec-b8a7df05e24d",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b8d2d85-c5ca-4b33-81b9-9a1809f29b56",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test = get_mtsc_data_tabular_from_ts(upstream['core']['CharacterTrajectories_TEST_TS'])\n",
    "test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98513e92-5e8b-461b-9f05-6137c6bf644f",
   "metadata": {},
   "outputs": [],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b3faa10-6391-4bc8-a6af-59de4642c849",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['reading_id'].min(), test['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2348cd4-e770-41fa-b002-b75018fdcad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bf3545c-e254-407f-aab3-7b3ae5b2bd81",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X = train[['case_id', 'class_vals']].drop_duplicates()\n",
    "X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)\n",
    "X_train.case_id.nunique(), X_val.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe834cf1-e0bd-4e1d-be03-52ad882a9677",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1ebf57b-e23c-4f34-ae50-a30779c3f49f",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_val.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53263e2e-05db-4592-bf7d-392de73662a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3d6d8e3-677b-44ec-b333-b1ab9c609758",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid = train.merge(X_val, on=['case_id'], how='inner')\n",
    "valid['class_vals'] = valid['class_vals_x']\n",
    "valid = valid.drop(columns=['class_vals_x','class_vals_y'])\n",
    "valid.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bbc8365-d730-451d-9b60-327c4eaf3e70",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train.merge(X_train, on=['case_id'], how='inner')\n",
    "train['class_vals'] = train['class_vals_x']\n",
    "train = train.drop(columns=['class_vals_x','class_vals_y'])\n",
    "train.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d208c89-8201-408d-a7d4-24ca97517840",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "517eb8a3-6ee9-407e-8663-8fee36a2df3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f507e9e-d50a-4d13-bf58-abb5aa939958",
   "metadata": {},
   "source": [
    "### Write data in parquet format for future processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce9474cc-4e38-4ba6-83ae-1adaa1da42ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "import cudf\n",
    "def write_parquet(pandas_df, output_dir, npartitions = 2):\n",
    "    pandas_df['class_vals'].replace(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'],\n",
    "                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], inplace = True)\n",
    "    gdf = cudf.from_pandas(pandas_df)\n",
    "    gdf['case_id_seq'] = gdf['case_id']\n",
    "    dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)\n",
    "    dask_gdf.to_parquet(output_dir) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7705b9de-39a5-4a96-82f2-4b0e16fe549d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb49ccab-f1c6-485f-8884-ee2fa002d8a6",
   "metadata": {},
   "source": [
    "__Train Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02e90067-0745-4a6d-80ff-bbca2b569d71",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(train, product['CharacterTrajectories_TRAIN_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0b0c2e9-551c-463b-8972-bfdf6742a6ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf = dask_cudf.read_parquet(product['CharacterTrajectories_TRAIN_RAW'])\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "730fbc96-1ebb-4291-a6b7-75ef422d58c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cfb42ee-c577-42cd-9ba4-1be52b92fcd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "616db4b3-eee6-49a3-a296-8ac98a5019ab",
   "metadata": {},
   "source": [
    "__Valid Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38fdac7a-1e75-492f-b0e3-250a6309b43b",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(valid, product['CharacterTrajectories_VALID_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1982785f-1489-4e94-89dd-b92fa83631e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf = dask_cudf.read_parquet(product['CharacterTrajectories_VALID_RAW'])\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0632ffee-c3c2-4810-b615-6b4482dcf793",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "223673bc-a46b-4488-baa9-4c96827056a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1dd5344c-cb03-4844-bb24-6a12c2a6961e",
   "metadata": {},
   "source": [
    "__Test Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91f773d7-b8d7-4a00-8cdc-60c43e76795f",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(test, product['CharacterTrajectories_TEST_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "995171de-0a48-47a6-aa45-7ab989acc023",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf = dask_cudf.read_parquet(product['CharacterTrajectories_TEST_RAW'])\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9846bb7-2193-4d63-b88e-b6e5c9980848",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6aa1aceb-ce0d-4a70-b130-5629e272b936",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9a8b8f3-1906-474a-b6b5-e40f16dfc8c9",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3c8381c-f105-49e7-bac3-416e0d32aade",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fcb38cfb-52ed-4a48-8214-9d6b6ef6f6be",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "CharacterTrajectories_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/test",
     "CharacterTrajectories_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/train",
     "CharacterTrajectories_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/valid",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/105_data.character_trajectories.html"
    },
    "upstream": {
     "core": {
      "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
      "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
      "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
      "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
      "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
      "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
      "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
      "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
      "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
      "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
