{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "550144d8-85ec-47b8-a59f-daf126108559",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp data.pen_digits\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9b50950-8d7d-4325-8a16-fdc8607cdab8",
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['core']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "536ee6e4-dbb5-4b33-a980-cbba70974f61",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "413eae4c-40d3-4143-91fe-4d915e3c2654",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "import dask_cudf\n",
    "import gc   #garbage collector interface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb446081-5f76-4e34-9f35-9de9bce942f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "upstream = {\n",
    "    \"core\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/00_core.html\",\n",
    "        \"FaceDetection_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts\",\n",
    "        \"FaceDetection_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts\",\n",
    "        \"InsectWingbeat_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts\",\n",
    "        \"InsectWingbeat_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts\",\n",
    "        \"PenDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts\",\n",
    "        \"PenDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts\",\n",
    "        \"SpokenArabicDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts\",\n",
    "        \"SpokenArabicDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts\",\n",
    "        \"CharacterTrajectories_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts\",\n",
    "        \"CharacterTrajectories_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/103_data.pen_digits.html\",\n",
    "    \"PenDigits_TRAIN_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/train\",\n",
    "    \"PenDigits_VALID_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/valid\",\n",
    "    \"PenDigits_TEST_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/test\",\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e626d9df-d6dc-4ab2-a303-fb02a2df4423",
   "metadata": {},
   "source": [
    "# <center> Data Download and Conversion </center>\n",
    "\n",
    "## <center> [PenDigits](https://www.timeseriesclassification.com/description.php?Dataset=PenDigits) dataset </center>\n",
    "\n",
    "> Convert dataset to parquet format to run target encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50056655-969d-47ee-8781-21042bf3dcb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "DATASET_NAME = 'PenDigits'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "216da507-b883-4803-b813-b61f31103dff",
   "metadata": {},
   "source": [
    "### Download and Convert dataset in tabular format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92b19b5b-9cff-4816-9368-4cea4a762656",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train = get_mtsc_data_tabular_from_ts(upstream['core']['PenDigits_TRAIN_TS'])\n",
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddf42fca-94fb-4159-b2a7-1ec472e666ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6678703-96bb-470d-96cf-dd0c3aff7dd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['reading_id'].min(), train['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c244697-950a-4afe-bcec-b8a7df05e24d",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b8d2d85-c5ca-4b33-81b9-9a1809f29b56",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test = get_mtsc_data_tabular_from_ts(upstream['core']['PenDigits_TEST_TS'])\n",
    "test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98513e92-5e8b-461b-9f05-6137c6bf644f",
   "metadata": {},
   "outputs": [],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b3faa10-6391-4bc8-a6af-59de4642c849",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['reading_id'].min(), test['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2348cd4-e770-41fa-b002-b75018fdcad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2d5a988-488a-4693-a334-535840b5f1fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X = train[['case_id', 'class_vals']].drop_duplicates()\n",
    "X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)\n",
    "X_train.case_id.nunique(), X_val.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbb8dbbc-90b9-418f-a8a6-dc0dda8a0bca",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfa5a80b-f004-4599-8302-46d773bb8dd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_val.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfdf16c1-00fc-4ff7-86fe-1f75e08f0fb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf9b1087-7c82-4577-9f66-8ce81b41535f",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid = train.merge(X_val, on=['case_id'], how='inner')\n",
    "valid['class_vals'] = valid['class_vals_x']\n",
    "valid = valid.drop(columns=['class_vals_x','class_vals_y'])\n",
    "valid.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1dc8122e-5f30-4093-abe4-b04f2598cd88",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train.merge(X_train, on=['case_id'], how='inner')\n",
    "train['class_vals'] = train['class_vals_x']\n",
    "train = train.drop(columns=['class_vals_x','class_vals_y'])\n",
    "train.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba507f07-53f6-403f-9a7d-42f5b7edacef",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c96210f2-7786-4750-bf43-88c6031aab2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f507e9e-d50a-4d13-bf58-abb5aa939958",
   "metadata": {},
   "source": [
    "### Write data in parquet format for future processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce9474cc-4e38-4ba6-83ae-1adaa1da42ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "import cudf\n",
    "import dask_cudf\n",
    "import pandas as pd\n",
    "\n",
    "def write_parquet(pandas_df, output_dir, npartitions = 2):\n",
    "    pandas_df['class_vals'].replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],\n",
    "                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace = True)\n",
    "    gdf = cudf.from_pandas(pandas_df)\n",
    "    gdf['case_id_seq'] = gdf['case_id']\n",
    "    dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)\n",
    "    dask_gdf.to_parquet(output_dir)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7705b9de-39a5-4a96-82f2-4b0e16fe549d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb49ccab-f1c6-485f-8884-ee2fa002d8a6",
   "metadata": {},
   "source": [
    "__Train Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02e90067-0745-4a6d-80ff-bbca2b569d71",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(train, product['PenDigits_TRAIN_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0b0c2e9-551c-463b-8972-bfdf6742a6ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf = dask_cudf.read_parquet(product['PenDigits_TRAIN_RAW'])\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "730fbc96-1ebb-4291-a6b7-75ef422d58c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cfb42ee-c577-42cd-9ba4-1be52b92fcd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "594261f8-a42e-48c3-adb4-060f597714a8",
   "metadata": {},
   "source": [
    "__Valid Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b063a82-58f7-40a5-8498-f321c0d3ab6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(valid, product['PenDigits_VALID_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ced6867-8090-4678-a4a0-71f4f76ddc26",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf = dask_cudf.read_parquet(product['PenDigits_VALID_RAW'])\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d700667-b015-46e4-bd51-c637352b95b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79ad52ea-bc32-4e09-a3ed-2ca956f28ff6",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d456019-6937-4aee-a3af-b341125dc873",
   "metadata": {},
   "source": [
    "__Test Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91f773d7-b8d7-4a00-8cdc-60c43e76795f",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(test, product['PenDigits_TEST_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "995171de-0a48-47a6-aa45-7ab989acc023",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf = dask_cudf.read_parquet(product['PenDigits_TEST_RAW'])\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9846bb7-2193-4d63-b88e-b6e5c9980848",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6aa1aceb-ce0d-4a70-b130-5629e272b936",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9a8b8f3-1906-474a-b6b5-e40f16dfc8c9",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3c8381c-f105-49e7-bac3-416e0d32aade",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fcb38cfb-52ed-4a48-8214-9d6b6ef6f6be",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "PenDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/test",
     "PenDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/train",
     "PenDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/valid",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/103_data.pen_digits.html"
    },
    "upstream": {
     "core": {
      "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
      "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
      "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
      "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
      "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
      "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
      "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
      "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
      "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
      "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
