{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c641d90-9609-444d-91e0-05b67e0fe655",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp data.insect_wingbeat\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf021e84-85d3-49f1-8295-9237da4bbaca",
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['core']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "536ee6e4-dbb5-4b33-a980-cbba70974f61",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "972c26e5-2b50-4f4a-b99d-a80eb68438a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "import dask_cudf\n",
    "import gc   #garbage collector interface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e52744f0-bfd2-4318-9f8b-da4f6984af8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "upstream = {\n",
    "    \"core\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/00_core.html\",\n",
    "        \"FaceDetection_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts\",\n",
    "        \"FaceDetection_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts\",\n",
    "        \"InsectWingbeat_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts\",\n",
    "        \"InsectWingbeat_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts\",\n",
    "        \"PenDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts\",\n",
    "        \"PenDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts\",\n",
    "        \"SpokenArabicDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts\",\n",
    "        \"SpokenArabicDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts\",\n",
    "        \"CharacterTrajectories_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts\",\n",
    "        \"CharacterTrajectories_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html\",\n",
    "    \"InsectWingbeat_TRAIN_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train\",\n",
    "    \"InsectWingbeat_VALID_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid\",\n",
    "    \"InsectWingbeat_TEST_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test\",\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e626d9df-d6dc-4ab2-a303-fb02a2df4423",
   "metadata": {},
   "source": [
    "# <center> Data Download and Conversion </center>\n",
    "\n",
    "## <center> [InsectWingbeat](https://www.timeseriesclassification.com/description.php?Dataset=InsectWingbeat) dataset </center>\n",
    "\n",
    "> Convert dataset to parquet format to run target encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50056655-969d-47ee-8781-21042bf3dcb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "DATASET_NAME = 'InsectWingbeat'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "216da507-b883-4803-b813-b61f31103dff",
   "metadata": {},
   "source": [
    "### Download and Convert dataset in tabular format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92b19b5b-9cff-4816-9368-4cea4a762656",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train = get_mtsc_data_tabular_from_ts(upstream['core']['InsectWingbeat_TRAIN_TS'])\n",
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddf42fca-94fb-4159-b2a7-1ec472e666ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6678703-96bb-470d-96cf-dd0c3aff7dd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['reading_id'].min(), train['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c244697-950a-4afe-bcec-b8a7df05e24d",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b8d2d85-c5ca-4b33-81b9-9a1809f29b56",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test = get_mtsc_data_tabular_from_ts(upstream['core']['InsectWingbeat_TEST_TS'])\n",
    "test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98513e92-5e8b-461b-9f05-6137c6bf644f",
   "metadata": {},
   "outputs": [],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b3faa10-6391-4bc8-a6af-59de4642c849",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['reading_id'].min(), test['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2348cd4-e770-41fa-b002-b75018fdcad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54ecfbcf-c48e-4180-8d0b-831557435d1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X = train[['case_id', 'class_vals']].drop_duplicates()\n",
    "X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)\n",
    "X_train.case_id.nunique(), X_val.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7771d633-8961-497c-bb8e-9ea8839cdf87",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05255e74-d140-4604-9d4d-e516d7f75770",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_val.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40f1b133-473c-4f6a-8c46-ff8f23baefd7",
   "metadata": {},
   "outputs": [],
   "source": [
    "test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3a717cf-e3cd-44d1-9db8-e5db245985ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid = train.merge(X_val, on=['case_id'], how='inner')\n",
    "valid['class_vals'] = valid['class_vals_x']\n",
    "valid = valid.drop(columns=['class_vals_x','class_vals_y'])\n",
    "valid.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac3201b8-6ad9-429f-9d18-7a80a4835718",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train.merge(X_train, on=['case_id'], how='inner')\n",
    "train['class_vals'] = train['class_vals_x']\n",
    "train = train.drop(columns=['class_vals_x','class_vals_y'])\n",
    "train.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7862267-536c-45c2-9954-53fad6d4a0c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b64f3658-0b55-4c13-8318-1873769c8a84",
   "metadata": {},
   "outputs": [],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f507e9e-d50a-4d13-bf58-abb5aa939958",
   "metadata": {},
   "source": [
    "### Write data in parquet format for future processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce9474cc-4e38-4ba6-83ae-1adaa1da42ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "import cudf\n",
    "import dask_cudf\n",
    "import pandas as pd\n",
    "\n",
    "def write_parquet(pandas_df, output_dir, npartitions = 2):\n",
    "    pandas_df['class_vals'].replace(['aedes_female',\n",
    "                                     'aedes_male',\n",
    "                                     'fruit_flies',\n",
    "                                     'house_flies',\n",
    "                                     'quinx_female',\n",
    "                                     'quinx_male',\n",
    "                                     'stigma_female',\n",
    "                                     'stigma_male',\n",
    "                                     'tarsalis_female',\n",
    "                                     'tarsalis_male'],\n",
    "                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace = True)\n",
    "    gdf = cudf.from_pandas(pandas_df)\n",
    "    gdf['case_id_seq'] = gdf['case_id']\n",
    "    dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)\n",
    "    dask_gdf.to_parquet(output_dir) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7705b9de-39a5-4a96-82f2-4b0e16fe549d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb49ccab-f1c6-485f-8884-ee2fa002d8a6",
   "metadata": {},
   "source": [
    "__Train Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02e90067-0745-4a6d-80ff-bbca2b569d71",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(train, product['InsectWingbeat_TRAIN_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0b0c2e9-551c-463b-8972-bfdf6742a6ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf = dask_cudf.read_parquet(product['InsectWingbeat_TRAIN_RAW'])\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "730fbc96-1ebb-4291-a6b7-75ef422d58c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cfb42ee-c577-42cd-9ba4-1be52b92fcd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c079a27d-9865-4cd1-8185-90ce5fbb1c2b",
   "metadata": {},
   "source": [
    "__Valid Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9611b3b-d46a-46b2-b2e9-bef941fcb930",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(valid, product['InsectWingbeat_VALID_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be61baa1-5166-4a9b-86ec-eb92e35072de",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf = dask_cudf.read_parquet(product['InsectWingbeat_VALID_RAW'])\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06f74c9a-53b8-43e4-8bd3-14bddefe2b28",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d83b7a9c-8ffc-47aa-8112-9dc4af7265d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a8525f8-7e0c-410d-ad9f-0ae2a10146d8",
   "metadata": {},
   "source": [
    "__Test Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91f773d7-b8d7-4a00-8cdc-60c43e76795f",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(test, product['InsectWingbeat_TEST_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "995171de-0a48-47a6-aa45-7ab989acc023",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf = dask_cudf.read_parquet(product['InsectWingbeat_TEST_RAW'])\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9846bb7-2193-4d63-b88e-b6e5c9980848",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6aa1aceb-ce0d-4a70-b130-5629e272b936",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9a8b8f3-1906-474a-b6b5-e40f16dfc8c9",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3c8381c-f105-49e7-bac3-416e0d32aade",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fcb38cfb-52ed-4a48-8214-9d6b6ef6f6be",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a6f1013-3590-4c7f-8751-7422d850500d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "InsectWingbeat_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test",
     "InsectWingbeat_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train",
     "InsectWingbeat_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html"
    },
    "upstream": {
     "core": {
      "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
      "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
      "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
      "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
      "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
      "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
      "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
      "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
      "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
      "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
