{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0855d5f0-2732-40ff-be9f-fba104668d7a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp data.face_detection\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "969a8044-4f6a-42d1-90a8-ff9b0bb3586e",
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['core']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "536ee6e4-dbb5-4b33-a980-cbba70974f61",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc67dac1-f8fe-4055-84f9-683ce38a1c63",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "import dask_cudf\n",
    "import gc   #garbage collector interface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe5b9e8d-ef7b-4afd-ba3b-82f9d1382b2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# |export\n",
    "upstream = {\n",
    "    \"core\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/00_core.html\",\n",
    "        \"FaceDetection_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts\",\n",
    "        \"FaceDetection_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts\",\n",
    "        \"InsectWingbeat_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts\",\n",
    "        \"InsectWingbeat_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts\",\n",
    "        \"PenDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts\",\n",
    "        \"PenDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts\",\n",
    "        \"SpokenArabicDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts\",\n",
    "        \"SpokenArabicDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts\",\n",
    "        \"CharacterTrajectories_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts\",\n",
    "        \"CharacterTrajectories_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/101_data.face_detection.html\",\n",
    "    \"FaceDetection_TRAIN_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/train\",\n",
    "    \"FaceDetection_VALID_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/valid\",\n",
    "    \"FaceDetection_TEST_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/test\",\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e626d9df-d6dc-4ab2-a303-fb02a2df4423",
   "metadata": {},
   "source": [
    "# <center> Data Download and Conversion </center>\n",
    "\n",
    "## <center> [FaceDetection](https://www.timeseriesclassification.com/description.php?Dataset=FaceDetection) dataset </center>\n",
    "\n",
    "> Convert dataset to parquet format to run target encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50056655-969d-47ee-8781-21042bf3dcb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "DATASET_NAME = 'FaceDetection'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "216da507-b883-4803-b813-b61f31103dff",
   "metadata": {},
   "source": [
    "### Download and Convert dataset in tabular format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92b19b5b-9cff-4816-9368-4cea4a762656",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train = get_mtsc_data_tabular_from_ts(upstream['core']['FaceDetection_TRAIN_TS'])\n",
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddf42fca-94fb-4159-b2a7-1ec472e666ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6678703-96bb-470d-96cf-dd0c3aff7dd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['reading_id'].min(), train['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c244697-950a-4afe-bcec-b8a7df05e24d",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b8d2d85-c5ca-4b33-81b9-9a1809f29b56",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test = get_mtsc_data_tabular_from_ts(upstream['core']['FaceDetection_TEST_TS'])\n",
    "test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98513e92-5e8b-461b-9f05-6137c6bf644f",
   "metadata": {},
   "outputs": [],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b3faa10-6391-4bc8-a6af-59de4642c849",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['reading_id'].min(), test['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2348cd4-e770-41fa-b002-b75018fdcad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ed25f2d-ae2d-40d1-89f2-f61a3dccd52f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X = train[['case_id', 'class_vals']].drop_duplicates()\n",
    "X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)\n",
    "X_train.case_id.nunique(), X_val.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3f0d7fb-b36d-4b51-b77e-2c8b2a7dc925",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "390270c5-ff8a-4873-a95a-a2aa6081adf1",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_val.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2924a290-dec7-457e-8739-f8e2aa951051",
   "metadata": {},
   "outputs": [],
   "source": [
    "test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "017cf10a-859c-47fd-9f18-5fab2ee6a14b",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid = train.merge(X_val, on=['case_id'], how='inner')\n",
    "valid['class_vals'] = valid['class_vals_x']\n",
    "valid = valid.drop(columns=['class_vals_x','class_vals_y'])\n",
    "valid.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b8f7acd-55ed-417d-aca9-4f57d17ae2e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train.merge(X_train, on=['case_id'], how='inner')\n",
    "train['class_vals'] = train['class_vals_x']\n",
    "train = train.drop(columns=['class_vals_x','class_vals_y'])\n",
    "train.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10b67626-1023-42e6-b41b-2ee240e202fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "894abc20-c613-495d-9060-9eed5eaf5460",
   "metadata": {},
   "outputs": [],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f507e9e-d50a-4d13-bf58-abb5aa939958",
   "metadata": {},
   "source": [
    "### Write data in parquet format for future processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce9474cc-4e38-4ba6-83ae-1adaa1da42ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "import cudf\n",
    "import dask_cudf\n",
    "import pandas as pd\n",
    "\n",
    "def write_parquet(pandas_df, output_dir, npartitions = 2):\n",
    "    pandas_df['class_vals'].replace(['0', '1'],\n",
    "                                    [0, 1], inplace = True)\n",
    "    gdf = cudf.from_pandas(pandas_df)\n",
    "    gdf['case_id_seq'] = gdf['case_id']\n",
    "    dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)\n",
    "    dask_gdf.to_parquet(output_dir)   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7705b9de-39a5-4a96-82f2-4b0e16fe549d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb49ccab-f1c6-485f-8884-ee2fa002d8a6",
   "metadata": {},
   "source": [
    "__Train Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02e90067-0745-4a6d-80ff-bbca2b569d71",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(train, product['FaceDetection_TRAIN_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0b0c2e9-551c-463b-8972-bfdf6742a6ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf = dask_cudf.read_parquet(product['FaceDetection_TRAIN_RAW'])\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "730fbc96-1ebb-4291-a6b7-75ef422d58c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cfb42ee-c577-42cd-9ba4-1be52b92fcd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dca35186-80a6-463e-9ad1-f09deda35def",
   "metadata": {},
   "source": [
    "__Valid Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3146d178-683f-4c88-b1e1-cd711dcbed52",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(valid, product['FaceDetection_VALID_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4fff9427-5a5a-4b06-b77c-992fb1296a93",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf = dask_cudf.read_parquet(product['FaceDetection_VALID_RAW'])\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d991ae6a-1eed-49ef-93cc-29de8bcdf60e",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89b5dd1f-b0d9-4c37-be34-33a4f5be480d",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c557f00-c80d-411f-9199-1a2f6e69becd",
   "metadata": {},
   "source": [
    "__Test Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91f773d7-b8d7-4a00-8cdc-60c43e76795f",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(test, product['FaceDetection_TEST_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "995171de-0a48-47a6-aa45-7ab989acc023",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf = dask_cudf.read_parquet(product['FaceDetection_TEST_RAW'])\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9846bb7-2193-4d63-b88e-b6e5c9980848",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6aa1aceb-ce0d-4a70-b130-5629e272b936",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9a8b8f3-1906-474a-b6b5-e40f16dfc8c9",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3c8381c-f105-49e7-bac3-416e0d32aade",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fcb38cfb-52ed-4a48-8214-9d6b6ef6f6be",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "FaceDetection_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/test",
     "FaceDetection_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/train",
     "FaceDetection_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/valid",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/101_data.face_detection.html"
    },
    "upstream": {
     "core": {
      "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
      "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
      "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
      "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
      "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
      "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
      "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
      "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
      "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
      "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
