{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40766638-249f-4526-82fe-b20711e32d10",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp data.spoken_arabic_digits\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51538003-c99a-4b01-ac14-e94411dc4b12",
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['core']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "536ee6e4-dbb5-4b33-a980-cbba70974f61",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62a556b4-3702-417f-b482-2d1c0c445028",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "import dask_cudf\n",
    "import gc   #garbage collector interface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c683578-97e9-43e3-b585-e0039a8af74f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "upstream = {\n",
    "    \"core\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/00_core.html\",\n",
    "        \"FaceDetection_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts\",\n",
    "        \"FaceDetection_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts\",\n",
    "        \"InsectWingbeat_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts\",\n",
    "        \"InsectWingbeat_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts\",\n",
    "        \"PenDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts\",\n",
    "        \"PenDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts\",\n",
    "        \"SpokenArabicDigits_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts\",\n",
    "        \"SpokenArabicDigits_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts\",\n",
    "        \"CharacterTrajectories_TRAIN_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts\",\n",
    "        \"CharacterTrajectories_TEST_TS\": \"/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/104_data.spoken_arabic_digits.html\",\n",
    "    \"SpokenArabicDigits_TRAIN_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/train\",\n",
    "    \"SpokenArabicDigits_VALID_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/valid\",\n",
    "    \"SpokenArabicDigits_TEST_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/test\",\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e626d9df-d6dc-4ab2-a303-fb02a2df4423",
   "metadata": {},
   "source": [
    "# <center> Data Download and Conversion </center>\n",
    "\n",
    "## <center> [SpokenArabicDigits](https://www.timeseriesclassification.com/description.php?Dataset=SpokenArabicDigits) dataset </center>\n",
    "\n",
    "> Convert dataset to parquet format to run target encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50056655-969d-47ee-8781-21042bf3dcb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "DATASET_NAME = 'SpokenArabicDigits'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "216da507-b883-4803-b813-b61f31103dff",
   "metadata": {},
   "source": [
    "### Download and Convert dataset in tabular format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92b19b5b-9cff-4816-9368-4cea4a762656",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train = get_mtsc_data_tabular_from_ts(upstream['core']['SpokenArabicDigits_TRAIN_TS'])\n",
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddf42fca-94fb-4159-b2a7-1ec472e666ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6678703-96bb-470d-96cf-dd0c3aff7dd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['reading_id'].min(), train['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c244697-950a-4afe-bcec-b8a7df05e24d",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b8d2d85-c5ca-4b33-81b9-9a1809f29b56",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test = get_mtsc_data_tabular_from_ts(upstream['core']['SpokenArabicDigits_TEST_TS'])\n",
    "test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98513e92-5e8b-461b-9f05-6137c6bf644f",
   "metadata": {},
   "outputs": [],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b3faa10-6391-4bc8-a6af-59de4642c849",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['reading_id'].min(), test['reading_id'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2348cd4-e770-41fa-b002-b75018fdcad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['class_vals'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bf3545c-e254-407f-aab3-7b3ae5b2bd81",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X = train[['case_id', 'class_vals']].drop_duplicates()\n",
    "X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)\n",
    "X_train.case_id.nunique(), X_val.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dab03352-f337-4022-a9c2-958c499a4f20",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "017fc3d7-98e9-47cc-b85e-bd99b74365e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_val.groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7cf636f-6b85-4510-8594-ec2d4a5a5226",
   "metadata": {},
   "outputs": [],
   "source": [
    "test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "433c7e5e-d70e-41bc-b8cd-e9d2271a42cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid = train.merge(X_val, on=['case_id'], how='inner')\n",
    "valid['class_vals'] = valid['class_vals_x']\n",
    "valid = valid.drop(columns=['class_vals_x','class_vals_y'])\n",
    "valid.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7ad81ed-9e18-466c-a28f-21af4a93f0fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train.merge(X_train, on=['case_id'], how='inner')\n",
    "train['class_vals'] = train['class_vals_x']\n",
    "train = train.drop(columns=['class_vals_x','class_vals_y'])\n",
    "train.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d355ee2-90fd-43f1-b410-dd9cf12208fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af8bdaba-815a-4378-b38e-19bc4710cdac",
   "metadata": {},
   "outputs": [],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f507e9e-d50a-4d13-bf58-abb5aa939958",
   "metadata": {},
   "source": [
    "### Write data in parquet format for future processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce9474cc-4e38-4ba6-83ae-1adaa1da42ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "import cudf\n",
    "import dask_cudf\n",
    "import pandas as pd\n",
    "\n",
    "def write_parquet(pandas_df, output_dir, npartitions = 2):\n",
    "    pandas_df['class_vals'].replace(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],\n",
    "                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace = True)\n",
    "    gdf = cudf.from_pandas(pandas_df)\n",
    "    gdf['case_id_seq'] = gdf['case_id']\n",
    "    dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)\n",
    "    dask_gdf.to_parquet(output_dir)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7705b9de-39a5-4a96-82f2-4b0e16fe549d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb49ccab-f1c6-485f-8884-ee2fa002d8a6",
   "metadata": {},
   "source": [
    "__Train Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02e90067-0745-4a6d-80ff-bbca2b569d71",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(train, product['SpokenArabicDigits_TRAIN_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0b0c2e9-551c-463b-8972-bfdf6742a6ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf = dask_cudf.read_parquet(product['SpokenArabicDigits_TRAIN_RAW'])\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "730fbc96-1ebb-4291-a6b7-75ef422d58c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e04a10f-feb4-4778-8eda-584c62b8ef2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21a01873-beb0-4d7c-9051-335e5aa8d8ac",
   "metadata": {},
   "source": [
    "__Valid Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "487b9f72-6a60-416c-bf54-7a8cd2a54c59",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(valid, product['SpokenArabicDigits_VALID_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e41a881b-3aeb-4dde-8d1f-3fe9935c0409",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf = dask_cudf.read_parquet(product['SpokenArabicDigits_VALID_RAW'])\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1de4e6f-1046-4fe8-b98b-22773fcf9568",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6dd5232b-a641-4925-b79d-69fbd4cf0323",
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c1691cf1-9bd9-48be-af12-483ced75dd01",
   "metadata": {},
   "source": [
    "__Test Dataset__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91f773d7-b8d7-4a00-8cdc-60c43e76795f",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "write_parquet(test, product['SpokenArabicDigits_TEST_RAW'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "995171de-0a48-47a6-aa45-7ab989acc023",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf = dask_cudf.read_parquet(product['SpokenArabicDigits_TEST_RAW'])\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9846bb7-2193-4d63-b88e-b6e5c9980848",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6aa1aceb-ce0d-4a70-b130-5629e272b936",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf.case_id.nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9a8b8f3-1906-474a-b6b5-e40f16dfc8c9",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3c8381c-f105-49e7-bac3-416e0d32aade",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fcb38cfb-52ed-4a48-8214-9d6b6ef6f6be",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "SpokenArabicDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/test",
     "SpokenArabicDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/train",
     "SpokenArabicDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/valid",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/104_data.spoken_arabic_digits.html"
    },
    "upstream": {
     "core": {
      "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
      "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
      "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
      "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
      "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
      "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
      "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
      "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
      "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
      "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
