{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp feature_preprocessing.spoken_arabic_digits.tabular_to_timeseries\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['feature_preprocessing_spoken_arabic_digits']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "from vitmtsc.data.spoken_arabic_digits import *\n",
    "from vitmtsc.feature_preprocessing.spoken_arabic_digits.target_encoding import *\n",
    "import os\n",
    "import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "upstream = {\n",
    "    \"feature_preprocessing_spoken_arabic_digits\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/204_feature_preprocessing.spoken_arabic_digits.target_encoding.html\",\n",
    "        \"SpokenArabicDigits_TRAIN_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/train\",\n",
    "        \"SpokenArabicDigits_VALID_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/valid\",\n",
    "        \"SpokenArabicDigits_TEST_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/test\",\n",
    "        \"SpokenArabicDigits_workflow_dir\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/nvtabular_workflow\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/304_feature_preprocessing.spoken_arabic_digits.tabular_to_timeseries.html\",\n",
    "    \"SpokenArabicDigits_TRAIN_MODEL_INPUT\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/train\",\n",
    "    \"SpokenArabicDigits_VALID_MODEL_INPUT\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/valid\",\n",
    "    \"SpokenArabicDigits_TEST_MODEL_INPUT\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/test\",\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Preprocessing for Neural Networks - III\n",
    "\n",
    "> Convert Category Encoding data from tabular to time-series format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "DATASET_NAME = 'SpokenArabicDigits'\n",
    "SEQUENCE_LENGTH = 93\n",
    "NUMBER_OF_FEATURES = 13\n",
    "NUM_TARGET = 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Convert from Tabular to Time-Series Format__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "MTSC_COLUMN_NAMES = [\n",
    "'dim_0',\n",
    "'dim_1',\n",
    "'dim_2',\n",
    "'dim_3',\n",
    "'dim_4',\n",
    "'dim_5',\n",
    "'dim_6',\n",
    "'dim_7',\n",
    "'dim_8',\n",
    "'dim_9',\n",
    "'dim_10',\n",
    "'dim_11',\n",
    "'dim_12']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "ALL_COLUMNS = ['case_id', 'case_id_seq', 'reading_id'] + MTSC_COLUMN_NAMES + ['class_vals']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Input Data Location__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_encoded_train_dir = os.path.join(\"./\", upstream['feature_preprocessing_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_TE'])\n",
    "target_encoded_valid_dir = os.path.join(\"./\", upstream['feature_preprocessing_spoken_arabic_digits']['SpokenArabicDigits_VALID_TE'])\n",
    "target_encoded_test_dir = os.path.join(\"./\", upstream['feature_preprocessing_spoken_arabic_digits']['SpokenArabicDigits_TEST_TE'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Output Data Location__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_train_dir = os.path.join(\"./\", product['SpokenArabicDigits_TRAIN_MODEL_INPUT'])\n",
    "output_valid_dir = os.path.join(\"./\", product['SpokenArabicDigits_VALID_MODEL_INPUT'])\n",
    "output_test_dir = os.path.join(\"./\", product['SpokenArabicDigits_TEST_MODEL_INPUT'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p $output_train_dir\n",
    "!mkdir -p $output_valid_dir\n",
    "!mkdir -p $output_test_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Train Dataset Conversion </center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Tabular to Time-Series format conversion__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "convert_from_tabular_to_timeseries_format(input_dir = target_encoded_train_dir, \n",
    "                                          output_dir = output_train_dir, \n",
    "                                          all_columns = ALL_COLUMNS,\n",
    "                                          mtsc_column_names = MTSC_COLUMN_NAMES,\n",
    "                                          chunk_size_processing = 50000,\n",
    "                                          number_of_features = NUMBER_OF_FEATURES, \n",
    "                                          seq_len = SEQUENCE_LENGTH,\n",
    "                                          chunk_size_file = 10000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Valid Dataset Conversion </center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Tabular to Time-Series format conversion__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "convert_from_tabular_to_timeseries_format(input_dir = target_encoded_valid_dir, \n",
    "                                          output_dir = output_valid_dir, \n",
    "                                          all_columns = ALL_COLUMNS,\n",
    "                                          mtsc_column_names = MTSC_COLUMN_NAMES,\n",
    "                                          chunk_size_processing = 50000,\n",
    "                                          number_of_features = NUMBER_OF_FEATURES, \n",
    "                                          seq_len = SEQUENCE_LENGTH,\n",
    "                                          chunk_size_file = 10000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Test Dataset Conversion </center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Tabular to Time-Series format conversion__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "convert_from_tabular_to_timeseries_format(input_dir = target_encoded_test_dir, \n",
    "                                          output_dir = output_test_dir, \n",
    "                                          all_columns = ALL_COLUMNS,\n",
    "                                          mtsc_column_names = MTSC_COLUMN_NAMES,\n",
    "                                          chunk_size_processing = 50000,\n",
    "                                          number_of_features = NUMBER_OF_FEATURES, \n",
    "                                          seq_len = SEQUENCE_LENGTH,\n",
    "                                          chunk_size_file = 10000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Verify Datasets </center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "import dask_cudf\n",
    "train_gdf = dask_cudf.read_parquet(output_train_dir)\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train_gdf['case_id'].nunique().compute(), train_gdf['class_vals'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "import dask_cudf\n",
    "valid_gdf = dask_cudf.read_parquet(output_valid_dir)\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "valid_gdf['case_id'].nunique().compute(), valid_gdf['class_vals'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "import dask_cudf\n",
    "test_gdf = dask_cudf.read_parquet(output_test_dir)\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test_gdf['case_id'].nunique().compute(), test_gdf['class_vals'].nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "SpokenArabicDigits_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/test",
     "SpokenArabicDigits_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/train",
     "SpokenArabicDigits_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/valid",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/304_feature_preprocessing.spoken_arabic_digits.tabular_to_timeseries.html"
    },
    "upstream": {
     "feature_preprocessing_spoken_arabic_digits": {
      "SpokenArabicDigits_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/test",
      "SpokenArabicDigits_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/train",
      "SpokenArabicDigits_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/valid",
      "SpokenArabicDigits_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/nvtabular_workflow",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/204_feature_preprocessing.spoken_arabic_digits.target_encoding.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
