{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp feature_preprocessing.insect_wingbeat.tabular_to_timeseries\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['feature_preprocessing_insect_wingbeat']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "from vitmtsc.data.insect_wingbeat import *\n",
    "from vitmtsc.feature_preprocessing.insect_wingbeat.target_encoding import *\n",
    "import os\n",
    "import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "upstream = {\n",
    "    \"feature_preprocessing_insect_wingbeat\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html\",\n",
    "        \"InsectWingbeat_TRAIN_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train\",\n",
    "        \"InsectWingbeat_VALID_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid\",\n",
    "        \"InsectWingbeat_TEST_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test\",\n",
    "        \"InsectWingbeat_workflow_dir\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/302_feature_preprocessing.insect_wingbeat.tabular_to_timeseries.html\",\n",
    "    \"InsectWingbeat_TRAIN_MODEL_INPUT\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train\",\n",
    "    \"InsectWingbeat_VALID_MODEL_INPUT\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/valid\",\n",
    "    \"InsectWingbeat_TEST_MODEL_INPUT\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test\",\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Preprocessing for Neural Networks - III\n",
    "\n",
    "> Convert Category Encoding data from tabular to time-series format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "DATASET_NAME = 'InsectWingbeat'\n",
    "SEQUENCE_LENGTH = 22\n",
    "NUMBER_OF_FEATURES = 200\n",
    "NUM_TARGET = 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Convert from Tabular to Time-Series Format__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "MTSC_COLUMN_NAMES = [\n",
    "'dim_0',\n",
    "'dim_1',\n",
    "'dim_2',\n",
    "'dim_3',\n",
    "'dim_4',\n",
    "'dim_5',\n",
    "'dim_6',\n",
    "'dim_7',\n",
    "'dim_8',\n",
    "'dim_9',\n",
    "'dim_10',\n",
    "'dim_11',\n",
    "'dim_12',\n",
    "'dim_13',\n",
    "'dim_14',\n",
    "'dim_15',\n",
    "'dim_16',\n",
    "'dim_17',\n",
    "'dim_18',\n",
    "'dim_19',\n",
    "'dim_20',\n",
    "'dim_21',\n",
    "'dim_22',\n",
    "'dim_23',\n",
    "'dim_24',\n",
    "'dim_25',\n",
    "'dim_26',\n",
    "'dim_27',\n",
    "'dim_28',\n",
    "'dim_29',\n",
    "'dim_30',\n",
    "'dim_31',\n",
    "'dim_32',\n",
    "'dim_33',\n",
    "'dim_34',\n",
    "'dim_35',\n",
    "'dim_36',\n",
    "'dim_37',\n",
    "'dim_38',\n",
    "'dim_39',\n",
    "'dim_40',\n",
    "'dim_41',\n",
    "'dim_42',\n",
    "'dim_43',\n",
    "'dim_44',\n",
    "'dim_45',\n",
    "'dim_46',\n",
    "'dim_47',\n",
    "'dim_48',\n",
    "'dim_49',\n",
    "'dim_50',\n",
    "'dim_51',\n",
    "'dim_52',\n",
    "'dim_53',\n",
    "'dim_54',\n",
    "'dim_55',\n",
    "'dim_56',\n",
    "'dim_57',\n",
    "'dim_58',\n",
    "'dim_59',\n",
    "'dim_60',\n",
    "'dim_61',\n",
    "'dim_62',\n",
    "'dim_63',\n",
    "'dim_64',\n",
    "'dim_65',\n",
    "'dim_66',\n",
    "'dim_67',\n",
    "'dim_68',\n",
    "'dim_69',\n",
    "'dim_70',\n",
    "'dim_71',\n",
    "'dim_72',\n",
    "'dim_73',\n",
    "'dim_74',\n",
    "'dim_75',\n",
    "'dim_76',\n",
    "'dim_77',\n",
    "'dim_78',\n",
    "'dim_79',\n",
    "'dim_80',\n",
    "'dim_81',\n",
    "'dim_82',\n",
    "'dim_83',\n",
    "'dim_84',\n",
    "'dim_85',\n",
    "'dim_86',\n",
    "'dim_87',\n",
    "'dim_88',\n",
    "'dim_89',\n",
    "'dim_90',\n",
    "'dim_91',\n",
    "'dim_92',\n",
    "'dim_93',\n",
    "'dim_94',\n",
    "'dim_95',\n",
    "'dim_96',\n",
    "'dim_97',\n",
    "'dim_98',\n",
    "'dim_99',\n",
    "'dim_100',\n",
    "'dim_101',\n",
    "'dim_102',\n",
    "'dim_103',\n",
    "'dim_104',\n",
    "'dim_105',\n",
    "'dim_106',\n",
    "'dim_107',\n",
    "'dim_108',\n",
    "'dim_109',\n",
    "'dim_110',\n",
    "'dim_111',\n",
    "'dim_112',\n",
    "'dim_113',\n",
    "'dim_114',\n",
    "'dim_115',\n",
    "'dim_116',\n",
    "'dim_117',\n",
    "'dim_118',\n",
    "'dim_119',\n",
    "'dim_120',\n",
    "'dim_121',\n",
    "'dim_122',\n",
    "'dim_123',\n",
    "'dim_124',\n",
    "'dim_125',\n",
    "'dim_126',\n",
    "'dim_127',\n",
    "'dim_128',\n",
    "'dim_129',\n",
    "'dim_130',\n",
    "'dim_131',\n",
    "'dim_132',\n",
    "'dim_133',\n",
    "'dim_134',\n",
    "'dim_135',\n",
    "'dim_136',\n",
    "'dim_137',\n",
    "'dim_138',\n",
    "'dim_139',\n",
    "'dim_140',\n",
    "'dim_141',\n",
    "'dim_142',\n",
    "'dim_143',\n",
    "'dim_144',\n",
    "'dim_145',\n",
    "'dim_146',\n",
    "'dim_147',\n",
    "'dim_148',\n",
    "'dim_149',\n",
    "'dim_150',\n",
    "'dim_151',\n",
    "'dim_152',\n",
    "'dim_153',\n",
    "'dim_154',\n",
    "'dim_155',\n",
    "'dim_156',\n",
    "'dim_157',\n",
    "'dim_158',\n",
    "'dim_159',\n",
    "'dim_160',\n",
    "'dim_161',\n",
    "'dim_162',\n",
    "'dim_163',\n",
    "'dim_164',\n",
    "'dim_165',\n",
    "'dim_166',\n",
    "'dim_167',\n",
    "'dim_168',\n",
    "'dim_169',\n",
    "'dim_170',\n",
    "'dim_171',\n",
    "'dim_172',\n",
    "'dim_173',\n",
    "'dim_174',\n",
    "'dim_175',\n",
    "'dim_176',\n",
    "'dim_177',\n",
    "'dim_178',\n",
    "'dim_179',\n",
    "'dim_180',\n",
    "'dim_181',\n",
    "'dim_182',\n",
    "'dim_183',\n",
    "'dim_184',\n",
    "'dim_185',\n",
    "'dim_186',\n",
    "'dim_187',\n",
    "'dim_188',\n",
    "'dim_189',\n",
    "'dim_190',\n",
    "'dim_191',\n",
    "'dim_192',\n",
    "'dim_193',\n",
    "'dim_194',\n",
    "'dim_195',\n",
    "'dim_196',\n",
    "'dim_197',\n",
    "'dim_198',\n",
    "'dim_199']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "ALL_COLUMNS = ['case_id', 'case_id_seq', 'reading_id'] + MTSC_COLUMN_NAMES + ['class_vals']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Input Data Location__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_encoded_train_dir = os.path.join(\"./\", upstream['feature_preprocessing_insect_wingbeat']['InsectWingbeat_TRAIN_TE'])\n",
    "target_encoded_valid_dir = os.path.join(\"./\", upstream['feature_preprocessing_insect_wingbeat']['InsectWingbeat_VALID_TE'])\n",
    "target_encoded_test_dir = os.path.join(\"./\", upstream['feature_preprocessing_insect_wingbeat']['InsectWingbeat_TEST_TE'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Output Data Location__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_train_dir = os.path.join(\"./\", product['InsectWingbeat_TRAIN_MODEL_INPUT'])\n",
    "output_valid_dir = os.path.join(\"./\", product['InsectWingbeat_VALID_MODEL_INPUT'])\n",
    "output_test_dir = os.path.join(\"./\", product['InsectWingbeat_TEST_MODEL_INPUT'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p $output_train_dir\n",
    "!mkdir -p $output_valid_dir\n",
    "!mkdir -p $output_test_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Train Dataset Conversion </center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Tabular to Time-Series format conversion__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "convert_from_tabular_to_timeseries_format(input_dir = target_encoded_train_dir, \n",
    "                                          output_dir = output_train_dir, \n",
    "                                          all_columns = ALL_COLUMNS,\n",
    "                                          mtsc_column_names = MTSC_COLUMN_NAMES,\n",
    "                                          chunk_size_processing = 50000,\n",
    "                                          number_of_features = NUMBER_OF_FEATURES, \n",
    "                                          seq_len = SEQUENCE_LENGTH,\n",
    "                                          chunk_size_file = 10000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Valid Dataset Conversion </center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Tabular to Time-Series format conversion__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "convert_from_tabular_to_timeseries_format(input_dir = target_encoded_valid_dir, \n",
    "                                          output_dir = output_valid_dir, \n",
    "                                          all_columns = ALL_COLUMNS,\n",
    "                                          mtsc_column_names = MTSC_COLUMN_NAMES,\n",
    "                                          chunk_size_processing = 50000,\n",
    "                                          number_of_features = NUMBER_OF_FEATURES, \n",
    "                                          seq_len = SEQUENCE_LENGTH,\n",
    "                                          chunk_size_file = 10000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Test Dataset Conversion </center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Tabular to Time-Series format conversion__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "convert_from_tabular_to_timeseries_format(input_dir = target_encoded_test_dir, \n",
    "                                          output_dir = output_test_dir, \n",
    "                                          all_columns = ALL_COLUMNS,\n",
    "                                          mtsc_column_names = MTSC_COLUMN_NAMES,\n",
    "                                          chunk_size_processing = 50000,\n",
    "                                          number_of_features = NUMBER_OF_FEATURES, \n",
    "                                          seq_len = SEQUENCE_LENGTH,\n",
    "                                          chunk_size_file = 10000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Verify Datasets </center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "import dask_cudf\n",
    "train_gdf = dask_cudf.read_parquet(output_train_dir)\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train_gdf['case_id'].nunique().compute(), train_gdf['class_vals'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "import dask_cudf\n",
    "valid_gdf = dask_cudf.read_parquet(output_valid_dir)\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "valid_gdf['case_id'].nunique().compute(), valid_gdf['class_vals'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "import dask_cudf\n",
    "test_gdf = dask_cudf.read_parquet(output_test_dir)\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test_gdf['case_id'].nunique().compute(), test_gdf['class_vals'].nunique().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "InsectWingbeat_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test",
     "InsectWingbeat_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train",
     "InsectWingbeat_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/valid",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/302_feature_preprocessing.insect_wingbeat.tabular_to_timeseries.html"
    },
    "upstream": {
     "feature_preprocessing_insect_wingbeat": {
      "InsectWingbeat_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test",
      "InsectWingbeat_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train",
      "InsectWingbeat_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid",
      "InsectWingbeat_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
