{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp feature_preprocessing.insect_wingbeat.target_encoding\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['parquet_conversion_insect_wingbeat']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "from vitmtsc.data.insect_wingbeat import *\n",
    "import os\n",
    "import nvtabular as nvt\n",
    "import dask_cudf\n",
    "from nvtabular import ops"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "upstream = {\n",
    "    \"parquet_conversion_insect_wingbeat\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html\",\n",
    "        \"InsectWingbeat_TRAIN_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train\",\n",
    "        \"InsectWingbeat_VALID_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid\",\n",
    "        \"InsectWingbeat_TEST_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html\",\n",
    "    \"InsectWingbeat_TRAIN_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train\",\n",
    "    \"InsectWingbeat_VALID_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid\",\n",
    "    \"InsectWingbeat_TEST_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test\",\n",
    "    \"InsectWingbeat_workflow_dir\": \"/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow\",\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!conda list|grep -i nvtabular"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Preprocessing via NVTabular\n",
    "\n",
    "> Fill missing continuous features\n",
    "\n",
    "> Normalize continuous features\n",
    "\n",
    "> Categorify categorical features\n",
    "\n",
    "> Target Encoding of Categorical Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__COLUMNS: CATEGORICAL, CONTINUOUS and TARGET__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "import numpy as np\n",
    "CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING = ['case_id', 'case_id_seq', 'reading_id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "CATEGORICAL_COLUMNS_NEED_ENCODING = [\n",
    "\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "CONTINUOUS_COLUMNS = [\n",
    "'dim_0',\n",
    "'dim_1',\n",
    "'dim_2',\n",
    "'dim_3',\n",
    "'dim_4',\n",
    "'dim_5',\n",
    "'dim_6',\n",
    "'dim_7',\n",
    "'dim_8',\n",
    "'dim_9',\n",
    "'dim_10',\n",
    "'dim_11',\n",
    "'dim_12',\n",
    "'dim_13',\n",
    "'dim_14',\n",
    "'dim_15',\n",
    "'dim_16',\n",
    "'dim_17',\n",
    "'dim_18',\n",
    "'dim_19',\n",
    "'dim_20',\n",
    "'dim_21',\n",
    "'dim_22',\n",
    "'dim_23',\n",
    "'dim_24',\n",
    "'dim_25',\n",
    "'dim_26',\n",
    "'dim_27',\n",
    "'dim_28',\n",
    "'dim_29',\n",
    "'dim_30',\n",
    "'dim_31',\n",
    "'dim_32',\n",
    "'dim_33',\n",
    "'dim_34',\n",
    "'dim_35',\n",
    "'dim_36',\n",
    "'dim_37',\n",
    "'dim_38',\n",
    "'dim_39',\n",
    "'dim_40',\n",
    "'dim_41',\n",
    "'dim_42',\n",
    "'dim_43',\n",
    "'dim_44',\n",
    "'dim_45',\n",
    "'dim_46',\n",
    "'dim_47',\n",
    "'dim_48',\n",
    "'dim_49',\n",
    "'dim_50',\n",
    "'dim_51',\n",
    "'dim_52',\n",
    "'dim_53',\n",
    "'dim_54',\n",
    "'dim_55',\n",
    "'dim_56',\n",
    "'dim_57',\n",
    "'dim_58',\n",
    "'dim_59',\n",
    "'dim_60',\n",
    "'dim_61',\n",
    "'dim_62',\n",
    "'dim_63',\n",
    "'dim_64',\n",
    "'dim_65',\n",
    "'dim_66',\n",
    "'dim_67',\n",
    "'dim_68',\n",
    "'dim_69',\n",
    "'dim_70',\n",
    "'dim_71',\n",
    "'dim_72',\n",
    "'dim_73',\n",
    "'dim_74',\n",
    "'dim_75',\n",
    "'dim_76',\n",
    "'dim_77',\n",
    "'dim_78',\n",
    "'dim_79',\n",
    "'dim_80',\n",
    "'dim_81',\n",
    "'dim_82',\n",
    "'dim_83',\n",
    "'dim_84',\n",
    "'dim_85',\n",
    "'dim_86',\n",
    "'dim_87',\n",
    "'dim_88',\n",
    "'dim_89',\n",
    "'dim_90',\n",
    "'dim_91',\n",
    "'dim_92',\n",
    "'dim_93',\n",
    "'dim_94',\n",
    "'dim_95',\n",
    "'dim_96',\n",
    "'dim_97',\n",
    "'dim_98',\n",
    "'dim_99',\n",
    "'dim_100',\n",
    "'dim_101',\n",
    "'dim_102',\n",
    "'dim_103',\n",
    "'dim_104',\n",
    "'dim_105',\n",
    "'dim_106',\n",
    "'dim_107',\n",
    "'dim_108',\n",
    "'dim_109',\n",
    "'dim_110',\n",
    "'dim_111',\n",
    "'dim_112',\n",
    "'dim_113',\n",
    "'dim_114',\n",
    "'dim_115',\n",
    "'dim_116',\n",
    "'dim_117',\n",
    "'dim_118',\n",
    "'dim_119',\n",
    "'dim_120',\n",
    "'dim_121',\n",
    "'dim_122',\n",
    "'dim_123',\n",
    "'dim_124',\n",
    "'dim_125',\n",
    "'dim_126',\n",
    "'dim_127',\n",
    "'dim_128',\n",
    "'dim_129',\n",
    "'dim_130',\n",
    "'dim_131',\n",
    "'dim_132',\n",
    "'dim_133',\n",
    "'dim_134',\n",
    "'dim_135',\n",
    "'dim_136',\n",
    "'dim_137',\n",
    "'dim_138',\n",
    "'dim_139',\n",
    "'dim_140',\n",
    "'dim_141',\n",
    "'dim_142',\n",
    "'dim_143',\n",
    "'dim_144',\n",
    "'dim_145',\n",
    "'dim_146',\n",
    "'dim_147',\n",
    "'dim_148',\n",
    "'dim_149',\n",
    "'dim_150',\n",
    "'dim_151',\n",
    "'dim_152',\n",
    "'dim_153',\n",
    "'dim_154',\n",
    "'dim_155',\n",
    "'dim_156',\n",
    "'dim_157',\n",
    "'dim_158',\n",
    "'dim_159',\n",
    "'dim_160',\n",
    "'dim_161',\n",
    "'dim_162',\n",
    "'dim_163',\n",
    "'dim_164',\n",
    "'dim_165',\n",
    "'dim_166',\n",
    "'dim_167',\n",
    "'dim_168',\n",
    "'dim_169',\n",
    "'dim_170',\n",
    "'dim_171',\n",
    "'dim_172',\n",
    "'dim_173',\n",
    "'dim_174',\n",
    "'dim_175',\n",
    "'dim_176',\n",
    "'dim_177',\n",
    "'dim_178',\n",
    "'dim_179',\n",
    "'dim_180',\n",
    "'dim_181',\n",
    "'dim_182',\n",
    "'dim_183',\n",
    "'dim_184',\n",
    "'dim_185',\n",
    "'dim_186',\n",
    "'dim_187',\n",
    "'dim_188',\n",
    "'dim_189',\n",
    "'dim_190',\n",
    "'dim_191',\n",
    "'dim_192',\n",
    "'dim_193',\n",
    "'dim_194',\n",
    "'dim_195',\n",
    "'dim_196',\n",
    "'dim_197',\n",
    "'dim_198',\n",
    "'dim_199'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "LABEL_COLUMNS = ['class_vals']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Workflow and Operations__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "import numpy as np\n",
    "cat_features_no_encoding = nvt.ColumnGroup(CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING)\n",
    "#te_features = CATEGORICAL_COLUMNS_NEED_ENCODING >> ops.TargetEncoding(LABEL_COLUMNS, kfold=5, fold_seed=42, p_smooth=20)\n",
    "cont_features = CONTINUOUS_COLUMNS >> ops.FillMissing() >> ops.Normalize()\n",
    "label_name = LABEL_COLUMNS\n",
    "\n",
    "workflow = nvt.Workflow(\n",
    "    #cat_features_no_encoding + te_features + cont_features + label_name\n",
    "    #cat_features_no_encoding + te_features + label_name\n",
    "    cat_features_no_encoding  + cont_features + label_name\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Datasets__\n",
    "\n",
    "> Input data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pre_processed_train_dir = os.path.join(\"./\", upstream['parquet_conversion_insect_wingbeat']['InsectWingbeat_TRAIN_RAW'])\n",
    "pre_processed_valid_dir = os.path.join(\"./\", upstream['parquet_conversion_insect_wingbeat']['InsectWingbeat_VALID_RAW'])\n",
    "pre_processed_test_dir = os.path.join(\"./\", upstream['parquet_conversion_insect_wingbeat']['InsectWingbeat_TEST_RAW'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Training, Validation and Test datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = nvt.Dataset(pre_processed_train_dir, engine='parquet')\n",
    "valid_dataset = nvt.Dataset(pre_processed_valid_dir, engine='parquet')\n",
    "test_dataset = nvt.Dataset(pre_processed_test_dir, engine='parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Output location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_train_dir = os.path.join(\"./\", product['InsectWingbeat_TRAIN_TE'])\n",
    "output_valid_dir = os.path.join(\"./\", product['InsectWingbeat_VALID_TE'])\n",
    "output_test_dir = os.path.join(\"./\", product['InsectWingbeat_TEST_TE'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p $output_train_dir\n",
    "!mkdir -p $output_valid_dir\n",
    "!mkdir -p $output_test_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Path to save the workflow to"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Fit: Train Dataset </center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "workflow.fit(train_dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Save workflow__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "workflow.save(product['InsectWingbeat_workflow_dir'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Clear workflow__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "workflow = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Load workflow__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "workflow = nvt.Workflow.load(product['InsectWingbeat_workflow_dir'], client=client)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Transform: Train Dataset </center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Write to new \"shuffled\" and \"processed\" dataset\n",
    "workflow.transform(train_dataset).to_parquet(\n",
    "    output_train_dir,\n",
    "    out_files_per_proc=2,\n",
    "    shuffle=nvt.io.Shuffle.PER_PARTITION,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Transform: Valid Dataset</center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Write to new \"shuffled\" and \"processed\" dataset\n",
    "workflow.transform(valid_dataset).to_parquet(\n",
    "    output_valid_dir,\n",
    "    out_files_per_proc=2,\n",
    "    shuffle=nvt.io.Shuffle.PER_PARTITION,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Transform: Test Dataset</center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Write to new \"shuffled\" and \"processed\" dataset\n",
    "workflow.transform(test_dataset).to_parquet(\n",
    "    output_test_dir,\n",
    "    out_files_per_proc=2,\n",
    "    shuffle=nvt.io.Shuffle.PER_PARTITION,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Verify Data__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf = dask_cudf.read_parquet(output_train_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train_gdf['case_id'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf = dask_cudf.read_parquet(output_valid_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "valid_gdf['case_id'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf = dask_cudf.read_parquet(output_test_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test_gdf['case_id'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -lrt --block-size=M $output_train_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -lrt --block-size=M $output_valid_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -lrt --block-size=M $output_test_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "InsectWingbeat_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test",
     "InsectWingbeat_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train",
     "InsectWingbeat_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid",
     "InsectWingbeat_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html"
    },
    "upstream": {
     "parquet_conversion_insect_wingbeat": {
      "InsectWingbeat_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test",
      "InsectWingbeat_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train",
      "InsectWingbeat_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
