{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp feature_preprocessing.spoken_arabic_digits.target_encoding\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['parquet_conversion_spoken_arabic_digits']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "from vitmtsc.data.spoken_arabic_digits import *\n",
    "import os\n",
    "import nvtabular as nvt\n",
    "import dask_cudf\n",
    "from nvtabular import ops"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "upstream = {\n",
    "    \"parquet_conversion_spoken_arabic_digits\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/104_data.spoken_arabic_digits.html\",\n",
    "        \"SpokenArabicDigits_TRAIN_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/train\",\n",
    "        \"SpokenArabicDigits_VALID_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/valid\",\n",
    "        \"SpokenArabicDigits_TEST_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/test\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/204_feature_preprocessing.spoken_arabic_digits.target_encoding.html\",\n",
    "    \"SpokenArabicDigits_TRAIN_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/train\",\n",
    "    \"SpokenArabicDigits_VALID_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/valid\",\n",
    "    \"SpokenArabicDigits_TEST_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/test\",\n",
    "    \"SpokenArabicDigits_workflow_dir\": \"/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/nvtabular_workflow\",\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!conda list|grep -i nvtabular"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Preprocessing via NVTabular\n",
    "\n",
    "> Fill missing continuous features\n",
    "\n",
    "> Normalize continuous features\n",
    "\n",
    "> Categorify categorical features\n",
    "\n",
    "> Target Encoding of Categorical Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__COLUMNS: CATEGORICAL, CONTINUOUS and TARGET__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "import numpy as np\n",
    "CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING = ['case_id', 'case_id_seq', 'reading_id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "CATEGORICAL_COLUMNS_NEED_ENCODING = [\n",
    "\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "CONTINUOUS_COLUMNS = [\n",
    "'dim_0',\n",
    "'dim_1',\n",
    "'dim_2',\n",
    "'dim_3',\n",
    "'dim_4',\n",
    "'dim_5',\n",
    "'dim_6',\n",
    "'dim_7',\n",
    "'dim_8',\n",
    "'dim_9',\n",
    "'dim_10',\n",
    "'dim_11',\n",
    "'dim_12'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "LABEL_COLUMNS = ['class_vals']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Workflow and Operations__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "import numpy as np\n",
    "cat_features_no_encoding = nvt.ColumnGroup(CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING)\n",
    "#te_features = CATEGORICAL_COLUMNS_NEED_ENCODING >> ops.TargetEncoding(LABEL_COLUMNS, kfold=5, fold_seed=42, p_smooth=20)\n",
    "cont_features = CONTINUOUS_COLUMNS >> ops.FillMissing() >> ops.Normalize()\n",
    "label_name = LABEL_COLUMNS\n",
    "\n",
    "workflow = nvt.Workflow(\n",
    "    #cat_features_no_encoding + te_features + cont_features + label_name\n",
    "    #cat_features_no_encoding + te_features + label_name\n",
    "    cat_features_no_encoding  + cont_features + label_name\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Datasets__\n",
    "\n",
    "> Input data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pre_processed_train_dir = os.path.join(\"./\", upstream['parquet_conversion_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_RAW'])\n",
    "pre_processed_valid_dir = os.path.join(\"./\", upstream['parquet_conversion_spoken_arabic_digits']['SpokenArabicDigits_VALID_RAW'])\n",
    "pre_processed_test_dir = os.path.join(\"./\", upstream['parquet_conversion_spoken_arabic_digits']['SpokenArabicDigits_TEST_RAW'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Training, Validation and Test datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = nvt.Dataset(pre_processed_train_dir, engine='parquet')\n",
    "valid_dataset = nvt.Dataset(pre_processed_valid_dir, engine='parquet')\n",
    "test_dataset = nvt.Dataset(pre_processed_test_dir, engine='parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Output location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_train_dir = os.path.join(\"./\", product['SpokenArabicDigits_TRAIN_TE'])\n",
    "output_valid_dir = os.path.join(\"./\", product['SpokenArabicDigits_VALID_TE'])\n",
    "output_test_dir = os.path.join(\"./\", product['SpokenArabicDigits_TEST_TE'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p $output_train_dir\n",
    "!mkdir -p $output_valid_dir\n",
    "!mkdir -p $output_test_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Path to save the workflow to"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Fit: Train Dataset </center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "workflow.fit(train_dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Save workflow__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "workflow.save(product['SpokenArabicDigits_workflow_dir'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Clear workflow__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "workflow = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Load workflow__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "workflow = nvt.Workflow.load(product['SpokenArabicDigits_workflow_dir'], client=client)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Transform: Train Dataset </center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Write to new \"shuffled\" and \"processed\" dataset\n",
    "workflow.transform(train_dataset).to_parquet(\n",
    "    output_train_dir,\n",
    "    out_files_per_proc=2,\n",
    "    shuffle=nvt.io.Shuffle.PER_PARTITION,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Transform: Valid Dataset</center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Write to new \"shuffled\" and \"processed\" dataset\n",
    "workflow.transform(valid_dataset).to_parquet(\n",
    "    output_valid_dir,\n",
    "    out_files_per_proc=2,\n",
    "    shuffle=nvt.io.Shuffle.PER_PARTITION,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Transform: Test Dataset</center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Write to new \"shuffled\" and \"processed\" dataset\n",
    "workflow.transform(test_dataset).to_parquet(\n",
    "    output_test_dir,\n",
    "    out_files_per_proc=2,\n",
    "    shuffle=nvt.io.Shuffle.PER_PARTITION,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Verify Data__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf = dask_cudf.read_parquet(output_train_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "train_gdf['case_id'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf = dask_cudf.read_parquet(output_valid_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "valid_gdf['case_id'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf = dask_cudf.read_parquet(output_test_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "test_gdf['case_id'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -lrt --block-size=M $output_train_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -lrt --block-size=M $output_valid_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -lrt --block-size=M $output_test_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "SpokenArabicDigits_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/test",
     "SpokenArabicDigits_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/train",
     "SpokenArabicDigits_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/valid",
     "SpokenArabicDigits_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/nvtabular_workflow",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/204_feature_preprocessing.spoken_arabic_digits.target_encoding.html"
    },
    "upstream": {
     "parquet_conversion_spoken_arabic_digits": {
      "SpokenArabicDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/test",
      "SpokenArabicDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/train",
      "SpokenArabicDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/valid",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/104_data.spoken_arabic_digits.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
