{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp feature_preprocessing.face_detection.target_encoding\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# declare a list tasks whose products you want to use as inputs\n",
    "upstream = ['parquet_conversion_face_detection']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "from nbdev.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "from vitmtsc import *\n",
    "from vitmtsc.core import *\n",
    "from vitmtsc.data.face_detection import *\n",
    "import os\n",
    "import nvtabular as nvt\n",
    "import dask_cudf\n",
    "from nvtabular import ops"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "upstream = {\n",
    "    \"parquet_conversion_face_detection\": {\n",
    "        \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/101_data.face_detection.html\",\n",
    "        \"FaceDetection_TRAIN_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/train\",\n",
    "        \"FaceDetection_VALID_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/valid\",\n",
    "        \"FaceDetection_TEST_RAW\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/test\",\n",
    "    }\n",
    "}\n",
    "product = {\n",
    "    \"nb\": \"/home/ubuntu/vitmtsc_nbdev/output/201_feature_preprocessing.face_detection.target_encoding.html\",\n",
    "    \"FaceDetection_TRAIN_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/train\",\n",
    "    \"FaceDetection_VALID_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/valid\",\n",
    "    \"FaceDetection_TEST_TE\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/test\",\n",
    "    \"FaceDetection_workflow_dir\": \"/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/nvtabular_workflow\",\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nvtabular                 1.3.3                    py38_0    nvidia\n"
     ]
    }
   ],
   "source": [
    "!conda list|grep -i nvtabular"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Preprocessing via NVTabular\n",
    "\n",
    "> Fill missing continuous features\n",
    "\n",
    "> Normalize continuous features\n",
    "\n",
    "> Categorify categorical features\n",
    "\n",
    "> Target Encoding of Categorical Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-09-20 05:40:20,783 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,783 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,829 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,829 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,875 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,875 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,877 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,877 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,930 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,931 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,957 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,957 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,968 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
      "2022-09-20 05:40:20,968 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
      "2022-09-20 05:40:21,057 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
      "2022-09-20 05:40:21,057 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-b855524c-38a6-11ed-8223-0a0cbbade515</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "\n",
       "        <tr>\n",
       "        \n",
       "            <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
       "            <td style=\"text-align: left;\"><strong>Cluster type:</strong> dask_cuda.LocalCUDACluster</td>\n",
       "        \n",
       "        </tr>\n",
       "\n",
       "        \n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\"></td>\n",
       "            </tr>\n",
       "        \n",
       "\n",
       "        </table>\n",
       "\n",
       "        \n",
       "            <details>\n",
       "            <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
       "            <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
       "    </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCUDACluster</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">45e564db</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Workers:</strong> 8\n",
       "                </td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total threads:</strong> 8\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total memory:</strong> 747.76 GiB\n",
       "                </td>\n",
       "            </tr>\n",
       "            \n",
       "            <tr>\n",
       "    <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
       "    <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
       "</tr>\n",
       "\n",
       "            \n",
       "        </table>\n",
       "\n",
       "        <details>\n",
       "            <summary style=\"margin-bottom: 20px;\">\n",
       "                <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
       "            </summary>\n",
       "\n",
       "            <div style=\"\">\n",
       "    <div>\n",
       "        <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
       "        <div style=\"margin-left: 48px;\">\n",
       "            <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-d57da674-23a0-4aa7-9371-f3b2909d715f</p>\n",
       "            <table style=\"width: 100%; text-align: left;\">\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Comm:</strong> tcp://127.0.0.1:45897\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Workers:</strong> 8\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total threads:</strong> 8\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Started:</strong> Just now\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total memory:</strong> 747.76 GiB\n",
       "                    </td>\n",
       "                </tr>\n",
       "            </table>\n",
       "        </div>\n",
       "    </div>\n",
       "\n",
       "    <details style=\"margin-left: 48px;\">\n",
       "        <summary style=\"margin-bottom: 20px;\">\n",
       "            <h3 style=\"display: inline;\">Workers</h3>\n",
       "        </summary>\n",
       "\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:38033\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:44597/status\" target=\"_blank\">http://127.0.0.1:44597/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 93.47 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:37807\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-ejx70lhj\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>Tesla V100-SXM2-32GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 32.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:46883\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:39705/status\" target=\"_blank\">http://127.0.0.1:39705/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 93.47 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:40705\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-9yrjp7mj\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>Tesla V100-SXM2-32GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 32.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:33411\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:32943/status\" target=\"_blank\">http://127.0.0.1:32943/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 93.47 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:43801\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-u4e853jk\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>Tesla V100-SXM2-32GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 32.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:39265\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:45007/status\" target=\"_blank\">http://127.0.0.1:45007/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 93.47 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:35425\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-2duvfn0a\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>Tesla V100-SXM2-32GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 32.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 4</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:34843\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:33545/status\" target=\"_blank\">http://127.0.0.1:33545/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 93.47 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:46415\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-ix_r1pr6\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>Tesla V100-SXM2-32GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 32.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 5</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:44897\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:41859/status\" target=\"_blank\">http://127.0.0.1:41859/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 93.47 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:42357\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-b6k0uibr\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>Tesla V100-SXM2-32GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 32.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 6</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:46601\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:39621/status\" target=\"_blank\">http://127.0.0.1:39621/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 93.47 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:39303\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-ooc901zr\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>Tesla V100-SXM2-32GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 32.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 7</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:33013\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:46251/status\" target=\"_blank\">http://127.0.0.1:46251/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 93.47 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:37371\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-o3qhi8wh\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU: </strong>Tesla V100-SXM2-32GB\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>GPU memory: </strong> 32.00 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "\n",
       "    </details>\n",
       "</div>\n",
       "\n",
       "        </details>\n",
       "    </div>\n",
       "</div>\n",
       "            </details>\n",
       "        \n",
       "\n",
       "    </div>\n",
       "</div>"
      ],
      "text/plain": [
       "<Client: 'tcp://127.0.0.1:45897' processes=8 threads=8, memory=747.76 GiB>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__COLUMNS: CATEGORICAL, CONTINUOUS and TARGET__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "import numpy as np\n",
    "CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING = ['case_id', 'case_id_seq', 'reading_id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "CATEGORICAL_COLUMNS_NEED_ENCODING = [\n",
    "\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "CONTINUOUS_COLUMNS = [\n",
    "'dim_0',\n",
    "'dim_1',\n",
    "'dim_2',\n",
    "'dim_3',\n",
    "'dim_4',\n",
    "'dim_5',\n",
    "'dim_6',\n",
    "'dim_7',\n",
    "'dim_8',\n",
    "'dim_9',\n",
    "'dim_10',\n",
    "'dim_11',\n",
    "'dim_12',\n",
    "'dim_13',\n",
    "'dim_14',\n",
    "'dim_15',\n",
    "'dim_16',\n",
    "'dim_17',\n",
    "'dim_18',\n",
    "'dim_19',\n",
    "'dim_20',\n",
    "'dim_21',\n",
    "'dim_22',\n",
    "'dim_23',\n",
    "'dim_24',\n",
    "'dim_25',\n",
    "'dim_26',\n",
    "'dim_27',\n",
    "'dim_28',\n",
    "'dim_29',\n",
    "'dim_30',\n",
    "'dim_31',\n",
    "'dim_32',\n",
    "'dim_33',\n",
    "'dim_34',\n",
    "'dim_35',\n",
    "'dim_36',\n",
    "'dim_37',\n",
    "'dim_38',\n",
    "'dim_39',\n",
    "'dim_40',\n",
    "'dim_41',\n",
    "'dim_42',\n",
    "'dim_43',\n",
    "'dim_44',\n",
    "'dim_45',\n",
    "'dim_46',\n",
    "'dim_47',\n",
    "'dim_48',\n",
    "'dim_49',\n",
    "'dim_50',\n",
    "'dim_51',\n",
    "'dim_52',\n",
    "'dim_53',\n",
    "'dim_54',\n",
    "'dim_55',\n",
    "'dim_56',\n",
    "'dim_57',\n",
    "'dim_58',\n",
    "'dim_59',\n",
    "'dim_60',\n",
    "'dim_61',\n",
    "'dim_62',\n",
    "'dim_63',\n",
    "'dim_64',\n",
    "'dim_65',\n",
    "'dim_66',\n",
    "'dim_67',\n",
    "'dim_68',\n",
    "'dim_69',\n",
    "'dim_70',\n",
    "'dim_71',\n",
    "'dim_72',\n",
    "'dim_73',\n",
    "'dim_74',\n",
    "'dim_75',\n",
    "'dim_76',\n",
    "'dim_77',\n",
    "'dim_78',\n",
    "'dim_79',\n",
    "'dim_80',\n",
    "'dim_81',\n",
    "'dim_82',\n",
    "'dim_83',\n",
    "'dim_84',\n",
    "'dim_85',\n",
    "'dim_86',\n",
    "'dim_87',\n",
    "'dim_88',\n",
    "'dim_89',\n",
    "'dim_90',\n",
    "'dim_91',\n",
    "'dim_92',\n",
    "'dim_93',\n",
    "'dim_94',\n",
    "'dim_95',\n",
    "'dim_96',\n",
    "'dim_97',\n",
    "'dim_98',\n",
    "'dim_99',\n",
    "'dim_100',\n",
    "'dim_101',\n",
    "'dim_102',\n",
    "'dim_103',\n",
    "'dim_104',\n",
    "'dim_105',\n",
    "'dim_106',\n",
    "'dim_107',\n",
    "'dim_108',\n",
    "'dim_109',\n",
    "'dim_110',\n",
    "'dim_111',\n",
    "'dim_112',\n",
    "'dim_113',\n",
    "'dim_114',\n",
    "'dim_115',\n",
    "'dim_116',\n",
    "'dim_117',\n",
    "'dim_118',\n",
    "'dim_119',\n",
    "'dim_120',\n",
    "'dim_121',\n",
    "'dim_122',\n",
    "'dim_123',\n",
    "'dim_124',\n",
    "'dim_125',\n",
    "'dim_126',\n",
    "'dim_127',\n",
    "'dim_128',\n",
    "'dim_129',\n",
    "'dim_130',\n",
    "'dim_131',\n",
    "'dim_132',\n",
    "'dim_133',\n",
    "'dim_134',\n",
    "'dim_135',\n",
    "'dim_136',\n",
    "'dim_137',\n",
    "'dim_138',\n",
    "'dim_139',\n",
    "'dim_140',\n",
    "'dim_141',\n",
    "'dim_142',\n",
    "'dim_143'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "LABEL_COLUMNS = ['class_vals']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Workflow and Operations__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "import numpy as np\n",
    "cat_features_no_encoding = nvt.ColumnGroup(CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING)\n",
    "#te_features = CATEGORICAL_COLUMNS_NEED_ENCODING >> ops.TargetEncoding(LABEL_COLUMNS, kfold=5, fold_seed=42, p_smooth=20)\n",
    "cont_features = CONTINUOUS_COLUMNS >> ops.FillMissing() >> ops.Normalize()\n",
    "label_name = LABEL_COLUMNS\n",
    "\n",
    "workflow = nvt.Workflow(\n",
    "    #cat_features_no_encoding + te_features + cont_features + label_name\n",
    "    #cat_features_no_encoding + te_features + label_name\n",
    "    cat_features_no_encoding  + cont_features + label_name\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Datasets__\n",
    "\n",
    "> Input data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "pre_processed_train_dir = os.path.join(\"./\", upstream['parquet_conversion_face_detection']['FaceDetection_TRAIN_RAW'])\n",
    "pre_processed_valid_dir = os.path.join(\"./\", upstream['parquet_conversion_face_detection']['FaceDetection_VALID_RAW'])\n",
    "pre_processed_test_dir = os.path.join(\"./\", upstream['parquet_conversion_face_detection']['FaceDetection_TEST_RAW'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Training, Validation and Test datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = nvt.Dataset(pre_processed_train_dir, engine='parquet')\n",
    "valid_dataset = nvt.Dataset(pre_processed_valid_dir, engine='parquet')\n",
    "test_dataset = nvt.Dataset(pre_processed_test_dir, engine='parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Output location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_train_dir = os.path.join(\"./\", product['FaceDetection_TRAIN_TE'])\n",
    "output_valid_dir = os.path.join(\"./\", product['FaceDetection_VALID_TE'])\n",
    "output_test_dir = os.path.join(\"./\", product['FaceDetection_TEST_TE'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p $output_train_dir\n",
    "!mkdir -p $output_valid_dir\n",
    "!mkdir -p $output_test_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Path to save the workflow to"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Fit: Train Dataset </center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 486 ms, sys: 83 ms, total: 569 ms\n",
      "Wall time: 3.35 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<nvtabular.workflow.workflow.Workflow at 0x7fe95c5370a0>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "workflow.fit(train_dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Save workflow__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.56 ms, sys: 467 µs, total: 5.03 ms\n",
      "Wall time: 4.75 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "workflow.save(product['FaceDetection_workflow_dir'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Clear workflow__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs\n",
      "Wall time: 5.48 µs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "workflow = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Load workflow__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.39 ms, sys: 1.73 ms, total: 3.12 ms\n",
      "Wall time: 2.92 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "workflow = nvt.Workflow.load(product['FaceDetection_workflow_dir'], client=client)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Transform: Train Dataset </center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/merlin/io/dataset.py:862: UserWarning: Only created 8 files did not have enough partitions to create 16 files.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 495 ms, sys: 96.4 ms, total: 591 ms\n",
      "Wall time: 2.37 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Write to new \"shuffled\" and \"processed\" dataset\n",
    "workflow.transform(train_dataset).to_parquet(\n",
    "    output_train_dir,\n",
    "    out_files_per_proc=2,\n",
    "    shuffle=nvt.io.Shuffle.PER_PARTITION,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Transform: Valid Dataset</center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 282 ms, sys: 25.7 ms, total: 308 ms\n",
      "Wall time: 957 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Write to new \"shuffled\" and \"processed\" dataset\n",
    "workflow.transform(valid_dataset).to_parquet(\n",
    "    output_valid_dir,\n",
    "    out_files_per_proc=2,\n",
    "    shuffle=nvt.io.Shuffle.PER_PARTITION,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center> Transform: Test Dataset</center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 285 ms, sys: 26.3 ms, total: 311 ms\n",
      "Wall time: 1.05 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Write to new \"shuffled\" and \"processed\" dataset\n",
    "workflow.transform(test_dataset).to_parquet(\n",
    "    output_test_dir,\n",
    "    out_files_per_proc=2,\n",
    "    shuffle=nvt.io.Shuffle.PER_PARTITION,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Verify Data__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_gdf = dask_cudf.read_parquet(output_train_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 23.1 ms, sys: 6.43 ms, total: 29.6 ms\n",
      "Wall time: 61.5 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dim_0</th>\n",
       "      <th>dim_1</th>\n",
       "      <th>dim_2</th>\n",
       "      <th>dim_3</th>\n",
       "      <th>dim_4</th>\n",
       "      <th>dim_5</th>\n",
       "      <th>dim_6</th>\n",
       "      <th>dim_7</th>\n",
       "      <th>dim_8</th>\n",
       "      <th>dim_9</th>\n",
       "      <th>...</th>\n",
       "      <th>dim_138</th>\n",
       "      <th>dim_139</th>\n",
       "      <th>dim_140</th>\n",
       "      <th>dim_141</th>\n",
       "      <th>dim_142</th>\n",
       "      <th>dim_143</th>\n",
       "      <th>case_id</th>\n",
       "      <th>case_id_seq</th>\n",
       "      <th>reading_id</th>\n",
       "      <th>class_vals</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.719180</td>\n",
       "      <td>0.715782</td>\n",
       "      <td>0.544206</td>\n",
       "      <td>-1.100469</td>\n",
       "      <td>0.194906</td>\n",
       "      <td>1.320790</td>\n",
       "      <td>1.412965</td>\n",
       "      <td>1.460956</td>\n",
       "      <td>1.038020</td>\n",
       "      <td>-0.191164</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.344396</td>\n",
       "      <td>0.226279</td>\n",
       "      <td>1.471979</td>\n",
       "      <td>1.686134</td>\n",
       "      <td>-0.762544</td>\n",
       "      <td>0.906672</td>\n",
       "      <td>3188</td>\n",
       "      <td>3188</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.822641</td>\n",
       "      <td>0.397050</td>\n",
       "      <td>-0.285200</td>\n",
       "      <td>1.152047</td>\n",
       "      <td>1.422886</td>\n",
       "      <td>-0.365725</td>\n",
       "      <td>0.271643</td>\n",
       "      <td>1.153121</td>\n",
       "      <td>-0.341057</td>\n",
       "      <td>1.170023</td>\n",
       "      <td>...</td>\n",
       "      <td>0.025819</td>\n",
       "      <td>-1.634586</td>\n",
       "      <td>1.052513</td>\n",
       "      <td>1.670757</td>\n",
       "      <td>-3.522848</td>\n",
       "      <td>1.223280</td>\n",
       "      <td>3303</td>\n",
       "      <td>3303</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.502361</td>\n",
       "      <td>1.735772</td>\n",
       "      <td>-0.280887</td>\n",
       "      <td>-0.908237</td>\n",
       "      <td>2.284322</td>\n",
       "      <td>0.074438</td>\n",
       "      <td>-0.607150</td>\n",
       "      <td>0.704954</td>\n",
       "      <td>-0.192168</td>\n",
       "      <td>0.570008</td>\n",
       "      <td>...</td>\n",
       "      <td>1.576688</td>\n",
       "      <td>-1.457373</td>\n",
       "      <td>1.175290</td>\n",
       "      <td>1.926524</td>\n",
       "      <td>-1.792511</td>\n",
       "      <td>0.950423</td>\n",
       "      <td>3069</td>\n",
       "      <td>3069</td>\n",
       "      <td>29</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.855324</td>\n",
       "      <td>0.351915</td>\n",
       "      <td>0.030595</td>\n",
       "      <td>-1.171957</td>\n",
       "      <td>1.049653</td>\n",
       "      <td>0.062817</td>\n",
       "      <td>0.288323</td>\n",
       "      <td>0.472932</td>\n",
       "      <td>0.029568</td>\n",
       "      <td>-0.016995</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.283749</td>\n",
       "      <td>-0.127383</td>\n",
       "      <td>-0.400956</td>\n",
       "      <td>-0.341851</td>\n",
       "      <td>0.463897</td>\n",
       "      <td>0.391738</td>\n",
       "      <td>3188</td>\n",
       "      <td>3188</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.571652</td>\n",
       "      <td>1.500655</td>\n",
       "      <td>-1.814388</td>\n",
       "      <td>-0.426979</td>\n",
       "      <td>2.057570</td>\n",
       "      <td>-0.350211</td>\n",
       "      <td>-0.884815</td>\n",
       "      <td>2.074110</td>\n",
       "      <td>-0.789449</td>\n",
       "      <td>0.157667</td>\n",
       "      <td>...</td>\n",
       "      <td>0.831596</td>\n",
       "      <td>0.033716</td>\n",
       "      <td>1.523205</td>\n",
       "      <td>1.418159</td>\n",
       "      <td>0.531392</td>\n",
       "      <td>0.482259</td>\n",
       "      <td>3306</td>\n",
       "      <td>3306</td>\n",
       "      <td>26</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 148 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      dim_0     dim_1     dim_2     dim_3     dim_4     dim_5     dim_6  \\\n",
       "0 -0.719180  0.715782  0.544206 -1.100469  0.194906  1.320790  1.412965   \n",
       "1  0.822641  0.397050 -0.285200  1.152047  1.422886 -0.365725  0.271643   \n",
       "2  0.502361  1.735772 -0.280887 -0.908237  2.284322  0.074438 -0.607150   \n",
       "3 -0.855324  0.351915  0.030595 -1.171957  1.049653  0.062817  0.288323   \n",
       "4  0.571652  1.500655 -1.814388 -0.426979  2.057570 -0.350211 -0.884815   \n",
       "\n",
       "      dim_7     dim_8     dim_9  ...   dim_138   dim_139   dim_140   dim_141  \\\n",
       "0  1.460956  1.038020 -0.191164  ... -0.344396  0.226279  1.471979  1.686134   \n",
       "1  1.153121 -0.341057  1.170023  ...  0.025819 -1.634586  1.052513  1.670757   \n",
       "2  0.704954 -0.192168  0.570008  ...  1.576688 -1.457373  1.175290  1.926524   \n",
       "3  0.472932  0.029568 -0.016995  ... -0.283749 -0.127383 -0.400956 -0.341851   \n",
       "4  2.074110 -0.789449  0.157667  ...  0.831596  0.033716  1.523205  1.418159   \n",
       "\n",
       "    dim_142   dim_143  case_id  case_id_seq  reading_id  class_vals  \n",
       "0 -0.762544  0.906672     3188         3188          20           0  \n",
       "1 -3.522848  1.223280     3303         3303          33           1  \n",
       "2 -1.792511  0.950423     3069         3069          29           1  \n",
       "3  0.463897  0.391738     3188         3188          29           0  \n",
       "4  0.531392  0.482259     3306         3306          26           0  \n",
       "\n",
       "[5 rows x 148 columns]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "train_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 137 ms, sys: 11.9 ms, total: 148 ms\n",
      "Wall time: 185 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "4712"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "train_gdf['case_id'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_gdf = dask_cudf.read_parquet(output_valid_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 24.4 ms, sys: 4.19 ms, total: 28.5 ms\n",
      "Wall time: 55.2 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dim_0</th>\n",
       "      <th>dim_1</th>\n",
       "      <th>dim_2</th>\n",
       "      <th>dim_3</th>\n",
       "      <th>dim_4</th>\n",
       "      <th>dim_5</th>\n",
       "      <th>dim_6</th>\n",
       "      <th>dim_7</th>\n",
       "      <th>dim_8</th>\n",
       "      <th>dim_9</th>\n",
       "      <th>...</th>\n",
       "      <th>dim_138</th>\n",
       "      <th>dim_139</th>\n",
       "      <th>dim_140</th>\n",
       "      <th>dim_141</th>\n",
       "      <th>dim_142</th>\n",
       "      <th>dim_143</th>\n",
       "      <th>case_id</th>\n",
       "      <th>case_id_seq</th>\n",
       "      <th>reading_id</th>\n",
       "      <th>class_vals</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.082990</td>\n",
       "      <td>1.563105</td>\n",
       "      <td>0.358668</td>\n",
       "      <td>-0.067843</td>\n",
       "      <td>1.818358</td>\n",
       "      <td>0.778785</td>\n",
       "      <td>-1.472940</td>\n",
       "      <td>-0.879955</td>\n",
       "      <td>0.841403</td>\n",
       "      <td>-0.775165</td>\n",
       "      <td>...</td>\n",
       "      <td>0.323710</td>\n",
       "      <td>1.250502</td>\n",
       "      <td>0.331661</td>\n",
       "      <td>0.305275</td>\n",
       "      <td>1.218022</td>\n",
       "      <td>0.128762</td>\n",
       "      <td>2996</td>\n",
       "      <td>2996</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-1.430117</td>\n",
       "      <td>0.023215</td>\n",
       "      <td>1.390516</td>\n",
       "      <td>-0.583876</td>\n",
       "      <td>-0.937619</td>\n",
       "      <td>1.192019</td>\n",
       "      <td>-2.079107</td>\n",
       "      <td>-1.520146</td>\n",
       "      <td>0.164477</td>\n",
       "      <td>-1.735753</td>\n",
       "      <td>...</td>\n",
       "      <td>0.923150</td>\n",
       "      <td>1.970994</td>\n",
       "      <td>1.750561</td>\n",
       "      <td>-0.000114</td>\n",
       "      <td>1.192631</td>\n",
       "      <td>0.782389</td>\n",
       "      <td>3213</td>\n",
       "      <td>3213</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.025047</td>\n",
       "      <td>-0.084901</td>\n",
       "      <td>-0.756202</td>\n",
       "      <td>1.633533</td>\n",
       "      <td>0.083482</td>\n",
       "      <td>-0.273219</td>\n",
       "      <td>1.707596</td>\n",
       "      <td>1.620050</td>\n",
       "      <td>0.881017</td>\n",
       "      <td>1.488393</td>\n",
       "      <td>...</td>\n",
       "      <td>0.797554</td>\n",
       "      <td>0.692285</td>\n",
       "      <td>1.162877</td>\n",
       "      <td>1.340374</td>\n",
       "      <td>1.011340</td>\n",
       "      <td>0.471440</td>\n",
       "      <td>3035</td>\n",
       "      <td>3035</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.089457</td>\n",
       "      <td>-0.446724</td>\n",
       "      <td>-1.183167</td>\n",
       "      <td>0.596871</td>\n",
       "      <td>-1.157572</td>\n",
       "      <td>-1.455812</td>\n",
       "      <td>-0.785047</td>\n",
       "      <td>0.444306</td>\n",
       "      <td>-1.316708</td>\n",
       "      <td>0.854992</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.153552</td>\n",
       "      <td>-1.532587</td>\n",
       "      <td>1.408728</td>\n",
       "      <td>-0.408405</td>\n",
       "      <td>-0.768310</td>\n",
       "      <td>0.549559</td>\n",
       "      <td>2905</td>\n",
       "      <td>2905</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.460682</td>\n",
       "      <td>0.935466</td>\n",
       "      <td>-0.035468</td>\n",
       "      <td>-0.414578</td>\n",
       "      <td>0.969504</td>\n",
       "      <td>0.612933</td>\n",
       "      <td>0.115769</td>\n",
       "      <td>1.016626</td>\n",
       "      <td>0.756099</td>\n",
       "      <td>-0.216604</td>\n",
       "      <td>...</td>\n",
       "      <td>0.740143</td>\n",
       "      <td>1.137553</td>\n",
       "      <td>0.049458</td>\n",
       "      <td>-0.254544</td>\n",
       "      <td>2.960490</td>\n",
       "      <td>0.561495</td>\n",
       "      <td>3102</td>\n",
       "      <td>3102</td>\n",
       "      <td>45</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 148 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      dim_0     dim_1     dim_2     dim_3     dim_4     dim_5     dim_6  \\\n",
       "0  1.082990  1.563105  0.358668 -0.067843  1.818358  0.778785 -1.472940   \n",
       "1 -1.430117  0.023215  1.390516 -0.583876 -0.937619  1.192019 -2.079107   \n",
       "2  1.025047 -0.084901 -0.756202  1.633533  0.083482 -0.273219  1.707596   \n",
       "3 -0.089457 -0.446724 -1.183167  0.596871 -1.157572 -1.455812 -0.785047   \n",
       "4  0.460682  0.935466 -0.035468 -0.414578  0.969504  0.612933  0.115769   \n",
       "\n",
       "      dim_7     dim_8     dim_9  ...   dim_138   dim_139   dim_140   dim_141  \\\n",
       "0 -0.879955  0.841403 -0.775165  ...  0.323710  1.250502  0.331661  0.305275   \n",
       "1 -1.520146  0.164477 -1.735753  ...  0.923150  1.970994  1.750561 -0.000114   \n",
       "2  1.620050  0.881017  1.488393  ...  0.797554  0.692285  1.162877  1.340374   \n",
       "3  0.444306 -1.316708  0.854992  ... -0.153552 -1.532587  1.408728 -0.408405   \n",
       "4  1.016626  0.756099 -0.216604  ...  0.740143  1.137553  0.049458 -0.254544   \n",
       "\n",
       "    dim_142   dim_143  case_id  case_id_seq  reading_id  class_vals  \n",
       "0  1.218022  0.128762     2996         2996          35           0  \n",
       "1  1.192631  0.782389     3213         3213           7           1  \n",
       "2  1.011340  0.471440     3035         3035          12           1  \n",
       "3 -0.768310  0.549559     2905         2905          31           0  \n",
       "4  2.960490  0.561495     3102         3102          45           1  \n",
       "\n",
       "[5 rows x 148 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "valid_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 33.1 ms, sys: 4.75 ms, total: 37.9 ms\n",
      "Wall time: 72.6 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1178"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "valid_gdf['case_id'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_gdf = dask_cudf.read_parquet(output_test_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 27.4 ms, sys: 0 ns, total: 27.4 ms\n",
      "Wall time: 62.4 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dim_0</th>\n",
       "      <th>dim_1</th>\n",
       "      <th>dim_2</th>\n",
       "      <th>dim_3</th>\n",
       "      <th>dim_4</th>\n",
       "      <th>dim_5</th>\n",
       "      <th>dim_6</th>\n",
       "      <th>dim_7</th>\n",
       "      <th>dim_8</th>\n",
       "      <th>dim_9</th>\n",
       "      <th>...</th>\n",
       "      <th>dim_138</th>\n",
       "      <th>dim_139</th>\n",
       "      <th>dim_140</th>\n",
       "      <th>dim_141</th>\n",
       "      <th>dim_142</th>\n",
       "      <th>dim_143</th>\n",
       "      <th>case_id</th>\n",
       "      <th>case_id_seq</th>\n",
       "      <th>reading_id</th>\n",
       "      <th>class_vals</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.664366</td>\n",
       "      <td>-0.927378</td>\n",
       "      <td>-0.582350</td>\n",
       "      <td>1.295654</td>\n",
       "      <td>-0.674468</td>\n",
       "      <td>-0.290503</td>\n",
       "      <td>-0.393850</td>\n",
       "      <td>0.239619</td>\n",
       "      <td>-0.023575</td>\n",
       "      <td>0.487661</td>\n",
       "      <td>...</td>\n",
       "      <td>0.663048</td>\n",
       "      <td>-0.014410</td>\n",
       "      <td>0.294813</td>\n",
       "      <td>0.800389</td>\n",
       "      <td>2.632731</td>\n",
       "      <td>0.504352</td>\n",
       "      <td>1894</td>\n",
       "      <td>1894</td>\n",
       "      <td>35</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.453768</td>\n",
       "      <td>1.003418</td>\n",
       "      <td>-1.001645</td>\n",
       "      <td>-0.484745</td>\n",
       "      <td>-0.617771</td>\n",
       "      <td>-0.522245</td>\n",
       "      <td>-0.056373</td>\n",
       "      <td>1.391316</td>\n",
       "      <td>-0.827856</td>\n",
       "      <td>-0.825778</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.050590</td>\n",
       "      <td>-0.635921</td>\n",
       "      <td>0.769618</td>\n",
       "      <td>0.118171</td>\n",
       "      <td>1.243587</td>\n",
       "      <td>-0.095690</td>\n",
       "      <td>1898</td>\n",
       "      <td>1898</td>\n",
       "      <td>54</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.026201</td>\n",
       "      <td>0.816085</td>\n",
       "      <td>-0.255020</td>\n",
       "      <td>0.587850</td>\n",
       "      <td>0.999438</td>\n",
       "      <td>0.177024</td>\n",
       "      <td>0.090801</td>\n",
       "      <td>1.298250</td>\n",
       "      <td>0.415890</td>\n",
       "      <td>-1.526477</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.550846</td>\n",
       "      <td>-0.071067</td>\n",
       "      <td>0.189794</td>\n",
       "      <td>-0.609160</td>\n",
       "      <td>-0.260229</td>\n",
       "      <td>1.501082</td>\n",
       "      <td>1819</td>\n",
       "      <td>1819</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.182159</td>\n",
       "      <td>-1.357800</td>\n",
       "      <td>0.717773</td>\n",
       "      <td>1.528579</td>\n",
       "      <td>-1.313376</td>\n",
       "      <td>0.582342</td>\n",
       "      <td>-0.545035</td>\n",
       "      <td>-0.608966</td>\n",
       "      <td>0.199264</td>\n",
       "      <td>0.024431</td>\n",
       "      <td>...</td>\n",
       "      <td>0.790421</td>\n",
       "      <td>1.201031</td>\n",
       "      <td>-0.869649</td>\n",
       "      <td>0.074254</td>\n",
       "      <td>1.416155</td>\n",
       "      <td>-1.469616</td>\n",
       "      <td>1946</td>\n",
       "      <td>1946</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.762208</td>\n",
       "      <td>-0.061749</td>\n",
       "      <td>-0.209632</td>\n",
       "      <td>-0.175421</td>\n",
       "      <td>0.063330</td>\n",
       "      <td>-0.012635</td>\n",
       "      <td>0.233588</td>\n",
       "      <td>0.551948</td>\n",
       "      <td>0.056605</td>\n",
       "      <td>-0.318447</td>\n",
       "      <td>...</td>\n",
       "      <td>1.496392</td>\n",
       "      <td>0.340592</td>\n",
       "      <td>0.582934</td>\n",
       "      <td>0.399857</td>\n",
       "      <td>-1.042838</td>\n",
       "      <td>-0.810789</td>\n",
       "      <td>1820</td>\n",
       "      <td>1820</td>\n",
       "      <td>24</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 148 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      dim_0     dim_1     dim_2     dim_3     dim_4     dim_5     dim_6  \\\n",
       "0  0.664366 -0.927378 -0.582350  1.295654 -0.674468 -0.290503 -0.393850   \n",
       "1  0.453768  1.003418 -1.001645 -0.484745 -0.617771 -0.522245 -0.056373   \n",
       "2  1.026201  0.816085 -0.255020  0.587850  0.999438  0.177024  0.090801   \n",
       "3  1.182159 -1.357800  0.717773  1.528579 -1.313376  0.582342 -0.545035   \n",
       "4 -1.762208 -0.061749 -0.209632 -0.175421  0.063330 -0.012635  0.233588   \n",
       "\n",
       "      dim_7     dim_8     dim_9  ...   dim_138   dim_139   dim_140   dim_141  \\\n",
       "0  0.239619 -0.023575  0.487661  ...  0.663048 -0.014410  0.294813  0.800389   \n",
       "1  1.391316 -0.827856 -0.825778  ... -0.050590 -0.635921  0.769618  0.118171   \n",
       "2  1.298250  0.415890 -1.526477  ... -0.550846 -0.071067  0.189794 -0.609160   \n",
       "3 -0.608966  0.199264  0.024431  ...  0.790421  1.201031 -0.869649  0.074254   \n",
       "4  0.551948  0.056605 -0.318447  ...  1.496392  0.340592  0.582934  0.399857   \n",
       "\n",
       "    dim_142   dim_143  case_id  case_id_seq  reading_id  class_vals  \n",
       "0  2.632731  0.504352     1894         1894          35           1  \n",
       "1  1.243587 -0.095690     1898         1898          54           0  \n",
       "2 -0.260229  1.501082     1819         1819          35           0  \n",
       "3  1.416155 -1.469616     1946         1946          11           1  \n",
       "4 -1.042838 -0.810789     1820         1820          24           1  \n",
       "\n",
       "[5 rows x 148 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "test_gdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 33.7 ms, sys: 5.65 ms, total: 39.3 ms\n",
      "Wall time: 81.7 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "3524"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "test_gdf['case_id'].nunique().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['dim_0', 'dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', 'dim_6', 'dim_7',\n",
       "       'dim_8', 'dim_9',\n",
       "       ...\n",
       "       'dim_138', 'dim_139', 'dim_140', 'dim_141', 'dim_142', 'dim_143',\n",
       "       'case_id', 'case_id_seq', 'reading_id', 'class_vals'],\n",
       "      dtype='object', length=148)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_gdf.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 324M\n",
      "-rw-rw-r-- 1 ubuntu ubuntu  1M Sep 20 05:40 schema.pbtxt\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_15.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_14.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_13.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_12.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_11.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_10.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_7.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_6.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_5.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_4.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_3.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_2.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_9.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_8.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_1.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 21M Sep 20 05:40 part_0.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu  1M Sep 20 05:40 _metadata.json\n",
      "-rw-rw-r-- 1 ubuntu ubuntu  1M Sep 20 05:40 _file_list.txt\n",
      "-rw-rw-r-- 1 ubuntu ubuntu  1M Sep 20 05:40 _metadata\n"
     ]
    }
   ],
   "source": [
    "!ls -lrt --block-size=M $output_train_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 82M\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 1M Sep 20 05:40 schema.pbtxt\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_9.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_8.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_3.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_2.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_5.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_4.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_15.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_14.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_13.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_12.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_1.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_0.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_7.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_6.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 6M Sep 20 05:40 part_11.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 5M Sep 20 05:40 part_10.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 1M Sep 20 05:40 _metadata.json\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 1M Sep 20 05:40 _file_list.txt\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 1M Sep 20 05:40 _metadata\n"
     ]
    }
   ],
   "source": [
    "!ls -lrt --block-size=M $output_valid_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 242M\n",
      "-rw-rw-r-- 1 ubuntu ubuntu  1M Sep 20 05:40 schema.pbtxt\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_13.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_12.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_7.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_6.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_1.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_0.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_3.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_2.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_15.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_14.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_5.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_4.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 15M Sep 20 05:40 part_11.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_10.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_9.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu 16M Sep 20 05:40 part_8.parquet\n",
      "-rw-rw-r-- 1 ubuntu ubuntu  1M Sep 20 05:40 _metadata.json\n",
      "-rw-rw-r-- 1 ubuntu ubuntu  1M Sep 20 05:40 _file_list.txt\n",
      "-rw-rw-r-- 1 ubuntu ubuntu  1M Sep 20 05:40 _metadata\n"
     ]
    }
   ],
   "source": [
    "!ls -lrt --block-size=M $output_test_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__We reset the kernel!!!__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py\", line 778, in wrapper\n",
      "    return await func(*args, **kwargs)\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py\", line 1211, in _reconnect\n",
      "    await self._ensure_connected(timeout=timeout)\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py\", line 1241, in _ensure_connected\n",
      "    comm = await connect(\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py\", line 315, in connect\n",
      "    await asyncio.sleep(backoff)\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py\", line 659, in sleep\n",
      "    return await future\n",
      "asyncio.exceptions.CancelledError\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py\", line 778, in wrapper\n",
      "    return await func(*args, **kwargs)\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py\", line 1400, in _handle_report\n",
      "    await self._reconnect()\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py\", line 778, in wrapper\n",
      "    return await func(*args, **kwargs)\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py\", line 1211, in _reconnect\n",
      "    await self._ensure_connected(timeout=timeout)\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py\", line 1241, in _ensure_connected\n",
      "    comm = await connect(\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py\", line 315, in connect\n",
      "    await asyncio.sleep(backoff)\n",
      "  File \"/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py\", line 659, in sleep\n",
      "    return await future\n",
      "asyncio.exceptions.CancelledError\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 54.2 ms, sys: 27.5 ms, total: 81.7 ms\n",
      "Wall time: 1.01 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "client.shutdown()\n",
    "client.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nbdev import nbdev_export\n",
    "nbdev_export()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rapids-22.08_ploomber",
   "language": "python",
   "name": "rapids-22.08_ploomber"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "papermill": {
   "environment_variables": {},
   "parameters": {
    "product": {
     "FaceDetection_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/test",
     "FaceDetection_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/train",
     "FaceDetection_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/valid",
     "FaceDetection_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/nvtabular_workflow",
     "nb": "/home/ubuntu/vitmtsc_nbdev/output/201_feature_preprocessing.face_detection.target_encoding.html"
    },
    "upstream": {
     "parquet_conversion_face_detection": {
      "FaceDetection_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/test",
      "FaceDetection_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/train",
      "FaceDetection_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/valid",
      "nb": "/home/ubuntu/vitmtsc_nbdev/output/101_data.face_detection.html"
     }
    }
   },
   "version": null
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
