In [1]:
#| default_exp feature_preprocessing.spoken_arabic_digits.target_encoding
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['parquet_conversion_spoken_arabic_digits']
In [3]:
# Parameters
upstream = {"parquet_conversion_spoken_arabic_digits": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/104_data.spoken_arabic_digits.html", "SpokenArabicDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/train", "SpokenArabicDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/valid", "SpokenArabicDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/test"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/204_feature_preprocessing.spoken_arabic_digits.target_encoding.html", "SpokenArabicDigits_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/train", "SpokenArabicDigits_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/valid", "SpokenArabicDigits_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/test", "SpokenArabicDigits_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/nvtabular_workflow"}
In [4]:
#| hide
from nbdev.showdoc import *
In [5]:
#| export
from vitmtsc import *
from vitmtsc.core import *
from vitmtsc.data.spoken_arabic_digits import *
import os
import nvtabular as nvt
import dask_cudf
from nvtabular import ops
In [6]:
#| export
upstream = {
    "parquet_conversion_spoken_arabic_digits": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/104_data.spoken_arabic_digits.html",
        "SpokenArabicDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/train",
        "SpokenArabicDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/valid",
        "SpokenArabicDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/test",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/204_feature_preprocessing.spoken_arabic_digits.target_encoding.html",
    "SpokenArabicDigits_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/train",
    "SpokenArabicDigits_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/valid",
    "SpokenArabicDigits_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/test",
    "SpokenArabicDigits_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/nvtabular_workflow",
}
In [7]:
!conda list|grep -i nvtabular

Feature Preprocessing via NVTabular¶

Fill missing continuous features

Normalize continuous features

Categorify categorical features

Target Encoding of Categorical Variables

In [8]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:01:39,406 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:01:39,406 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:01:39,441 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:01:39,441 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:01:39,484 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:01:39,484 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:01:39,503 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:01:39,503 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Out[8]:

Client

Client-27f97194-3b72-11ed-80b1-061eae6df733

Connection method: Cluster object Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

LocalCUDACluster

b785cfd6

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 4 Total memory: 150.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-b22d4260-67de-4389-9940-53e52cf62526

Comm: tcp://127.0.0.1:41651 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 4
Started: Just now Total memory: 150.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:44937 Total threads: 1
Dashboard: http://127.0.0.1:44843/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:42245
Local directory: /tmp/dask-worker-space/worker-rmufjl0g
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 1

Comm: tcp://127.0.0.1:34385 Total threads: 1
Dashboard: http://127.0.0.1:34747/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:38263
Local directory: /tmp/dask-worker-space/worker-w1ft0d1p
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 2

Comm: tcp://127.0.0.1:33201 Total threads: 1
Dashboard: http://127.0.0.1:44653/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:33295
Local directory: /tmp/dask-worker-space/worker-9stpwdj1
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 3

Comm: tcp://127.0.0.1:42711 Total threads: 1
Dashboard: http://127.0.0.1:35079/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:39527
Local directory: /tmp/dask-worker-space/worker-870nf7ec
GPU: Tesla T4 GPU memory: 14.76 GiB

COLUMNS: CATEGORICAL, CONTINUOUS and TARGET

In [9]:
#| export
import numpy as np
CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING = ['case_id', 'case_id_seq', 'reading_id']
In [10]:
#| export
CATEGORICAL_COLUMNS_NEED_ENCODING = [

]
In [11]:
#| export
CONTINUOUS_COLUMNS = [
'dim_0',
'dim_1',
'dim_2',
'dim_3',
'dim_4',
'dim_5',
'dim_6',
'dim_7',
'dim_8',
'dim_9',
'dim_10',
'dim_11',
'dim_12'
]
In [12]:
#| export
LABEL_COLUMNS = ['class_vals']

Workflow and Operations

In [13]:
import cudf
import numpy as np
cat_features_no_encoding = nvt.ColumnGroup(CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING)
#te_features = CATEGORICAL_COLUMNS_NEED_ENCODING >> ops.TargetEncoding(LABEL_COLUMNS, kfold=5, fold_seed=42, p_smooth=20)
cont_features = CONTINUOUS_COLUMNS >> ops.FillMissing() >> ops.Normalize()
label_name = LABEL_COLUMNS

workflow = nvt.Workflow(
    #cat_features_no_encoding + te_features + cont_features + label_name
    #cat_features_no_encoding + te_features + label_name
    cat_features_no_encoding  + cont_features + label_name
)

Datasets

Input data

In [14]:
pre_processed_train_dir = os.path.join("./", upstream['parquet_conversion_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_RAW'])
pre_processed_valid_dir = os.path.join("./", upstream['parquet_conversion_spoken_arabic_digits']['SpokenArabicDigits_VALID_RAW'])
pre_processed_test_dir = os.path.join("./", upstream['parquet_conversion_spoken_arabic_digits']['SpokenArabicDigits_TEST_RAW'])

Training, Validation and Test datasets

In [15]:
train_dataset = nvt.Dataset(pre_processed_train_dir, engine='parquet')
valid_dataset = nvt.Dataset(pre_processed_valid_dir, engine='parquet')
test_dataset = nvt.Dataset(pre_processed_test_dir, engine='parquet')

Output location

In [16]:
output_train_dir = os.path.join("./", product['SpokenArabicDigits_TRAIN_TE'])
output_valid_dir = os.path.join("./", product['SpokenArabicDigits_VALID_TE'])
output_test_dir = os.path.join("./", product['SpokenArabicDigits_TEST_TE'])
In [17]:
!mkdir -p $output_train_dir
!mkdir -p $output_valid_dir
!mkdir -p $output_test_dir

Path to save the workflow to

Fit: Train Dataset
¶

In [18]:
%%time
workflow.fit(train_dataset)
CPU times: user 122 ms, sys: 48.1 ms, total: 170 ms
Wall time: 2.07 s
Out[18]:
<nvtabular.workflow.workflow.Workflow at 0x7fdbc14b8ca0>

Save workflow

In [19]:
%%time
workflow.save(product['SpokenArabicDigits_workflow_dir'])
CPU times: user 1.51 ms, sys: 0 ns, total: 1.51 ms
Wall time: 1.28 ms

Clear workflow

In [20]:
%%time
workflow = None
CPU times: user 1 µs, sys: 2 µs, total: 3 µs
Wall time: 6.2 µs

Load workflow

In [21]:
%%time
workflow = nvt.Workflow.load(product['SpokenArabicDigits_workflow_dir'], client=client)
CPU times: user 946 µs, sys: 0 ns, total: 946 µs
Wall time: 825 µs

Transform: Train Dataset
¶

In [22]:
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(train_dataset).to_parquet(
    output_train_dir,
    out_files_per_proc=2,
    shuffle=nvt.io.Shuffle.PER_PARTITION,
)
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/merlin/io/dataset.py:862: UserWarning: Only created 4 files did not have enough partitions to create 8 files.
  warnings.warn(
CPU times: user 167 ms, sys: 8.15 ms, total: 176 ms
Wall time: 2.38 s

Transform: Valid Dataset
¶

In [23]:
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(valid_dataset).to_parquet(
    output_valid_dir,
    out_files_per_proc=2,
    shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 50.4 ms, sys: 5.51 ms, total: 55.9 ms
Wall time: 182 ms

Transform: Test Dataset
¶

In [24]:
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(test_dataset).to_parquet(
    output_test_dir,
    out_files_per_proc=2,
    shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 56 ms, sys: 3.72 ms, total: 59.7 ms
Wall time: 203 ms

Verify Data

In [25]:
train_gdf = dask_cudf.read_parquet(output_train_dir)
In [26]:
%%time
train_gdf.head()
CPU times: user 11.5 ms, sys: 3.49 ms, total: 15 ms
Wall time: 24.7 ms
Out[26]:
dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 dim_8 dim_9 dim_10 dim_11 dim_12 case_id case_id_seq reading_id class_vals
0 -2.156874 0.475753 0.507209 0.587834 -0.031909 0.027982 0.380251 0.267008 -0.275396 0.317955 0.892540 0.286739 0.396203 3723 3723 39 5
1 -0.754594 -0.385721 0.440070 0.558343 1.252602 0.531703 -0.742155 -1.517387 -1.903049 -1.261425 -0.853626 -0.319565 -1.791995 3800 3800 44 5
2 1.145584 -1.290091 -1.330844 -1.632132 0.777363 -0.342097 0.185460 0.631775 -1.044989 -1.171606 -1.370700 2.073556 -0.611623 3658 3658 53 5
3 1.016003 1.013697 -1.244734 -0.083767 -0.541389 0.249727 -0.341089 0.787778 0.208822 0.322054 -0.643695 -0.408454 0.409597 4209 4209 31 6
4 -0.856874 -1.610310 -0.799904 0.128338 1.958033 1.633232 -0.667427 1.546357 1.853666 1.691711 -0.715517 0.772755 0.747864 3369 3369 1 5
In [27]:
%%time
train_gdf['case_id'].nunique().compute()
CPU times: user 113 ms, sys: 14.1 ms, total: 127 ms
Wall time: 152 ms
Out[27]:
5279
In [28]:
valid_gdf = dask_cudf.read_parquet(output_valid_dir)
In [29]:
%%time
valid_gdf.head()
CPU times: user 11.6 ms, sys: 3.65 ms, total: 15.3 ms
Wall time: 23.3 ms
Out[29]:
dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 dim_8 dim_9 dim_10 dim_11 dim_12 case_id case_id_seq reading_id class_vals
0 -0.083873 1.320869 1.008252 0.642920 -0.440215 -1.128561 -0.246360 -2.029218 -1.499457 0.651216 -0.375215 -0.266346 -0.617796 3412 3412 11 5
1 -1.576518 0.228166 0.852320 1.418352 0.835680 0.479848 -0.352939 -0.943208 -0.103780 0.718712 1.765949 0.904590 -1.146879 3979 3979 20 6
2 0.202786 -0.725515 0.049635 -0.734679 0.357877 -1.388294 -1.228936 -1.711134 -0.169800 -0.477506 -1.158517 0.217679 -0.254558 3651 3651 21 5
3 -0.592673 0.701845 1.155435 0.684398 -0.091579 -0.223135 -0.352350 -2.133810 0.786475 -0.921527 -0.870610 -0.202096 -0.767339 3774 3774 17 5
4 -2.338549 0.190234 0.565265 1.174318 1.000907 0.949724 0.296178 0.092061 0.476885 0.287979 -0.217853 0.210313 0.108411 3767 3767 30 5
In [30]:
%%time
valid_gdf['case_id'].nunique().compute()
CPU times: user 21.6 ms, sys: 0 ns, total: 21.6 ms
Wall time: 47.5 ms
Out[30]:
1320
In [31]:
test_gdf = dask_cudf.read_parquet(output_test_dir)
In [32]:
%%time
test_gdf.head()
CPU times: user 12 ms, sys: 2.96 ms, total: 15 ms
Wall time: 23.5 ms
Out[32]:
dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 dim_8 dim_9 dim_10 dim_11 dim_12 case_id case_id_seq reading_id class_vals
0 1.017019 0.477907 -1.017426 -1.067772 -1.056865 0.203524 -1.083452 -0.003735 1.201706 -0.087909 -1.798079 -1.849999 0.325625 239 239 8 1
1 -0.401030 0.030405 0.473714 0.583984 -0.249840 -0.688274 -0.888310 -1.316102 -0.097323 0.729371 1.667976 0.455513 0.932301 105 105 32 0
2 0.532542 1.040478 -0.330421 -0.400506 -1.323181 1.220669 -1.789089 -1.643401 0.741550 2.250455 0.772351 -1.924102 -0.014136 257 257 10 1
3 0.094137 -0.008135 -0.914392 -1.374714 -0.010304 -0.809902 -0.221825 -0.711646 -0.323396 0.366476 1.431594 0.792409 3.116056 168 168 37 0
4 1.445775 0.081121 -0.632863 1.823147 -2.215659 -1.224102 -1.997999 0.808217 -0.875756 0.593732 1.366622 -0.685766 -0.086496 40 40 12 0
In [33]:
%%time
test_gdf['case_id'].nunique().compute()
CPU times: user 12.7 ms, sys: 11.8 ms, total: 24.5 ms
Wall time: 50.2 ms
Out[33]:
2199
In [34]:
test_gdf.columns
Out[34]:
Index(['dim_0', 'dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', 'dim_6', 'dim_7',
       'dim_8', 'dim_9', 'dim_10', 'dim_11', 'dim_12', 'case_id',
       'case_id_seq', 'reading_id', 'class_vals'],
      dtype='object')
In [35]:
!ls -lrt --block-size=M $output_train_dir
total 23M
-rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt
-rw-r--r-- 1 root root 3M Sep 23 19:01 part_7.parquet
-rw-r--r-- 1 root root 3M Sep 23 19:01 part_6.parquet
-rw-r--r-- 1 root root 3M Sep 23 19:01 part_5.parquet
-rw-r--r-- 1 root root 3M Sep 23 19:01 part_4.parquet
-rw-r--r-- 1 root root 3M Sep 23 19:01 part_3.parquet
-rw-r--r-- 1 root root 3M Sep 23 19:01 part_2.parquet
-rw-r--r-- 1 root root 3M Sep 23 19:01 part_1.parquet
-rw-r--r-- 1 root root 3M Sep 23 19:01 part_0.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata
-rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
In [36]:
!ls -lrt --block-size=M $output_valid_dir
total 6M
-rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_7.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_6.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_5.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_4.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_3.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_2.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_1.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_0.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata
-rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
In [37]:
!ls -lrt --block-size=M $output_test_dir
total 10M
-rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt
-rw-r--r-- 1 root root 2M Sep 23 19:01 part_3.parquet
-rw-r--r-- 1 root root 2M Sep 23 19:01 part_2.parquet
-rw-r--r-- 1 root root 2M Sep 23 19:01 part_1.parquet
-rw-r--r-- 1 root root 2M Sep 23 19:01 part_0.parquet
-rw-r--r-- 1 root root 2M Sep 23 19:01 part_5.parquet
-rw-r--r-- 1 root root 2M Sep 23 19:01 part_4.parquet
-rw-r--r-- 1 root root 2M Sep 23 19:01 part_7.parquet
-rw-r--r-- 1 root root 2M Sep 23 19:01 part_6.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json
-rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata

We reset the kernel!!!

In [38]:
%%time
client.shutdown()
client.close()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report
    await self._reconnect()
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError
CPU times: user 25.4 ms, sys: 24.1 ms, total: 49.5 ms
Wall time: 670 ms
In [39]:
from nbdev import nbdev_export
nbdev_export()