In [1]:
#| default_exp feature_preprocessing.character_trajectories.target_encoding
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['parquet_conversion_character_trajectories']
In [3]:
# Parameters
upstream = {"parquet_conversion_character_trajectories": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/105_data.character_trajectories.html", "CharacterTrajectories_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/train", "CharacterTrajectories_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/valid", "CharacterTrajectories_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/test"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/205_feature_preprocessing.character_trajectories.target_encoding.html", "CharacterTrajectories_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/target_encoding/train", "CharacterTrajectories_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/target_encoding/valid", "CharacterTrajectories_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/target_encoding/test", "CharacterTrajectories_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/target_encoding/nvtabular_workflow"}
In [4]:
#| hide
from nbdev.showdoc import *
In [5]:
#| export
from vitmtsc import *
from vitmtsc.core import *
from vitmtsc.data.character_trajectories import *
import os
import nvtabular as nvt
import dask_cudf
from nvtabular import ops
In [6]:
# |export
upstream = {
    "parquet_conversion_character_trajectories": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/105_data.character_trajectories.html",
        "CharacterTrajectories_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/train",
        "CharacterTrajectories_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/valid",
        "CharacterTrajectories_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/test",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/205_feature_preprocessing.character_trajectories.target_encoding.html",
    "CharacterTrajectories_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/target_encoding/train",
    "CharacterTrajectories_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/target_encoding/valid",
    "CharacterTrajectories_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/target_encoding/test",
    "CharacterTrajectories_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/target_encoding/nvtabular_workflow",
}
In [7]:
!conda list|grep -i nvtabular

Feature Preprocessing via NVTabular¶

Fill missing continuous features

Normalize continuous features

Categorify categorical features

Target Encoding of Categorical Variables

In [8]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:01:11,933 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:01:11,933 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:01:11,933 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:01:11,933 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:01:11,935 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:01:11,935 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:01:11,978 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:01:11,978 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Out[8]:

Client

Client-1799f47a-3b72-11ed-80a5-0a4f0fdf7975

Connection method: Cluster object Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

LocalCUDACluster

d58b55ac

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 4 Total memory: 150.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-2ef83713-27c0-48be-87f2-436cfacc2d76

Comm: tcp://127.0.0.1:40221 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 4
Started: Just now Total memory: 150.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:37221 Total threads: 1
Dashboard: http://127.0.0.1:36349/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:40457
Local directory: /tmp/dask-worker-space/worker-oqd8avas
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 1

Comm: tcp://127.0.0.1:34447 Total threads: 1
Dashboard: http://127.0.0.1:39557/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:44669
Local directory: /tmp/dask-worker-space/worker-yvmuw7d2
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 2

Comm: tcp://127.0.0.1:33079 Total threads: 1
Dashboard: http://127.0.0.1:45947/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:42365
Local directory: /tmp/dask-worker-space/worker-ea_qjzk_
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 3

Comm: tcp://127.0.0.1:36505 Total threads: 1
Dashboard: http://127.0.0.1:35325/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:35929
Local directory: /tmp/dask-worker-space/worker-yr5nth0i
GPU: Tesla T4 GPU memory: 14.76 GiB

COLUMNS: CATEGORICAL, CONTINUOUS and TARGET

In [9]:
#| export
import numpy as np
CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING = ['case_id', 'case_id_seq', 'reading_id']
In [10]:
#| export
CATEGORICAL_COLUMNS_NEED_ENCODING = [

]
In [11]:
#| export
CONTINUOUS_COLUMNS = [
'dim_0',
'dim_1',
'dim_2'
]
In [12]:
#| export
LABEL_COLUMNS = ['class_vals']

Workflow and Operations

In [13]:
cat_features_no_encoding = nvt.ColumnGroup(CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING)
#te_features = CATEGORICAL_COLUMNS_NEED_ENCODING >> ops.TargetEncoding(LABEL_COLUMNS, kfold=5, fold_seed=42, p_smooth=20)
cont_features = CONTINUOUS_COLUMNS >> ops.FillMissing() >> ops.Normalize()
label_name = LABEL_COLUMNS

workflow = nvt.Workflow(
    #cat_features_no_encoding + te_features + cont_features + label_name
    #cat_features_no_encoding + te_features + label_name
    cat_features_no_encoding  + cont_features + label_name
)

Datasets

Input data

In [14]:
pre_processed_train_dir = os.path.join("./", upstream['parquet_conversion_character_trajectories']['CharacterTrajectories_TRAIN_RAW'])
pre_processed_valid_dir = os.path.join("./", upstream['parquet_conversion_character_trajectories']['CharacterTrajectories_VALID_RAW'])
pre_processed_test_dir = os.path.join("./", upstream['parquet_conversion_character_trajectories']['CharacterTrajectories_TEST_RAW'])

Training, Validation and Test datasets

In [15]:
train_dataset = nvt.Dataset(pre_processed_train_dir, engine='parquet')
valid_dataset = nvt.Dataset(pre_processed_valid_dir, engine='parquet')
test_dataset = nvt.Dataset(pre_processed_test_dir, engine='parquet')

Output location

In [16]:
output_train_dir = os.path.join("./", product['CharacterTrajectories_TRAIN_TE'])
output_valid_dir = os.path.join("./", product['CharacterTrajectories_VALID_TE'])
output_test_dir = os.path.join("./", product['CharacterTrajectories_TEST_TE'])
In [17]:
!mkdir -p $output_train_dir
!mkdir -p $output_valid_dir
!mkdir -p $output_test_dir

Path to save the workflow to

Fit: Train Dataset
¶

In [18]:
%%time
workflow.fit(train_dataset)
CPU times: user 139 ms, sys: 11.9 ms, total: 151 ms
Wall time: 1.96 s
Out[18]:
<nvtabular.workflow.workflow.Workflow at 0x7f6fc9d83ac0>

Save workflow

In [19]:
%%time
workflow.save(product['CharacterTrajectories_workflow_dir'])
CPU times: user 1.1 ms, sys: 132 µs, total: 1.23 ms
Wall time: 1.04 ms

Clear workflow

In [20]:
%%time
workflow = None
CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 5.72 µs

Load workflow

In [21]:
%%time
workflow = nvt.Workflow.load(product['CharacterTrajectories_workflow_dir'], client=client)
CPU times: user 282 µs, sys: 375 µs, total: 657 µs
Wall time: 503 µs

Transform: Train Dataset
¶

In [22]:
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(train_dataset).to_parquet(
    output_train_dir,
    out_files_per_proc=2,
    shuffle=nvt.io.Shuffle.PER_PARTITION,
)
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/merlin/io/dataset.py:862: UserWarning: Only created 4 files did not have enough partitions to create 8 files.
  warnings.warn(
CPU times: user 135 ms, sys: 23.6 ms, total: 159 ms
Wall time: 2.29 s

Transform: Valid Dataset
¶

In [23]:
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(valid_dataset).to_parquet(
    output_valid_dir,
    out_files_per_proc=2,
    shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 41.3 ms, sys: 5.97 ms, total: 47.3 ms
Wall time: 133 ms

Transform: Test Dataset
¶

In [24]:
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(test_dataset).to_parquet(
    output_test_dir,
    out_files_per_proc=2,
    shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 44.2 ms, sys: 642 µs, total: 44.8 ms
Wall time: 146 ms

Verify Data

In [25]:
train_gdf = dask_cudf.read_parquet(output_train_dir)
In [26]:
%%time
train_gdf.head()
CPU times: user 15.9 ms, sys: 17 µs, total: 15.9 ms
Wall time: 24.4 ms
Out[26]:
dim_0 dim_1 dim_2 case_id case_id_seq reading_id class_vals
0 -0.153776 0.095584 -1.352046 821 821 119 11
1 -0.147927 0.096286 1.340823 822 822 1 11
2 -1.395404 -0.386817 0.698436 769 769 24 10
3 0.659693 0.590795 0.265359 874 874 70 11
4 -0.754230 -0.941674 0.314788 800 800 34 10
In [27]:
%%time
train_gdf['case_id'].nunique().compute()
CPU times: user 121 ms, sys: 6.26 ms, total: 127 ms
Wall time: 150 ms
Out[27]:
1137
In [28]:
valid_gdf = dask_cudf.read_parquet(output_valid_dir)
In [29]:
%%time
valid_gdf.head()
CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 23.3 ms
Out[29]:
dim_0 dim_1 dim_2 case_id case_id_seq reading_id class_vals
0 -1.375186 -2.209853 0.295608 893 893 75 12
1 0.515818 2.094258 0.314136 865 865 46 11
2 -0.512440 0.047859 -1.610068 888 888 155 12
3 -0.874572 -0.413067 0.518190 869 869 88 11
4 -0.485885 0.238515 1.085228 772 772 9 10
In [30]:
%%time
valid_gdf['case_id'].nunique().compute()
CPU times: user 20.4 ms, sys: 683 µs, total: 21.1 ms
Wall time: 48.2 ms
Out[30]:
285
In [31]:
test_gdf = dask_cudf.read_parquet(output_test_dir)
In [32]:
%%time
test_gdf.head()
CPU times: user 10.7 ms, sys: 3.39 ms, total: 14.1 ms
Wall time: 22.1 ms
Out[32]:
dim_0 dim_1 dim_2 case_id case_id_seq reading_id class_vals
0 1.127216 2.960204 0.424676 835 835 47 11
1 -1.598934 0.352640 0.295608 753 753 69 10
2 1.218612 1.402855 0.459686 750 750 48 10
3 1.243141 -0.345703 -1.553693 818 818 98 10
4 0.673112 2.115456 -0.124735 852 852 52 11
In [33]:
%%time
test_gdf['case_id'].nunique().compute()
CPU times: user 18.3 ms, sys: 2.98 ms, total: 21.3 ms
Wall time: 47.9 ms
Out[33]:
1436
In [34]:
test_gdf.columns
Out[34]:
Index(['dim_0', 'dim_1', 'dim_2', 'case_id', 'case_id_seq', 'reading_id',
       'class_vals'],
      dtype='object')
In [35]:
!ls -lrt --block-size=M $output_train_dir
total 4M
-rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_5.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_4.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_7.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_6.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_1.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_0.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_3.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_2.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata
-rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
In [36]:
!ls -lrt --block-size=M $output_valid_dir
total 1M
-rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_5.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_4.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_7.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_6.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_3.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_2.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_1.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_0.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata
-rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
In [37]:
!ls -lrt --block-size=M $output_test_dir
total 5M
-rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_7.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_6.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_5.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_4.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_3.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_2.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_1.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 part_0.parquet
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json
-rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
-rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata

We reset the kernel!!!

In [38]:
%%time
client.shutdown()
client.close()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report
    await self._reconnect()
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError
CPU times: user 39.1 ms, sys: 10.2 ms, total: 49.4 ms
Wall time: 669 ms
In [39]:
from nbdev import nbdev_export
nbdev_export()