#| default_exp feature_preprocessing.pen_digits.target_encoding
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['parquet_conversion_pen_digits']
# Parameters
upstream = {"parquet_conversion_pen_digits": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/103_data.pen_digits.html", "PenDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/train", "PenDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/valid", "PenDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/test"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/203_feature_preprocessing.pen_digits.target_encoding.html", "PenDigits_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/target_encoding/train", "PenDigits_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/target_encoding/valid", "PenDigits_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/target_encoding/test", "PenDigits_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/target_encoding/nvtabular_workflow"}
#| hide
from nbdev.showdoc import *
#| export
from vitmtsc import *
from vitmtsc.core import *
from vitmtsc.data.pen_digits import *
import os
import nvtabular as nvt
import dask_cudf
from nvtabular import ops
#| export
upstream = {
"parquet_conversion_pen_digits": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/103_data.pen_digits.html",
"PenDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/train",
"PenDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/valid",
"PenDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/test",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/203_feature_preprocessing.pen_digits.target_encoding.html",
"PenDigits_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/target_encoding/train",
"PenDigits_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/target_encoding/valid",
"PenDigits_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/target_encoding/test",
"PenDigits_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/target_encoding/nvtabular_workflow",
}
!conda list|grep -i nvtabular
Fill missing continuous features
Normalize continuous features
Categorify categorical features
Target Encoding of Categorical Variables
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:01:22,260 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:01:22,260 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:01:22,260 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:01:22,261 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:01:22,287 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:01:22,287 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:01:22,334 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:01:22,334 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Client-1dc0e4bd-3b72-11ed-80ab-02b68d644837
Connection method: Cluster object | Cluster type: dask_cuda.LocalCUDACluster |
Dashboard: http://127.0.0.1:8787/status |
2abf8131
Dashboard: http://127.0.0.1:8787/status | Workers: 4 |
Total threads: 4 | Total memory: 150.00 GiB |
Status: running | Using processes: True |
Scheduler-3bd722f7-0afc-4d17-ad76-b7d6d625f50c
Comm: tcp://127.0.0.1:42395 | Workers: 4 |
Dashboard: http://127.0.0.1:8787/status | Total threads: 4 |
Started: Just now | Total memory: 150.00 GiB |
Comm: tcp://127.0.0.1:44131 | Total threads: 1 |
Dashboard: http://127.0.0.1:41421/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:34001 | |
Local directory: /tmp/dask-worker-space/worker-p9i474xz | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:35557 | Total threads: 1 |
Dashboard: http://127.0.0.1:38887/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:34513 | |
Local directory: /tmp/dask-worker-space/worker-9pr9u500 | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:38699 | Total threads: 1 |
Dashboard: http://127.0.0.1:34641/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:42307 | |
Local directory: /tmp/dask-worker-space/worker-srsifpsa | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:34339 | Total threads: 1 |
Dashboard: http://127.0.0.1:37257/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:34909 | |
Local directory: /tmp/dask-worker-space/worker-d4xjor9u | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
COLUMNS: CATEGORICAL, CONTINUOUS and TARGET
#| export
import numpy as np
CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING = ['case_id', 'case_id_seq', 'reading_id']
#| export
CATEGORICAL_COLUMNS_NEED_ENCODING = [
]
#| export
CONTINUOUS_COLUMNS = [
'dim_0',
'dim_1'
]
#| export
LABEL_COLUMNS = ['class_vals']
Workflow and Operations
import cudf
import numpy as np
cat_features_no_encoding = nvt.ColumnGroup(CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING)
#te_features = CATEGORICAL_COLUMNS_NEED_ENCODING >> ops.TargetEncoding(LABEL_COLUMNS, kfold=5, fold_seed=42, p_smooth=20)
cont_features = CONTINUOUS_COLUMNS >> ops.FillMissing() >> ops.Normalize()
label_name = LABEL_COLUMNS
workflow = nvt.Workflow(
#cat_features_no_encoding + te_features + cont_features + label_name
#cat_features_no_encoding + te_features + label_name
cat_features_no_encoding + cont_features + label_name
)
Datasets
Input data
pre_processed_train_dir = os.path.join("./", upstream['parquet_conversion_pen_digits']['PenDigits_TRAIN_RAW'])
pre_processed_valid_dir = os.path.join("./", upstream['parquet_conversion_pen_digits']['PenDigits_VALID_RAW'])
pre_processed_test_dir = os.path.join("./", upstream['parquet_conversion_pen_digits']['PenDigits_TEST_RAW'])
Training, Validation and Test datasets
train_dataset = nvt.Dataset(pre_processed_train_dir, engine='parquet')
valid_dataset = nvt.Dataset(pre_processed_valid_dir, engine='parquet')
test_dataset = nvt.Dataset(pre_processed_test_dir, engine='parquet')
Output location
output_train_dir = os.path.join("./", product['PenDigits_TRAIN_TE'])
output_valid_dir = os.path.join("./", product['PenDigits_VALID_TE'])
output_test_dir = os.path.join("./", product['PenDigits_TEST_TE'])
!mkdir -p $output_train_dir
!mkdir -p $output_valid_dir
!mkdir -p $output_test_dir
Path to save the workflow to
%%time
workflow.fit(train_dataset)
CPU times: user 108 ms, sys: 32.6 ms, total: 140 ms Wall time: 1.96 s
<nvtabular.workflow.workflow.Workflow at 0x7ffa02413fa0>
Save workflow
%%time
workflow.save(product['PenDigits_workflow_dir'])
CPU times: user 1.17 ms, sys: 0 ns, total: 1.17 ms Wall time: 977 µs
Clear workflow
%%time
workflow = None
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs Wall time: 5.72 µs
Load workflow
%%time
workflow = nvt.Workflow.load(product['PenDigits_workflow_dir'], client=client)
CPU times: user 255 µs, sys: 339 µs, total: 594 µs Wall time: 451 µs
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(train_dataset).to_parquet(
output_train_dir,
out_files_per_proc=2,
shuffle=nvt.io.Shuffle.PER_PARTITION,
)
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/merlin/io/dataset.py:862: UserWarning: Only created 4 files did not have enough partitions to create 8 files. warnings.warn(
CPU times: user 114 ms, sys: 29.2 ms, total: 143 ms Wall time: 2.27 s
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(valid_dataset).to_parquet(
output_valid_dir,
out_files_per_proc=2,
shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 44.7 ms, sys: 1.31 ms, total: 46 ms Wall time: 110 ms
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(test_dataset).to_parquet(
output_test_dir,
out_files_per_proc=2,
shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 34.9 ms, sys: 5.9 ms, total: 40.8 ms Wall time: 118 ms
Verify Data
train_gdf = dask_cudf.read_parquet(output_train_dir)
%%time
train_gdf.head()
CPU times: user 10.5 ms, sys: 3.13 ms, total: 13.6 ms Wall time: 21.1 ms
dim_0 | dim_1 | case_id | case_id_seq | reading_id | class_vals | |
---|---|---|---|---|---|---|
0 | 0.325309 | -0.466866 | 699 | 699 | 6 | 6 |
1 | 1.477136 | -0.495294 | 766 | 766 | 4 | 4 |
2 | 1.477136 | 1.238846 | 730 | 730 | 0 | 8 |
3 | 0.413911 | -1.206007 | 731 | 731 | 5 | 1 |
4 | -0.619781 | 0.584990 | 47 | 47 | 0 | 2 |
%%time
train_gdf['case_id'].nunique().compute()
CPU times: user 120 ms, sys: 5.39 ms, total: 126 ms Wall time: 149 ms
5995
valid_gdf = dask_cudf.read_parquet(output_valid_dir)
%%time
valid_gdf.head()
CPU times: user 9.66 ms, sys: 4.98 ms, total: 14.6 ms Wall time: 21.8 ms
dim_0 | dim_1 | case_id | case_id_seq | reading_id | class_vals | |
---|---|---|---|---|---|---|
0 | 1.477136 | 0.158562 | 321 | 321 | 5 | 4 |
1 | -0.088168 | -0.296295 | 511 | 511 | 6 | 8 |
2 | -1.476268 | -1.461864 | 500 | 500 | 7 | 5 |
3 | -0.472110 | 1.380989 | 601 | 601 | 3 | 9 |
4 | 1.270398 | 1.153561 | 625 | 625 | 3 | 9 |
%%time
valid_gdf['case_id'].nunique().compute()
CPU times: user 11.7 ms, sys: 8.6 ms, total: 20.3 ms Wall time: 48.1 ms
1499
test_gdf = dask_cudf.read_parquet(output_test_dir)
%%time
test_gdf.head()
CPU times: user 11.1 ms, sys: 2.82 ms, total: 14 ms Wall time: 21.1 ms
dim_0 | dim_1 | case_id | case_id_seq | reading_id | class_vals | |
---|---|---|---|---|---|---|
0 | 0.000434 | -0.836437 | 357 | 357 | 6 | 1 |
1 | -0.856053 | -1.461864 | 91 | 91 | 4 | 7 |
2 | -0.117702 | 0.584990 | 214 | 214 | 2 | 2 |
3 | 0.413911 | -1.461864 | 364 | 364 | 6 | 9 |
4 | -1.476268 | 0.044848 | 277 | 277 | 0 | 1 |
%%time
test_gdf['case_id'].nunique().compute()
CPU times: user 19.7 ms, sys: 732 µs, total: 20.4 ms Wall time: 46.3 ms
3498
test_gdf.columns
Index(['dim_0', 'dim_1', 'case_id', 'case_id_seq', 'reading_id', 'class_vals'], dtype='object')
!ls -lrt --block-size=M $output_train_dir
total 1M -rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt -rw-r--r-- 1 root root 1M Sep 23 19:01 part_7.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_6.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_4.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_5.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_3.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_2.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_1.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_0.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata
!ls -lrt --block-size=M $output_valid_dir
total 1M -rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt -rw-r--r-- 1 root root 1M Sep 23 19:01 part_7.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_6.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_5.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_4.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_1.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_0.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_3.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_2.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata -rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
!ls -lrt --block-size=M $output_test_dir
total 1M -rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt -rw-r--r-- 1 root root 1M Sep 23 19:01 part_7.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_6.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_5.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_4.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_3.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_2.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_1.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_0.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata -rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
We reset the kernel!!!
%%time
client.shutdown()
client.close()
Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report await self._reconnect() File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError
CPU times: user 31.9 ms, sys: 12.6 ms, total: 44.4 ms Wall time: 670 ms
from nbdev import nbdev_export
nbdev_export()