#| default_exp feature_preprocessing.spoken_arabic_digits.target_encoding
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['parquet_conversion_spoken_arabic_digits']
# Parameters
upstream = {"parquet_conversion_spoken_arabic_digits": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/104_data.spoken_arabic_digits.html", "SpokenArabicDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/train", "SpokenArabicDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/valid", "SpokenArabicDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/test"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/204_feature_preprocessing.spoken_arabic_digits.target_encoding.html", "SpokenArabicDigits_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/train", "SpokenArabicDigits_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/valid", "SpokenArabicDigits_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/test", "SpokenArabicDigits_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/nvtabular_workflow"}
#| hide
from nbdev.showdoc import *
#| export
from vitmtsc import *
from vitmtsc.core import *
from vitmtsc.data.spoken_arabic_digits import *
import os
import nvtabular as nvt
import dask_cudf
from nvtabular import ops
#| export
upstream = {
"parquet_conversion_spoken_arabic_digits": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/104_data.spoken_arabic_digits.html",
"SpokenArabicDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/train",
"SpokenArabicDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/valid",
"SpokenArabicDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/raw/test",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/204_feature_preprocessing.spoken_arabic_digits.target_encoding.html",
"SpokenArabicDigits_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/train",
"SpokenArabicDigits_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/valid",
"SpokenArabicDigits_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/test",
"SpokenArabicDigits_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding/nvtabular_workflow",
}
!conda list|grep -i nvtabular
Fill missing continuous features
Normalize continuous features
Categorify categorical features
Target Encoding of Categorical Variables
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:01:39,406 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:01:39,406 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:01:39,441 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:01:39,441 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:01:39,484 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:01:39,484 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:01:39,503 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:01:39,503 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Client-27f97194-3b72-11ed-80b1-061eae6df733
Connection method: Cluster object | Cluster type: dask_cuda.LocalCUDACluster |
Dashboard: http://127.0.0.1:8787/status |
b785cfd6
Dashboard: http://127.0.0.1:8787/status | Workers: 4 |
Total threads: 4 | Total memory: 150.00 GiB |
Status: running | Using processes: True |
Scheduler-b22d4260-67de-4389-9940-53e52cf62526
Comm: tcp://127.0.0.1:41651 | Workers: 4 |
Dashboard: http://127.0.0.1:8787/status | Total threads: 4 |
Started: Just now | Total memory: 150.00 GiB |
Comm: tcp://127.0.0.1:44937 | Total threads: 1 |
Dashboard: http://127.0.0.1:44843/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:42245 | |
Local directory: /tmp/dask-worker-space/worker-rmufjl0g | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:34385 | Total threads: 1 |
Dashboard: http://127.0.0.1:34747/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:38263 | |
Local directory: /tmp/dask-worker-space/worker-w1ft0d1p | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:33201 | Total threads: 1 |
Dashboard: http://127.0.0.1:44653/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:33295 | |
Local directory: /tmp/dask-worker-space/worker-9stpwdj1 | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:42711 | Total threads: 1 |
Dashboard: http://127.0.0.1:35079/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:39527 | |
Local directory: /tmp/dask-worker-space/worker-870nf7ec | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
COLUMNS: CATEGORICAL, CONTINUOUS and TARGET
#| export
import numpy as np
CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING = ['case_id', 'case_id_seq', 'reading_id']
#| export
CATEGORICAL_COLUMNS_NEED_ENCODING = [
]
#| export
CONTINUOUS_COLUMNS = [
'dim_0',
'dim_1',
'dim_2',
'dim_3',
'dim_4',
'dim_5',
'dim_6',
'dim_7',
'dim_8',
'dim_9',
'dim_10',
'dim_11',
'dim_12'
]
#| export
LABEL_COLUMNS = ['class_vals']
Workflow and Operations
import cudf
import numpy as np
cat_features_no_encoding = nvt.ColumnGroup(CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING)
#te_features = CATEGORICAL_COLUMNS_NEED_ENCODING >> ops.TargetEncoding(LABEL_COLUMNS, kfold=5, fold_seed=42, p_smooth=20)
cont_features = CONTINUOUS_COLUMNS >> ops.FillMissing() >> ops.Normalize()
label_name = LABEL_COLUMNS
workflow = nvt.Workflow(
#cat_features_no_encoding + te_features + cont_features + label_name
#cat_features_no_encoding + te_features + label_name
cat_features_no_encoding + cont_features + label_name
)
Datasets
Input data
pre_processed_train_dir = os.path.join("./", upstream['parquet_conversion_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_RAW'])
pre_processed_valid_dir = os.path.join("./", upstream['parquet_conversion_spoken_arabic_digits']['SpokenArabicDigits_VALID_RAW'])
pre_processed_test_dir = os.path.join("./", upstream['parquet_conversion_spoken_arabic_digits']['SpokenArabicDigits_TEST_RAW'])
Training, Validation and Test datasets
train_dataset = nvt.Dataset(pre_processed_train_dir, engine='parquet')
valid_dataset = nvt.Dataset(pre_processed_valid_dir, engine='parquet')
test_dataset = nvt.Dataset(pre_processed_test_dir, engine='parquet')
Output location
output_train_dir = os.path.join("./", product['SpokenArabicDigits_TRAIN_TE'])
output_valid_dir = os.path.join("./", product['SpokenArabicDigits_VALID_TE'])
output_test_dir = os.path.join("./", product['SpokenArabicDigits_TEST_TE'])
!mkdir -p $output_train_dir
!mkdir -p $output_valid_dir
!mkdir -p $output_test_dir
Path to save the workflow to
%%time
workflow.fit(train_dataset)
CPU times: user 122 ms, sys: 48.1 ms, total: 170 ms Wall time: 2.07 s
<nvtabular.workflow.workflow.Workflow at 0x7fdbc14b8ca0>
Save workflow
%%time
workflow.save(product['SpokenArabicDigits_workflow_dir'])
CPU times: user 1.51 ms, sys: 0 ns, total: 1.51 ms Wall time: 1.28 ms
Clear workflow
%%time
workflow = None
CPU times: user 1 µs, sys: 2 µs, total: 3 µs Wall time: 6.2 µs
Load workflow
%%time
workflow = nvt.Workflow.load(product['SpokenArabicDigits_workflow_dir'], client=client)
CPU times: user 946 µs, sys: 0 ns, total: 946 µs Wall time: 825 µs
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(train_dataset).to_parquet(
output_train_dir,
out_files_per_proc=2,
shuffle=nvt.io.Shuffle.PER_PARTITION,
)
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/merlin/io/dataset.py:862: UserWarning: Only created 4 files did not have enough partitions to create 8 files. warnings.warn(
CPU times: user 167 ms, sys: 8.15 ms, total: 176 ms Wall time: 2.38 s
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(valid_dataset).to_parquet(
output_valid_dir,
out_files_per_proc=2,
shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 50.4 ms, sys: 5.51 ms, total: 55.9 ms Wall time: 182 ms
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(test_dataset).to_parquet(
output_test_dir,
out_files_per_proc=2,
shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 56 ms, sys: 3.72 ms, total: 59.7 ms Wall time: 203 ms
Verify Data
train_gdf = dask_cudf.read_parquet(output_train_dir)
%%time
train_gdf.head()
CPU times: user 11.5 ms, sys: 3.49 ms, total: 15 ms Wall time: 24.7 ms
dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | dim_8 | dim_9 | dim_10 | dim_11 | dim_12 | case_id | case_id_seq | reading_id | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -2.156874 | 0.475753 | 0.507209 | 0.587834 | -0.031909 | 0.027982 | 0.380251 | 0.267008 | -0.275396 | 0.317955 | 0.892540 | 0.286739 | 0.396203 | 3723 | 3723 | 39 | 5 |
1 | -0.754594 | -0.385721 | 0.440070 | 0.558343 | 1.252602 | 0.531703 | -0.742155 | -1.517387 | -1.903049 | -1.261425 | -0.853626 | -0.319565 | -1.791995 | 3800 | 3800 | 44 | 5 |
2 | 1.145584 | -1.290091 | -1.330844 | -1.632132 | 0.777363 | -0.342097 | 0.185460 | 0.631775 | -1.044989 | -1.171606 | -1.370700 | 2.073556 | -0.611623 | 3658 | 3658 | 53 | 5 |
3 | 1.016003 | 1.013697 | -1.244734 | -0.083767 | -0.541389 | 0.249727 | -0.341089 | 0.787778 | 0.208822 | 0.322054 | -0.643695 | -0.408454 | 0.409597 | 4209 | 4209 | 31 | 6 |
4 | -0.856874 | -1.610310 | -0.799904 | 0.128338 | 1.958033 | 1.633232 | -0.667427 | 1.546357 | 1.853666 | 1.691711 | -0.715517 | 0.772755 | 0.747864 | 3369 | 3369 | 1 | 5 |
%%time
train_gdf['case_id'].nunique().compute()
CPU times: user 113 ms, sys: 14.1 ms, total: 127 ms Wall time: 152 ms
5279
valid_gdf = dask_cudf.read_parquet(output_valid_dir)
%%time
valid_gdf.head()
CPU times: user 11.6 ms, sys: 3.65 ms, total: 15.3 ms Wall time: 23.3 ms
dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | dim_8 | dim_9 | dim_10 | dim_11 | dim_12 | case_id | case_id_seq | reading_id | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.083873 | 1.320869 | 1.008252 | 0.642920 | -0.440215 | -1.128561 | -0.246360 | -2.029218 | -1.499457 | 0.651216 | -0.375215 | -0.266346 | -0.617796 | 3412 | 3412 | 11 | 5 |
1 | -1.576518 | 0.228166 | 0.852320 | 1.418352 | 0.835680 | 0.479848 | -0.352939 | -0.943208 | -0.103780 | 0.718712 | 1.765949 | 0.904590 | -1.146879 | 3979 | 3979 | 20 | 6 |
2 | 0.202786 | -0.725515 | 0.049635 | -0.734679 | 0.357877 | -1.388294 | -1.228936 | -1.711134 | -0.169800 | -0.477506 | -1.158517 | 0.217679 | -0.254558 | 3651 | 3651 | 21 | 5 |
3 | -0.592673 | 0.701845 | 1.155435 | 0.684398 | -0.091579 | -0.223135 | -0.352350 | -2.133810 | 0.786475 | -0.921527 | -0.870610 | -0.202096 | -0.767339 | 3774 | 3774 | 17 | 5 |
4 | -2.338549 | 0.190234 | 0.565265 | 1.174318 | 1.000907 | 0.949724 | 0.296178 | 0.092061 | 0.476885 | 0.287979 | -0.217853 | 0.210313 | 0.108411 | 3767 | 3767 | 30 | 5 |
%%time
valid_gdf['case_id'].nunique().compute()
CPU times: user 21.6 ms, sys: 0 ns, total: 21.6 ms Wall time: 47.5 ms
1320
test_gdf = dask_cudf.read_parquet(output_test_dir)
%%time
test_gdf.head()
CPU times: user 12 ms, sys: 2.96 ms, total: 15 ms Wall time: 23.5 ms
dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | dim_8 | dim_9 | dim_10 | dim_11 | dim_12 | case_id | case_id_seq | reading_id | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.017019 | 0.477907 | -1.017426 | -1.067772 | -1.056865 | 0.203524 | -1.083452 | -0.003735 | 1.201706 | -0.087909 | -1.798079 | -1.849999 | 0.325625 | 239 | 239 | 8 | 1 |
1 | -0.401030 | 0.030405 | 0.473714 | 0.583984 | -0.249840 | -0.688274 | -0.888310 | -1.316102 | -0.097323 | 0.729371 | 1.667976 | 0.455513 | 0.932301 | 105 | 105 | 32 | 0 |
2 | 0.532542 | 1.040478 | -0.330421 | -0.400506 | -1.323181 | 1.220669 | -1.789089 | -1.643401 | 0.741550 | 2.250455 | 0.772351 | -1.924102 | -0.014136 | 257 | 257 | 10 | 1 |
3 | 0.094137 | -0.008135 | -0.914392 | -1.374714 | -0.010304 | -0.809902 | -0.221825 | -0.711646 | -0.323396 | 0.366476 | 1.431594 | 0.792409 | 3.116056 | 168 | 168 | 37 | 0 |
4 | 1.445775 | 0.081121 | -0.632863 | 1.823147 | -2.215659 | -1.224102 | -1.997999 | 0.808217 | -0.875756 | 0.593732 | 1.366622 | -0.685766 | -0.086496 | 40 | 40 | 12 | 0 |
%%time
test_gdf['case_id'].nunique().compute()
CPU times: user 12.7 ms, sys: 11.8 ms, total: 24.5 ms Wall time: 50.2 ms
2199
test_gdf.columns
Index(['dim_0', 'dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', 'dim_6', 'dim_7', 'dim_8', 'dim_9', 'dim_10', 'dim_11', 'dim_12', 'case_id', 'case_id_seq', 'reading_id', 'class_vals'], dtype='object')
!ls -lrt --block-size=M $output_train_dir
total 23M -rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt -rw-r--r-- 1 root root 3M Sep 23 19:01 part_7.parquet -rw-r--r-- 1 root root 3M Sep 23 19:01 part_6.parquet -rw-r--r-- 1 root root 3M Sep 23 19:01 part_5.parquet -rw-r--r-- 1 root root 3M Sep 23 19:01 part_4.parquet -rw-r--r-- 1 root root 3M Sep 23 19:01 part_3.parquet -rw-r--r-- 1 root root 3M Sep 23 19:01 part_2.parquet -rw-r--r-- 1 root root 3M Sep 23 19:01 part_1.parquet -rw-r--r-- 1 root root 3M Sep 23 19:01 part_0.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata -rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
!ls -lrt --block-size=M $output_valid_dir
total 6M -rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt -rw-r--r-- 1 root root 1M Sep 23 19:01 part_7.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_6.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_5.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_4.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_3.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_2.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_1.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 part_0.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata -rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt
!ls -lrt --block-size=M $output_test_dir
total 10M -rw-r--r-- 1 root root 1M Sep 23 19:01 schema.pbtxt -rw-r--r-- 1 root root 2M Sep 23 19:01 part_3.parquet -rw-r--r-- 1 root root 2M Sep 23 19:01 part_2.parquet -rw-r--r-- 1 root root 2M Sep 23 19:01 part_1.parquet -rw-r--r-- 1 root root 2M Sep 23 19:01 part_0.parquet -rw-r--r-- 1 root root 2M Sep 23 19:01 part_5.parquet -rw-r--r-- 1 root root 2M Sep 23 19:01 part_4.parquet -rw-r--r-- 1 root root 2M Sep 23 19:01 part_7.parquet -rw-r--r-- 1 root root 2M Sep 23 19:01 part_6.parquet -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata.json -rw-r--r-- 1 root root 1M Sep 23 19:01 _file_list.txt -rw-r--r-- 1 root root 1M Sep 23 19:01 _metadata
We reset the kernel!!!
%%time
client.shutdown()
client.close()
Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report await self._reconnect() File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError
CPU times: user 25.4 ms, sys: 24.1 ms, total: 49.5 ms Wall time: 670 ms
from nbdev import nbdev_export
nbdev_export()