In [1]:
#| default_exp data.pen_digits
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['core']
In [3]:
# Parameters
upstream = {"core": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/103_data.pen_digits.html", "PenDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/train", "PenDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/valid", "PenDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/test"}
In [4]:
#| hide
from nbdev.showdoc import *
In [5]:
#| export
from vitmtsc import *
from vitmtsc.core import *
import dask_cudf
import gc   #garbage collector interface
In [6]:
#| export
upstream = {
    "core": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
        "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
        "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
        "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
        "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
        "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
        "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
        "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
        "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
        "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
        "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/103_data.pen_digits.html",
    "PenDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/train",
    "PenDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/valid",
    "PenDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/test",
}

Data Download and Conversion
¶

PenDigits dataset
¶

Convert dataset to parquet format to run target encoding

In [7]:
#| export
DATASET_NAME = 'PenDigits'

Download and Convert dataset in tabular format¶

In [8]:
%%time
train = get_mtsc_data_tabular_from_ts(upstream['core']['PenDigits_TRAIN_TS'])
train.shape
Reading dataset TS file...
Converting _x to tabular format...
Converting _y to tabular format...
Merging _x and _y...
CPU times: user 10.1 s, sys: 132 ms, total: 10.2 s
Wall time: 10.2 s
Out[8]:
(59952, 5)
In [9]:
train.head()
Out[9]:
case_id reading_id dim_0 dim_1 class_vals
0 0 0 47.0 100.0 8
1 0 1 27.0 81.0 8
2 0 2 57.0 37.0 8
3 0 3 26.0 0.0 8
4 0 4 0.0 23.0 8
In [10]:
train['reading_id'].min(), train['reading_id'].max()
Out[10]:
(0, 7)
In [11]:
train['class_vals'].unique()
Out[11]:
array(['8', '2', '1', '4', '6', '0', '5', '9', '7', '3'], dtype=object)
In [12]:
%%time
test = get_mtsc_data_tabular_from_ts(upstream['core']['PenDigits_TEST_TS'])
test.shape
Reading dataset TS file...
Converting _x to tabular format...
Converting _y to tabular format...
Merging _x and _y...
CPU times: user 4.61 s, sys: 22.7 ms, total: 4.64 s
Wall time: 4.63 s
Out[12]:
(27984, 5)
In [13]:
test.head()
Out[13]:
case_id reading_id dim_0 dim_1 class_vals
0 0 0 88.0 92.0 8
1 0 1 2.0 99.0 8
2 0 2 16.0 66.0 8
3 0 3 94.0 37.0 8
4 0 4 70.0 0.0 8
In [14]:
test['reading_id'].min(), test['reading_id'].max()
Out[14]:
(0, 7)
In [15]:
test['class_vals'].unique()
Out[15]:
array(['8', '9', '1', '4', '7', '0', '2', '5', '3', '6'], dtype=object)
In [16]:
from sklearn.model_selection import train_test_split
X = train[['case_id', 'class_vals']].drop_duplicates()
X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)
X_train.case_id.nunique(), X_val.case_id.nunique()
Out[16]:
(5995, 1499)
In [17]:
X_train.groupby(by = ['class_vals'], dropna = False).count()
Out[17]:
case_id
class_vals
0 628
1 622
2 638
3 584
4 632
5 567
6 559
7 616
8 566
9 583
In [18]:
X_val.groupby(by = ['class_vals'], dropna = False).count()
Out[18]:
case_id
class_vals
0 152
1 157
2 142
3 135
4 148
5 153
6 161
7 162
8 153
9 136
In [19]:
test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()
Out[19]:
case_id
class_vals
0 363
1 364
2 364
3 336
4 364
5 335
6 336
7 364
8 336
9 336
In [20]:
valid = train.merge(X_val, on=['case_id'], how='inner')
valid['class_vals'] = valid['class_vals_x']
valid = valid.drop(columns=['class_vals_x','class_vals_y'])
valid.case_id.nunique()
Out[20]:
1499
In [21]:
train = train.merge(X_train, on=['case_id'], how='inner')
train['class_vals'] = train['class_vals_x']
train = train.drop(columns=['class_vals_x','class_vals_y'])
train.case_id.nunique()
Out[21]:
5995
In [22]:
train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()
Out[22]:
(5995, 1499, 3498)
In [23]:
train
Out[23]:
case_id reading_id dim_0 dim_1 class_vals
0 1 0 0.0 89.0 2
1 1 1 27.0 100.0 2
2 1 2 42.0 75.0 2
3 1 3 29.0 45.0 2
4 1 4 15.0 15.0 2
... ... ... ... ... ...
47955 7493 3 70.0 48.0 7
47956 7493 4 42.0 11.0 7
47957 7493 5 32.0 0.0 7
47958 7493 6 25.0 36.0 7
47959 7493 7 100.0 40.0 7

47960 rows × 5 columns

Write data in parquet format for future processing¶

In [24]:
#| export
import cudf
import dask_cudf
import pandas as pd

def write_parquet(pandas_df, output_dir, npartitions = 2):
    pandas_df['class_vals'].replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace = True)
    gdf = cudf.from_pandas(pandas_df)
    gdf['case_id_seq'] = gdf['case_id']
    dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)
    dask_gdf.to_parquet(output_dir)  
In [25]:
import time

from dask.distributed import Client
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:00:35,470 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:00:35,470 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:00:35,470 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:00:35,471 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:00:35,471 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:00:35,471 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:00:35,471 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:00:35,471 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Out[25]:

Client

Client-01f207e2-3b72-11ed-80d1-02b68d644837

Connection method: Cluster object Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

LocalCUDACluster

34d63417

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 4 Total memory: 150.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-afbbd400-3841-4a6d-8988-c8833d96ec8c

Comm: tcp://127.0.0.1:42753 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 4
Started: Just now Total memory: 150.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:36167 Total threads: 1
Dashboard: http://127.0.0.1:36059/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:39755
Local directory: /tmp/dask-worker-space/worker-_tqacl9y
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 1

Comm: tcp://127.0.0.1:40359 Total threads: 1
Dashboard: http://127.0.0.1:39691/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:43911
Local directory: /tmp/dask-worker-space/worker-nkrukqx4
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 2

Comm: tcp://127.0.0.1:35605 Total threads: 1
Dashboard: http://127.0.0.1:35459/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:42369
Local directory: /tmp/dask-worker-space/worker-0luskvyb
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 3

Comm: tcp://127.0.0.1:46159 Total threads: 1
Dashboard: http://127.0.0.1:35087/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:41061
Local directory: /tmp/dask-worker-space/worker-a4qrqumh
GPU: Tesla T4 GPU memory: 14.76 GiB

Train Dataset

In [26]:
%%time
write_parquet(train, product['PenDigits_TRAIN_RAW'])
CPU times: user 1.4 s, sys: 396 ms, total: 1.8 s
Wall time: 3.44 s
In [27]:
train_gdf = dask_cudf.read_parquet(product['PenDigits_TRAIN_RAW'])
train_gdf.head()
Out[27]:
case_id reading_id dim_0 dim_1 class_vals case_id_seq
0 1 0 0.0 89.0 2 1
1 1 1 27.0 100.0 2 1
2 1 2 42.0 75.0 2 1
3 1 3 29.0 45.0 2 1
4 1 4 15.0 15.0 2 1
In [28]:
train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()
Out[28]:
(0, 7)
In [29]:
train_gdf.case_id.nunique().compute()
Out[29]:
5995

Valid Dataset

In [30]:
%%time
write_parquet(valid, product['PenDigits_VALID_RAW'])
CPU times: user 35 ms, sys: 0 ns, total: 35 ms
Wall time: 70.7 ms
In [31]:
valid_gdf = dask_cudf.read_parquet(product['PenDigits_VALID_RAW'])
valid_gdf.head()
Out[31]:
case_id reading_id dim_0 dim_1 class_vals case_id_seq
0 0 0 47.0 100.0 8 0
1 0 1 27.0 81.0 8 0
2 0 2 57.0 37.0 8 0
3 0 3 26.0 0.0 8 0
4 0 4 0.0 23.0 8 0
In [32]:
valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()
Out[32]:
(0, 7)
In [33]:
valid_gdf.case_id.nunique().compute()
Out[33]:
1499

Test Dataset

In [34]:
%%time
write_parquet(test, product['PenDigits_TEST_RAW'])
CPU times: user 43 ms, sys: 3.59 ms, total: 46.6 ms
Wall time: 103 ms
In [35]:
test_gdf = dask_cudf.read_parquet(product['PenDigits_TEST_RAW'])
test_gdf.head()
Out[35]:
case_id reading_id dim_0 dim_1 class_vals case_id_seq
0 0 0 88.0 92.0 8 0
1 0 1 2.0 99.0 8 0
2 0 2 16.0 66.0 8 0
3 0 3 94.0 37.0 8 0
4 0 4 70.0 0.0 8 0
In [36]:
test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()
Out[36]:
(0, 7)
In [37]:
test_gdf.case_id.nunique().compute()
Out[37]:
3498

We reset the kernel!!!

In [38]:
%%time
client.shutdown()
client.close()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report
    await self._reconnect()
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError
CPU times: user 30.8 ms, sys: 11.5 ms, total: 42.2 ms
Wall time: 620 ms
In [39]:
from nbdev import nbdev_export
nbdev_export()