#| default_exp data.pen_digits
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['core']
# Parameters
upstream = {"core": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/103_data.pen_digits.html", "PenDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/train", "PenDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/valid", "PenDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/test"}
#| hide
from nbdev.showdoc import *
#| export
from vitmtsc import *
from vitmtsc.core import *
import dask_cudf
import gc #garbage collector interface
#| export
upstream = {
"core": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
"FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
"FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
"InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
"InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
"PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
"PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
"SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
"SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
"CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
"CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/103_data.pen_digits.html",
"PenDigits_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/train",
"PenDigits_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/valid",
"PenDigits_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/raw/test",
}
#| export
DATASET_NAME = 'PenDigits'
%%time
train = get_mtsc_data_tabular_from_ts(upstream['core']['PenDigits_TRAIN_TS'])
train.shape
Reading dataset TS file... Converting _x to tabular format... Converting _y to tabular format... Merging _x and _y... CPU times: user 10.1 s, sys: 132 ms, total: 10.2 s Wall time: 10.2 s
(59952, 5)
train.head()
case_id | reading_id | dim_0 | dim_1 | class_vals | |
---|---|---|---|---|---|
0 | 0 | 0 | 47.0 | 100.0 | 8 |
1 | 0 | 1 | 27.0 | 81.0 | 8 |
2 | 0 | 2 | 57.0 | 37.0 | 8 |
3 | 0 | 3 | 26.0 | 0.0 | 8 |
4 | 0 | 4 | 0.0 | 23.0 | 8 |
train['reading_id'].min(), train['reading_id'].max()
(0, 7)
train['class_vals'].unique()
array(['8', '2', '1', '4', '6', '0', '5', '9', '7', '3'], dtype=object)
%%time
test = get_mtsc_data_tabular_from_ts(upstream['core']['PenDigits_TEST_TS'])
test.shape
Reading dataset TS file... Converting _x to tabular format... Converting _y to tabular format... Merging _x and _y... CPU times: user 4.61 s, sys: 22.7 ms, total: 4.64 s Wall time: 4.63 s
(27984, 5)
test.head()
case_id | reading_id | dim_0 | dim_1 | class_vals | |
---|---|---|---|---|---|
0 | 0 | 0 | 88.0 | 92.0 | 8 |
1 | 0 | 1 | 2.0 | 99.0 | 8 |
2 | 0 | 2 | 16.0 | 66.0 | 8 |
3 | 0 | 3 | 94.0 | 37.0 | 8 |
4 | 0 | 4 | 70.0 | 0.0 | 8 |
test['reading_id'].min(), test['reading_id'].max()
(0, 7)
test['class_vals'].unique()
array(['8', '9', '1', '4', '7', '0', '2', '5', '3', '6'], dtype=object)
from sklearn.model_selection import train_test_split
X = train[['case_id', 'class_vals']].drop_duplicates()
X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)
X_train.case_id.nunique(), X_val.case_id.nunique()
(5995, 1499)
X_train.groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
0 | 628 |
1 | 622 |
2 | 638 |
3 | 584 |
4 | 632 |
5 | 567 |
6 | 559 |
7 | 616 |
8 | 566 |
9 | 583 |
X_val.groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
0 | 152 |
1 | 157 |
2 | 142 |
3 | 135 |
4 | 148 |
5 | 153 |
6 | 161 |
7 | 162 |
8 | 153 |
9 | 136 |
test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
0 | 363 |
1 | 364 |
2 | 364 |
3 | 336 |
4 | 364 |
5 | 335 |
6 | 336 |
7 | 364 |
8 | 336 |
9 | 336 |
valid = train.merge(X_val, on=['case_id'], how='inner')
valid['class_vals'] = valid['class_vals_x']
valid = valid.drop(columns=['class_vals_x','class_vals_y'])
valid.case_id.nunique()
1499
train = train.merge(X_train, on=['case_id'], how='inner')
train['class_vals'] = train['class_vals_x']
train = train.drop(columns=['class_vals_x','class_vals_y'])
train.case_id.nunique()
5995
train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()
(5995, 1499, 3498)
train
case_id | reading_id | dim_0 | dim_1 | class_vals | |
---|---|---|---|---|---|
0 | 1 | 0 | 0.0 | 89.0 | 2 |
1 | 1 | 1 | 27.0 | 100.0 | 2 |
2 | 1 | 2 | 42.0 | 75.0 | 2 |
3 | 1 | 3 | 29.0 | 45.0 | 2 |
4 | 1 | 4 | 15.0 | 15.0 | 2 |
... | ... | ... | ... | ... | ... |
47955 | 7493 | 3 | 70.0 | 48.0 | 7 |
47956 | 7493 | 4 | 42.0 | 11.0 | 7 |
47957 | 7493 | 5 | 32.0 | 0.0 | 7 |
47958 | 7493 | 6 | 25.0 | 36.0 | 7 |
47959 | 7493 | 7 | 100.0 | 40.0 | 7 |
47960 rows × 5 columns
#| export
import cudf
import dask_cudf
import pandas as pd
def write_parquet(pandas_df, output_dir, npartitions = 2):
pandas_df['class_vals'].replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace = True)
gdf = cudf.from_pandas(pandas_df)
gdf['case_id_seq'] = gdf['case_id']
dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)
dask_gdf.to_parquet(output_dir)
import time
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:00:35,470 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:00:35,470 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:00:35,470 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:00:35,471 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:00:35,471 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:00:35,471 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:00:35,471 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:00:35,471 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Client-01f207e2-3b72-11ed-80d1-02b68d644837
Connection method: Cluster object | Cluster type: dask_cuda.LocalCUDACluster |
Dashboard: http://127.0.0.1:8787/status |
34d63417
Dashboard: http://127.0.0.1:8787/status | Workers: 4 |
Total threads: 4 | Total memory: 150.00 GiB |
Status: running | Using processes: True |
Scheduler-afbbd400-3841-4a6d-8988-c8833d96ec8c
Comm: tcp://127.0.0.1:42753 | Workers: 4 |
Dashboard: http://127.0.0.1:8787/status | Total threads: 4 |
Started: Just now | Total memory: 150.00 GiB |
Comm: tcp://127.0.0.1:36167 | Total threads: 1 |
Dashboard: http://127.0.0.1:36059/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:39755 | |
Local directory: /tmp/dask-worker-space/worker-_tqacl9y | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:40359 | Total threads: 1 |
Dashboard: http://127.0.0.1:39691/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:43911 | |
Local directory: /tmp/dask-worker-space/worker-nkrukqx4 | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:35605 | Total threads: 1 |
Dashboard: http://127.0.0.1:35459/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:42369 | |
Local directory: /tmp/dask-worker-space/worker-0luskvyb | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:46159 | Total threads: 1 |
Dashboard: http://127.0.0.1:35087/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:41061 | |
Local directory: /tmp/dask-worker-space/worker-a4qrqumh | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Train Dataset
%%time
write_parquet(train, product['PenDigits_TRAIN_RAW'])
CPU times: user 1.4 s, sys: 396 ms, total: 1.8 s Wall time: 3.44 s
train_gdf = dask_cudf.read_parquet(product['PenDigits_TRAIN_RAW'])
train_gdf.head()
case_id | reading_id | dim_0 | dim_1 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|
0 | 1 | 0 | 0.0 | 89.0 | 2 | 1 |
1 | 1 | 1 | 27.0 | 100.0 | 2 | 1 |
2 | 1 | 2 | 42.0 | 75.0 | 2 | 1 |
3 | 1 | 3 | 29.0 | 45.0 | 2 | 1 |
4 | 1 | 4 | 15.0 | 15.0 | 2 | 1 |
train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()
(0, 7)
train_gdf.case_id.nunique().compute()
5995
Valid Dataset
%%time
write_parquet(valid, product['PenDigits_VALID_RAW'])
CPU times: user 35 ms, sys: 0 ns, total: 35 ms Wall time: 70.7 ms
valid_gdf = dask_cudf.read_parquet(product['PenDigits_VALID_RAW'])
valid_gdf.head()
case_id | reading_id | dim_0 | dim_1 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 47.0 | 100.0 | 8 | 0 |
1 | 0 | 1 | 27.0 | 81.0 | 8 | 0 |
2 | 0 | 2 | 57.0 | 37.0 | 8 | 0 |
3 | 0 | 3 | 26.0 | 0.0 | 8 | 0 |
4 | 0 | 4 | 0.0 | 23.0 | 8 | 0 |
valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()
(0, 7)
valid_gdf.case_id.nunique().compute()
1499
Test Dataset
%%time
write_parquet(test, product['PenDigits_TEST_RAW'])
CPU times: user 43 ms, sys: 3.59 ms, total: 46.6 ms Wall time: 103 ms
test_gdf = dask_cudf.read_parquet(product['PenDigits_TEST_RAW'])
test_gdf.head()
case_id | reading_id | dim_0 | dim_1 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 88.0 | 92.0 | 8 | 0 |
1 | 0 | 1 | 2.0 | 99.0 | 8 | 0 |
2 | 0 | 2 | 16.0 | 66.0 | 8 | 0 |
3 | 0 | 3 | 94.0 | 37.0 | 8 | 0 |
4 | 0 | 4 | 70.0 | 0.0 | 8 | 0 |
test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()
(0, 7)
test_gdf.case_id.nunique().compute()
3498
We reset the kernel!!!
%%time
client.shutdown()
client.close()
Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report await self._reconnect() File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError
CPU times: user 30.8 ms, sys: 11.5 ms, total: 42.2 ms Wall time: 620 ms
from nbdev import nbdev_export
nbdev_export()