#| default_exp data.character_trajectories
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['core']
# Parameters
upstream = {"core": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/105_data.character_trajectories.html", "CharacterTrajectories_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/train", "CharacterTrajectories_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/valid", "CharacterTrajectories_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/test"}
#| hide
from nbdev.showdoc import *
#| export
from vitmtsc import *
from vitmtsc.core import *
import dask_cudf
# |export
upstream = {
"core": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
"FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
"FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
"InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
"InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
"PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
"PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
"SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
"SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
"CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
"CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/105_data.character_trajectories.html",
"CharacterTrajectories_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/train",
"CharacterTrajectories_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/valid",
"CharacterTrajectories_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/test",
}
Convert dataset to parquet format to run target encoding
#| export
DATASET_NAME = 'CharacterTrajectories'
%%time
train = get_mtsc_data_tabular_from_ts(upstream['core']['CharacterTrajectories_TRAIN_TS'])
train.shape
Reading dataset TS file... Converting _x to tabular format... Converting _y to tabular format... Merging _x and _y... CPU times: user 2.4 s, sys: 45.4 ms, total: 2.45 s Wall time: 2.44 s
(170872, 6)
train.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | class_vals | |
---|---|---|---|---|---|---|
0 | 0 | 0 | -0.130150 | 0.071323 | 0.899306 | 1 |
1 | 0 | 1 | -0.183121 | 0.083973 | 1.157239 | 1 |
2 | 0 | 2 | -0.234104 | 0.085534 | 1.320469 | 1 |
3 | 0 | 3 | -0.279929 | 0.078218 | 1.369472 | 1 |
4 | 0 | 4 | -0.321851 | 0.067093 | 1.323889 | 1 |
train['reading_id'].min(), train['reading_id'].max()
(0, 179)
train['class_vals'].unique()
array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'], dtype=object)
%%time
test = get_mtsc_data_tabular_from_ts(upstream['core']['CharacterTrajectories_TEST_TS'])
test.shape
Reading dataset TS file... Converting _x to tabular format... Converting _y to tabular format... Merging _x and _y... CPU times: user 2.4 s, sys: 5.77 ms, total: 2.41 s Wall time: 2.41 s
(172067, 6)
test.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | class_vals | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 0.043915 | -0.010011 | 0.536491 | 1 |
1 | 0 | 1 | 0.033922 | -0.004357 | 0.767180 | 1 |
2 | 0 | 2 | 0.007255 | 0.005951 | 0.993862 | 1 |
3 | 0 | 3 | -0.030362 | 0.018295 | 1.184721 | 1 |
4 | 0 | 4 | -0.069213 | 0.029589 | 1.312480 | 1 |
test['reading_id'].min(), test['reading_id'].max()
(0, 181)
test['class_vals'].unique()
array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'], dtype=object)
from sklearn.model_selection import train_test_split
X = train[['case_id', 'class_vals']].drop_duplicates()
X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)
X_train.case_id.nunique(), X_val.case_id.nunique()
(1137, 285)
X_train.groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
1 | 67 |
10 | 53 |
11 | 54 |
12 | 54 |
13 | 47 |
14 | 55 |
15 | 50 |
16 | 49 |
17 | 59 |
18 | 50 |
19 | 63 |
2 | 61 |
20 | 62 |
3 | 57 |
4 | 60 |
5 | 71 |
6 | 55 |
7 | 50 |
8 | 69 |
9 | 51 |
X_val.groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
1 | 18 |
10 | 12 |
11 | 16 |
12 | 11 |
13 | 15 |
14 | 4 |
15 | 16 |
16 | 16 |
17 | 18 |
18 | 12 |
19 | 5 |
2 | 9 |
20 | 23 |
3 | 14 |
4 | 18 |
5 | 22 |
6 | 14 |
7 | 13 |
8 | 18 |
9 | 11 |
test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
1 | 86 |
10 | 65 |
11 | 71 |
12 | 66 |
13 | 62 |
14 | 60 |
15 | 67 |
16 | 66 |
17 | 78 |
18 | 63 |
19 | 69 |
2 | 71 |
20 | 86 |
3 | 71 |
4 | 79 |
5 | 93 |
6 | 69 |
7 | 64 |
8 | 87 |
9 | 63 |
valid = train.merge(X_val, on=['case_id'], how='inner')
valid['class_vals'] = valid['class_vals_x']
valid = valid.drop(columns=['class_vals_x','class_vals_y'])
valid.case_id.nunique()
285
train = train.merge(X_train, on=['case_id'], how='inner')
train['class_vals'] = train['class_vals_x']
train = train.drop(columns=['class_vals_x','class_vals_y'])
train.case_id.nunique()
1137
train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()
(1137, 285, 1436)
train
case_id | reading_id | dim_0 | dim_1 | dim_2 | class_vals | |
---|---|---|---|---|---|---|
0 | 0 | 0 | -0.130150 | 0.071323 | 0.899306 | 1 |
1 | 0 | 1 | -0.183121 | 0.083973 | 1.157239 | 1 |
2 | 0 | 2 | -0.234104 | 0.085534 | 1.320469 | 1 |
3 | 0 | 3 | -0.279929 | 0.078218 | 1.369472 | 1 |
4 | 0 | 4 | -0.321851 | 0.067093 | 1.323889 | 1 |
... | ... | ... | ... | ... | ... | ... |
136792 | 1421 | 142 | -0.410398 | 0.015511 | -2.293325 | 20 |
136793 | 1421 | 143 | -0.398122 | 0.015047 | -2.224722 | 20 |
136794 | 1421 | 144 | -0.370147 | 0.013989 | -2.068397 | 20 |
136795 | 1421 | 145 | -0.319821 | 0.012087 | -1.787174 | 20 |
136796 | 1421 | 146 | -0.248272 | 0.009383 | -1.387357 | 20 |
136797 rows × 6 columns
#| export
import cudf
def write_parquet(pandas_df, output_dir, npartitions = 2):
pandas_df['class_vals'].replace(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], inplace = True)
gdf = cudf.from_pandas(pandas_df)
gdf['case_id_seq'] = gdf['case_id']
dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)
dask_gdf.to_parquet(output_dir)
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:00:27,121 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:00:27,121 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:00:27,139 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:00:27,139 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:00:27,175 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:00:27,175 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:00:27,221 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:00:27,221 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Client-fcee6df9-3b71-11ed-80d1-0a4f0fdf7975
Connection method: Cluster object | Cluster type: dask_cuda.LocalCUDACluster |
Dashboard: http://127.0.0.1:8787/status |
f572ee62
Dashboard: http://127.0.0.1:8787/status | Workers: 4 |
Total threads: 4 | Total memory: 150.00 GiB |
Status: running | Using processes: True |
Scheduler-0189f9ef-8a9d-437b-a912-34c8b9670951
Comm: tcp://127.0.0.1:38791 | Workers: 4 |
Dashboard: http://127.0.0.1:8787/status | Total threads: 4 |
Started: Just now | Total memory: 150.00 GiB |
Comm: tcp://127.0.0.1:46167 | Total threads: 1 |
Dashboard: http://127.0.0.1:34731/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:35645 | |
Local directory: /tmp/dask-worker-space/worker-2k3y31b0 | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:38537 | Total threads: 1 |
Dashboard: http://127.0.0.1:35009/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:37471 | |
Local directory: /tmp/dask-worker-space/worker-zdcn97cw | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:40545 | Total threads: 1 |
Dashboard: http://127.0.0.1:41973/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:33123 | |
Local directory: /tmp/dask-worker-space/worker-u3yffk_c | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:37803 | Total threads: 1 |
Dashboard: http://127.0.0.1:46009/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:40197 | |
Local directory: /tmp/dask-worker-space/worker-uxalrwpi | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Train Dataset
%%time
write_parquet(train, product['CharacterTrajectories_TRAIN_RAW'])
CPU times: user 1.42 s, sys: 509 ms, total: 1.93 s Wall time: 3.57 s
train_gdf = dask_cudf.read_parquet(product['CharacterTrajectories_TRAIN_RAW'])
train_gdf.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|---|
0 | 0 | 0 | -0.130150 | 0.071323 | 0.899306 | 0 | 0 |
1 | 0 | 1 | -0.183121 | 0.083973 | 1.157239 | 0 | 0 |
2 | 0 | 2 | -0.234104 | 0.085534 | 1.320469 | 0 | 0 |
3 | 0 | 3 | -0.279929 | 0.078218 | 1.369472 | 0 | 0 |
4 | 0 | 4 | -0.321851 | 0.067093 | 1.323889 | 0 | 0 |
train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()
(0, 179)
train_gdf.case_id.nunique().compute()
1137
Valid Dataset
%%time
write_parquet(valid, product['CharacterTrajectories_VALID_RAW'])
CPU times: user 52.5 ms, sys: 5.1 ms, total: 57.6 ms Wall time: 113 ms
valid_gdf = dask_cudf.read_parquet(product['CharacterTrajectories_VALID_RAW'])
valid_gdf.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|---|
0 | 23 | 0 | 0.038300 | 0.107932 | 0.951737 | 0 | 23 |
1 | 23 | 1 | 0.031047 | 0.141460 | 1.213346 | 0 | 23 |
2 | 23 | 2 | 0.006217 | 0.167784 | 1.379965 | 0 | 23 |
3 | 23 | 3 | -0.037579 | 0.186031 | 1.438267 | 0 | 23 |
4 | 23 | 4 | -0.099370 | 0.197670 | 1.407480 | 0 | 23 |
valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()
(0, 172)
valid_gdf.case_id.nunique().compute()
285
Test Dataset
%%time
write_parquet(test, product['CharacterTrajectories_TEST_RAW'])
CPU times: user 158 ms, sys: 13.1 ms, total: 171 ms Wall time: 243 ms
test_gdf = dask_cudf.read_parquet(product['CharacterTrajectories_TEST_RAW'])
test_gdf.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0.043915 | -0.010011 | 0.536491 | 0 | 0 |
1 | 0 | 1 | 0.033922 | -0.004357 | 0.767180 | 0 | 0 |
2 | 0 | 2 | 0.007255 | 0.005951 | 0.993862 | 0 | 0 |
3 | 0 | 3 | -0.030362 | 0.018295 | 1.184721 | 0 | 0 |
4 | 0 | 4 | -0.069213 | 0.029589 | 1.312480 | 0 | 0 |
test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()
(0, 181)
test_gdf.case_id.nunique().compute()
1436
We reset the kernel!!!
%%time
client.shutdown()
client.close()
Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report await self._reconnect() File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError
CPU times: user 37.2 ms, sys: 5.67 ms, total: 42.9 ms Wall time: 618 ms
from nbdev import nbdev_export
nbdev_export()