In [1]:
#| default_exp data.character_trajectories
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['core']
In [3]:
# Parameters
upstream = {"core": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/105_data.character_trajectories.html", "CharacterTrajectories_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/train", "CharacterTrajectories_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/valid", "CharacterTrajectories_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/test"}
In [4]:
#| hide
from nbdev.showdoc import *
In [5]:
#| export
from vitmtsc import *
from vitmtsc.core import *
import dask_cudf
In [6]:
# |export
upstream = {
    "core": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
        "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
        "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
        "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
        "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
        "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
        "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
        "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
        "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
        "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
        "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/105_data.character_trajectories.html",
    "CharacterTrajectories_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/train",
    "CharacterTrajectories_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/valid",
    "CharacterTrajectories_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/raw/test",
}

Data Download and Conversion
¶

CharacterTrajectories dataset
¶

Convert dataset to parquet format to run target encoding

In [7]:
#| export
DATASET_NAME = 'CharacterTrajectories'

Download and Convert dataset in tabular format¶

In [8]:
%%time
train = get_mtsc_data_tabular_from_ts(upstream['core']['CharacterTrajectories_TRAIN_TS'])
train.shape
Reading dataset TS file...
Converting _x to tabular format...
Converting _y to tabular format...
Merging _x and _y...
CPU times: user 2.4 s, sys: 45.4 ms, total: 2.45 s
Wall time: 2.44 s
Out[8]:
(170872, 6)
In [9]:
train.head()
Out[9]:
case_id reading_id dim_0 dim_1 dim_2 class_vals
0 0 0 -0.130150 0.071323 0.899306 1
1 0 1 -0.183121 0.083973 1.157239 1
2 0 2 -0.234104 0.085534 1.320469 1
3 0 3 -0.279929 0.078218 1.369472 1
4 0 4 -0.321851 0.067093 1.323889 1
In [10]:
train['reading_id'].min(), train['reading_id'].max()
Out[10]:
(0, 179)
In [11]:
train['class_vals'].unique()
Out[11]:
array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20'], dtype=object)
In [12]:
%%time
test = get_mtsc_data_tabular_from_ts(upstream['core']['CharacterTrajectories_TEST_TS'])
test.shape
Reading dataset TS file...
Converting _x to tabular format...
Converting _y to tabular format...
Merging _x and _y...
CPU times: user 2.4 s, sys: 5.77 ms, total: 2.41 s
Wall time: 2.41 s
Out[12]:
(172067, 6)
In [13]:
test.head()
Out[13]:
case_id reading_id dim_0 dim_1 dim_2 class_vals
0 0 0 0.043915 -0.010011 0.536491 1
1 0 1 0.033922 -0.004357 0.767180 1
2 0 2 0.007255 0.005951 0.993862 1
3 0 3 -0.030362 0.018295 1.184721 1
4 0 4 -0.069213 0.029589 1.312480 1
In [14]:
test['reading_id'].min(), test['reading_id'].max()
Out[14]:
(0, 181)
In [15]:
test['class_vals'].unique()
Out[15]:
array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20'], dtype=object)
In [16]:
from sklearn.model_selection import train_test_split
X = train[['case_id', 'class_vals']].drop_duplicates()
X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)
X_train.case_id.nunique(), X_val.case_id.nunique()
Out[16]:
(1137, 285)
In [17]:
X_train.groupby(by = ['class_vals'], dropna = False).count()
Out[17]:
case_id
class_vals
1 67
10 53
11 54
12 54
13 47
14 55
15 50
16 49
17 59
18 50
19 63
2 61
20 62
3 57
4 60
5 71
6 55
7 50
8 69
9 51
In [18]:
X_val.groupby(by = ['class_vals'], dropna = False).count()
Out[18]:
case_id
class_vals
1 18
10 12
11 16
12 11
13 15
14 4
15 16
16 16
17 18
18 12
19 5
2 9
20 23
3 14
4 18
5 22
6 14
7 13
8 18
9 11
In [19]:
test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()
Out[19]:
case_id
class_vals
1 86
10 65
11 71
12 66
13 62
14 60
15 67
16 66
17 78
18 63
19 69
2 71
20 86
3 71
4 79
5 93
6 69
7 64
8 87
9 63
In [20]:
valid = train.merge(X_val, on=['case_id'], how='inner')
valid['class_vals'] = valid['class_vals_x']
valid = valid.drop(columns=['class_vals_x','class_vals_y'])
valid.case_id.nunique()
Out[20]:
285
In [21]:
train = train.merge(X_train, on=['case_id'], how='inner')
train['class_vals'] = train['class_vals_x']
train = train.drop(columns=['class_vals_x','class_vals_y'])
train.case_id.nunique()
Out[21]:
1137
In [22]:
train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()
Out[22]:
(1137, 285, 1436)
In [23]:
train
Out[23]:
case_id reading_id dim_0 dim_1 dim_2 class_vals
0 0 0 -0.130150 0.071323 0.899306 1
1 0 1 -0.183121 0.083973 1.157239 1
2 0 2 -0.234104 0.085534 1.320469 1
3 0 3 -0.279929 0.078218 1.369472 1
4 0 4 -0.321851 0.067093 1.323889 1
... ... ... ... ... ... ...
136792 1421 142 -0.410398 0.015511 -2.293325 20
136793 1421 143 -0.398122 0.015047 -2.224722 20
136794 1421 144 -0.370147 0.013989 -2.068397 20
136795 1421 145 -0.319821 0.012087 -1.787174 20
136796 1421 146 -0.248272 0.009383 -1.387357 20

136797 rows × 6 columns

Write data in parquet format for future processing¶

In [24]:
#| export
import cudf
def write_parquet(pandas_df, output_dir, npartitions = 2):
    pandas_df['class_vals'].replace(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'],
                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], inplace = True)
    gdf = cudf.from_pandas(pandas_df)
    gdf['case_id_seq'] = gdf['case_id']
    dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)
    dask_gdf.to_parquet(output_dir) 
In [25]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:00:27,121 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:00:27,121 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:00:27,139 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:00:27,139 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:00:27,175 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:00:27,175 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:00:27,221 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:00:27,221 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Out[25]:

Client

Client-fcee6df9-3b71-11ed-80d1-0a4f0fdf7975

Connection method: Cluster object Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

LocalCUDACluster

f572ee62

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 4 Total memory: 150.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-0189f9ef-8a9d-437b-a912-34c8b9670951

Comm: tcp://127.0.0.1:38791 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 4
Started: Just now Total memory: 150.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:46167 Total threads: 1
Dashboard: http://127.0.0.1:34731/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:35645
Local directory: /tmp/dask-worker-space/worker-2k3y31b0
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 1

Comm: tcp://127.0.0.1:38537 Total threads: 1
Dashboard: http://127.0.0.1:35009/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:37471
Local directory: /tmp/dask-worker-space/worker-zdcn97cw
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 2

Comm: tcp://127.0.0.1:40545 Total threads: 1
Dashboard: http://127.0.0.1:41973/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:33123
Local directory: /tmp/dask-worker-space/worker-u3yffk_c
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 3

Comm: tcp://127.0.0.1:37803 Total threads: 1
Dashboard: http://127.0.0.1:46009/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:40197
Local directory: /tmp/dask-worker-space/worker-uxalrwpi
GPU: Tesla T4 GPU memory: 14.76 GiB

Train Dataset

In [26]:
%%time
write_parquet(train, product['CharacterTrajectories_TRAIN_RAW'])
CPU times: user 1.42 s, sys: 509 ms, total: 1.93 s
Wall time: 3.57 s
In [27]:
train_gdf = dask_cudf.read_parquet(product['CharacterTrajectories_TRAIN_RAW'])
train_gdf.head()
Out[27]:
case_id reading_id dim_0 dim_1 dim_2 class_vals case_id_seq
0 0 0 -0.130150 0.071323 0.899306 0 0
1 0 1 -0.183121 0.083973 1.157239 0 0
2 0 2 -0.234104 0.085534 1.320469 0 0
3 0 3 -0.279929 0.078218 1.369472 0 0
4 0 4 -0.321851 0.067093 1.323889 0 0
In [28]:
train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()
Out[28]:
(0, 179)
In [29]:
train_gdf.case_id.nunique().compute()
Out[29]:
1137

Valid Dataset

In [30]:
%%time
write_parquet(valid, product['CharacterTrajectories_VALID_RAW'])
CPU times: user 52.5 ms, sys: 5.1 ms, total: 57.6 ms
Wall time: 113 ms
In [31]:
valid_gdf = dask_cudf.read_parquet(product['CharacterTrajectories_VALID_RAW'])
valid_gdf.head()
Out[31]:
case_id reading_id dim_0 dim_1 dim_2 class_vals case_id_seq
0 23 0 0.038300 0.107932 0.951737 0 23
1 23 1 0.031047 0.141460 1.213346 0 23
2 23 2 0.006217 0.167784 1.379965 0 23
3 23 3 -0.037579 0.186031 1.438267 0 23
4 23 4 -0.099370 0.197670 1.407480 0 23
In [32]:
valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()
Out[32]:
(0, 172)
In [33]:
valid_gdf.case_id.nunique().compute()
Out[33]:
285

Test Dataset

In [34]:
%%time
write_parquet(test, product['CharacterTrajectories_TEST_RAW'])
CPU times: user 158 ms, sys: 13.1 ms, total: 171 ms
Wall time: 243 ms
In [35]:
test_gdf = dask_cudf.read_parquet(product['CharacterTrajectories_TEST_RAW'])
test_gdf.head()
Out[35]:
case_id reading_id dim_0 dim_1 dim_2 class_vals case_id_seq
0 0 0 0.043915 -0.010011 0.536491 0 0
1 0 1 0.033922 -0.004357 0.767180 0 0
2 0 2 0.007255 0.005951 0.993862 0 0
3 0 3 -0.030362 0.018295 1.184721 0 0
4 0 4 -0.069213 0.029589 1.312480 0 0
In [36]:
test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()
Out[36]:
(0, 181)
In [37]:
test_gdf.case_id.nunique().compute()
Out[37]:
1436

We reset the kernel!!!

In [38]:
%%time
client.shutdown()
client.close()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report
    await self._reconnect()
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError
CPU times: user 37.2 ms, sys: 5.67 ms, total: 42.9 ms
Wall time: 618 ms
In [39]:
from nbdev import nbdev_export
nbdev_export()