In [1]:
#| default_exp data.insect_wingbeat
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['core']
In [3]:
# Parameters
upstream = {"core": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html", "InsectWingbeat_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train", "InsectWingbeat_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid", "InsectWingbeat_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test"}
In [4]:
#| hide
from nbdev.showdoc import *
In [5]:
#| export
from vitmtsc import *
from vitmtsc.core import *
import dask_cudf
import gc   #garbage collector interface
In [6]:
#| export
upstream = {
    "core": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
        "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
        "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
        "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
        "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
        "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
        "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
        "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
        "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
        "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
        "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html",
    "InsectWingbeat_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train",
    "InsectWingbeat_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid",
    "InsectWingbeat_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test",
}

Data Download and Conversion
¶

InsectWingbeat dataset
¶

Convert dataset to parquet format to run target encoding

In [7]:
#| export
DATASET_NAME = 'InsectWingbeat'

Download and Convert dataset in tabular format¶

In [8]:
%%time
train = get_mtsc_data_tabular_from_ts(upstream['core']['InsectWingbeat_TRAIN_TS'])
train.shape
Reading dataset TS file...
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:928: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data["dim_" + str(dim)] = instance_list[dim]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:934: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data["class_vals"] = pd.Series(class_val_list)
Converting _x to tabular format...
/home/ubuntu/vitmtsc_nbdev/vitmtsc/core.py:52: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data_x_tabular.reset_index(inplace=True)
Converting _y to tabular format...
Merging _x and _y...
CPU times: user 16min 31s, sys: 10.1 s, total: 16min 41s
Wall time: 16min 41s
Out[8]:
(167974, 203)
In [9]:
train.head()
Out[9]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_191 dim_192 dim_193 dim_194 dim_195 dim_196 dim_197 dim_198 dim_199 class_vals
0 0 0 -0.000526 0.000426 -0.000490 0.000469 -0.000361 0.000117 0.000013 -0.000065 ... -0.000787 0.000518 -0.000294 -0.000110 0.000469 -0.000530 0.000652 -0.000818 -0.000818 aedes_female
1 0 1 -0.000680 0.000293 -0.000443 -0.000035 -0.000393 0.000220 -0.000136 0.000325 ... -0.000274 -0.000250 -0.000033 0.000191 0.000030 -0.000092 0.000526 0.000248 0.000248 aedes_female
2 0 2 -0.010782 0.013112 -0.012667 0.009561 -0.011605 0.014131 -0.012283 0.014958 ... 0.142569 0.053184 -0.026049 0.021518 -0.021696 0.019203 -0.020744 0.026454 0.026454 aedes_female
3 0 3 0.024827 -0.035617 -0.010840 -0.342877 -0.214020 0.001512 -0.002940 -0.006419 ... 0.656476 0.237110 -0.010011 0.002020 -0.000191 0.001416 0.000318 0.000044 0.000044 aedes_female
4 0 4 0.019104 -0.000091 -0.971610 -2.116380 -0.333796 0.015550 0.009869 0.017413 ... -0.021847 0.022194 0.020843 0.020790 0.021171 0.020698 0.020344 0.020116 0.020116 aedes_female

5 rows × 203 columns

In [10]:
train['reading_id'].min(), train['reading_id'].max()
Out[10]:
(0, 21)
In [11]:
train['class_vals'].unique()
Out[11]:
array(['aedes_female', 'aedes_male', 'fruit_flies', 'house_flies',
       'quinx_female', 'quinx_male', 'stigma_female', 'stigma_male',
       'tarsalis_female', 'tarsalis_male'], dtype=object)
In [12]:
%%time
test = get_mtsc_data_tabular_from_ts(upstream['core']['InsectWingbeat_TEST_TS'])
test.shape
Reading dataset TS file...
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:928: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data["dim_" + str(dim)] = instance_list[dim]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:934: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data["class_vals"] = pd.Series(class_val_list)
Converting _x to tabular format...
/home/ubuntu/vitmtsc_nbdev/vitmtsc/core.py:52: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data_x_tabular.reset_index(inplace=True)
Converting _y to tabular format...
Merging _x and _y...
CPU times: user 16min 22s, sys: 6.39 s, total: 16min 28s
Wall time: 16min 28s
Out[12]:
(167560, 203)
In [13]:
test.head()
Out[13]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_191 dim_192 dim_193 dim_194 dim_195 dim_196 dim_197 dim_198 dim_199 class_vals
0 0 0 -0.002947 0.002920 -0.002758 0.002282 -0.001377 0.000128 0.001152 -0.002060 ... -0.000081 0.000090 -0.000068 0.000031 -0.000007 0.000014 -0.000049 0.000087 0.000087 aedes_female
1 0 1 -0.053421 0.011923 -0.006867 0.027616 0.045003 -0.094597 0.014191 0.022102 ... -0.000171 0.000134 0.000284 0.000281 0.000343 0.000118 0.000088 -0.000291 -0.000291 aedes_female
2 0 2 -0.049052 -0.020504 0.000670 -0.000306 0.063919 0.024317 -0.004142 0.002782 ... -0.000002 -0.000031 -0.000027 0.000070 0.000052 0.000011 0.000190 0.000467 0.000467 aedes_female
3 0 3 0.046457 -0.555692 0.194275 0.016277 -0.024176 0.480148 -0.058053 -0.003973 ... -0.001096 0.000840 -0.001102 0.001148 -0.001079 0.001008 -0.001456 0.010817 0.010817 aedes_female
4 0 4 -0.896564 7.723570 -0.871859 -1.611516 -0.154569 -0.239838 0.765347 -0.284888 ... -0.013012 0.013401 -0.014279 0.015049 -0.014547 0.015267 -0.015893 0.017930 0.017930 aedes_female

5 rows × 203 columns

In [14]:
test['reading_id'].min(), test['reading_id'].max()
Out[14]:
(0, 21)
In [15]:
test['class_vals'].unique()
Out[15]:
array(['aedes_female', 'aedes_male', 'fruit_flies', 'house_flies',
       'quinx_female', 'quinx_male', 'stigma_female', 'stigma_male',
       'tarsalis_female', 'tarsalis_male'], dtype=object)
In [16]:
from sklearn.model_selection import train_test_split
X = train[['case_id', 'class_vals']].drop_duplicates()
X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)
X_train.case_id.nunique(), X_val.case_id.nunique()
Out[16]:
(20000, 5000)
In [17]:
X_train.groupby(by = ['class_vals'], dropna = False).count()
Out[17]:
case_id
class_vals
aedes_female 2007
aedes_male 1972
fruit_flies 2007
house_flies 1993
quinx_female 2006
quinx_male 2009
stigma_female 2015
stigma_male 1996
tarsalis_female 1997
tarsalis_male 1998
In [18]:
X_val.groupby(by = ['class_vals'], dropna = False).count()
Out[18]:
case_id
class_vals
aedes_female 493
aedes_male 528
fruit_flies 493
house_flies 507
quinx_female 494
quinx_male 491
stigma_female 485
stigma_male 504
tarsalis_female 503
tarsalis_male 502
In [19]:
test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()
Out[19]:
case_id
class_vals
aedes_female 2500
aedes_male 2500
fruit_flies 2500
house_flies 2500
quinx_female 2500
quinx_male 2500
stigma_female 2500
stigma_male 2500
tarsalis_female 2500
tarsalis_male 2500
In [20]:
valid = train.merge(X_val, on=['case_id'], how='inner')
valid['class_vals'] = valid['class_vals_x']
valid = valid.drop(columns=['class_vals_x','class_vals_y'])
valid.case_id.nunique()
Out[20]:
5000
In [21]:
train = train.merge(X_train, on=['case_id'], how='inner')
train['class_vals'] = train['class_vals_x']
train = train.drop(columns=['class_vals_x','class_vals_y'])
train.case_id.nunique()
Out[21]:
20000
In [22]:
train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()
Out[22]:
(20000, 5000, 25000)
In [23]:
train
Out[23]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_191 dim_192 dim_193 dim_194 dim_195 dim_196 dim_197 dim_198 dim_199 class_vals
0 0 0 -0.000526 0.000426 -0.000490 0.000469 -0.000361 0.000117 0.000013 -0.000065 ... -0.000787 0.000518 -0.000294 -0.000110 0.000469 -0.000530 0.000652 -0.000818 -0.000818 aedes_female
1 0 1 -0.000680 0.000293 -0.000443 -0.000035 -0.000393 0.000220 -0.000136 0.000325 ... -0.000274 -0.000250 -0.000033 0.000191 0.000030 -0.000092 0.000526 0.000248 0.000248 aedes_female
2 0 2 -0.010782 0.013112 -0.012667 0.009561 -0.011605 0.014131 -0.012283 0.014958 ... 0.142569 0.053184 -0.026049 0.021518 -0.021696 0.019203 -0.020744 0.026454 0.026454 aedes_female
3 0 3 0.024827 -0.035617 -0.010840 -0.342877 -0.214020 0.001512 -0.002940 -0.006419 ... 0.656476 0.237110 -0.010011 0.002020 -0.000191 0.001416 0.000318 0.000044 0.000044 aedes_female
4 0 4 0.019104 -0.000091 -0.971610 -2.116380 -0.333796 0.015550 0.009869 0.017413 ... -0.021847 0.022194 0.020843 0.020790 0.021171 0.020698 0.020344 0.020116 0.020116 aedes_female
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
134367 24998 2 -0.005761 0.001334 -0.000173 -0.000088 0.000262 -0.000555 -0.000152 -0.000119 ... 0.000115 0.000040 0.000218 0.000100 -0.000062 0.000069 0.000198 0.000027 0.000027 tarsalis_male
134368 24998 3 1.191404 -0.659416 -0.660178 0.631950 -0.168012 0.436480 0.223098 -0.251304 ... 0.001960 -0.002290 0.002210 -0.002204 0.002151 -0.002175 0.002536 -0.002290 -0.002290 tarsalis_male
134369 24998 4 2.882338 -4.776926 4.540706 -3.988603 0.156886 -1.182728 -0.801508 0.878964 ... 0.006013 -0.007888 0.010264 -0.012222 0.006145 -0.004855 0.002678 -0.006170 -0.006170 tarsalis_male
134370 24998 5 -5.569411 1.057257 8.147614 3.205241 0.742497 0.485884 0.640577 0.164989 ... -0.000266 0.014064 -0.026576 0.025557 -0.020304 0.001435 0.007384 -0.026695 -0.026695 tarsalis_male
134371 24998 6 -0.211579 -0.210664 -0.205127 -0.190088 -0.165624 -0.140203 -0.126331 -0.131375 ... -0.006157 -0.004272 -0.004563 -0.006921 -0.009405 -0.009900 -0.007943 -0.005148 -0.005148 tarsalis_male

134372 rows × 203 columns

Write data in parquet format for future processing¶

In [24]:
#| export
import cudf
import dask_cudf
import pandas as pd

def write_parquet(pandas_df, output_dir, npartitions = 2):
    pandas_df['class_vals'].replace(['aedes_female',
                                     'aedes_male',
                                     'fruit_flies',
                                     'house_flies',
                                     'quinx_female',
                                     'quinx_male',
                                     'stigma_female',
                                     'stigma_male',
                                     'tarsalis_female',
                                     'tarsalis_male'],
                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace = True)
    gdf = cudf.from_pandas(pandas_df)
    gdf['case_id_seq'] = gdf['case_id']
    dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)
    dask_gdf.to_parquet(output_dir) 
In [25]:
import time

from dask.distributed import Client
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:33:31,833 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:33:31,833 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:33:31,846 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:33:31,846 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:33:31,937 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:33:31,937 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:33:32,042 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:33:32,042 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Out[25]:

Client

Client-9bf2d02a-3b76-11ed-80d2-06c0bb745397

Connection method: Cluster object Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

LocalCUDACluster

418a3880

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 4 Total memory: 150.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-cf6e00c1-8680-4bf2-b303-6d78558f9eb3

Comm: tcp://127.0.0.1:42227 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 4
Started: Just now Total memory: 150.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:35335 Total threads: 1
Dashboard: http://127.0.0.1:42545/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:44269
Local directory: /tmp/dask-worker-space/worker-mwbqkchz
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 1

Comm: tcp://127.0.0.1:46213 Total threads: 1
Dashboard: http://127.0.0.1:46245/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:33989
Local directory: /tmp/dask-worker-space/worker-iuiqwyz_
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 2

Comm: tcp://127.0.0.1:35837 Total threads: 1
Dashboard: http://127.0.0.1:35249/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:38933
Local directory: /tmp/dask-worker-space/worker-epogeuhe
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 3

Comm: tcp://127.0.0.1:39187 Total threads: 1
Dashboard: http://127.0.0.1:33849/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:42833
Local directory: /tmp/dask-worker-space/worker-ojdxshkb
GPU: Tesla T4 GPU memory: 14.76 GiB

Train Dataset

In [26]:
%%time
write_parquet(train, product['InsectWingbeat_TRAIN_RAW'])
CPU times: user 2.57 s, sys: 488 ms, total: 3.05 s
Wall time: 4.97 s
In [27]:
train_gdf = dask_cudf.read_parquet(product['InsectWingbeat_TRAIN_RAW'])
train_gdf.head()
Out[27]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_192 dim_193 dim_194 dim_195 dim_196 dim_197 dim_198 dim_199 class_vals case_id_seq
0 0 0 -0.000526 0.000426 -0.000490 0.000469 -0.000361 0.000117 0.000013 -0.000065 ... 0.000518 -0.000294 -0.000110 0.000469 -0.000530 0.000652 -0.000818 -0.000818 0 0
1 0 1 -0.000680 0.000293 -0.000443 -0.000035 -0.000393 0.000220 -0.000136 0.000325 ... -0.000250 -0.000033 0.000191 0.000030 -0.000092 0.000526 0.000248 0.000248 0 0
2 0 2 -0.010782 0.013112 -0.012667 0.009561 -0.011605 0.014131 -0.012283 0.014958 ... 0.053184 -0.026049 0.021518 -0.021696 0.019203 -0.020744 0.026454 0.026454 0 0
3 0 3 0.024827 -0.035617 -0.010840 -0.342877 -0.214020 0.001512 -0.002940 -0.006419 ... 0.237110 -0.010011 0.002020 -0.000191 0.001416 0.000318 0.000044 0.000044 0 0
4 0 4 0.019104 -0.000091 -0.971610 -2.116380 -0.333796 0.015550 0.009869 0.017413 ... 0.022194 0.020843 0.020790 0.021171 0.020698 0.020344 0.020116 0.020116 0 0

5 rows × 204 columns

In [28]:
train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()
Out[28]:
(0, 21)
In [29]:
train_gdf.case_id.nunique().compute()
Out[29]:
20000

Valid Dataset

In [30]:
%%time
write_parquet(valid, product['InsectWingbeat_VALID_RAW'])
CPU times: user 430 ms, sys: 32.1 ms, total: 462 ms
Wall time: 605 ms
In [31]:
valid_gdf = dask_cudf.read_parquet(product['InsectWingbeat_VALID_RAW'])
valid_gdf.head()
Out[31]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_192 dim_193 dim_194 dim_195 dim_196 dim_197 dim_198 dim_199 class_vals case_id_seq
0 17 0 -0.674501 -0.044123 -0.043462 0.056184 -0.058077 0.057803 -0.059285 0.061007 ... -0.003337 0.002940 -0.003611 0.004123 -0.005298 0.006879 -0.006634 -0.006634 0 17
1 17 1 0.299125 -1.862445 -0.421858 -0.241298 0.175858 -0.171911 0.204115 -0.255530 ... -0.023787 -0.007275 0.003402 0.000278 -0.010965 0.002707 0.003182 0.003182 0 17
2 17 2 -0.428115 -0.339370 1.075969 0.141863 0.087708 0.049868 0.392346 0.659165 ... -0.008609 -0.014546 -0.000611 -0.016076 0.012444 -0.017995 0.011532 0.011532 0 17
3 17 3 -0.066288 -0.915256 -0.051801 -0.236039 -0.119390 -0.280049 -0.128623 -0.341739 ... 0.013279 0.001198 0.019783 0.001476 0.004445 -0.000844 0.005537 0.005537 0 17
4 17 4 -1.573770 1.404155 -0.310456 -0.053030 -0.021343 -0.145558 0.173446 0.167817 ... 0.012897 0.013816 0.003628 -0.010986 0.013594 0.017745 0.012654 0.012654 0 17

5 rows × 204 columns

In [32]:
valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()
Out[32]:
(0, 20)
In [33]:
valid_gdf.case_id.nunique().compute()
Out[33]:
5000

Test Dataset

In [34]:
%%time
write_parquet(test, product['InsectWingbeat_TEST_RAW'])
CPU times: user 1.34 s, sys: 328 ms, total: 1.67 s
Wall time: 1.99 s
In [35]:
test_gdf = dask_cudf.read_parquet(product['InsectWingbeat_TEST_RAW'])
test_gdf.head()
Out[35]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_192 dim_193 dim_194 dim_195 dim_196 dim_197 dim_198 dim_199 class_vals case_id_seq
0 0 0 -0.002947 0.002920 -0.002758 0.002282 -0.001377 0.000128 0.001152 -0.002060 ... 0.000090 -0.000068 0.000031 -0.000007 0.000014 -0.000049 0.000087 0.000087 0 0
1 0 1 -0.053421 0.011923 -0.006867 0.027616 0.045003 -0.094597 0.014191 0.022102 ... 0.000134 0.000284 0.000281 0.000343 0.000118 0.000088 -0.000291 -0.000291 0 0
2 0 2 -0.049052 -0.020504 0.000670 -0.000306 0.063919 0.024317 -0.004142 0.002782 ... -0.000031 -0.000027 0.000070 0.000052 0.000011 0.000190 0.000467 0.000467 0 0
3 0 3 0.046457 -0.555692 0.194275 0.016277 -0.024176 0.480148 -0.058053 -0.003973 ... 0.000840 -0.001102 0.001148 -0.001079 0.001008 -0.001456 0.010817 0.010817 0 0
4 0 4 -0.896564 7.723570 -0.871859 -1.611516 -0.154569 -0.239838 0.765347 -0.284888 ... 0.013401 -0.014279 0.015049 -0.014547 0.015267 -0.015893 0.017930 0.017930 0 0

5 rows × 204 columns

In [36]:
test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()
Out[36]:
(0, 21)
In [37]:
test_gdf.case_id.nunique().compute()
Out[37]:
25000

We reset the kernel!!!

In [38]:
%%time
client.shutdown()
client.close()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report
    await self._reconnect()
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError
CPU times: user 40 ms, sys: 8.11 ms, total: 48.1 ms
Wall time: 629 ms
In [39]:
from nbdev import nbdev_export
nbdev_export()
In [ ]: