In [1]:
#| default_exp data.face_detection
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['core']
In [3]:
# Parameters
upstream = {"core": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/101_data.face_detection.html", "FaceDetection_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/train", "FaceDetection_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/valid", "FaceDetection_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/test"}
In [4]:
#| hide
from nbdev.showdoc import *
In [5]:
#| export
from vitmtsc import *
from vitmtsc.core import *
import dask_cudf
import gc   #garbage collector interface
In [6]:
# |export
upstream = {
    "core": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
        "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
        "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
        "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
        "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
        "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
        "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
        "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
        "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
        "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
        "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/101_data.face_detection.html",
    "FaceDetection_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/train",
    "FaceDetection_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/valid",
    "FaceDetection_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/test",
}

Data Download and Conversion
¶

FaceDetection dataset
¶

Convert dataset to parquet format to run target encoding

In [7]:
#| export
DATASET_NAME = 'FaceDetection'

Download and Convert dataset in tabular format¶

In [8]:
%%time
train = get_mtsc_data_tabular_from_ts(upstream['core']['FaceDetection_TRAIN_TS'])
train.shape
Reading dataset TS file...
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:928: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data["dim_" + str(dim)] = instance_list[dim]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:934: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data["class_vals"] = pd.Series(class_val_list)
Converting _x to tabular format...
/home/ubuntu/vitmtsc_nbdev/vitmtsc/core.py:52: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data_x_tabular.reset_index(inplace=True)
Converting _y to tabular format...
Merging _x and _y...
CPU times: user 3min 3s, sys: 3.07 s, total: 3min 6s
Wall time: 3min 6s
Out[8]:
(365180, 147)
In [9]:
train.head()
Out[9]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_135 dim_136 dim_137 dim_138 dim_139 dim_140 dim_141 dim_142 dim_143 class_vals
0 0 0 -0.075450 0.056080 -0.824537 1.220979 -0.762913 -0.588704 -1.358603 -0.435668 ... -1.231982 -0.715474 -0.120353 0.092455 0.530208 0.720140 -0.567580 -0.234040 -0.356189 0
1 0 1 -0.336703 -0.128013 -0.746068 1.253613 -1.248101 -0.699180 -1.609432 -0.340918 ... -0.971470 -0.564188 -0.168357 0.483913 0.863468 0.763068 -1.073942 0.104291 -0.511199 0
2 0 2 -0.278238 -0.323847 -0.482871 1.094015 -1.498578 -0.548781 -1.655122 -0.434355 ... -0.996615 -0.443425 -0.057533 0.711995 0.852293 0.694540 -1.136367 0.327425 -0.483072 0
3 0 3 -0.101338 -0.535844 -0.355501 1.024461 -1.582962 -0.545213 -1.607034 -0.416748 ... -1.121799 -0.308594 -0.060117 0.800073 0.833964 0.670192 -1.149557 0.414523 -0.363002 0
4 0 4 0.117664 -0.721767 -0.163015 0.873627 -1.537441 -0.492561 -1.449226 -0.431590 ... -1.137925 -0.189920 -0.147974 1.047859 0.780197 0.531566 -1.226237 0.380344 -0.310250 0

5 rows × 147 columns

In [10]:
train['reading_id'].min(), train['reading_id'].max()
Out[10]:
(0, 61)
In [11]:
train['class_vals'].unique()
Out[11]:
array(['0', '1'], dtype=object)
In [12]:
%%time
test = get_mtsc_data_tabular_from_ts(upstream['core']['FaceDetection_TEST_TS'])
test.shape
Reading dataset TS file...
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:928: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data["dim_" + str(dim)] = instance_list[dim]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:934: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data["class_vals"] = pd.Series(class_val_list)
Converting _x to tabular format...
/home/ubuntu/vitmtsc_nbdev/vitmtsc/core.py:52: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`
  data_x_tabular.reset_index(inplace=True)
Converting _y to tabular format...
Merging _x and _y...
CPU times: user 1min 47s, sys: 1.26 s, total: 1min 48s
Wall time: 1min 48s
Out[12]:
(218488, 147)
In [13]:
test.head()
Out[13]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_135 dim_136 dim_137 dim_138 dim_139 dim_140 dim_141 dim_142 dim_143 class_vals
0 0 0 2.183168 -0.355553 0.072301 1.195415 -0.794009 0.153302 1.115395 -1.176578 ... 0.044418 -0.194282 0.367769 0.580047 -0.112294 -0.122612 0.161240 0.428348 0.915118 0
1 0 1 2.225861 -0.520159 0.231310 1.215142 -0.640171 0.294658 1.199156 -1.113278 ... 0.226521 -0.166871 0.339567 0.325842 0.038369 0.024042 0.273805 0.706688 1.212907 0
2 0 2 2.143762 -0.567942 0.129102 1.207573 -0.395744 0.150840 0.811009 -1.061630 ... 0.287824 -0.267554 0.195717 0.032441 0.227289 0.083741 0.178206 0.716761 1.392795 0
3 0 3 1.946997 -0.546087 0.174260 1.085086 -0.128849 0.188971 0.482807 -0.986273 ... 0.409944 -0.251355 0.168935 0.005684 0.375295 0.006296 0.140060 0.688435 1.482891 0
4 0 4 1.657394 -0.417094 0.159386 0.933640 0.253186 0.199629 0.058581 -0.892543 ... 0.548979 -0.281089 0.107426 -0.050389 0.518138 -0.060584 0.105976 0.692968 1.513194 0

5 rows × 147 columns

In [14]:
test['reading_id'].min(), test['reading_id'].max()
Out[14]:
(0, 61)
In [15]:
test['class_vals'].unique()
Out[15]:
array(['0', '1'], dtype=object)
In [16]:
from sklearn.model_selection import train_test_split
X = train[['case_id', 'class_vals']].drop_duplicates()
X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)
X_train.case_id.nunique(), X_val.case_id.nunique()
Out[16]:
(4712, 1178)
In [17]:
X_train.groupby(by = ['class_vals'], dropna = False).count()
Out[17]:
case_id
class_vals
0 2357
1 2355
In [18]:
X_val.groupby(by = ['class_vals'], dropna = False).count()
Out[18]:
case_id
class_vals
0 588
1 590
In [19]:
test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()
Out[19]:
case_id
class_vals
0 1762
1 1762
In [20]:
valid = train.merge(X_val, on=['case_id'], how='inner')
valid['class_vals'] = valid['class_vals_x']
valid = valid.drop(columns=['class_vals_x','class_vals_y'])
valid.case_id.nunique()
Out[20]:
1178
In [21]:
train = train.merge(X_train, on=['case_id'], how='inner')
train['class_vals'] = train['class_vals_x']
train = train.drop(columns=['class_vals_x','class_vals_y'])
train.case_id.nunique()
Out[21]:
4712
In [22]:
train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()
Out[22]:
(4712, 1178, 3524)
In [23]:
train
Out[23]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_135 dim_136 dim_137 dim_138 dim_139 dim_140 dim_141 dim_142 dim_143 class_vals
0 0 0 -0.075450 0.056080 -0.824537 1.220979 -0.762913 -0.588704 -1.358603 -0.435668 ... -1.231982 -0.715474 -0.120353 0.092455 0.530208 0.720140 -0.567580 -0.234040 -0.356189 0
1 0 1 -0.336703 -0.128013 -0.746068 1.253613 -1.248101 -0.699180 -1.609432 -0.340918 ... -0.971470 -0.564188 -0.168357 0.483913 0.863468 0.763068 -1.073942 0.104291 -0.511199 0
2 0 2 -0.278238 -0.323847 -0.482871 1.094015 -1.498578 -0.548781 -1.655122 -0.434355 ... -0.996615 -0.443425 -0.057533 0.711995 0.852293 0.694540 -1.136367 0.327425 -0.483072 0
3 0 3 -0.101338 -0.535844 -0.355501 1.024461 -1.582962 -0.545213 -1.607034 -0.416748 ... -1.121799 -0.308594 -0.060117 0.800073 0.833964 0.670192 -1.149557 0.414523 -0.363002 0
4 0 4 0.117664 -0.721767 -0.163015 0.873627 -1.537441 -0.492561 -1.449226 -0.431590 ... -1.137925 -0.189920 -0.147974 1.047859 0.780197 0.531566 -1.226237 0.380344 -0.310250 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
292139 5889 57 -0.230534 -0.286763 -0.387361 0.204795 0.728654 0.071708 -0.487104 -0.624672 ... -0.248141 0.225418 -0.352202 -0.494414 -0.587478 -0.297909 -0.243174 -0.198680 0.191251 1
292140 5889 58 0.012224 -0.435941 -0.804282 -0.370081 -0.367577 -0.480024 0.306681 -0.697584 ... -0.551751 0.526589 -0.710969 -1.478857 -0.256933 -0.223932 -0.536537 -0.463251 0.586670 1
292141 5889 59 -0.463263 -0.671191 -0.916895 -0.550672 -0.149643 -0.698187 0.959569 -0.951837 ... 0.687394 1.478814 0.451611 0.970563 0.304664 0.257314 0.224479 -0.125931 -0.967319 1
292142 5889 60 0.080807 0.818791 -1.227603 0.549396 -0.161739 -0.642325 -1.194296 -0.645866 ... -0.710146 0.124100 0.293746 -1.267861 -0.635904 -0.718340 -0.072564 0.782164 0.246511 1
292143 5889 61 -0.222367 -0.050930 -1.741634 -0.359941 0.814029 -2.246209 0.895925 0.722281 ... -1.879624 -2.142719 -1.271458 -2.572693 -1.884100 -1.644821 -0.991384 -1.900225 -0.182463 1

292144 rows × 147 columns

Write data in parquet format for future processing¶

In [24]:
#| export
import cudf
import dask_cudf
import pandas as pd

def write_parquet(pandas_df, output_dir, npartitions = 2):
    pandas_df['class_vals'].replace(['0', '1'],
                                    [0, 1], inplace = True)
    gdf = cudf.from_pandas(pandas_df)
    gdf['case_id_seq'] = gdf['case_id']
    dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)
    dask_gdf.to_parquet(output_dir)   
In [25]:
import time

from dask.distributed import Client
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 18:54:40,704 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 18:54:40,704 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 18:54:40,767 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 18:54:40,767 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 18:54:40,799 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 18:54:40,799 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 18:54:40,800 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 18:54:40,800 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Out[25]:

Client

Client-2e7ddbd8-3b71-11ed-80bf-0a01290f6f4b

Connection method: Cluster object Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

LocalCUDACluster

282ca355

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 4 Total memory: 150.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-bf8e81f3-3e39-4fec-9853-356450829554

Comm: tcp://127.0.0.1:43645 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 4
Started: Just now Total memory: 150.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:34961 Total threads: 1
Dashboard: http://127.0.0.1:42675/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:43715
Local directory: /tmp/dask-worker-space/worker-q9i1n3yw
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 1

Comm: tcp://127.0.0.1:36207 Total threads: 1
Dashboard: http://127.0.0.1:35503/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:37645
Local directory: /tmp/dask-worker-space/worker-lhqbsyr2
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 2

Comm: tcp://127.0.0.1:32989 Total threads: 1
Dashboard: http://127.0.0.1:35405/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:44361
Local directory: /tmp/dask-worker-space/worker-pldmuhu9
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 3

Comm: tcp://127.0.0.1:35997 Total threads: 1
Dashboard: http://127.0.0.1:35201/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:45039
Local directory: /tmp/dask-worker-space/worker-5xcck_aw
GPU: Tesla T4 GPU memory: 14.76 GiB

Train Dataset

In [26]:
%%time
write_parquet(train, product['FaceDetection_TRAIN_RAW'])
CPU times: user 2.08 s, sys: 574 ms, total: 2.66 s
Wall time: 4.56 s
In [27]:
train_gdf = dask_cudf.read_parquet(product['FaceDetection_TRAIN_RAW'])
train_gdf.head()
Out[27]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_136 dim_137 dim_138 dim_139 dim_140 dim_141 dim_142 dim_143 class_vals case_id_seq
0 0 0 -0.075450 0.056080 -0.824537 1.220979 -0.762913 -0.588704 -1.358603 -0.435668 ... -0.715474 -0.120353 0.092455 0.530208 0.720140 -0.567580 -0.234040 -0.356189 0 0
1 0 1 -0.336703 -0.128013 -0.746068 1.253613 -1.248101 -0.699180 -1.609432 -0.340918 ... -0.564188 -0.168357 0.483913 0.863468 0.763068 -1.073942 0.104291 -0.511199 0 0
2 0 2 -0.278238 -0.323847 -0.482871 1.094015 -1.498578 -0.548781 -1.655122 -0.434355 ... -0.443425 -0.057533 0.711995 0.852293 0.694540 -1.136367 0.327425 -0.483072 0 0
3 0 3 -0.101338 -0.535844 -0.355501 1.024461 -1.582962 -0.545213 -1.607034 -0.416748 ... -0.308594 -0.060117 0.800073 0.833964 0.670192 -1.149557 0.414523 -0.363002 0 0
4 0 4 0.117664 -0.721767 -0.163015 0.873627 -1.537441 -0.492561 -1.449226 -0.431590 ... -0.189920 -0.147974 1.047859 0.780197 0.531566 -1.226237 0.380344 -0.310250 0 0

5 rows × 148 columns

In [28]:
train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()
Out[28]:
(0, 61)
In [29]:
train_gdf.case_id.nunique().compute()
Out[29]:
4712

Valid Dataset

In [30]:
%%time
write_parquet(valid, product['FaceDetection_VALID_RAW'])
CPU times: user 246 ms, sys: 89.8 ms, total: 335 ms
Wall time: 469 ms
In [31]:
valid_gdf = dask_cudf.read_parquet(product['FaceDetection_VALID_RAW'])
valid_gdf.head()
Out[31]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_136 dim_137 dim_138 dim_139 dim_140 dim_141 dim_142 dim_143 class_vals case_id_seq
0 8 0 -0.514330 0.848444 0.633715 -0.386462 -2.046528 1.090028 0.755727 0.302179 ... -0.042363 1.810685 1.473524 1.049092 0.384222 0.842596 2.357036 -0.627867 0 8
1 8 1 -0.319596 0.878676 0.482996 -0.125028 -1.570902 1.055667 0.420854 1.111848 ... -0.419235 1.521585 1.195183 0.929807 0.672089 0.338519 2.089587 -0.387521 0 8
2 8 2 -0.423044 0.967854 0.495505 -0.094408 -1.337488 1.109245 0.565454 1.250807 ... -0.830331 1.549919 1.464097 0.752714 0.890977 0.300028 2.026411 -0.513880 0 8
3 8 3 -0.457570 1.125700 0.436169 -0.050398 -1.154981 1.047152 0.718827 1.402167 ... -0.950836 1.466116 1.359329 0.777893 0.967682 0.244541 2.003905 -0.466148 0 8
4 8 4 -0.476584 1.367462 0.384900 -0.086635 -0.811420 0.986001 0.987596 1.482867 ... -1.171173 1.304991 1.274834 0.611228 1.086574 0.198109 1.894594 -0.376624 0 8

5 rows × 148 columns

In [32]:
valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()
Out[32]:
(0, 61)
In [33]:
valid_gdf.case_id.nunique().compute()
Out[33]:
1178

Test Dataset

In [34]:
%%time
write_parquet(test, product['FaceDetection_TEST_RAW'])
CPU times: user 496 ms, sys: 208 ms, total: 704 ms
Wall time: 985 ms
In [35]:
test_gdf = dask_cudf.read_parquet(product['FaceDetection_TEST_RAW'])
test_gdf.head()
Out[35]:
case_id reading_id dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 dim_6 dim_7 ... dim_136 dim_137 dim_138 dim_139 dim_140 dim_141 dim_142 dim_143 class_vals case_id_seq
0 0 0 2.183168 -0.355553 0.072301 1.195415 -0.794009 0.153302 1.115395 -1.176578 ... -0.194282 0.367769 0.580047 -0.112294 -0.122612 0.161240 0.428348 0.915118 0 0
1 0 1 2.225861 -0.520159 0.231310 1.215142 -0.640171 0.294658 1.199156 -1.113278 ... -0.166871 0.339567 0.325842 0.038369 0.024042 0.273805 0.706688 1.212907 0 0
2 0 2 2.143762 -0.567942 0.129102 1.207573 -0.395744 0.150840 0.811009 -1.061630 ... -0.267554 0.195717 0.032441 0.227289 0.083741 0.178206 0.716761 1.392795 0 0
3 0 3 1.946997 -0.546087 0.174260 1.085086 -0.128849 0.188971 0.482807 -0.986273 ... -0.251355 0.168935 0.005684 0.375295 0.006296 0.140060 0.688435 1.482891 0 0
4 0 4 1.657394 -0.417094 0.159386 0.933640 0.253186 0.199629 0.058581 -0.892543 ... -0.281089 0.107426 -0.050389 0.518138 -0.060584 0.105976 0.692968 1.513194 0 0

5 rows × 148 columns

In [36]:
test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()
Out[36]:
(0, 61)
In [37]:
test_gdf.case_id.nunique().compute()
Out[37]:
3524

We reset the kernel!!!

In [38]:
%%time
client.shutdown()
client.close()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report
    await self._reconnect()
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError
CPU times: user 27.3 ms, sys: 19.6 ms, total: 46.9 ms
Wall time: 616 ms
In [39]:
from nbdev import nbdev_export
nbdev_export()