#| default_exp data.face_detection
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['core']
# Parameters
upstream = {"core": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/101_data.face_detection.html", "FaceDetection_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/train", "FaceDetection_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/valid", "FaceDetection_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/test"}
#| hide
from nbdev.showdoc import *
#| export
from vitmtsc import *
from vitmtsc.core import *
import dask_cudf
import gc #garbage collector interface
# |export
upstream = {
"core": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
"FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
"FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
"InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
"InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
"PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
"PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
"SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
"SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
"CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
"CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/101_data.face_detection.html",
"FaceDetection_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/train",
"FaceDetection_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/valid",
"FaceDetection_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/raw/test",
}
Convert dataset to parquet format to run target encoding
#| export
DATASET_NAME = 'FaceDetection'
%%time
train = get_mtsc_data_tabular_from_ts(upstream['core']['FaceDetection_TRAIN_TS'])
train.shape
Reading dataset TS file...
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:928: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data["dim_" + str(dim)] = instance_list[dim] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:934: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data["class_vals"] = pd.Series(class_val_list)
Converting _x to tabular format...
/home/ubuntu/vitmtsc_nbdev/vitmtsc/core.py:52: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data_x_tabular.reset_index(inplace=True)
Converting _y to tabular format... Merging _x and _y... CPU times: user 3min 3s, sys: 3.07 s, total: 3min 6s Wall time: 3min 6s
(365180, 147)
train.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_135 | dim_136 | dim_137 | dim_138 | dim_139 | dim_140 | dim_141 | dim_142 | dim_143 | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | -0.075450 | 0.056080 | -0.824537 | 1.220979 | -0.762913 | -0.588704 | -1.358603 | -0.435668 | ... | -1.231982 | -0.715474 | -0.120353 | 0.092455 | 0.530208 | 0.720140 | -0.567580 | -0.234040 | -0.356189 | 0 |
1 | 0 | 1 | -0.336703 | -0.128013 | -0.746068 | 1.253613 | -1.248101 | -0.699180 | -1.609432 | -0.340918 | ... | -0.971470 | -0.564188 | -0.168357 | 0.483913 | 0.863468 | 0.763068 | -1.073942 | 0.104291 | -0.511199 | 0 |
2 | 0 | 2 | -0.278238 | -0.323847 | -0.482871 | 1.094015 | -1.498578 | -0.548781 | -1.655122 | -0.434355 | ... | -0.996615 | -0.443425 | -0.057533 | 0.711995 | 0.852293 | 0.694540 | -1.136367 | 0.327425 | -0.483072 | 0 |
3 | 0 | 3 | -0.101338 | -0.535844 | -0.355501 | 1.024461 | -1.582962 | -0.545213 | -1.607034 | -0.416748 | ... | -1.121799 | -0.308594 | -0.060117 | 0.800073 | 0.833964 | 0.670192 | -1.149557 | 0.414523 | -0.363002 | 0 |
4 | 0 | 4 | 0.117664 | -0.721767 | -0.163015 | 0.873627 | -1.537441 | -0.492561 | -1.449226 | -0.431590 | ... | -1.137925 | -0.189920 | -0.147974 | 1.047859 | 0.780197 | 0.531566 | -1.226237 | 0.380344 | -0.310250 | 0 |
5 rows × 147 columns
train['reading_id'].min(), train['reading_id'].max()
(0, 61)
train['class_vals'].unique()
array(['0', '1'], dtype=object)
%%time
test = get_mtsc_data_tabular_from_ts(upstream['core']['FaceDetection_TEST_TS'])
test.shape
Reading dataset TS file...
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:928: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data["dim_" + str(dim)] = instance_list[dim] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:934: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data["class_vals"] = pd.Series(class_val_list)
Converting _x to tabular format...
/home/ubuntu/vitmtsc_nbdev/vitmtsc/core.py:52: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data_x_tabular.reset_index(inplace=True)
Converting _y to tabular format... Merging _x and _y... CPU times: user 1min 47s, sys: 1.26 s, total: 1min 48s Wall time: 1min 48s
(218488, 147)
test.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_135 | dim_136 | dim_137 | dim_138 | dim_139 | dim_140 | dim_141 | dim_142 | dim_143 | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 2.183168 | -0.355553 | 0.072301 | 1.195415 | -0.794009 | 0.153302 | 1.115395 | -1.176578 | ... | 0.044418 | -0.194282 | 0.367769 | 0.580047 | -0.112294 | -0.122612 | 0.161240 | 0.428348 | 0.915118 | 0 |
1 | 0 | 1 | 2.225861 | -0.520159 | 0.231310 | 1.215142 | -0.640171 | 0.294658 | 1.199156 | -1.113278 | ... | 0.226521 | -0.166871 | 0.339567 | 0.325842 | 0.038369 | 0.024042 | 0.273805 | 0.706688 | 1.212907 | 0 |
2 | 0 | 2 | 2.143762 | -0.567942 | 0.129102 | 1.207573 | -0.395744 | 0.150840 | 0.811009 | -1.061630 | ... | 0.287824 | -0.267554 | 0.195717 | 0.032441 | 0.227289 | 0.083741 | 0.178206 | 0.716761 | 1.392795 | 0 |
3 | 0 | 3 | 1.946997 | -0.546087 | 0.174260 | 1.085086 | -0.128849 | 0.188971 | 0.482807 | -0.986273 | ... | 0.409944 | -0.251355 | 0.168935 | 0.005684 | 0.375295 | 0.006296 | 0.140060 | 0.688435 | 1.482891 | 0 |
4 | 0 | 4 | 1.657394 | -0.417094 | 0.159386 | 0.933640 | 0.253186 | 0.199629 | 0.058581 | -0.892543 | ... | 0.548979 | -0.281089 | 0.107426 | -0.050389 | 0.518138 | -0.060584 | 0.105976 | 0.692968 | 1.513194 | 0 |
5 rows × 147 columns
test['reading_id'].min(), test['reading_id'].max()
(0, 61)
test['class_vals'].unique()
array(['0', '1'], dtype=object)
from sklearn.model_selection import train_test_split
X = train[['case_id', 'class_vals']].drop_duplicates()
X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)
X_train.case_id.nunique(), X_val.case_id.nunique()
(4712, 1178)
X_train.groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
0 | 2357 |
1 | 2355 |
X_val.groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
0 | 588 |
1 | 590 |
test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
0 | 1762 |
1 | 1762 |
valid = train.merge(X_val, on=['case_id'], how='inner')
valid['class_vals'] = valid['class_vals_x']
valid = valid.drop(columns=['class_vals_x','class_vals_y'])
valid.case_id.nunique()
1178
train = train.merge(X_train, on=['case_id'], how='inner')
train['class_vals'] = train['class_vals_x']
train = train.drop(columns=['class_vals_x','class_vals_y'])
train.case_id.nunique()
4712
train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()
(4712, 1178, 3524)
train
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_135 | dim_136 | dim_137 | dim_138 | dim_139 | dim_140 | dim_141 | dim_142 | dim_143 | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | -0.075450 | 0.056080 | -0.824537 | 1.220979 | -0.762913 | -0.588704 | -1.358603 | -0.435668 | ... | -1.231982 | -0.715474 | -0.120353 | 0.092455 | 0.530208 | 0.720140 | -0.567580 | -0.234040 | -0.356189 | 0 |
1 | 0 | 1 | -0.336703 | -0.128013 | -0.746068 | 1.253613 | -1.248101 | -0.699180 | -1.609432 | -0.340918 | ... | -0.971470 | -0.564188 | -0.168357 | 0.483913 | 0.863468 | 0.763068 | -1.073942 | 0.104291 | -0.511199 | 0 |
2 | 0 | 2 | -0.278238 | -0.323847 | -0.482871 | 1.094015 | -1.498578 | -0.548781 | -1.655122 | -0.434355 | ... | -0.996615 | -0.443425 | -0.057533 | 0.711995 | 0.852293 | 0.694540 | -1.136367 | 0.327425 | -0.483072 | 0 |
3 | 0 | 3 | -0.101338 | -0.535844 | -0.355501 | 1.024461 | -1.582962 | -0.545213 | -1.607034 | -0.416748 | ... | -1.121799 | -0.308594 | -0.060117 | 0.800073 | 0.833964 | 0.670192 | -1.149557 | 0.414523 | -0.363002 | 0 |
4 | 0 | 4 | 0.117664 | -0.721767 | -0.163015 | 0.873627 | -1.537441 | -0.492561 | -1.449226 | -0.431590 | ... | -1.137925 | -0.189920 | -0.147974 | 1.047859 | 0.780197 | 0.531566 | -1.226237 | 0.380344 | -0.310250 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
292139 | 5889 | 57 | -0.230534 | -0.286763 | -0.387361 | 0.204795 | 0.728654 | 0.071708 | -0.487104 | -0.624672 | ... | -0.248141 | 0.225418 | -0.352202 | -0.494414 | -0.587478 | -0.297909 | -0.243174 | -0.198680 | 0.191251 | 1 |
292140 | 5889 | 58 | 0.012224 | -0.435941 | -0.804282 | -0.370081 | -0.367577 | -0.480024 | 0.306681 | -0.697584 | ... | -0.551751 | 0.526589 | -0.710969 | -1.478857 | -0.256933 | -0.223932 | -0.536537 | -0.463251 | 0.586670 | 1 |
292141 | 5889 | 59 | -0.463263 | -0.671191 | -0.916895 | -0.550672 | -0.149643 | -0.698187 | 0.959569 | -0.951837 | ... | 0.687394 | 1.478814 | 0.451611 | 0.970563 | 0.304664 | 0.257314 | 0.224479 | -0.125931 | -0.967319 | 1 |
292142 | 5889 | 60 | 0.080807 | 0.818791 | -1.227603 | 0.549396 | -0.161739 | -0.642325 | -1.194296 | -0.645866 | ... | -0.710146 | 0.124100 | 0.293746 | -1.267861 | -0.635904 | -0.718340 | -0.072564 | 0.782164 | 0.246511 | 1 |
292143 | 5889 | 61 | -0.222367 | -0.050930 | -1.741634 | -0.359941 | 0.814029 | -2.246209 | 0.895925 | 0.722281 | ... | -1.879624 | -2.142719 | -1.271458 | -2.572693 | -1.884100 | -1.644821 | -0.991384 | -1.900225 | -0.182463 | 1 |
292144 rows × 147 columns
#| export
import cudf
import dask_cudf
import pandas as pd
def write_parquet(pandas_df, output_dir, npartitions = 2):
pandas_df['class_vals'].replace(['0', '1'],
[0, 1], inplace = True)
gdf = cudf.from_pandas(pandas_df)
gdf['case_id_seq'] = gdf['case_id']
dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)
dask_gdf.to_parquet(output_dir)
import time
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 18:54:40,704 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 18:54:40,704 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 18:54:40,767 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 18:54:40,767 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 18:54:40,799 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 18:54:40,799 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 18:54:40,800 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 18:54:40,800 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Client-2e7ddbd8-3b71-11ed-80bf-0a01290f6f4b
Connection method: Cluster object | Cluster type: dask_cuda.LocalCUDACluster |
Dashboard: http://127.0.0.1:8787/status |
282ca355
Dashboard: http://127.0.0.1:8787/status | Workers: 4 |
Total threads: 4 | Total memory: 150.00 GiB |
Status: running | Using processes: True |
Scheduler-bf8e81f3-3e39-4fec-9853-356450829554
Comm: tcp://127.0.0.1:43645 | Workers: 4 |
Dashboard: http://127.0.0.1:8787/status | Total threads: 4 |
Started: Just now | Total memory: 150.00 GiB |
Comm: tcp://127.0.0.1:34961 | Total threads: 1 |
Dashboard: http://127.0.0.1:42675/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:43715 | |
Local directory: /tmp/dask-worker-space/worker-q9i1n3yw | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:36207 | Total threads: 1 |
Dashboard: http://127.0.0.1:35503/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:37645 | |
Local directory: /tmp/dask-worker-space/worker-lhqbsyr2 | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:32989 | Total threads: 1 |
Dashboard: http://127.0.0.1:35405/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:44361 | |
Local directory: /tmp/dask-worker-space/worker-pldmuhu9 | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:35997 | Total threads: 1 |
Dashboard: http://127.0.0.1:35201/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:45039 | |
Local directory: /tmp/dask-worker-space/worker-5xcck_aw | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Train Dataset
%%time
write_parquet(train, product['FaceDetection_TRAIN_RAW'])
CPU times: user 2.08 s, sys: 574 ms, total: 2.66 s Wall time: 4.56 s
train_gdf = dask_cudf.read_parquet(product['FaceDetection_TRAIN_RAW'])
train_gdf.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_136 | dim_137 | dim_138 | dim_139 | dim_140 | dim_141 | dim_142 | dim_143 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | -0.075450 | 0.056080 | -0.824537 | 1.220979 | -0.762913 | -0.588704 | -1.358603 | -0.435668 | ... | -0.715474 | -0.120353 | 0.092455 | 0.530208 | 0.720140 | -0.567580 | -0.234040 | -0.356189 | 0 | 0 |
1 | 0 | 1 | -0.336703 | -0.128013 | -0.746068 | 1.253613 | -1.248101 | -0.699180 | -1.609432 | -0.340918 | ... | -0.564188 | -0.168357 | 0.483913 | 0.863468 | 0.763068 | -1.073942 | 0.104291 | -0.511199 | 0 | 0 |
2 | 0 | 2 | -0.278238 | -0.323847 | -0.482871 | 1.094015 | -1.498578 | -0.548781 | -1.655122 | -0.434355 | ... | -0.443425 | -0.057533 | 0.711995 | 0.852293 | 0.694540 | -1.136367 | 0.327425 | -0.483072 | 0 | 0 |
3 | 0 | 3 | -0.101338 | -0.535844 | -0.355501 | 1.024461 | -1.582962 | -0.545213 | -1.607034 | -0.416748 | ... | -0.308594 | -0.060117 | 0.800073 | 0.833964 | 0.670192 | -1.149557 | 0.414523 | -0.363002 | 0 | 0 |
4 | 0 | 4 | 0.117664 | -0.721767 | -0.163015 | 0.873627 | -1.537441 | -0.492561 | -1.449226 | -0.431590 | ... | -0.189920 | -0.147974 | 1.047859 | 0.780197 | 0.531566 | -1.226237 | 0.380344 | -0.310250 | 0 | 0 |
5 rows × 148 columns
train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()
(0, 61)
train_gdf.case_id.nunique().compute()
4712
Valid Dataset
%%time
write_parquet(valid, product['FaceDetection_VALID_RAW'])
CPU times: user 246 ms, sys: 89.8 ms, total: 335 ms Wall time: 469 ms
valid_gdf = dask_cudf.read_parquet(product['FaceDetection_VALID_RAW'])
valid_gdf.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_136 | dim_137 | dim_138 | dim_139 | dim_140 | dim_141 | dim_142 | dim_143 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 0 | -0.514330 | 0.848444 | 0.633715 | -0.386462 | -2.046528 | 1.090028 | 0.755727 | 0.302179 | ... | -0.042363 | 1.810685 | 1.473524 | 1.049092 | 0.384222 | 0.842596 | 2.357036 | -0.627867 | 0 | 8 |
1 | 8 | 1 | -0.319596 | 0.878676 | 0.482996 | -0.125028 | -1.570902 | 1.055667 | 0.420854 | 1.111848 | ... | -0.419235 | 1.521585 | 1.195183 | 0.929807 | 0.672089 | 0.338519 | 2.089587 | -0.387521 | 0 | 8 |
2 | 8 | 2 | -0.423044 | 0.967854 | 0.495505 | -0.094408 | -1.337488 | 1.109245 | 0.565454 | 1.250807 | ... | -0.830331 | 1.549919 | 1.464097 | 0.752714 | 0.890977 | 0.300028 | 2.026411 | -0.513880 | 0 | 8 |
3 | 8 | 3 | -0.457570 | 1.125700 | 0.436169 | -0.050398 | -1.154981 | 1.047152 | 0.718827 | 1.402167 | ... | -0.950836 | 1.466116 | 1.359329 | 0.777893 | 0.967682 | 0.244541 | 2.003905 | -0.466148 | 0 | 8 |
4 | 8 | 4 | -0.476584 | 1.367462 | 0.384900 | -0.086635 | -0.811420 | 0.986001 | 0.987596 | 1.482867 | ... | -1.171173 | 1.304991 | 1.274834 | 0.611228 | 1.086574 | 0.198109 | 1.894594 | -0.376624 | 0 | 8 |
5 rows × 148 columns
valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()
(0, 61)
valid_gdf.case_id.nunique().compute()
1178
Test Dataset
%%time
write_parquet(test, product['FaceDetection_TEST_RAW'])
CPU times: user 496 ms, sys: 208 ms, total: 704 ms Wall time: 985 ms
test_gdf = dask_cudf.read_parquet(product['FaceDetection_TEST_RAW'])
test_gdf.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_136 | dim_137 | dim_138 | dim_139 | dim_140 | dim_141 | dim_142 | dim_143 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 2.183168 | -0.355553 | 0.072301 | 1.195415 | -0.794009 | 0.153302 | 1.115395 | -1.176578 | ... | -0.194282 | 0.367769 | 0.580047 | -0.112294 | -0.122612 | 0.161240 | 0.428348 | 0.915118 | 0 | 0 |
1 | 0 | 1 | 2.225861 | -0.520159 | 0.231310 | 1.215142 | -0.640171 | 0.294658 | 1.199156 | -1.113278 | ... | -0.166871 | 0.339567 | 0.325842 | 0.038369 | 0.024042 | 0.273805 | 0.706688 | 1.212907 | 0 | 0 |
2 | 0 | 2 | 2.143762 | -0.567942 | 0.129102 | 1.207573 | -0.395744 | 0.150840 | 0.811009 | -1.061630 | ... | -0.267554 | 0.195717 | 0.032441 | 0.227289 | 0.083741 | 0.178206 | 0.716761 | 1.392795 | 0 | 0 |
3 | 0 | 3 | 1.946997 | -0.546087 | 0.174260 | 1.085086 | -0.128849 | 0.188971 | 0.482807 | -0.986273 | ... | -0.251355 | 0.168935 | 0.005684 | 0.375295 | 0.006296 | 0.140060 | 0.688435 | 1.482891 | 0 | 0 |
4 | 0 | 4 | 1.657394 | -0.417094 | 0.159386 | 0.933640 | 0.253186 | 0.199629 | 0.058581 | -0.892543 | ... | -0.281089 | 0.107426 | -0.050389 | 0.518138 | -0.060584 | 0.105976 | 0.692968 | 1.513194 | 0 | 0 |
5 rows × 148 columns
test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()
(0, 61)
test_gdf.case_id.nunique().compute()
3524
We reset the kernel!!!
%%time
client.shutdown()
client.close()
Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report await self._reconnect() File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError
CPU times: user 27.3 ms, sys: 19.6 ms, total: 46.9 ms Wall time: 616 ms
from nbdev import nbdev_export
nbdev_export()