#| default_exp data.insect_wingbeat
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['core']
# Parameters
upstream = {"core": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html", "InsectWingbeat_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train", "InsectWingbeat_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid", "InsectWingbeat_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test"}
#| hide
from nbdev.showdoc import *
#| export
from vitmtsc import *
from vitmtsc.core import *
import dask_cudf
import gc #garbage collector interface
#| export
upstream = {
"core": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
"FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
"FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
"InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
"InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
"PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
"PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
"SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
"SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
"CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
"CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html",
"InsectWingbeat_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train",
"InsectWingbeat_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid",
"InsectWingbeat_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test",
}
Convert dataset to parquet format to run target encoding
#| export
DATASET_NAME = 'InsectWingbeat'
%%time
train = get_mtsc_data_tabular_from_ts(upstream['core']['InsectWingbeat_TRAIN_TS'])
train.shape
Reading dataset TS file...
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:928: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data["dim_" + str(dim)] = instance_list[dim] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:934: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data["class_vals"] = pd.Series(class_val_list)
Converting _x to tabular format...
/home/ubuntu/vitmtsc_nbdev/vitmtsc/core.py:52: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data_x_tabular.reset_index(inplace=True)
Converting _y to tabular format... Merging _x and _y... CPU times: user 16min 31s, sys: 10.1 s, total: 16min 41s Wall time: 16min 41s
(167974, 203)
train.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_191 | dim_192 | dim_193 | dim_194 | dim_195 | dim_196 | dim_197 | dim_198 | dim_199 | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | -0.000526 | 0.000426 | -0.000490 | 0.000469 | -0.000361 | 0.000117 | 0.000013 | -0.000065 | ... | -0.000787 | 0.000518 | -0.000294 | -0.000110 | 0.000469 | -0.000530 | 0.000652 | -0.000818 | -0.000818 | aedes_female |
1 | 0 | 1 | -0.000680 | 0.000293 | -0.000443 | -0.000035 | -0.000393 | 0.000220 | -0.000136 | 0.000325 | ... | -0.000274 | -0.000250 | -0.000033 | 0.000191 | 0.000030 | -0.000092 | 0.000526 | 0.000248 | 0.000248 | aedes_female |
2 | 0 | 2 | -0.010782 | 0.013112 | -0.012667 | 0.009561 | -0.011605 | 0.014131 | -0.012283 | 0.014958 | ... | 0.142569 | 0.053184 | -0.026049 | 0.021518 | -0.021696 | 0.019203 | -0.020744 | 0.026454 | 0.026454 | aedes_female |
3 | 0 | 3 | 0.024827 | -0.035617 | -0.010840 | -0.342877 | -0.214020 | 0.001512 | -0.002940 | -0.006419 | ... | 0.656476 | 0.237110 | -0.010011 | 0.002020 | -0.000191 | 0.001416 | 0.000318 | 0.000044 | 0.000044 | aedes_female |
4 | 0 | 4 | 0.019104 | -0.000091 | -0.971610 | -2.116380 | -0.333796 | 0.015550 | 0.009869 | 0.017413 | ... | -0.021847 | 0.022194 | 0.020843 | 0.020790 | 0.021171 | 0.020698 | 0.020344 | 0.020116 | 0.020116 | aedes_female |
5 rows × 203 columns
train['reading_id'].min(), train['reading_id'].max()
(0, 21)
train['class_vals'].unique()
array(['aedes_female', 'aedes_male', 'fruit_flies', 'house_flies', 'quinx_female', 'quinx_male', 'stigma_female', 'stigma_male', 'tarsalis_female', 'tarsalis_male'], dtype=object)
%%time
test = get_mtsc_data_tabular_from_ts(upstream['core']['InsectWingbeat_TEST_TS'])
test.shape
Reading dataset TS file...
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:928: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data["dim_" + str(dim)] = instance_list[dim] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/sktime/datasets/_data_io.py:934: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data["class_vals"] = pd.Series(class_val_list)
Converting _x to tabular format...
/home/ubuntu/vitmtsc_nbdev/vitmtsc/core.py:52: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` data_x_tabular.reset_index(inplace=True)
Converting _y to tabular format... Merging _x and _y... CPU times: user 16min 22s, sys: 6.39 s, total: 16min 28s Wall time: 16min 28s
(167560, 203)
test.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_191 | dim_192 | dim_193 | dim_194 | dim_195 | dim_196 | dim_197 | dim_198 | dim_199 | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | -0.002947 | 0.002920 | -0.002758 | 0.002282 | -0.001377 | 0.000128 | 0.001152 | -0.002060 | ... | -0.000081 | 0.000090 | -0.000068 | 0.000031 | -0.000007 | 0.000014 | -0.000049 | 0.000087 | 0.000087 | aedes_female |
1 | 0 | 1 | -0.053421 | 0.011923 | -0.006867 | 0.027616 | 0.045003 | -0.094597 | 0.014191 | 0.022102 | ... | -0.000171 | 0.000134 | 0.000284 | 0.000281 | 0.000343 | 0.000118 | 0.000088 | -0.000291 | -0.000291 | aedes_female |
2 | 0 | 2 | -0.049052 | -0.020504 | 0.000670 | -0.000306 | 0.063919 | 0.024317 | -0.004142 | 0.002782 | ... | -0.000002 | -0.000031 | -0.000027 | 0.000070 | 0.000052 | 0.000011 | 0.000190 | 0.000467 | 0.000467 | aedes_female |
3 | 0 | 3 | 0.046457 | -0.555692 | 0.194275 | 0.016277 | -0.024176 | 0.480148 | -0.058053 | -0.003973 | ... | -0.001096 | 0.000840 | -0.001102 | 0.001148 | -0.001079 | 0.001008 | -0.001456 | 0.010817 | 0.010817 | aedes_female |
4 | 0 | 4 | -0.896564 | 7.723570 | -0.871859 | -1.611516 | -0.154569 | -0.239838 | 0.765347 | -0.284888 | ... | -0.013012 | 0.013401 | -0.014279 | 0.015049 | -0.014547 | 0.015267 | -0.015893 | 0.017930 | 0.017930 | aedes_female |
5 rows × 203 columns
test['reading_id'].min(), test['reading_id'].max()
(0, 21)
test['class_vals'].unique()
array(['aedes_female', 'aedes_male', 'fruit_flies', 'house_flies', 'quinx_female', 'quinx_male', 'stigma_female', 'stigma_male', 'tarsalis_female', 'tarsalis_male'], dtype=object)
from sklearn.model_selection import train_test_split
X = train[['case_id', 'class_vals']].drop_duplicates()
X_train, X_val, y_train, y_val = train_test_split(X, X['class_vals'], train_size=0.8, random_state = 42)
X_train.case_id.nunique(), X_val.case_id.nunique()
(20000, 5000)
X_train.groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
aedes_female | 2007 |
aedes_male | 1972 |
fruit_flies | 2007 |
house_flies | 1993 |
quinx_female | 2006 |
quinx_male | 2009 |
stigma_female | 2015 |
stigma_male | 1996 |
tarsalis_female | 1997 |
tarsalis_male | 1998 |
X_val.groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
aedes_female | 493 |
aedes_male | 528 |
fruit_flies | 493 |
house_flies | 507 |
quinx_female | 494 |
quinx_male | 491 |
stigma_female | 485 |
stigma_male | 504 |
tarsalis_female | 503 |
tarsalis_male | 502 |
test[['case_id', 'class_vals']].drop_duplicates().groupby(by = ['class_vals'], dropna = False).count()
case_id | |
---|---|
class_vals | |
aedes_female | 2500 |
aedes_male | 2500 |
fruit_flies | 2500 |
house_flies | 2500 |
quinx_female | 2500 |
quinx_male | 2500 |
stigma_female | 2500 |
stigma_male | 2500 |
tarsalis_female | 2500 |
tarsalis_male | 2500 |
valid = train.merge(X_val, on=['case_id'], how='inner')
valid['class_vals'] = valid['class_vals_x']
valid = valid.drop(columns=['class_vals_x','class_vals_y'])
valid.case_id.nunique()
5000
train = train.merge(X_train, on=['case_id'], how='inner')
train['class_vals'] = train['class_vals_x']
train = train.drop(columns=['class_vals_x','class_vals_y'])
train.case_id.nunique()
20000
train.case_id.nunique(), valid.case_id.nunique(), test.case_id.nunique()
(20000, 5000, 25000)
train
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_191 | dim_192 | dim_193 | dim_194 | dim_195 | dim_196 | dim_197 | dim_198 | dim_199 | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | -0.000526 | 0.000426 | -0.000490 | 0.000469 | -0.000361 | 0.000117 | 0.000013 | -0.000065 | ... | -0.000787 | 0.000518 | -0.000294 | -0.000110 | 0.000469 | -0.000530 | 0.000652 | -0.000818 | -0.000818 | aedes_female |
1 | 0 | 1 | -0.000680 | 0.000293 | -0.000443 | -0.000035 | -0.000393 | 0.000220 | -0.000136 | 0.000325 | ... | -0.000274 | -0.000250 | -0.000033 | 0.000191 | 0.000030 | -0.000092 | 0.000526 | 0.000248 | 0.000248 | aedes_female |
2 | 0 | 2 | -0.010782 | 0.013112 | -0.012667 | 0.009561 | -0.011605 | 0.014131 | -0.012283 | 0.014958 | ... | 0.142569 | 0.053184 | -0.026049 | 0.021518 | -0.021696 | 0.019203 | -0.020744 | 0.026454 | 0.026454 | aedes_female |
3 | 0 | 3 | 0.024827 | -0.035617 | -0.010840 | -0.342877 | -0.214020 | 0.001512 | -0.002940 | -0.006419 | ... | 0.656476 | 0.237110 | -0.010011 | 0.002020 | -0.000191 | 0.001416 | 0.000318 | 0.000044 | 0.000044 | aedes_female |
4 | 0 | 4 | 0.019104 | -0.000091 | -0.971610 | -2.116380 | -0.333796 | 0.015550 | 0.009869 | 0.017413 | ... | -0.021847 | 0.022194 | 0.020843 | 0.020790 | 0.021171 | 0.020698 | 0.020344 | 0.020116 | 0.020116 | aedes_female |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
134367 | 24998 | 2 | -0.005761 | 0.001334 | -0.000173 | -0.000088 | 0.000262 | -0.000555 | -0.000152 | -0.000119 | ... | 0.000115 | 0.000040 | 0.000218 | 0.000100 | -0.000062 | 0.000069 | 0.000198 | 0.000027 | 0.000027 | tarsalis_male |
134368 | 24998 | 3 | 1.191404 | -0.659416 | -0.660178 | 0.631950 | -0.168012 | 0.436480 | 0.223098 | -0.251304 | ... | 0.001960 | -0.002290 | 0.002210 | -0.002204 | 0.002151 | -0.002175 | 0.002536 | -0.002290 | -0.002290 | tarsalis_male |
134369 | 24998 | 4 | 2.882338 | -4.776926 | 4.540706 | -3.988603 | 0.156886 | -1.182728 | -0.801508 | 0.878964 | ... | 0.006013 | -0.007888 | 0.010264 | -0.012222 | 0.006145 | -0.004855 | 0.002678 | -0.006170 | -0.006170 | tarsalis_male |
134370 | 24998 | 5 | -5.569411 | 1.057257 | 8.147614 | 3.205241 | 0.742497 | 0.485884 | 0.640577 | 0.164989 | ... | -0.000266 | 0.014064 | -0.026576 | 0.025557 | -0.020304 | 0.001435 | 0.007384 | -0.026695 | -0.026695 | tarsalis_male |
134371 | 24998 | 6 | -0.211579 | -0.210664 | -0.205127 | -0.190088 | -0.165624 | -0.140203 | -0.126331 | -0.131375 | ... | -0.006157 | -0.004272 | -0.004563 | -0.006921 | -0.009405 | -0.009900 | -0.007943 | -0.005148 | -0.005148 | tarsalis_male |
134372 rows × 203 columns
#| export
import cudf
import dask_cudf
import pandas as pd
def write_parquet(pandas_df, output_dir, npartitions = 2):
pandas_df['class_vals'].replace(['aedes_female',
'aedes_male',
'fruit_flies',
'house_flies',
'quinx_female',
'quinx_male',
'stigma_female',
'stigma_male',
'tarsalis_female',
'tarsalis_male'],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace = True)
gdf = cudf.from_pandas(pandas_df)
gdf['case_id_seq'] = gdf['case_id']
dask_gdf = dask_cudf.from_cudf(gdf, npartitions = npartitions)
dask_gdf.to_parquet(output_dir)
import time
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.2, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:33:31,833 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:33:31,833 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:33:31,846 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:33:31,846 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:33:31,937 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:33:31,937 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:33:32,042 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:33:32,042 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Client-9bf2d02a-3b76-11ed-80d2-06c0bb745397
Connection method: Cluster object | Cluster type: dask_cuda.LocalCUDACluster |
Dashboard: http://127.0.0.1:8787/status |
418a3880
Dashboard: http://127.0.0.1:8787/status | Workers: 4 |
Total threads: 4 | Total memory: 150.00 GiB |
Status: running | Using processes: True |
Scheduler-cf6e00c1-8680-4bf2-b303-6d78558f9eb3
Comm: tcp://127.0.0.1:42227 | Workers: 4 |
Dashboard: http://127.0.0.1:8787/status | Total threads: 4 |
Started: Just now | Total memory: 150.00 GiB |
Comm: tcp://127.0.0.1:35335 | Total threads: 1 |
Dashboard: http://127.0.0.1:42545/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:44269 | |
Local directory: /tmp/dask-worker-space/worker-mwbqkchz | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:46213 | Total threads: 1 |
Dashboard: http://127.0.0.1:46245/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:33989 | |
Local directory: /tmp/dask-worker-space/worker-iuiqwyz_ | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:35837 | Total threads: 1 |
Dashboard: http://127.0.0.1:35249/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:38933 | |
Local directory: /tmp/dask-worker-space/worker-epogeuhe | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:39187 | Total threads: 1 |
Dashboard: http://127.0.0.1:33849/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:42833 | |
Local directory: /tmp/dask-worker-space/worker-ojdxshkb | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Train Dataset
%%time
write_parquet(train, product['InsectWingbeat_TRAIN_RAW'])
CPU times: user 2.57 s, sys: 488 ms, total: 3.05 s Wall time: 4.97 s
train_gdf = dask_cudf.read_parquet(product['InsectWingbeat_TRAIN_RAW'])
train_gdf.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_192 | dim_193 | dim_194 | dim_195 | dim_196 | dim_197 | dim_198 | dim_199 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | -0.000526 | 0.000426 | -0.000490 | 0.000469 | -0.000361 | 0.000117 | 0.000013 | -0.000065 | ... | 0.000518 | -0.000294 | -0.000110 | 0.000469 | -0.000530 | 0.000652 | -0.000818 | -0.000818 | 0 | 0 |
1 | 0 | 1 | -0.000680 | 0.000293 | -0.000443 | -0.000035 | -0.000393 | 0.000220 | -0.000136 | 0.000325 | ... | -0.000250 | -0.000033 | 0.000191 | 0.000030 | -0.000092 | 0.000526 | 0.000248 | 0.000248 | 0 | 0 |
2 | 0 | 2 | -0.010782 | 0.013112 | -0.012667 | 0.009561 | -0.011605 | 0.014131 | -0.012283 | 0.014958 | ... | 0.053184 | -0.026049 | 0.021518 | -0.021696 | 0.019203 | -0.020744 | 0.026454 | 0.026454 | 0 | 0 |
3 | 0 | 3 | 0.024827 | -0.035617 | -0.010840 | -0.342877 | -0.214020 | 0.001512 | -0.002940 | -0.006419 | ... | 0.237110 | -0.010011 | 0.002020 | -0.000191 | 0.001416 | 0.000318 | 0.000044 | 0.000044 | 0 | 0 |
4 | 0 | 4 | 0.019104 | -0.000091 | -0.971610 | -2.116380 | -0.333796 | 0.015550 | 0.009869 | 0.017413 | ... | 0.022194 | 0.020843 | 0.020790 | 0.021171 | 0.020698 | 0.020344 | 0.020116 | 0.020116 | 0 | 0 |
5 rows × 204 columns
train_gdf['reading_id'].min().compute(), train_gdf['reading_id'].max().compute()
(0, 21)
train_gdf.case_id.nunique().compute()
20000
Valid Dataset
%%time
write_parquet(valid, product['InsectWingbeat_VALID_RAW'])
CPU times: user 430 ms, sys: 32.1 ms, total: 462 ms Wall time: 605 ms
valid_gdf = dask_cudf.read_parquet(product['InsectWingbeat_VALID_RAW'])
valid_gdf.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_192 | dim_193 | dim_194 | dim_195 | dim_196 | dim_197 | dim_198 | dim_199 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17 | 0 | -0.674501 | -0.044123 | -0.043462 | 0.056184 | -0.058077 | 0.057803 | -0.059285 | 0.061007 | ... | -0.003337 | 0.002940 | -0.003611 | 0.004123 | -0.005298 | 0.006879 | -0.006634 | -0.006634 | 0 | 17 |
1 | 17 | 1 | 0.299125 | -1.862445 | -0.421858 | -0.241298 | 0.175858 | -0.171911 | 0.204115 | -0.255530 | ... | -0.023787 | -0.007275 | 0.003402 | 0.000278 | -0.010965 | 0.002707 | 0.003182 | 0.003182 | 0 | 17 |
2 | 17 | 2 | -0.428115 | -0.339370 | 1.075969 | 0.141863 | 0.087708 | 0.049868 | 0.392346 | 0.659165 | ... | -0.008609 | -0.014546 | -0.000611 | -0.016076 | 0.012444 | -0.017995 | 0.011532 | 0.011532 | 0 | 17 |
3 | 17 | 3 | -0.066288 | -0.915256 | -0.051801 | -0.236039 | -0.119390 | -0.280049 | -0.128623 | -0.341739 | ... | 0.013279 | 0.001198 | 0.019783 | 0.001476 | 0.004445 | -0.000844 | 0.005537 | 0.005537 | 0 | 17 |
4 | 17 | 4 | -1.573770 | 1.404155 | -0.310456 | -0.053030 | -0.021343 | -0.145558 | 0.173446 | 0.167817 | ... | 0.012897 | 0.013816 | 0.003628 | -0.010986 | 0.013594 | 0.017745 | 0.012654 | 0.012654 | 0 | 17 |
5 rows × 204 columns
valid_gdf['reading_id'].min().compute(), valid_gdf['reading_id'].max().compute()
(0, 20)
valid_gdf.case_id.nunique().compute()
5000
Test Dataset
%%time
write_parquet(test, product['InsectWingbeat_TEST_RAW'])
CPU times: user 1.34 s, sys: 328 ms, total: 1.67 s Wall time: 1.99 s
test_gdf = dask_cudf.read_parquet(product['InsectWingbeat_TEST_RAW'])
test_gdf.head()
case_id | reading_id | dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | ... | dim_192 | dim_193 | dim_194 | dim_195 | dim_196 | dim_197 | dim_198 | dim_199 | class_vals | case_id_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | -0.002947 | 0.002920 | -0.002758 | 0.002282 | -0.001377 | 0.000128 | 0.001152 | -0.002060 | ... | 0.000090 | -0.000068 | 0.000031 | -0.000007 | 0.000014 | -0.000049 | 0.000087 | 0.000087 | 0 | 0 |
1 | 0 | 1 | -0.053421 | 0.011923 | -0.006867 | 0.027616 | 0.045003 | -0.094597 | 0.014191 | 0.022102 | ... | 0.000134 | 0.000284 | 0.000281 | 0.000343 | 0.000118 | 0.000088 | -0.000291 | -0.000291 | 0 | 0 |
2 | 0 | 2 | -0.049052 | -0.020504 | 0.000670 | -0.000306 | 0.063919 | 0.024317 | -0.004142 | 0.002782 | ... | -0.000031 | -0.000027 | 0.000070 | 0.000052 | 0.000011 | 0.000190 | 0.000467 | 0.000467 | 0 | 0 |
3 | 0 | 3 | 0.046457 | -0.555692 | 0.194275 | 0.016277 | -0.024176 | 0.480148 | -0.058053 | -0.003973 | ... | 0.000840 | -0.001102 | 0.001148 | -0.001079 | 0.001008 | -0.001456 | 0.010817 | 0.010817 | 0 | 0 |
4 | 0 | 4 | -0.896564 | 7.723570 | -0.871859 | -1.611516 | -0.154569 | -0.239838 | 0.765347 | -0.284888 | ... | 0.013401 | -0.014279 | 0.015049 | -0.014547 | 0.015267 | -0.015893 | 0.017930 | 0.017930 | 0 | 0 |
5 rows × 204 columns
test_gdf['reading_id'].min().compute(), test_gdf['reading_id'].max().compute()
(0, 21)
test_gdf.case_id.nunique().compute()
25000
We reset the kernel!!!
%%time
client.shutdown()
client.close()
Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report await self._reconnect() File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError
CPU times: user 40 ms, sys: 8.11 ms, total: 48.1 ms Wall time: 629 ms
from nbdev import nbdev_export
nbdev_export()