In [1]:
#| default_exp feature_preprocessing.face_detection.tabular_to_timeseries
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['feature_preprocessing_face_detection']
In [3]:
# Parameters
upstream = {"feature_preprocessing_face_detection": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/201_feature_preprocessing.face_detection.target_encoding.html", "FaceDetection_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/train", "FaceDetection_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/valid", "FaceDetection_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/test", "FaceDetection_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/nvtabular_workflow"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/301_feature_preprocessing.face_detection.tabular_to_timeseries.html", "FaceDetection_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/train", "FaceDetection_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/valid", "FaceDetection_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/test"}
In [4]:
#| hide
from nbdev.showdoc import *
In [5]:
#| export
from vitmtsc import *
from vitmtsc.core import *
from vitmtsc.data.face_detection import *
from vitmtsc.feature_preprocessing.face_detection.target_encoding import *
import os
import glob
In [6]:
#| export
upstream = {
    "feature_preprocessing_face_detection": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/201_feature_preprocessing.face_detection.target_encoding.html",
        "FaceDetection_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/train",
        "FaceDetection_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/valid",
        "FaceDetection_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/test",
        "FaceDetection_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding/nvtabular_workflow",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/301_feature_preprocessing.face_detection.tabular_to_timeseries.html",
    "FaceDetection_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/train",
    "FaceDetection_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/valid",
    "FaceDetection_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/test",
}

Feature Preprocessing for Neural Networks - III¶

Convert Category Encoding data from tabular to time-series format

In [7]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 18:56:39,926 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 18:56:39,926 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 18:56:39,926 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 18:56:39,926 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 18:56:39,926 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 18:56:39,926 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 18:56:39,947 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 18:56:39,947 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Out[7]:

Client

Client-757926df-3b71-11ed-812f-0a01290f6f4b

Connection method: Cluster object Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

LocalCUDACluster

ad3b8e5f

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 4 Total memory: 150.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-2cdea13b-ae78-48b9-8900-faf9610ec3b4

Comm: tcp://127.0.0.1:44691 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 4
Started: Just now Total memory: 150.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:42929 Total threads: 1
Dashboard: http://127.0.0.1:38577/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:46671
Local directory: /tmp/dask-worker-space/worker-om6la5d8
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 1

Comm: tcp://127.0.0.1:38175 Total threads: 1
Dashboard: http://127.0.0.1:36947/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:41129
Local directory: /tmp/dask-worker-space/worker-y93igzpe
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 2

Comm: tcp://127.0.0.1:43017 Total threads: 1
Dashboard: http://127.0.0.1:38615/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:38925
Local directory: /tmp/dask-worker-space/worker-ho0h83c4
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 3

Comm: tcp://127.0.0.1:45957 Total threads: 1
Dashboard: http://127.0.0.1:35493/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:34417
Local directory: /tmp/dask-worker-space/worker-qdu8xfmu
GPU: Tesla T4 GPU memory: 14.76 GiB
In [8]:
#| export
DATASET_NAME = 'FaceDetection'
SEQUENCE_LENGTH = 62
NUMBER_OF_FEATURES = 144
NUM_TARGET = 2

Convert from Tabular to Time-Series Format

In [9]:
#| export
MTSC_COLUMN_NAMES = [
'dim_0',
'dim_1',
'dim_2',
'dim_3',
'dim_4',
'dim_5',
'dim_6',
'dim_7',
'dim_8',
'dim_9',
'dim_10',
'dim_11',
'dim_12',
'dim_13',
'dim_14',
'dim_15',
'dim_16',
'dim_17',
'dim_18',
'dim_19',
'dim_20',
'dim_21',
'dim_22',
'dim_23',
'dim_24',
'dim_25',
'dim_26',
'dim_27',
'dim_28',
'dim_29',
'dim_30',
'dim_31',
'dim_32',
'dim_33',
'dim_34',
'dim_35',
'dim_36',
'dim_37',
'dim_38',
'dim_39',
'dim_40',
'dim_41',
'dim_42',
'dim_43',
'dim_44',
'dim_45',
'dim_46',
'dim_47',
'dim_48',
'dim_49',
'dim_50',
'dim_51',
'dim_52',
'dim_53',
'dim_54',
'dim_55',
'dim_56',
'dim_57',
'dim_58',
'dim_59',
'dim_60',
'dim_61',
'dim_62',
'dim_63',
'dim_64',
'dim_65',
'dim_66',
'dim_67',
'dim_68',
'dim_69',
'dim_70',
'dim_71',
'dim_72',
'dim_73',
'dim_74',
'dim_75',
'dim_76',
'dim_77',
'dim_78',
'dim_79',
'dim_80',
'dim_81',
'dim_82',
'dim_83',
'dim_84',
'dim_85',
'dim_86',
'dim_87',
'dim_88',
'dim_89',
'dim_90',
'dim_91',
'dim_92',
'dim_93',
'dim_94',
'dim_95',
'dim_96',
'dim_97',
'dim_98',
'dim_99',
'dim_100',
'dim_101',
'dim_102',
'dim_103',
'dim_104',
'dim_105',
'dim_106',
'dim_107',
'dim_108',
'dim_109',
'dim_110',
'dim_111',
'dim_112',
'dim_113',
'dim_114',
'dim_115',
'dim_116',
'dim_117',
'dim_118',
'dim_119',
'dim_120',
'dim_121',
'dim_122',
'dim_123',
'dim_124',
'dim_125',
'dim_126',
'dim_127',
'dim_128',
'dim_129',
'dim_130',
'dim_131',
'dim_132',
'dim_133',
'dim_134',
'dim_135',
'dim_136',
'dim_137',
'dim_138',
'dim_139',
'dim_140',
'dim_141',
'dim_142',
'dim_143']
In [10]:
#| export
ALL_COLUMNS = ['case_id', 'case_id_seq', 'reading_id'] + MTSC_COLUMN_NAMES + ['class_vals']

Input Data Location

In [11]:
target_encoded_train_dir = os.path.join("./", upstream['feature_preprocessing_face_detection']['FaceDetection_TRAIN_TE'])
target_encoded_valid_dir = os.path.join("./", upstream['feature_preprocessing_face_detection']['FaceDetection_VALID_TE'])
target_encoded_test_dir = os.path.join("./", upstream['feature_preprocessing_face_detection']['FaceDetection_TEST_TE'])

Output Data Location

In [12]:
output_train_dir = os.path.join("./", product['FaceDetection_TRAIN_MODEL_INPUT'])
output_valid_dir = os.path.join("./", product['FaceDetection_VALID_MODEL_INPUT'])
output_test_dir = os.path.join("./", product['FaceDetection_TEST_MODEL_INPUT'])
In [13]:
!mkdir -p $output_train_dir
!mkdir -p $output_valid_dir
!mkdir -p $output_test_dir

Train Dataset Conversion
¶

Tabular to Time-Series format conversion

In [14]:
%%time
convert_from_tabular_to_timeseries_format(input_dir = target_encoded_train_dir, 
                                          output_dir = output_train_dir, 
                                          all_columns = ALL_COLUMNS,
                                          mtsc_column_names = MTSC_COLUMN_NAMES,
                                          chunk_size_processing = 50000,
                                          number_of_features = NUMBER_OF_FEATURES, 
                                          seq_len = SEQUENCE_LENGTH,
                                          chunk_size_file = 10000)
case_id_seq_min:  0 case_id_seq_max:  5889
Total number of chunks to be processed:  1
Started processing chunk:  0  with case_id_seq from :  0 to  5889
Before CumCount Min:  0 CumCount Max:  61
After CumCount Min:  0 CumCount Max:  61
sorted
flattened_gdf.shape:  (4712, 8930)
Total number of files to be created:  1
Writing to output file:  /home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/train/chunk_0_part_0.parquet with records from iloc:  0 to  4712
Finished processing chunk:  0  with case_id_seq from :  0 to  5889
CPU times: user 15.5 s, sys: 2.82 s, total: 18.3 s
Wall time: 19.7 s

Valid Dataset Conversion
¶

Tabular to Time-Series format conversion

In [15]:
%%time
convert_from_tabular_to_timeseries_format(input_dir = target_encoded_valid_dir, 
                                          output_dir = output_valid_dir, 
                                          all_columns = ALL_COLUMNS,
                                          mtsc_column_names = MTSC_COLUMN_NAMES,
                                          chunk_size_processing = 50000,
                                          number_of_features = NUMBER_OF_FEATURES, 
                                          seq_len = SEQUENCE_LENGTH,
                                          chunk_size_file = 10000)
case_id_seq_min:  8 case_id_seq_max:  5882
Total number of chunks to be processed:  1
Started processing chunk:  0  with case_id_seq from :  0 to  5882
Before CumCount Min:  0 CumCount Max:  61
After CumCount Min:  0 CumCount Max:  61
sorted
flattened_gdf.shape:  (1178, 8930)
Total number of files to be created:  1
Writing to output file:  /home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/valid/chunk_0_part_0.parquet with records from iloc:  0 to  1178
Finished processing chunk:  0  with case_id_seq from :  0 to  5882
CPU times: user 10.8 s, sys: 770 ms, total: 11.6 s
Wall time: 11.4 s

Test Dataset Conversion
¶

Tabular to Time-Series format conversion

In [16]:
%%time
convert_from_tabular_to_timeseries_format(input_dir = target_encoded_test_dir, 
                                          output_dir = output_test_dir, 
                                          all_columns = ALL_COLUMNS,
                                          mtsc_column_names = MTSC_COLUMN_NAMES,
                                          chunk_size_processing = 50000,
                                          number_of_features = NUMBER_OF_FEATURES, 
                                          seq_len = SEQUENCE_LENGTH,
                                          chunk_size_file = 10000)
case_id_seq_min:  0 case_id_seq_max:  3523
Total number of chunks to be processed:  1
Started processing chunk:  0  with case_id_seq from :  0 to  3523
Before CumCount Min:  0 CumCount Max:  61
After CumCount Min:  0 CumCount Max:  61
sorted
flattened_gdf.shape:  (3524, 8930)
Total number of files to be created:  1
Writing to output file:  /home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/test/chunk_0_part_0.parquet with records from iloc:  0 to  3524
Finished processing chunk:  0  with case_id_seq from :  0 to  3523
CPU times: user 12.7 s, sys: 1.6 s, total: 14.3 s
Wall time: 14 s

Verify Datasets
¶

In [17]:
%%time
import dask_cudf
train_gdf = dask_cudf.read_parquet(output_train_dir)
train_gdf.head()
CPU times: user 18 s, sys: 396 ms, total: 18.4 s
Wall time: 30.9 s
Out[17]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_143_54 dim_143_55 dim_143_56 dim_143_57 dim_143_58 dim_143_59 dim_143_60 dim_143_61 class_vals case_id
0 -3.611579 -3.544451 -3.310208 -2.981573 -2.707916 -4.318740 -2.681417 -1.143323 -0.371899 -0.398802 ... 0.098988 0.111700 0.059702 -0.353686 -0.669344 -0.701798 -0.503293 0.011390 1.0 5703.0
1 1.504849 1.429360 1.372945 1.154656 0.879269 1.387374 2.662556 0.951379 1.861210 0.875889 ... 1.051566 1.135207 1.283814 -0.551966 -0.409467 -2.146049 -1.793830 -0.727230 0.0 4245.0
2 0.597107 0.689412 0.784829 0.793919 0.767928 -0.301940 1.133886 0.802337 2.655175 2.248975 ... 0.894139 0.352814 -0.889328 -0.906947 -1.318142 -0.732796 -0.446137 -0.211485 0.0 1386.0
3 0.131422 0.697414 0.952692 1.163141 1.346406 -1.309643 -1.616391 -1.709870 -1.079094 -1.121214 ... -0.869001 -0.232134 -0.238141 -0.025023 -0.333906 -0.146325 0.264351 0.253558 1.0 4911.0
4 -0.680714 -0.349697 -0.305056 -0.227676 -0.145361 -1.335729 -1.568024 -1.274372 -0.817652 0.180512 ... 0.395892 0.026665 1.564071 1.091476 2.065342 1.780000 0.580152 0.762950 1.0 2948.0

5 rows × 8930 columns

In [18]:
train_gdf.tail()
Out[18]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_143_54 dim_143_55 dim_143_56 dim_143_57 dim_143_58 dim_143_59 dim_143_60 dim_143_61 class_vals case_id
4707 -1.330701 -1.291110 -1.334657 -1.295054 -1.169601 -0.375712 0.176117 -0.824844 -0.215864 -1.207686 ... 0.250639 0.414858 0.710168 1.343488 0.567356 -1.193091 -1.807238 -1.130595 1.0 1988.0
4708 0.336049 0.747271 0.967525 1.086845 1.141900 0.723601 0.356979 1.530178 0.421692 0.884415 ... 0.441511 -0.191611 0.073925 -1.173424 -0.542870 0.317379 0.692330 0.577826 0.0 4626.0
4709 1.063434 1.186315 1.225160 1.331084 1.327598 0.170167 0.987125 1.853559 0.186814 2.500128 ... -1.672389 -1.376272 -1.362003 -1.295241 -0.243934 -0.899911 -0.228563 -0.566318 0.0 709.0
4710 0.530625 0.251153 0.030042 -0.115527 -0.133783 1.220124 -0.449545 0.319548 -0.101689 0.795397 ... 0.035429 -0.201459 -0.208152 -0.226692 0.061203 0.429655 -0.420488 -0.550902 1.0 3741.0
4711 -0.909796 -1.171535 -1.365407 -1.534692 -1.710724 1.384764 -1.055296 1.287799 0.147919 -0.362669 ... -0.290226 0.181056 0.565160 0.028504 -0.661667 -1.072682 -0.651636 -0.356642 0.0 670.0

5 rows × 8930 columns

In [19]:
%%time
train_gdf['case_id'].nunique().compute(), train_gdf['class_vals'].nunique().compute()
CPU times: user 888 ms, sys: 19.4 ms, total: 907 ms
Wall time: 2.05 s
Out[19]:
(4712, 2)
In [20]:
%%time
import dask_cudf
valid_gdf = dask_cudf.read_parquet(output_valid_dir)
valid_gdf.head()
CPU times: user 17.4 s, sys: 408 ms, total: 17.8 s
Wall time: 30.1 s
Out[20]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_143_54 dim_143_55 dim_143_56 dim_143_57 dim_143_58 dim_143_59 dim_143_60 dim_143_61 class_vals case_id
0 -0.929148 -1.105110 -0.941891 -0.649398 -0.245340 -0.266881 -1.239319 -1.406907 -0.599465 0.242002 ... -0.343859 0.338954 1.288810 1.278083 0.619133 -0.555734 0.162212 0.371525 0.0 3784.0
1 0.381624 0.886542 0.973097 0.997448 1.014284 1.556347 1.160265 1.130376 1.257697 0.260258 ... -0.385063 -1.367192 0.094255 1.124542 0.822106 0.694545 0.525326 0.327642 1.0 408.0
2 1.926687 1.510785 1.395330 1.163647 1.046519 1.039880 0.538547 1.490433 1.576309 2.003257 ... 0.695202 0.031622 0.095980 -0.190589 0.258013 0.135613 -0.412100 -1.018441 0.0 5582.0
3 0.367114 0.651509 0.432148 0.233512 -0.000091 -0.081989 -0.074007 -1.006657 -0.841989 -1.933067 ... 0.020846 1.013557 0.071996 0.453203 1.324450 0.724536 -0.134212 0.593665 0.0 465.0
4 -1.538407 -1.339643 -1.162910 -0.972035 -0.799651 -1.043930 -0.969960 0.559300 0.073164 -0.181563 ... 0.073362 -0.110095 -0.213497 0.238001 -0.612707 0.354244 0.139645 0.560081 0.0 4169.0

5 rows × 8930 columns

In [21]:
valid_gdf.tail()
Out[21]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_143_54 dim_143_55 dim_143_56 dim_143_57 dim_143_58 dim_143_59 dim_143_60 dim_143_61 class_vals case_id
1173 0.071042 -0.024387 0.191721 0.128114 -0.059758 0.268023 1.476222 0.959334 -1.141774 -0.423020 ... -0.365797 0.908630 1.685662 0.574225 0.349738 -0.549095 -0.288867 0.711257 1.0 1561.0
1174 0.252586 0.439646 0.425299 0.419486 0.267317 0.321941 -0.448104 1.129621 -1.374113 -0.475453 ... -0.415012 -1.096173 -1.425451 -1.266408 0.624598 0.273851 -1.239975 -0.912561 0.0 624.0
1175 -0.581236 -0.831223 -0.900761 -1.045076 -1.251502 0.073992 1.158753 1.319034 1.447029 0.329836 ... -1.761261 -2.998023 -3.599784 -2.669970 -1.778380 -0.402851 0.004090 0.624326 1.0 486.0
1176 0.561493 0.582066 0.659096 0.682265 0.696228 1.799372 1.108655 0.715644 -0.280438 0.629797 ... -0.952442 -0.806641 -1.839325 -1.666076 -0.240264 -0.200910 -0.022648 -1.224240 0.0 3880.0
1177 -0.242705 -0.013153 0.180737 0.394410 0.638280 1.529123 1.484270 0.606758 1.107706 0.441574 ... 0.104662 -1.271352 -0.533765 -2.233684 -1.331086 0.250216 1.427544 2.279614 1.0 3185.0

5 rows × 8930 columns

In [22]:
%%time
valid_gdf['case_id'].nunique().compute(), valid_gdf['class_vals'].nunique().compute()
CPU times: user 726 ms, sys: 28.9 ms, total: 754 ms
Wall time: 1.89 s
Out[22]:
(1178, 2)
In [23]:
%%time
import dask_cudf
test_gdf = dask_cudf.read_parquet(output_test_dir)
test_gdf.head()
CPU times: user 17.4 s, sys: 371 ms, total: 17.8 s
Wall time: 30.1 s
Out[23]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_143_54 dim_143_55 dim_143_56 dim_143_57 dim_143_58 dim_143_59 dim_143_60 dim_143_61 class_vals case_id
0 0.358228 0.074760 0.015374 -0.193739 -0.368179 -0.071744 0.554529 -0.196741 -0.677700 -1.545661 ... -0.339607 0.555091 1.180807 1.288541 0.139732 0.738019 -0.025770 1.075838 1.0 2793.0
1 -1.387512 -1.752615 -1.849856 -1.727745 -1.521146 -0.242973 -1.734590 0.044742 0.785843 0.681541 ... 1.111590 2.149374 1.126483 -0.585017 -1.489166 -1.763359 0.069810 -0.063899 0.0 2157.0
2 1.273599 0.823874 0.738863 0.651619 0.682204 -0.476438 -0.353018 -0.815121 -0.947192 -0.574885 ... 1.704488 0.166966 0.080120 -0.266172 0.150620 0.891987 0.869114 1.031004 1.0 965.0
3 -0.269798 -0.370553 -0.266308 -0.204556 -0.154651 -0.892622 -0.494802 -1.803072 -0.659148 -0.408984 ... 0.626399 0.286209 -0.111414 0.044262 0.644858 1.219852 -0.716698 -0.796233 1.0 1632.0
4 1.010976 1.183456 1.082183 0.931184 0.827308 -0.039847 -0.226516 0.412190 0.006674 0.045594 ... -0.919426 -0.862778 -1.319405 -0.519904 0.079618 0.406579 0.364146 -0.687530 0.0 540.0

5 rows × 8930 columns

In [24]:
test_gdf.tail()
Out[24]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_143_54 dim_143_55 dim_143_56 dim_143_57 dim_143_58 dim_143_59 dim_143_60 dim_143_61 class_vals case_id
3519 0.879537 0.816978 0.660403 0.435763 0.061187 -0.143533 -0.494938 -0.653089 -0.428259 -1.971869 ... 0.215659 0.369807 0.352191 0.522829 1.076980 0.811910 0.887828 -0.632244 0.0 1926.0
3520 -1.016368 -0.598267 -0.361403 -0.220654 -0.074354 0.493791 0.625948 0.201896 -0.443691 -0.448058 ... -1.010442 -0.686450 -0.054556 1.387704 0.192816 -1.033519 -0.917726 -1.895749 0.0 1862.0
3521 0.123616 -0.188700 -0.440103 -0.636104 -0.794950 1.500530 1.338156 0.543613 1.295190 -0.085536 ... 0.862444 1.219391 1.160300 0.478101 0.961551 0.176353 1.199245 0.511172 1.0 2977.0
3522 0.386057 0.349258 0.125756 -0.070403 -0.297548 0.588159 -0.016820 -0.194569 0.454931 -0.762533 ... -0.526063 0.062500 -0.139767 0.778894 1.089592 -0.019971 0.732896 0.461898 0.0 1747.0
3523 -0.388013 -0.748928 -1.110766 -1.399547 -1.547126 -0.374099 0.621944 -0.450650 0.271023 -0.436169 ... 1.113185 1.324051 0.893113 0.358821 0.468017 0.255657 0.184054 0.001536 1.0 3039.0

5 rows × 8930 columns

In [25]:
%%time
test_gdf['case_id'].nunique().compute(), test_gdf['class_vals'].nunique().compute()
CPU times: user 777 ms, sys: 37.1 ms, total: 814 ms
Wall time: 1.97 s
Out[25]:
(3524, 2)

We reset the kernel!!!

In [26]:
%%time
client.shutdown()
client.close()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report
    await self._reconnect()
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper
    return await func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected
    comm = await connect(
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect
    await asyncio.sleep(backoff)
  File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await future
asyncio.exceptions.CancelledError
CPU times: user 38.5 ms, sys: 5.57 ms, total: 44 ms
Wall time: 668 ms
In [27]:
from nbdev import nbdev_export
nbdev_export()
In [ ]: