In [1]:
#| default_exp feature_preprocessing.insect_wingbeat.tabular_to_timeseries
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['feature_preprocessing_insect_wingbeat']
In [3]:
# Parameters
upstream = {"feature_preprocessing_insect_wingbeat": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html", "InsectWingbeat_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train", "InsectWingbeat_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid", "InsectWingbeat_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test", "InsectWingbeat_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/302_feature_preprocessing.insect_wingbeat.tabular_to_timeseries.html", "InsectWingbeat_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train", "InsectWingbeat_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/valid", "InsectWingbeat_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test"}
In [4]:
#| hide
from nbdev.showdoc import *
In [5]:
#| export
from vitmtsc import *
from vitmtsc.core import *
from vitmtsc.data.insect_wingbeat import *
from vitmtsc.feature_preprocessing.insect_wingbeat.target_encoding import *
import os
import glob
In [6]:
#| export
upstream = {
    "feature_preprocessing_insect_wingbeat": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html",
        "InsectWingbeat_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train",
        "InsectWingbeat_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid",
        "InsectWingbeat_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test",
        "InsectWingbeat_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/302_feature_preprocessing.insect_wingbeat.tabular_to_timeseries.html",
    "InsectWingbeat_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train",
    "InsectWingbeat_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/valid",
    "InsectWingbeat_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test",
}

Feature Preprocessing for Neural Networks - III¶

Convert Category Encoding data from tabular to time-series format

In [7]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:35:45,513 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:35:45,513 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:35:45,515 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:35:45,515 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:35:45,545 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:35:45,545 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-09-23 19:35:45,549 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-09-23 19:35:45,549 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Out[7]:

Client

Client-eb8b0f0f-3b76-11ed-8177-06c0bb745397

Connection method: Cluster object Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

LocalCUDACluster

668855f2

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 4 Total memory: 150.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-a9d79834-1c2a-41b3-8a06-74a5d9d24bf6

Comm: tcp://127.0.0.1:41023 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 4
Started: Just now Total memory: 150.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:34573 Total threads: 1
Dashboard: http://127.0.0.1:36137/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:40087
Local directory: /tmp/dask-worker-space/worker-544f9lky
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 1

Comm: tcp://127.0.0.1:45097 Total threads: 1
Dashboard: http://127.0.0.1:38797/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:41907
Local directory: /tmp/dask-worker-space/worker-6ip2jpd8
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 2

Comm: tcp://127.0.0.1:41941 Total threads: 1
Dashboard: http://127.0.0.1:34495/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:42889
Local directory: /tmp/dask-worker-space/worker-lslkuai0
GPU: Tesla T4 GPU memory: 14.76 GiB

Worker: 3

Comm: tcp://127.0.0.1:40577 Total threads: 1
Dashboard: http://127.0.0.1:43717/status Memory: 37.50 GiB
Nanny: tcp://127.0.0.1:45793
Local directory: /tmp/dask-worker-space/worker-hnbfmmhd
GPU: Tesla T4 GPU memory: 14.76 GiB
In [8]:
#| export
DATASET_NAME = 'InsectWingbeat'
SEQUENCE_LENGTH = 22
NUMBER_OF_FEATURES = 200
NUM_TARGET = 10

Convert from Tabular to Time-Series Format

In [9]:
#| export
MTSC_COLUMN_NAMES = [
'dim_0',
'dim_1',
'dim_2',
'dim_3',
'dim_4',
'dim_5',
'dim_6',
'dim_7',
'dim_8',
'dim_9',
'dim_10',
'dim_11',
'dim_12',
'dim_13',
'dim_14',
'dim_15',
'dim_16',
'dim_17',
'dim_18',
'dim_19',
'dim_20',
'dim_21',
'dim_22',
'dim_23',
'dim_24',
'dim_25',
'dim_26',
'dim_27',
'dim_28',
'dim_29',
'dim_30',
'dim_31',
'dim_32',
'dim_33',
'dim_34',
'dim_35',
'dim_36',
'dim_37',
'dim_38',
'dim_39',
'dim_40',
'dim_41',
'dim_42',
'dim_43',
'dim_44',
'dim_45',
'dim_46',
'dim_47',
'dim_48',
'dim_49',
'dim_50',
'dim_51',
'dim_52',
'dim_53',
'dim_54',
'dim_55',
'dim_56',
'dim_57',
'dim_58',
'dim_59',
'dim_60',
'dim_61',
'dim_62',
'dim_63',
'dim_64',
'dim_65',
'dim_66',
'dim_67',
'dim_68',
'dim_69',
'dim_70',
'dim_71',
'dim_72',
'dim_73',
'dim_74',
'dim_75',
'dim_76',
'dim_77',
'dim_78',
'dim_79',
'dim_80',
'dim_81',
'dim_82',
'dim_83',
'dim_84',
'dim_85',
'dim_86',
'dim_87',
'dim_88',
'dim_89',
'dim_90',
'dim_91',
'dim_92',
'dim_93',
'dim_94',
'dim_95',
'dim_96',
'dim_97',
'dim_98',
'dim_99',
'dim_100',
'dim_101',
'dim_102',
'dim_103',
'dim_104',
'dim_105',
'dim_106',
'dim_107',
'dim_108',
'dim_109',
'dim_110',
'dim_111',
'dim_112',
'dim_113',
'dim_114',
'dim_115',
'dim_116',
'dim_117',
'dim_118',
'dim_119',
'dim_120',
'dim_121',
'dim_122',
'dim_123',
'dim_124',
'dim_125',
'dim_126',
'dim_127',
'dim_128',
'dim_129',
'dim_130',
'dim_131',
'dim_132',
'dim_133',
'dim_134',
'dim_135',
'dim_136',
'dim_137',
'dim_138',
'dim_139',
'dim_140',
'dim_141',
'dim_142',
'dim_143',
'dim_144',
'dim_145',
'dim_146',
'dim_147',
'dim_148',
'dim_149',
'dim_150',
'dim_151',
'dim_152',
'dim_153',
'dim_154',
'dim_155',
'dim_156',
'dim_157',
'dim_158',
'dim_159',
'dim_160',
'dim_161',
'dim_162',
'dim_163',
'dim_164',
'dim_165',
'dim_166',
'dim_167',
'dim_168',
'dim_169',
'dim_170',
'dim_171',
'dim_172',
'dim_173',
'dim_174',
'dim_175',
'dim_176',
'dim_177',
'dim_178',
'dim_179',
'dim_180',
'dim_181',
'dim_182',
'dim_183',
'dim_184',
'dim_185',
'dim_186',
'dim_187',
'dim_188',
'dim_189',
'dim_190',
'dim_191',
'dim_192',
'dim_193',
'dim_194',
'dim_195',
'dim_196',
'dim_197',
'dim_198',
'dim_199']
In [10]:
#| export
ALL_COLUMNS = ['case_id', 'case_id_seq', 'reading_id'] + MTSC_COLUMN_NAMES + ['class_vals']

Input Data Location

In [11]:
target_encoded_train_dir = os.path.join("./", upstream['feature_preprocessing_insect_wingbeat']['InsectWingbeat_TRAIN_TE'])
target_encoded_valid_dir = os.path.join("./", upstream['feature_preprocessing_insect_wingbeat']['InsectWingbeat_VALID_TE'])
target_encoded_test_dir = os.path.join("./", upstream['feature_preprocessing_insect_wingbeat']['InsectWingbeat_TEST_TE'])

Output Data Location

In [12]:
output_train_dir = os.path.join("./", product['InsectWingbeat_TRAIN_MODEL_INPUT'])
output_valid_dir = os.path.join("./", product['InsectWingbeat_VALID_MODEL_INPUT'])
output_test_dir = os.path.join("./", product['InsectWingbeat_TEST_MODEL_INPUT'])
In [13]:
!mkdir -p $output_train_dir
!mkdir -p $output_valid_dir
!mkdir -p $output_test_dir

Train Dataset Conversion
¶

Tabular to Time-Series format conversion

In [14]:
%%time
convert_from_tabular_to_timeseries_format(input_dir = target_encoded_train_dir, 
                                          output_dir = output_train_dir, 
                                          all_columns = ALL_COLUMNS,
                                          mtsc_column_names = MTSC_COLUMN_NAMES,
                                          chunk_size_processing = 50000,
                                          number_of_features = NUMBER_OF_FEATURES, 
                                          seq_len = SEQUENCE_LENGTH,
                                          chunk_size_file = 10000)
case_id_seq_min:  0 case_id_seq_max:  24998
Total number of chunks to be processed:  1
Started processing chunk:  0  with case_id_seq from :  0 to  24998
Before CumCount Min:  0 CumCount Max:  21
After CumCount Min:  0 CumCount Max:  21
sorted
flattened_gdf.shape:  (20000, 4402)
Total number of files to be created:  2
Writing to output file:  /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train/chunk_0_part_0.parquet with records from iloc:  0 to  10000
Writing to output file:  /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train/chunk_0_part_1.parquet with records from iloc:  10000 to  20000
Finished processing chunk:  0  with case_id_seq from :  0 to  24998
CPU times: user 14.5 s, sys: 3.09 s, total: 17.6 s
Wall time: 19.1 s

Valid Dataset Conversion
¶

Tabular to Time-Series format conversion

In [15]:
%%time
convert_from_tabular_to_timeseries_format(input_dir = target_encoded_valid_dir, 
                                          output_dir = output_valid_dir, 
                                          all_columns = ALL_COLUMNS,
                                          mtsc_column_names = MTSC_COLUMN_NAMES,
                                          chunk_size_processing = 50000,
                                          number_of_features = NUMBER_OF_FEATURES, 
                                          seq_len = SEQUENCE_LENGTH,
                                          chunk_size_file = 10000)
case_id_seq_min:  17 case_id_seq_max:  24999
Total number of chunks to be processed:  1
Started processing chunk:  0  with case_id_seq from :  0 to  24999
Before CumCount Min:  0 CumCount Max:  20
After CumCount Min:  0 CumCount Max:  20
sorted
flattened_gdf.shape:  (5000, 4402)
Total number of files to be created:  1
Writing to output file:  /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/valid/chunk_0_part_0.parquet with records from iloc:  0 to  5000
Finished processing chunk:  0  with case_id_seq from :  0 to  24999
CPU times: user 7.87 s, sys: 799 ms, total: 8.67 s
Wall time: 8.54 s

Test Dataset Conversion
¶

Tabular to Time-Series format conversion

In [16]:
%%time
convert_from_tabular_to_timeseries_format(input_dir = target_encoded_test_dir, 
                                          output_dir = output_test_dir, 
                                          all_columns = ALL_COLUMNS,
                                          mtsc_column_names = MTSC_COLUMN_NAMES,
                                          chunk_size_processing = 50000,
                                          number_of_features = NUMBER_OF_FEATURES, 
                                          seq_len = SEQUENCE_LENGTH,
                                          chunk_size_file = 10000)
case_id_seq_min:  0 case_id_seq_max:  24999
Total number of chunks to be processed:  1
Started processing chunk:  0  with case_id_seq from :  0 to  24999
Before CumCount Min:  0 CumCount Max:  21
After CumCount Min:  0 CumCount Max:  21
sorted
flattened_gdf.shape:  (25000, 4402)
Total number of files to be created:  3
Writing to output file:  /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test/chunk_0_part_0.parquet with records from iloc:  0 to  10000
Writing to output file:  /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test/chunk_0_part_1.parquet with records from iloc:  10000 to  20000
Writing to output file:  /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test/chunk_0_part_2.parquet with records from iloc:  20000 to  25000
Finished processing chunk:  0  with case_id_seq from :  0 to  24999
CPU times: user 14.9 s, sys: 2.94 s, total: 17.9 s
Wall time: 17.5 s

Verify Datasets
¶

In [17]:
%%time
import dask_cudf
train_gdf = dask_cudf.read_parquet(output_train_dir)
train_gdf.head()
CPU times: user 5.79 s, sys: 194 ms, total: 5.99 s
Wall time: 9.49 s
Out[17]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_199_14 dim_199_15 dim_199_16 dim_199_17 dim_199_18 dim_199_19 dim_199_20 dim_199_21 class_vals case_id
0 0.032063 -0.054971 -0.267784 -1.067128 1.216967 0.052408 0.010434 0.000000 0.000000 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 22785.0
1 -0.062069 -0.024357 1.082939 -2.032642 0.244790 1.061763 0.076604 0.000000 0.000000 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 10048.0
2 0.013101 0.610176 -2.407612 0.008669 0.023391 0.000000 0.000000 0.000000 0.000000 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 10740.0
3 0.111349 -0.064055 0.946340 0.469756 0.022269 0.000000 0.000000 0.000000 0.000000 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 4830.0
4 0.022464 0.181676 0.277060 -0.615980 0.446005 -0.005245 -1.217014 1.282087 -0.049652 0.03522 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 11005.0

5 rows × 4402 columns

In [18]:
train_gdf.tail()
Out[18]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_199_14 dim_199_15 dim_199_16 dim_199_17 dim_199_18 dim_199_19 dim_199_20 dim_199_21 class_vals case_id
19995 -0.001325 0.116420 -0.645990 0.603539 0.022499 0.000000 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 11089.0
19996 0.023738 0.086435 0.038533 -0.058786 -0.547138 -0.325506 -0.734727 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 10406.0
19997 0.022344 0.022349 0.357544 -2.243955 -0.532616 0.000000 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 22963.0
19998 0.021922 0.025154 0.232804 -1.263299 0.866391 0.265158 0.020039 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 10361.0
19999 0.071430 -0.114545 0.332819 -0.149102 0.062493 0.048579 0.017992 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8.0 20442.0

5 rows × 4402 columns

In [19]:
%%time
train_gdf['case_id'].nunique().compute(), train_gdf['class_vals'].nunique().compute()
CPU times: user 518 ms, sys: 32.7 ms, total: 551 ms
Wall time: 1.18 s
Out[19]:
(20000, 10)
In [20]:
%%time
import dask_cudf
valid_gdf = dask_cudf.read_parquet(output_valid_dir)
valid_gdf.head()
CPU times: user 5.67 s, sys: 128 ms, total: 5.8 s
Wall time: 9.2 s
Out[20]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_199_14 dim_199_15 dim_199_16 dim_199_17 dim_199_18 dim_199_19 dim_199_20 dim_199_21 class_vals case_id
0 0.022414 0.022381 0.022424 -0.020581 0.187857 0.213099 -0.198727 -0.220894 0.023761 0.022388 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0 17775.0
1 0.022418 0.026991 -0.073085 0.153986 -0.390933 0.376018 0.301444 -0.007603 0.022414 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 360.0
2 0.022342 0.014775 0.151475 -1.026869 0.344332 0.079579 -0.041067 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 24655.0
3 -0.016620 -0.782072 5.613051 0.881482 0.023386 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0 13884.0
4 0.235395 0.882627 -0.764873 0.702121 0.003729 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 24342.0

5 rows × 4402 columns

In [21]:
valid_gdf.tail()
Out[21]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_199_14 dim_199_15 dim_199_16 dim_199_17 dim_199_18 dim_199_19 dim_199_20 dim_199_21 class_vals case_id
4995 0.032163 0.764881 -0.869177 0.359423 -0.389531 0.553232 0.031750 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 4139.0
4996 0.022196 0.134991 -1.015465 0.316442 0.022520 0.000000 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 7212.0
4997 0.022402 0.022353 0.028277 -1.655183 -0.390674 0.702128 0.022406 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 367.0
4998 0.031373 0.003649 0.007124 -0.023877 1.108508 -2.907924 0.590468 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 22976.0
4999 0.024010 0.102044 -0.282106 1.212721 0.063672 0.000000 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 11262.0

5 rows × 4402 columns

In [22]:
%%time
valid_gdf['case_id'].nunique().compute(), valid_gdf['class_vals'].nunique().compute()
CPU times: user 305 ms, sys: 24.8 ms, total: 330 ms
Wall time: 897 ms
Out[22]:
(5000, 10)
In [23]:
%%time
import dask_cudf
test_gdf = dask_cudf.read_parquet(output_test_dir)
test_gdf.head()
CPU times: user 5.5 s, sys: 121 ms, total: 5.62 s
Wall time: 9.03 s
Out[23]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_199_14 dim_199_15 dim_199_16 dim_199_17 dim_199_18 dim_199_19 dim_199_20 dim_199_21 class_vals case_id
0 0.023226 -0.009603 0.814100 -2.280586 1.537670 0.020897 0.022441 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 23434.0
1 0.159843 -1.216731 2.247976 -0.061670 -0.072155 -0.018768 0.020887 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 5711.0
2 0.067165 3.113498 -2.863501 3.685871 0.022752 0.000000 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0 12870.0
3 0.054060 0.336126 -0.606661 -0.557574 0.025019 0.000000 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 11746.0
4 0.022422 0.022329 -0.028971 0.020431 0.070435 0.022480 0.022419 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 7110.0

5 rows × 4402 columns

In [24]:
test_gdf.tail()
Out[24]:
dim_0_0 dim_0_1 dim_0_2 dim_0_3 dim_0_4 dim_0_5 dim_0_6 dim_0_7 dim_0_8 dim_0_9 ... dim_199_14 dim_199_15 dim_199_16 dim_199_17 dim_199_18 dim_199_19 dim_199_20 dim_199_21 class_vals case_id
24995 0.218689 2.559381 0.149158 0.018900 0.022415 0.000000 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 9023.0
24996 0.190995 0.570612 -0.516594 1.190207 -0.162653 -1.432876 0.297485 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0 13974.0
24997 0.021313 0.106590 -0.789149 0.544075 0.030988 0.022411 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1603.0
24998 0.179996 0.153285 0.825175 0.018868 0.022372 0.022516 0.022501 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 8104.0
24999 0.061483 0.253287 -0.306561 -1.298569 0.151962 0.000000 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 3896.0

5 rows × 4402 columns

In [25]:
%%time
test_gdf['case_id'].nunique().compute(), test_gdf['class_vals'].nunique().compute()
CPU times: user 287 ms, sys: 17.1 ms, total: 304 ms
Wall time: 900 ms
Out[25]:
(25000, 10)

We reset the kernel!!!

In [26]:
%%time
client.shutdown()
client.close()
CPU times: user 119 ms, sys: 78.1 ms, total: 197 ms
Wall time: 30.5 s
In [27]:
from nbdev import nbdev_export
nbdev_export()