#| default_exp feature_preprocessing.insect_wingbeat.tabular_to_timeseries
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['feature_preprocessing_insect_wingbeat']
# Parameters
upstream = {"feature_preprocessing_insect_wingbeat": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html", "InsectWingbeat_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train", "InsectWingbeat_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid", "InsectWingbeat_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test", "InsectWingbeat_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/302_feature_preprocessing.insect_wingbeat.tabular_to_timeseries.html", "InsectWingbeat_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train", "InsectWingbeat_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/valid", "InsectWingbeat_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test"}
#| hide
from nbdev.showdoc import *
#| export
from vitmtsc import *
from vitmtsc.core import *
from vitmtsc.data.insect_wingbeat import *
from vitmtsc.feature_preprocessing.insect_wingbeat.target_encoding import *
import os
import glob
#| export
upstream = {
"feature_preprocessing_insect_wingbeat": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html",
"InsectWingbeat_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train",
"InsectWingbeat_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid",
"InsectWingbeat_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test",
"InsectWingbeat_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/302_feature_preprocessing.insect_wingbeat.tabular_to_timeseries.html",
"InsectWingbeat_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train",
"InsectWingbeat_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/valid",
"InsectWingbeat_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test",
}
Convert Category Encoding data from tabular to time-series format
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:35:45,513 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:35:45,513 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:35:45,515 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:35:45,515 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:35:45,545 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:35:45,545 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:35:45,549 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:35:45,549 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Client-eb8b0f0f-3b76-11ed-8177-06c0bb745397
Connection method: Cluster object | Cluster type: dask_cuda.LocalCUDACluster |
Dashboard: http://127.0.0.1:8787/status |
668855f2
Dashboard: http://127.0.0.1:8787/status | Workers: 4 |
Total threads: 4 | Total memory: 150.00 GiB |
Status: running | Using processes: True |
Scheduler-a9d79834-1c2a-41b3-8a06-74a5d9d24bf6
Comm: tcp://127.0.0.1:41023 | Workers: 4 |
Dashboard: http://127.0.0.1:8787/status | Total threads: 4 |
Started: Just now | Total memory: 150.00 GiB |
Comm: tcp://127.0.0.1:34573 | Total threads: 1 |
Dashboard: http://127.0.0.1:36137/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:40087 | |
Local directory: /tmp/dask-worker-space/worker-544f9lky | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:45097 | Total threads: 1 |
Dashboard: http://127.0.0.1:38797/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:41907 | |
Local directory: /tmp/dask-worker-space/worker-6ip2jpd8 | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:41941 | Total threads: 1 |
Dashboard: http://127.0.0.1:34495/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:42889 | |
Local directory: /tmp/dask-worker-space/worker-lslkuai0 | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:40577 | Total threads: 1 |
Dashboard: http://127.0.0.1:43717/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:45793 | |
Local directory: /tmp/dask-worker-space/worker-hnbfmmhd | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
#| export
DATASET_NAME = 'InsectWingbeat'
SEQUENCE_LENGTH = 22
NUMBER_OF_FEATURES = 200
NUM_TARGET = 10
Convert from Tabular to Time-Series Format
#| export
MTSC_COLUMN_NAMES = [
'dim_0',
'dim_1',
'dim_2',
'dim_3',
'dim_4',
'dim_5',
'dim_6',
'dim_7',
'dim_8',
'dim_9',
'dim_10',
'dim_11',
'dim_12',
'dim_13',
'dim_14',
'dim_15',
'dim_16',
'dim_17',
'dim_18',
'dim_19',
'dim_20',
'dim_21',
'dim_22',
'dim_23',
'dim_24',
'dim_25',
'dim_26',
'dim_27',
'dim_28',
'dim_29',
'dim_30',
'dim_31',
'dim_32',
'dim_33',
'dim_34',
'dim_35',
'dim_36',
'dim_37',
'dim_38',
'dim_39',
'dim_40',
'dim_41',
'dim_42',
'dim_43',
'dim_44',
'dim_45',
'dim_46',
'dim_47',
'dim_48',
'dim_49',
'dim_50',
'dim_51',
'dim_52',
'dim_53',
'dim_54',
'dim_55',
'dim_56',
'dim_57',
'dim_58',
'dim_59',
'dim_60',
'dim_61',
'dim_62',
'dim_63',
'dim_64',
'dim_65',
'dim_66',
'dim_67',
'dim_68',
'dim_69',
'dim_70',
'dim_71',
'dim_72',
'dim_73',
'dim_74',
'dim_75',
'dim_76',
'dim_77',
'dim_78',
'dim_79',
'dim_80',
'dim_81',
'dim_82',
'dim_83',
'dim_84',
'dim_85',
'dim_86',
'dim_87',
'dim_88',
'dim_89',
'dim_90',
'dim_91',
'dim_92',
'dim_93',
'dim_94',
'dim_95',
'dim_96',
'dim_97',
'dim_98',
'dim_99',
'dim_100',
'dim_101',
'dim_102',
'dim_103',
'dim_104',
'dim_105',
'dim_106',
'dim_107',
'dim_108',
'dim_109',
'dim_110',
'dim_111',
'dim_112',
'dim_113',
'dim_114',
'dim_115',
'dim_116',
'dim_117',
'dim_118',
'dim_119',
'dim_120',
'dim_121',
'dim_122',
'dim_123',
'dim_124',
'dim_125',
'dim_126',
'dim_127',
'dim_128',
'dim_129',
'dim_130',
'dim_131',
'dim_132',
'dim_133',
'dim_134',
'dim_135',
'dim_136',
'dim_137',
'dim_138',
'dim_139',
'dim_140',
'dim_141',
'dim_142',
'dim_143',
'dim_144',
'dim_145',
'dim_146',
'dim_147',
'dim_148',
'dim_149',
'dim_150',
'dim_151',
'dim_152',
'dim_153',
'dim_154',
'dim_155',
'dim_156',
'dim_157',
'dim_158',
'dim_159',
'dim_160',
'dim_161',
'dim_162',
'dim_163',
'dim_164',
'dim_165',
'dim_166',
'dim_167',
'dim_168',
'dim_169',
'dim_170',
'dim_171',
'dim_172',
'dim_173',
'dim_174',
'dim_175',
'dim_176',
'dim_177',
'dim_178',
'dim_179',
'dim_180',
'dim_181',
'dim_182',
'dim_183',
'dim_184',
'dim_185',
'dim_186',
'dim_187',
'dim_188',
'dim_189',
'dim_190',
'dim_191',
'dim_192',
'dim_193',
'dim_194',
'dim_195',
'dim_196',
'dim_197',
'dim_198',
'dim_199']
#| export
ALL_COLUMNS = ['case_id', 'case_id_seq', 'reading_id'] + MTSC_COLUMN_NAMES + ['class_vals']
Input Data Location
target_encoded_train_dir = os.path.join("./", upstream['feature_preprocessing_insect_wingbeat']['InsectWingbeat_TRAIN_TE'])
target_encoded_valid_dir = os.path.join("./", upstream['feature_preprocessing_insect_wingbeat']['InsectWingbeat_VALID_TE'])
target_encoded_test_dir = os.path.join("./", upstream['feature_preprocessing_insect_wingbeat']['InsectWingbeat_TEST_TE'])
Output Data Location
output_train_dir = os.path.join("./", product['InsectWingbeat_TRAIN_MODEL_INPUT'])
output_valid_dir = os.path.join("./", product['InsectWingbeat_VALID_MODEL_INPUT'])
output_test_dir = os.path.join("./", product['InsectWingbeat_TEST_MODEL_INPUT'])
!mkdir -p $output_train_dir
!mkdir -p $output_valid_dir
!mkdir -p $output_test_dir
Tabular to Time-Series format conversion
%%time
convert_from_tabular_to_timeseries_format(input_dir = target_encoded_train_dir,
output_dir = output_train_dir,
all_columns = ALL_COLUMNS,
mtsc_column_names = MTSC_COLUMN_NAMES,
chunk_size_processing = 50000,
number_of_features = NUMBER_OF_FEATURES,
seq_len = SEQUENCE_LENGTH,
chunk_size_file = 10000)
case_id_seq_min: 0 case_id_seq_max: 24998 Total number of chunks to be processed: 1 Started processing chunk: 0 with case_id_seq from : 0 to 24998 Before CumCount Min: 0 CumCount Max: 21 After CumCount Min: 0 CumCount Max: 21 sorted flattened_gdf.shape: (20000, 4402) Total number of files to be created: 2 Writing to output file: /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train/chunk_0_part_0.parquet with records from iloc: 0 to 10000 Writing to output file: /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/train/chunk_0_part_1.parquet with records from iloc: 10000 to 20000 Finished processing chunk: 0 with case_id_seq from : 0 to 24998 CPU times: user 14.5 s, sys: 3.09 s, total: 17.6 s Wall time: 19.1 s
Tabular to Time-Series format conversion
%%time
convert_from_tabular_to_timeseries_format(input_dir = target_encoded_valid_dir,
output_dir = output_valid_dir,
all_columns = ALL_COLUMNS,
mtsc_column_names = MTSC_COLUMN_NAMES,
chunk_size_processing = 50000,
number_of_features = NUMBER_OF_FEATURES,
seq_len = SEQUENCE_LENGTH,
chunk_size_file = 10000)
case_id_seq_min: 17 case_id_seq_max: 24999 Total number of chunks to be processed: 1 Started processing chunk: 0 with case_id_seq from : 0 to 24999 Before CumCount Min: 0 CumCount Max: 20 After CumCount Min: 0 CumCount Max: 20 sorted flattened_gdf.shape: (5000, 4402) Total number of files to be created: 1 Writing to output file: /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/valid/chunk_0_part_0.parquet with records from iloc: 0 to 5000 Finished processing chunk: 0 with case_id_seq from : 0 to 24999 CPU times: user 7.87 s, sys: 799 ms, total: 8.67 s Wall time: 8.54 s
Tabular to Time-Series format conversion
%%time
convert_from_tabular_to_timeseries_format(input_dir = target_encoded_test_dir,
output_dir = output_test_dir,
all_columns = ALL_COLUMNS,
mtsc_column_names = MTSC_COLUMN_NAMES,
chunk_size_processing = 50000,
number_of_features = NUMBER_OF_FEATURES,
seq_len = SEQUENCE_LENGTH,
chunk_size_file = 10000)
case_id_seq_min: 0 case_id_seq_max: 24999 Total number of chunks to be processed: 1 Started processing chunk: 0 with case_id_seq from : 0 to 24999 Before CumCount Min: 0 CumCount Max: 21 After CumCount Min: 0 CumCount Max: 21 sorted flattened_gdf.shape: (25000, 4402) Total number of files to be created: 3 Writing to output file: /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test/chunk_0_part_0.parquet with records from iloc: 0 to 10000 Writing to output file: /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test/chunk_0_part_1.parquet with records from iloc: 10000 to 20000 Writing to output file: /home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding-nn/test/chunk_0_part_2.parquet with records from iloc: 20000 to 25000 Finished processing chunk: 0 with case_id_seq from : 0 to 24999 CPU times: user 14.9 s, sys: 2.94 s, total: 17.9 s Wall time: 17.5 s
%%time
import dask_cudf
train_gdf = dask_cudf.read_parquet(output_train_dir)
train_gdf.head()
CPU times: user 5.79 s, sys: 194 ms, total: 5.99 s Wall time: 9.49 s
dim_0_0 | dim_0_1 | dim_0_2 | dim_0_3 | dim_0_4 | dim_0_5 | dim_0_6 | dim_0_7 | dim_0_8 | dim_0_9 | ... | dim_199_14 | dim_199_15 | dim_199_16 | dim_199_17 | dim_199_18 | dim_199_19 | dim_199_20 | dim_199_21 | class_vals | case_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.032063 | -0.054971 | -0.267784 | -1.067128 | 1.216967 | 0.052408 | 0.010434 | 0.000000 | 0.000000 | 0.00000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 22785.0 |
1 | -0.062069 | -0.024357 | 1.082939 | -2.032642 | 0.244790 | 1.061763 | 0.076604 | 0.000000 | 0.000000 | 0.00000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 10048.0 |
2 | 0.013101 | 0.610176 | -2.407612 | 0.008669 | 0.023391 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 10740.0 |
3 | 0.111349 | -0.064055 | 0.946340 | 0.469756 | 0.022269 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 4830.0 |
4 | 0.022464 | 0.181676 | 0.277060 | -0.615980 | 0.446005 | -0.005245 | -1.217014 | 1.282087 | -0.049652 | 0.03522 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 11005.0 |
5 rows × 4402 columns
train_gdf.tail()
dim_0_0 | dim_0_1 | dim_0_2 | dim_0_3 | dim_0_4 | dim_0_5 | dim_0_6 | dim_0_7 | dim_0_8 | dim_0_9 | ... | dim_199_14 | dim_199_15 | dim_199_16 | dim_199_17 | dim_199_18 | dim_199_19 | dim_199_20 | dim_199_21 | class_vals | case_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
19995 | -0.001325 | 0.116420 | -0.645990 | 0.603539 | 0.022499 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 11089.0 |
19996 | 0.023738 | 0.086435 | 0.038533 | -0.058786 | -0.547138 | -0.325506 | -0.734727 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 10406.0 |
19997 | 0.022344 | 0.022349 | 0.357544 | -2.243955 | -0.532616 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 22963.0 |
19998 | 0.021922 | 0.025154 | 0.232804 | -1.263299 | 0.866391 | 0.265158 | 0.020039 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 10361.0 |
19999 | 0.071430 | -0.114545 | 0.332819 | -0.149102 | 0.062493 | 0.048579 | 0.017992 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 8.0 | 20442.0 |
5 rows × 4402 columns
%%time
train_gdf['case_id'].nunique().compute(), train_gdf['class_vals'].nunique().compute()
CPU times: user 518 ms, sys: 32.7 ms, total: 551 ms Wall time: 1.18 s
(20000, 10)
%%time
import dask_cudf
valid_gdf = dask_cudf.read_parquet(output_valid_dir)
valid_gdf.head()
CPU times: user 5.67 s, sys: 128 ms, total: 5.8 s Wall time: 9.2 s
dim_0_0 | dim_0_1 | dim_0_2 | dim_0_3 | dim_0_4 | dim_0_5 | dim_0_6 | dim_0_7 | dim_0_8 | dim_0_9 | ... | dim_199_14 | dim_199_15 | dim_199_16 | dim_199_17 | dim_199_18 | dim_199_19 | dim_199_20 | dim_199_21 | class_vals | case_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.022414 | 0.022381 | 0.022424 | -0.020581 | 0.187857 | 0.213099 | -0.198727 | -0.220894 | 0.023761 | 0.022388 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 | 17775.0 |
1 | 0.022418 | 0.026991 | -0.073085 | 0.153986 | -0.390933 | 0.376018 | 0.301444 | -0.007603 | 0.022414 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 360.0 |
2 | 0.022342 | 0.014775 | 0.151475 | -1.026869 | 0.344332 | 0.079579 | -0.041067 | 0.000000 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 24655.0 |
3 | -0.016620 | -0.782072 | 5.613051 | 0.881482 | 0.023386 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 13884.0 |
4 | 0.235395 | 0.882627 | -0.764873 | 0.702121 | 0.003729 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 24342.0 |
5 rows × 4402 columns
valid_gdf.tail()
dim_0_0 | dim_0_1 | dim_0_2 | dim_0_3 | dim_0_4 | dim_0_5 | dim_0_6 | dim_0_7 | dim_0_8 | dim_0_9 | ... | dim_199_14 | dim_199_15 | dim_199_16 | dim_199_17 | dim_199_18 | dim_199_19 | dim_199_20 | dim_199_21 | class_vals | case_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4995 | 0.032163 | 0.764881 | -0.869177 | 0.359423 | -0.389531 | 0.553232 | 0.031750 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 4139.0 |
4996 | 0.022196 | 0.134991 | -1.015465 | 0.316442 | 0.022520 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 7212.0 |
4997 | 0.022402 | 0.022353 | 0.028277 | -1.655183 | -0.390674 | 0.702128 | 0.022406 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 367.0 |
4998 | 0.031373 | 0.003649 | 0.007124 | -0.023877 | 1.108508 | -2.907924 | 0.590468 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 22976.0 |
4999 | 0.024010 | 0.102044 | -0.282106 | 1.212721 | 0.063672 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 11262.0 |
5 rows × 4402 columns
%%time
valid_gdf['case_id'].nunique().compute(), valid_gdf['class_vals'].nunique().compute()
CPU times: user 305 ms, sys: 24.8 ms, total: 330 ms Wall time: 897 ms
(5000, 10)
%%time
import dask_cudf
test_gdf = dask_cudf.read_parquet(output_test_dir)
test_gdf.head()
CPU times: user 5.5 s, sys: 121 ms, total: 5.62 s Wall time: 9.03 s
dim_0_0 | dim_0_1 | dim_0_2 | dim_0_3 | dim_0_4 | dim_0_5 | dim_0_6 | dim_0_7 | dim_0_8 | dim_0_9 | ... | dim_199_14 | dim_199_15 | dim_199_16 | dim_199_17 | dim_199_18 | dim_199_19 | dim_199_20 | dim_199_21 | class_vals | case_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.023226 | -0.009603 | 0.814100 | -2.280586 | 1.537670 | 0.020897 | 0.022441 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 23434.0 |
1 | 0.159843 | -1.216731 | 2.247976 | -0.061670 | -0.072155 | -0.018768 | 0.020887 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 5711.0 |
2 | 0.067165 | 3.113498 | -2.863501 | 3.685871 | 0.022752 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 12870.0 |
3 | 0.054060 | 0.336126 | -0.606661 | -0.557574 | 0.025019 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 11746.0 |
4 | 0.022422 | 0.022329 | -0.028971 | 0.020431 | 0.070435 | 0.022480 | 0.022419 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 7110.0 |
5 rows × 4402 columns
test_gdf.tail()
dim_0_0 | dim_0_1 | dim_0_2 | dim_0_3 | dim_0_4 | dim_0_5 | dim_0_6 | dim_0_7 | dim_0_8 | dim_0_9 | ... | dim_199_14 | dim_199_15 | dim_199_16 | dim_199_17 | dim_199_18 | dim_199_19 | dim_199_20 | dim_199_21 | class_vals | case_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
24995 | 0.218689 | 2.559381 | 0.149158 | 0.018900 | 0.022415 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 9023.0 |
24996 | 0.190995 | 0.570612 | -0.516594 | 1.190207 | -0.162653 | -1.432876 | 0.297485 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 13974.0 |
24997 | 0.021313 | 0.106590 | -0.789149 | 0.544075 | 0.030988 | 0.022411 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1603.0 |
24998 | 0.179996 | 0.153285 | 0.825175 | 0.018868 | 0.022372 | 0.022516 | 0.022501 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 8104.0 |
24999 | 0.061483 | 0.253287 | -0.306561 | -1.298569 | 0.151962 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3896.0 |
5 rows × 4402 columns
%%time
test_gdf['case_id'].nunique().compute(), test_gdf['class_vals'].nunique().compute()
CPU times: user 287 ms, sys: 17.1 ms, total: 304 ms Wall time: 900 ms
(25000, 10)
We reset the kernel!!!
%%time
client.shutdown()
client.close()
CPU times: user 119 ms, sys: 78.1 ms, total: 197 ms Wall time: 30.5 s
from nbdev import nbdev_export
nbdev_export()