#| default_exp feature_preprocessing.insect_wingbeat.target_encoding
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['parquet_conversion_insect_wingbeat']
# Parameters
upstream = {"parquet_conversion_insect_wingbeat": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html", "InsectWingbeat_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train", "InsectWingbeat_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid", "InsectWingbeat_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html", "InsectWingbeat_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train", "InsectWingbeat_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid", "InsectWingbeat_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test", "InsectWingbeat_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow"}
#| hide
from nbdev.showdoc import *
#| export
from vitmtsc import *
from vitmtsc.core import *
from vitmtsc.data.insect_wingbeat import *
import os
import nvtabular as nvt
import dask_cudf
from nvtabular import ops
#| export
upstream = {
"parquet_conversion_insect_wingbeat": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/102_data.insect_wingbeat.html",
"InsectWingbeat_TRAIN_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/train",
"InsectWingbeat_VALID_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/valid",
"InsectWingbeat_TEST_RAW": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/raw/test",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/202_feature_preprocessing.insect_wingbeat.target_encoding.html",
"InsectWingbeat_TRAIN_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/train",
"InsectWingbeat_VALID_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/valid",
"InsectWingbeat_TEST_TE": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/test",
"InsectWingbeat_workflow_dir": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/target_encoding/nvtabular_workflow",
}
!conda list|grep -i nvtabular
Fill missing continuous features
Normalize continuous features
Categorify categorical features
Target Encoding of Categorical Variables
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(memory_limit='auto', device_memory_limit=0.5, rmm_pool_size='20GB', rmm_managed_memory=True)
client = Client(cluster)
client
2022-09-23 19:34:30,292 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:34:30,292 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:34:30,307 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:34:30,307 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:34:30,363 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:34:30,363 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize 2022-09-23 19:34:30,363 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize 2022-09-23 19:34:30,363 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
Client-beb64617-3b76-11ed-8119-06c0bb745397
Connection method: Cluster object | Cluster type: dask_cuda.LocalCUDACluster |
Dashboard: http://127.0.0.1:8787/status |
acd2c3b8
Dashboard: http://127.0.0.1:8787/status | Workers: 4 |
Total threads: 4 | Total memory: 150.00 GiB |
Status: running | Using processes: True |
Scheduler-ecdb14e6-5f1a-40c6-af46-223c8ee2697a
Comm: tcp://127.0.0.1:45091 | Workers: 4 |
Dashboard: http://127.0.0.1:8787/status | Total threads: 4 |
Started: Just now | Total memory: 150.00 GiB |
Comm: tcp://127.0.0.1:38929 | Total threads: 1 |
Dashboard: http://127.0.0.1:42473/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:35801 | |
Local directory: /tmp/dask-worker-space/worker-w7lc37wn | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:41259 | Total threads: 1 |
Dashboard: http://127.0.0.1:35595/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:38615 | |
Local directory: /tmp/dask-worker-space/worker-low2vw4f | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:44093 | Total threads: 1 |
Dashboard: http://127.0.0.1:39881/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:36943 | |
Local directory: /tmp/dask-worker-space/worker-4jb7ou_h | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
Comm: tcp://127.0.0.1:41869 | Total threads: 1 |
Dashboard: http://127.0.0.1:45773/status | Memory: 37.50 GiB |
Nanny: tcp://127.0.0.1:39915 | |
Local directory: /tmp/dask-worker-space/worker-rii4x320 | |
GPU: Tesla T4 | GPU memory: 14.76 GiB |
COLUMNS: CATEGORICAL, CONTINUOUS and TARGET
#| export
import numpy as np
CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING = ['case_id', 'case_id_seq', 'reading_id']
#| export
CATEGORICAL_COLUMNS_NEED_ENCODING = [
]
#| export
CONTINUOUS_COLUMNS = [
'dim_0',
'dim_1',
'dim_2',
'dim_3',
'dim_4',
'dim_5',
'dim_6',
'dim_7',
'dim_8',
'dim_9',
'dim_10',
'dim_11',
'dim_12',
'dim_13',
'dim_14',
'dim_15',
'dim_16',
'dim_17',
'dim_18',
'dim_19',
'dim_20',
'dim_21',
'dim_22',
'dim_23',
'dim_24',
'dim_25',
'dim_26',
'dim_27',
'dim_28',
'dim_29',
'dim_30',
'dim_31',
'dim_32',
'dim_33',
'dim_34',
'dim_35',
'dim_36',
'dim_37',
'dim_38',
'dim_39',
'dim_40',
'dim_41',
'dim_42',
'dim_43',
'dim_44',
'dim_45',
'dim_46',
'dim_47',
'dim_48',
'dim_49',
'dim_50',
'dim_51',
'dim_52',
'dim_53',
'dim_54',
'dim_55',
'dim_56',
'dim_57',
'dim_58',
'dim_59',
'dim_60',
'dim_61',
'dim_62',
'dim_63',
'dim_64',
'dim_65',
'dim_66',
'dim_67',
'dim_68',
'dim_69',
'dim_70',
'dim_71',
'dim_72',
'dim_73',
'dim_74',
'dim_75',
'dim_76',
'dim_77',
'dim_78',
'dim_79',
'dim_80',
'dim_81',
'dim_82',
'dim_83',
'dim_84',
'dim_85',
'dim_86',
'dim_87',
'dim_88',
'dim_89',
'dim_90',
'dim_91',
'dim_92',
'dim_93',
'dim_94',
'dim_95',
'dim_96',
'dim_97',
'dim_98',
'dim_99',
'dim_100',
'dim_101',
'dim_102',
'dim_103',
'dim_104',
'dim_105',
'dim_106',
'dim_107',
'dim_108',
'dim_109',
'dim_110',
'dim_111',
'dim_112',
'dim_113',
'dim_114',
'dim_115',
'dim_116',
'dim_117',
'dim_118',
'dim_119',
'dim_120',
'dim_121',
'dim_122',
'dim_123',
'dim_124',
'dim_125',
'dim_126',
'dim_127',
'dim_128',
'dim_129',
'dim_130',
'dim_131',
'dim_132',
'dim_133',
'dim_134',
'dim_135',
'dim_136',
'dim_137',
'dim_138',
'dim_139',
'dim_140',
'dim_141',
'dim_142',
'dim_143',
'dim_144',
'dim_145',
'dim_146',
'dim_147',
'dim_148',
'dim_149',
'dim_150',
'dim_151',
'dim_152',
'dim_153',
'dim_154',
'dim_155',
'dim_156',
'dim_157',
'dim_158',
'dim_159',
'dim_160',
'dim_161',
'dim_162',
'dim_163',
'dim_164',
'dim_165',
'dim_166',
'dim_167',
'dim_168',
'dim_169',
'dim_170',
'dim_171',
'dim_172',
'dim_173',
'dim_174',
'dim_175',
'dim_176',
'dim_177',
'dim_178',
'dim_179',
'dim_180',
'dim_181',
'dim_182',
'dim_183',
'dim_184',
'dim_185',
'dim_186',
'dim_187',
'dim_188',
'dim_189',
'dim_190',
'dim_191',
'dim_192',
'dim_193',
'dim_194',
'dim_195',
'dim_196',
'dim_197',
'dim_198',
'dim_199'
]
#| export
LABEL_COLUMNS = ['class_vals']
Workflow and Operations
import cudf
import numpy as np
cat_features_no_encoding = nvt.ColumnGroup(CATEGORICAL_COLUMNS_DONOT_NEED_ENCODING)
#te_features = CATEGORICAL_COLUMNS_NEED_ENCODING >> ops.TargetEncoding(LABEL_COLUMNS, kfold=5, fold_seed=42, p_smooth=20)
cont_features = CONTINUOUS_COLUMNS >> ops.FillMissing() >> ops.Normalize()
label_name = LABEL_COLUMNS
workflow = nvt.Workflow(
#cat_features_no_encoding + te_features + cont_features + label_name
#cat_features_no_encoding + te_features + label_name
cat_features_no_encoding + cont_features + label_name
)
Datasets
Input data
pre_processed_train_dir = os.path.join("./", upstream['parquet_conversion_insect_wingbeat']['InsectWingbeat_TRAIN_RAW'])
pre_processed_valid_dir = os.path.join("./", upstream['parquet_conversion_insect_wingbeat']['InsectWingbeat_VALID_RAW'])
pre_processed_test_dir = os.path.join("./", upstream['parquet_conversion_insect_wingbeat']['InsectWingbeat_TEST_RAW'])
Training, Validation and Test datasets
train_dataset = nvt.Dataset(pre_processed_train_dir, engine='parquet')
valid_dataset = nvt.Dataset(pre_processed_valid_dir, engine='parquet')
test_dataset = nvt.Dataset(pre_processed_test_dir, engine='parquet')
Output location
output_train_dir = os.path.join("./", product['InsectWingbeat_TRAIN_TE'])
output_valid_dir = os.path.join("./", product['InsectWingbeat_VALID_TE'])
output_test_dir = os.path.join("./", product['InsectWingbeat_TEST_TE'])
!mkdir -p $output_train_dir
!mkdir -p $output_valid_dir
!mkdir -p $output_test_dir
Path to save the workflow to
%%time
workflow.fit(train_dataset)
CPU times: user 594 ms, sys: 17.6 ms, total: 612 ms Wall time: 3.3 s
<nvtabular.workflow.workflow.Workflow at 0x7f25ad56ce50>
Save workflow
%%time
workflow.save(product['InsectWingbeat_workflow_dir'])
CPU times: user 6.66 ms, sys: 0 ns, total: 6.66 ms Wall time: 6.26 ms
Clear workflow
%%time
workflow = None
CPU times: user 1e+03 ns, sys: 2 µs, total: 3 µs Wall time: 5.48 µs
Load workflow
%%time
workflow = nvt.Workflow.load(product['InsectWingbeat_workflow_dir'], client=client)
CPU times: user 3.36 ms, sys: 0 ns, total: 3.36 ms Wall time: 3.03 ms
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(train_dataset).to_parquet(
output_train_dir,
out_files_per_proc=2,
shuffle=nvt.io.Shuffle.PER_PARTITION,
)
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/merlin/io/dataset.py:862: UserWarning: Only created 4 files did not have enough partitions to create 8 files. warnings.warn(
CPU times: user 498 ms, sys: 47.7 ms, total: 546 ms Wall time: 3.35 s
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(valid_dataset).to_parquet(
output_valid_dir,
out_files_per_proc=2,
shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 277 ms, sys: 23.5 ms, total: 301 ms Wall time: 968 ms
%%time
# Write to new "shuffled" and "processed" dataset
workflow.transform(test_dataset).to_parquet(
output_test_dir,
out_files_per_proc=2,
shuffle=nvt.io.Shuffle.PER_PARTITION,
)
CPU times: user 290 ms, sys: 17.3 ms, total: 307 ms Wall time: 1.19 s
Verify Data
train_gdf = dask_cudf.read_parquet(output_train_dir)
%%time
train_gdf.head()
CPU times: user 25.3 ms, sys: 5.44 ms, total: 30.8 ms Wall time: 76.1 ms
dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | dim_8 | dim_9 | ... | dim_194 | dim_195 | dim_196 | dim_197 | dim_198 | dim_199 | case_id | case_id_seq | reading_id | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.050804 | 0.681181 | 1.013231 | 0.242954 | -0.423010 | -0.270492 | 0.262974 | 1.500984 | 0.192567 | 5.167649 | ... | -0.206104 | -0.011772 | -0.254361 | -0.005969 | -0.269680 | -0.269680 | 15353 | 15353 | 4 | 6 |
1 | -1.718440 | 0.339011 | -0.349984 | 0.425720 | -0.485362 | 0.009983 | 0.042684 | 0.227740 | -0.005105 | -0.028321 | ... | 0.337281 | 0.417158 | 0.429991 | 0.458928 | 0.421419 | 0.421419 | 13654 | 13654 | 2 | 5 |
2 | -0.117013 | -0.965658 | -0.611225 | -1.249610 | -0.698549 | -0.020280 | 0.021620 | 0.216538 | 0.081172 | -0.065212 | ... | 0.000798 | -0.005521 | 0.001085 | -0.010480 | -0.000145 | -0.000145 | 13751 | 13751 | 2 | 5 |
3 | -0.192365 | -0.071957 | -0.006036 | 0.002622 | 0.008681 | 0.007983 | 0.004411 | -0.009100 | -0.005111 | -0.001968 | ... | -0.009080 | 0.014464 | 0.031121 | 0.018526 | -0.025577 | -0.025577 | 13334 | 13334 | 7 | 5 |
4 | 0.044327 | 0.073654 | -0.158428 | 0.054091 | 0.001096 | -0.015807 | 0.002900 | -0.028916 | 0.000247 | 0.006576 | ... | 0.117479 | 0.016841 | 0.082249 | 0.021918 | 0.031442 | 0.031442 | 13873 | 13873 | 3 | 5 |
5 rows × 204 columns
%%time
train_gdf['case_id'].nunique().compute()
CPU times: user 128 ms, sys: 4.11 ms, total: 132 ms Wall time: 163 ms
20000
valid_gdf = dask_cudf.read_parquet(output_valid_dir)
%%time
valid_gdf.head()
CPU times: user 19.7 ms, sys: 7.71 ms, total: 27.4 ms Wall time: 65.2 ms
dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | dim_8 | dim_9 | ... | dim_194 | dim_195 | dim_196 | dim_197 | dim_198 | dim_199 | case_id | case_id_seq | reading_id | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.087206 | -0.801269 | -0.991575 | -0.116312 | 0.672066 | 0.510439 | 0.475757 | 0.319095 | 0.072277 | 0.011849 | ... | -0.164485 | 0.102474 | 0.674178 | 0.869020 | 1.230336 | 1.230336 | 14233 | 14233 | 1 | 5 |
1 | -0.158837 | 0.387723 | -1.088610 | 1.381328 | -0.943876 | 0.173133 | 0.242806 | 0.157767 | -0.320208 | 0.156779 | ... | -0.156080 | -0.803016 | 0.009903 | 0.703609 | -0.092156 | -0.092156 | 13060 | 13060 | 6 | 5 |
2 | 0.026978 | -0.035676 | 0.013953 | 0.011395 | 0.003562 | 0.003706 | 0.007913 | 0.005006 | 0.005156 | 0.003744 | ... | 0.378011 | 0.432359 | 0.469225 | 0.452600 | 0.461441 | 0.461441 | 14944 | 14944 | 4 | 5 |
3 | 0.461530 | 0.079703 | 0.053388 | -0.002319 | -0.037487 | 0.031856 | 0.084729 | -0.134924 | 0.018036 | -0.003846 | ... | 0.402160 | -0.438912 | 0.495201 | -0.486520 | 0.437103 | 0.437103 | 13811 | 13811 | 2 | 5 |
4 | 0.085716 | -2.339949 | 0.399482 | -2.581492 | -2.611039 | -1.371416 | 0.301281 | -1.423098 | -0.351432 | 0.012843 | ... | -0.512978 | -0.410945 | -0.469623 | -0.467898 | -0.730986 | -0.730986 | 13060 | 13060 | 5 | 5 |
5 rows × 204 columns
%%time
valid_gdf['case_id'].nunique().compute()
CPU times: user 27.6 ms, sys: 903 µs, total: 28.5 ms Wall time: 65.2 ms
5000
test_gdf = dask_cudf.read_parquet(output_test_dir)
%%time
test_gdf.head()
CPU times: user 28.2 ms, sys: 0 ns, total: 28.2 ms Wall time: 75.9 ms
dim_0 | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | dim_8 | dim_9 | ... | dim_194 | dim_195 | dim_196 | dim_197 | dim_198 | dim_199 | case_id | case_id_seq | reading_id | class_vals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.387993 | -1.048660 | 0.399718 | -0.665260 | 0.730515 | -0.285534 | -0.044396 | -0.104056 | 0.009882 | -0.024395 | ... | -0.018475 | -0.001824 | 0.039938 | 0.036966 | -0.082159 | -0.082159 | 1892 | 1892 | 5 | 0 |
1 | 0.061857 | -0.049819 | 0.009652 | 0.010162 | 0.001651 | -0.030991 | 0.017266 | -0.006341 | 0.002815 | -0.001528 | ... | 0.018218 | -0.019667 | 0.016658 | -0.011813 | 0.023287 | 0.023287 | 2304 | 2304 | 2 | 0 |
2 | 0.042386 | -0.039667 | 0.009262 | 0.028066 | -0.033792 | -0.001098 | 0.003418 | -0.000908 | 0.000556 | -0.000932 | ... | 0.006715 | 0.005337 | 0.009046 | 0.004719 | -0.003957 | -0.003957 | 2133 | 2133 | 0 | 0 |
3 | 0.197399 | -0.240693 | 0.022010 | -0.109015 | -0.050297 | -0.015284 | -0.010694 | 0.000871 | -0.001049 | 0.001703 | ... | -0.015352 | 0.018601 | -0.027396 | -0.050902 | 0.150228 | 0.150228 | 1913 | 1913 | 1 | 0 |
4 | 0.477535 | -0.007621 | -0.023894 | 0.022317 | -0.084003 | -0.071894 | 0.050038 | 0.004446 | -0.009515 | -0.004554 | ... | -0.041481 | -0.066504 | -0.068311 | -0.046724 | -0.051759 | -0.051759 | 850 | 850 | 4 | 0 |
5 rows × 204 columns
%%time
test_gdf['case_id'].nunique().compute()
CPU times: user 28 ms, sys: 0 ns, total: 28 ms Wall time: 66.2 ms
25000
test_gdf.columns
Index(['dim_0', 'dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', 'dim_6', 'dim_7', 'dim_8', 'dim_9', ... 'dim_194', 'dim_195', 'dim_196', 'dim_197', 'dim_198', 'dim_199', 'case_id', 'case_id_seq', 'reading_id', 'class_vals'], dtype='object', length=204)
!ls -lrt --block-size=M $output_train_dir
total 199M -rw-r--r-- 1 root root 1M Sep 23 19:34 schema.pbtxt -rw-r--r-- 1 root root 25M Sep 23 19:34 part_3.parquet -rw-r--r-- 1 root root 25M Sep 23 19:34 part_2.parquet -rw-r--r-- 1 root root 26M Sep 23 19:34 part_1.parquet -rw-r--r-- 1 root root 26M Sep 23 19:34 part_0.parquet -rw-r--r-- 1 root root 25M Sep 23 19:34 part_7.parquet -rw-r--r-- 1 root root 25M Sep 23 19:34 part_6.parquet -rw-r--r-- 1 root root 25M Sep 23 19:34 part_5.parquet -rw-r--r-- 1 root root 25M Sep 23 19:34 part_4.parquet -rw-r--r-- 1 root root 1M Sep 23 19:34 _metadata.json -rw-r--r-- 1 root root 1M Sep 23 19:34 _file_list.txt -rw-r--r-- 1 root root 1M Sep 23 19:34 _metadata
!ls -lrt --block-size=M $output_valid_dir
total 53M -rw-r--r-- 1 root root 1M Sep 23 19:34 schema.pbtxt -rw-r--r-- 1 root root 7M Sep 23 19:34 part_1.parquet -rw-r--r-- 1 root root 7M Sep 23 19:34 part_0.parquet -rw-r--r-- 1 root root 7M Sep 23 19:34 part_5.parquet -rw-r--r-- 1 root root 7M Sep 23 19:34 part_4.parquet -rw-r--r-- 1 root root 7M Sep 23 19:34 part_7.parquet -rw-r--r-- 1 root root 7M Sep 23 19:34 part_6.parquet -rw-r--r-- 1 root root 7M Sep 23 19:34 part_3.parquet -rw-r--r-- 1 root root 7M Sep 23 19:34 part_2.parquet -rw-r--r-- 1 root root 1M Sep 23 19:34 _metadata.json -rw-r--r-- 1 root root 1M Sep 23 19:34 _file_list.txt -rw-r--r-- 1 root root 1M Sep 23 19:34 _metadata
!ls -lrt --block-size=M $output_test_dir
total 242M -rw-r--r-- 1 root root 1M Sep 23 19:34 schema.pbtxt -rw-r--r-- 1 root root 30M Sep 23 19:34 part_7.parquet -rw-r--r-- 1 root root 30M Sep 23 19:34 part_6.parquet -rw-r--r-- 1 root root 32M Sep 23 19:34 part_5.parquet -rw-r--r-- 1 root root 31M Sep 23 19:34 part_4.parquet -rw-r--r-- 1 root root 31M Sep 23 19:34 part_3.parquet -rw-r--r-- 1 root root 31M Sep 23 19:34 part_2.parquet -rw-r--r-- 1 root root 31M Sep 23 19:34 part_1.parquet -rw-r--r-- 1 root root 31M Sep 23 19:34 part_0.parquet -rw-r--r-- 1 root root 1M Sep 23 19:34 _metadata.json -rw-r--r-- 1 root root 1M Sep 23 19:34 _file_list.txt -rw-r--r-- 1 root root 1M Sep 23 19:34 _metadata
We reset the kernel!!!
%%time
client.shutdown()
client.close()
Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1400, in _handle_report await self._reconnect() File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/utils.py", line 778, in wrapper return await func(*args, **kwargs) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1211, in _reconnect await self._ensure_connected(timeout=timeout) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/client.py", line 1241, in _ensure_connected comm = await connect( File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/distributed/comm/core.py", line 315, in connect await asyncio.sleep(backoff) File "/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/asyncio/tasks.py", line 659, in sleep return await future asyncio.exceptions.CancelledError
CPU times: user 41.9 ms, sys: 6.1 ms, total: 48 ms Wall time: 670 ms
from nbdev import nbdev_export
nbdev_export()