# | default_exp core
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = None
# Parameters
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}
# |export
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
"FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
"FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
"InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
"InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
"PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
"PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
"SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
"SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
"CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
"CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
}
!nvidia-smi
Fri Sep 23 18:47:32 2022 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla T4 Off | 00000000:00:1B.0 Off | 0 | | N/A 30C P0 24W / 70W | 0MiB / 15109MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 Tesla T4 Off | 00000000:00:1C.0 Off | 0 | | N/A 31C P0 25W / 70W | 0MiB / 15109MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 2 Tesla T4 Off | 00000000:00:1D.0 Off | 0 | | N/A 30C P0 25W / 70W | 0MiB / 15109MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 3 Tesla T4 Off | 00000000:00:1E.0 Off | 0 | | N/A 31C P0 24W / 70W | 0MiB / 15109MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+
import torch
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
Using device: cuda
This notebook contains common library code used across all subsequent notebooks.
# |export
import math
import os
# |export
import cudf
import dask_cudf
import numpy as np
# |hide
from nbdev.showdoc import *
# |export
from numba import cuda
from sktime.datasets import load_from_tsfile_to_dataframe
from sktime.datasets.tsc_dataset_names import multivariate
from sktime.datatypes._panel._convert import from_nested_to_multi_index
multivariate
['ArticularyWordRecognition', 'AsphaltObstaclesCoordinates', 'AsphaltPavementTypeCoordinates', 'AsphaltRegularityCoordinates', 'AtrialFibrillation', 'BasicMotions', 'CharacterTrajectories', 'Cricket', 'DuckDuckGeese', 'EigenWorms', 'Epilepsy', 'EthanolConcentration', 'ERing', 'FaceDetection', 'FingerMovements', 'HandMovementDirection', 'Handwriting', 'Heartbeat', 'InsectWingbeat', 'JapaneseVowels', 'Libras', 'LSST', 'MotorImagery', 'NATOPS', 'PenDigits', 'PEMS-SF', 'PhonemeSpectra', 'RacketSports', 'SelfRegulationSCP1', 'SelfRegulationSCP2', 'SpokenArabicDigits', 'StandWalkJump', 'UWaveGestureLibrary']
!rm -rf Multivariate2018_ts.zip
!rm -rf Multivariate_ts
!wget http://www.timeseriesclassification.com/Downloads/Archives/Multivariate2018_ts.zip
--2022-09-23 18:47:39-- http://www.timeseriesclassification.com/Downloads/Archives/Multivariate2018_ts.zip Resolving www.timeseriesclassification.com (www.timeseriesclassification.com)... 109.123.71.232 Connecting to www.timeseriesclassification.com (www.timeseriesclassification.com)|109.123.71.232|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 887405508 (846M) [application/zip] Saving to: ‘Multivariate2018_ts.zip’ 100%[======================================>] 887,405,508 19.3MB/s in 46s 2022-09-23 18:48:25 (18.5 MB/s) - ‘Multivariate2018_ts.zip’ saved [887405508/887405508]
!unzip Multivariate2018_ts.zip
Archive: Multivariate2018_ts.zip creating: Multivariate_ts/ creating: Multivariate_ts/ArticularyWordRecognition/ inflating: Multivariate_ts/ArticularyWordRecognition/ArticularyWordRecognition_TEST.ts inflating: Multivariate_ts/ArticularyWordRecognition/ArticularyWordRecognition_TRAIN.ts creating: Multivariate_ts/AtrialFibrillation/ inflating: Multivariate_ts/AtrialFibrillation/AtrialFibrillation_TEST.ts inflating: Multivariate_ts/AtrialFibrillation/AtrialFibrillation_TRAIN.ts creating: Multivariate_ts/BasicMotions/ inflating: Multivariate_ts/BasicMotions/BasicMotions_TEST.ts inflating: Multivariate_ts/BasicMotions/BasicMotions_TRAIN.ts creating: Multivariate_ts/CharacterTrajectories/ inflating: Multivariate_ts/CharacterTrajectories/CharacterTrajectories_TEST.ts inflating: Multivariate_ts/CharacterTrajectories/CharacterTrajectories_TRAIN.ts creating: Multivariate_ts/Cricket/ inflating: Multivariate_ts/Cricket/Cricket_TEST.ts inflating: Multivariate_ts/Cricket/Cricket_TRAIN.ts creating: Multivariate_ts/DuckDuckGeese/ inflating: Multivariate_ts/DuckDuckGeese/DuckDuckGeese_TEST.ts inflating: Multivariate_ts/DuckDuckGeese/DuckDuckGeese_TRAIN.ts creating: Multivariate_ts/EigenWorms/ inflating: Multivariate_ts/EigenWorms/EigenWorms_TEST.ts inflating: Multivariate_ts/EigenWorms/EigenWorms_TRAIN.ts creating: Multivariate_ts/Epilepsy/ inflating: Multivariate_ts/Epilepsy/Epilepsy_TEST.ts inflating: Multivariate_ts/Epilepsy/Epilepsy_TRAIN.ts creating: Multivariate_ts/ERing/ inflating: Multivariate_ts/ERing/ERing_TEST.ts inflating: Multivariate_ts/ERing/ERing_TRAIN.ts creating: Multivariate_ts/EthanolConcentration/ inflating: Multivariate_ts/EthanolConcentration/EthanolConcentration_TEST.ts inflating: Multivariate_ts/EthanolConcentration/EthanolConcentration_TRAIN.ts creating: Multivariate_ts/FaceDetection/ inflating: Multivariate_ts/FaceDetection/FaceDetection_TEST.ts inflating: Multivariate_ts/FaceDetection/FaceDetection_TRAIN.ts creating: Multivariate_ts/FingerMovements/ inflating: Multivariate_ts/FingerMovements/FingerMovements_TEST.ts inflating: Multivariate_ts/FingerMovements/FingerMovements_TRAIN.ts creating: Multivariate_ts/HandMovementDirection/ inflating: Multivariate_ts/HandMovementDirection/HandMovementDirection_TEST.ts inflating: Multivariate_ts/HandMovementDirection/HandMovementDirection_TRAIN.ts creating: Multivariate_ts/Handwriting/ inflating: Multivariate_ts/Handwriting/Handwriting_TEST.ts inflating: Multivariate_ts/Handwriting/Handwriting_TRAIN.ts creating: Multivariate_ts/Heartbeat/ inflating: Multivariate_ts/Heartbeat/Heartbeat_TEST.ts inflating: Multivariate_ts/Heartbeat/Heartbeat_TRAIN.ts creating: Multivariate_ts/InsectWingbeat/ inflating: Multivariate_ts/InsectWingbeat/InsectWingbeat_TEST.ts inflating: Multivariate_ts/InsectWingbeat/InsectWingbeat_TRAIN.ts creating: Multivariate_ts/JapaneseVowels/ inflating: Multivariate_ts/JapaneseVowels/JapaneseVowels_TEST.ts inflating: Multivariate_ts/JapaneseVowels/JapaneseVowels_TRAIN.ts creating: Multivariate_ts/Libras/ inflating: Multivariate_ts/Libras/Libras_TEST.ts inflating: Multivariate_ts/Libras/Libras_TRAIN.ts creating: Multivariate_ts/LSST/ inflating: Multivariate_ts/LSST/LSST_TEST.ts inflating: Multivariate_ts/LSST/LSST_TRAIN.ts creating: Multivariate_ts/MotorImagery/ inflating: Multivariate_ts/MotorImagery/MotorImagery_TEST.ts inflating: Multivariate_ts/MotorImagery/MotorImagery_TRAIN.ts creating: Multivariate_ts/NATOPS/ inflating: Multivariate_ts/NATOPS/NATOPS_TEST.ts inflating: Multivariate_ts/NATOPS/NATOPS_TRAIN.ts creating: Multivariate_ts/PEMS-SF/ inflating: Multivariate_ts/PEMS-SF/PEMS-SF_TEST.ts inflating: Multivariate_ts/PEMS-SF/PEMS-SF_TRAIN.ts creating: Multivariate_ts/PenDigits/ inflating: Multivariate_ts/PenDigits/PenDigits_TEST.ts inflating: Multivariate_ts/PenDigits/PenDigits_TRAIN.ts creating: Multivariate_ts/PhonemeSpectra/ inflating: Multivariate_ts/PhonemeSpectra/PhonemeSpectra_TEST.ts inflating: Multivariate_ts/PhonemeSpectra/PhonemeSpectra_TRAIN.ts creating: Multivariate_ts/RacketSports/ inflating: Multivariate_ts/RacketSports/RacketSports_TEST.ts inflating: Multivariate_ts/RacketSports/RacketSports_TRAIN.ts creating: Multivariate_ts/SelfRegulationSCP1/ inflating: Multivariate_ts/SelfRegulationSCP1/SelfRegulationSCP1_TEST.ts inflating: Multivariate_ts/SelfRegulationSCP1/SelfRegulationSCP1_TRAIN.ts creating: Multivariate_ts/SelfRegulationSCP2/ inflating: Multivariate_ts/SelfRegulationSCP2/SelfRegulationSCP2_TEST.ts inflating: Multivariate_ts/SelfRegulationSCP2/SelfRegulationSCP2_TRAIN.ts creating: Multivariate_ts/SpokenArabicDigits/ inflating: Multivariate_ts/SpokenArabicDigits/SpokenArabicDigits_TEST.ts inflating: Multivariate_ts/SpokenArabicDigits/SpokenArabicDigits_TRAIN.ts creating: Multivariate_ts/StandWalkJump/ inflating: Multivariate_ts/StandWalkJump/StandWalkJump_TEST.ts inflating: Multivariate_ts/StandWalkJump/StandWalkJump_TRAIN.ts creating: Multivariate_ts/UWaveGestureLibrary/ inflating: Multivariate_ts/UWaveGestureLibrary/UWaveGestureLibrary_TEST.ts inflating: Multivariate_ts/UWaveGestureLibrary/UWaveGestureLibrary_TRAIN.ts
!rm -rf Multivariate2018_ts.zip
!rm -rf output/FaceDetection
!rm -rf output/InsectWingbeat
!rm -rf output/PenDigits
!rm -rf output/SpokenArabicDigits
!rm -rf output/CharacterTrajectories
!mkdir -p output/FaceDetection/ts/train/
!mkdir -p output/FaceDetection/ts/test/
!mkdir -p output/InsectWingbeat/ts/train/
!mkdir -p output/InsectWingbeat/ts/test/
!mkdir -p output/PenDigits/ts/train/
!mkdir -p output/PenDigits/ts/test/
!mkdir -p output/SpokenArabicDigits/ts/train/
!mkdir -p output/SpokenArabicDigits/ts/test/
!mkdir -p output/CharacterTrajectories/ts/train/
!mkdir -p output/CharacterTrajectories/ts/test/
!mv ./Multivariate_ts/FaceDetection/FaceDetection_TRAIN.ts output/FaceDetection/ts/train/
!mv ./Multivariate_ts/FaceDetection/FaceDetection_TEST.ts output/FaceDetection/ts/test/
!mv ./Multivariate_ts/InsectWingbeat/InsectWingbeat_TRAIN.ts output/InsectWingbeat/ts/train/
!mv ./Multivariate_ts/InsectWingbeat/InsectWingbeat_TEST.ts output/InsectWingbeat/ts/test/
!mv ./Multivariate_ts/PenDigits/PenDigits_TRAIN.ts output/PenDigits/ts/train/
!mv ./Multivariate_ts/PenDigits/PenDigits_TEST.ts output/PenDigits/ts/test/
!mv ./Multivariate_ts/SpokenArabicDigits/SpokenArabicDigits_TRAIN.ts output/SpokenArabicDigits/ts/train/
!mv ./Multivariate_ts/SpokenArabicDigits/SpokenArabicDigits_TEST.ts output/SpokenArabicDigits/ts/test/
!mv ./Multivariate_ts/CharacterTrajectories/CharacterTrajectories_TRAIN.ts output/CharacterTrajectories/ts/train/
!mv ./Multivariate_ts/CharacterTrajectories/CharacterTrajectories_TEST.ts output/CharacterTrajectories/ts/test/
!rm -rf Multivariate_ts
# |export
def get_mtsc_data_tabular_from_ts(
path
):
print("Reading dataset TS file...")
data = load_from_tsfile_to_dataframe(
str(path),
return_separate_X_and_y=False,
)
print("Converting _x to tabular format...")
data_x = data.loc[:, data.columns != "class_vals"]
data_x_tabular = from_nested_to_multi_index(
data_x, instance_index="case_id", time_index="reading_id"
)
data_x_tabular.reset_index(inplace=True)
print("Converting _y to tabular format...")
data_y_tabular = data["class_vals"].to_frame()
data_y_tabular.reset_index(inplace=True)
data_y_tabular = data_y_tabular.rename(columns={"index": "case_id"})
print("Merging _x and _y...")
return data_x_tabular.merge(data_y_tabular, how="inner")
Tabular to Time-Series conversion routines
# | export
def cum_count(case_id_seq, cumcount):
for i in range(cuda.threadIdx.x, len(case_id_seq), cuda.blockDim.x):
cumcount[i] = i
def add_cum_count(gdf):
return gdf.groupby(["case_id_seq"]).apply_grouped(
cum_count, incols=["case_id_seq"], outcols=dict(cumcount=np.int32)
)
# | export
def convert_from_tabular_to_timeseries_format(
input_dir,
output_dir,
all_columns,
mtsc_column_names,
chunk_size_processing,
number_of_features,
seq_len,
chunk_size_file,
):
# read data in GPU
data_gdf = dask_cudf.read_parquet(input_dir, columns=all_columns)
# get min and max customer_id_seq
case_id_seq_min, case_id_seq_max = (
data_gdf.case_id_seq.min().compute(),
data_gdf.case_id_seq.max().compute(),
)
print("case_id_seq_min: ", case_id_seq_min, "case_id_seq_max: ", case_id_seq_max)
# total chunks
total_chunks = math.ceil(case_id_seq_max / chunk_size_processing)
print("Total number of chunks to be processed: ", total_chunks)
# process each chunk one by one
for chunk_num in range(total_chunks):
min_idx = chunk_num * chunk_size_processing
max_idx = (chunk_num + 1) * chunk_size_processing
if max_idx > case_id_seq_max:
max_idx = case_id_seq_max
print(
"Started processing chunk: ",
chunk_num,
" with case_id_seq from : ",
min_idx,
"to ",
max_idx,
)
convert_from_tabular_to_timeseries_format_chunk(
data_gdf,
output_dir,
chunk_num,
min_idx,
max_idx,
number_of_features,
seq_len,
chunk_size_file,
mtsc_column_names,
)
print(
"Finished processing chunk: ",
chunk_num,
" with case_id_seq from : ",
min_idx,
"to ",
max_idx,
)
del data_gdf
# | export
def convert_from_tabular_to_timeseries_format_chunk(
data_gdf,
output_location,
chunk_num,
case_id_seq_min,
case_id_seq_max,
number_of_features,
seq_len,
chunk_size_file,
mtsc_column_names,
):
# only keep customers in range >= case_id_seq_min
dataset_gdf = data_gdf[data_gdf.case_id_seq >= case_id_seq_min]
# only keep customers in range < case_id_seq_max
dataset_gdf = dataset_gdf[dataset_gdf.case_id_seq <= case_id_seq_max]
# convert to time-series format
flattened_gdf = convert_tabular_to_fixed_length_time_series(
dataset_gdf.compute(), seq_len, mtsc_column_names
)
# clean-up
del dataset_gdf
# fix data-type
flattened_gdf = flattened_gdf.astype("float64")
# write to disk
write_big_parquet_file_chunk_to_disk(
flattened_gdf,
chunk_num,
output_location,
number_of_features,
seq_len,
chunk_size_file,
)
# clean-up
del flattened_gdf
# | export
def write_big_parquet_file_chunk_to_disk(
gdf, chunk_num, output_location, number_of_features, seq_len, chunk_size_file
):
if (
gdf.shape[1] != number_of_features * seq_len + 2
): # NUMBER_OF_FEATURES * SEQUENCE_LENGTH + 2
print("Error: Skipping!!!")
return
total_rows = gdf.shape[0]
file_count = math.ceil(total_rows / chunk_size_file)
cnt = 0
print("Total number of files to be created: ", file_count)
for i in range(file_count):
low = cnt * chunk_size_file
high = (cnt + 1) * chunk_size_file
if high > total_rows:
high = total_rows
output_file = (
output_location
+ "/"
+ "chunk_"
+ str(chunk_num)
+ "_part_"
+ str(cnt)
+ ".parquet"
)
print(
"Writing to output file: ",
output_file,
"with records from iloc: ",
low,
"to ",
high,
)
gdf.iloc[low:high].to_pandas().to_parquet(
output_file, engine="pyarrow", row_group_size=5000
)
cnt = cnt + 1
# | export
def add_missing_columns_for_uniform_length(gdf, mtsc_column_names, seq_len):
ordered_columns = []
for col in mtsc_column_names:
ordered_columns += [col + "_" + str(i) for i in range(0, seq_len)]
ordered_columns += ["class_vals", "case_id"]
gdf = gdf.reindex(ordered_columns, axis=1)
return gdf
# | export
def convert_tabular_to_fixed_length_time_series(gdf, seq_len, mtsc_column_names):
# Step 1: Add cumcount
processed_cumcount_gdf = add_cum_count(gdf)
print(
"Before CumCount Min: ",
processed_cumcount_gdf["cumcount"].min(),
"CumCount Max: ",
processed_cumcount_gdf["cumcount"].max(),
)
processed_cumcount_gdf = processed_cumcount_gdf[
processed_cumcount_gdf.cumcount < seq_len
]
print(
"After CumCount Min: ",
processed_cumcount_gdf["cumcount"].min(),
"CumCount Max: ",
processed_cumcount_gdf["cumcount"].max(),
)
processed_cumcount_gdf = processed_cumcount_gdf.sort_values(
["case_id_seq", "reading_id"], ascending=(True, True)
)
print("sorted")
# Step 2: Get labels
target = processed_cumcount_gdf.groupby("case_id_seq")["class_vals"].max()
case_id = processed_cumcount_gdf.groupby("case_id_seq")["case_id"].max()
# Step 3: Convert to fixed length time-series
flattened_gdf = processed_cumcount_gdf.pivot(
index="case_id_seq", columns="reading_id", values=mtsc_column_names
).assign(class_vals=target, case_id=case_id)
# Step 4: Fix column names
columns_list_tuple = flattened_gdf.columns.to_list()
columns_list_tuple.remove("case_id")
columns_list_tuple.remove("class_vals")
multi_index_columns = cudf.MultiIndex.from_tuples(columns_list_tuple)
expanded_columns = (
multi_index_columns.get_level_values(0).astype(str)
+ "_"
+ multi_index_columns.get_level_values(1).astype(str)
)
expanded_columns = expanded_columns.append("class_vals")
expanded_columns = expanded_columns.append("case_id")
flattened_gdf.columns = expanded_columns.to_pandas()
# Step 5: Add missing columns with N/A values to maintain uniform length
flattened_gdf = add_missing_columns_for_uniform_length(
flattened_gdf, mtsc_column_names, seq_len
)
# Step 6: fillna
flattened_gdf.fillna(0, inplace=True)
# Step 7: reset index
flattened_gdf.reset_index(drop=True, inplace=True)
flattened_gdf = flattened_gdf.sample(frac=1).reset_index(drop=True)
print("flattened_gdf.shape: ", flattened_gdf.shape)
return flattened_gdf
from nbdev import nbdev_export
nbdev_export()