In [1]:
# | default_exp core
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = None
In [3]:
# Parameters
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html", "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts", "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts", "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts", "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts", "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts", "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts", "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts", "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts", "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts", "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts"}
In [4]:
# |export
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/00_core.html",
    "FaceDetection_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/train/FaceDetection_TRAIN.ts",
    "FaceDetection_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ts/test/FaceDetection_TEST.ts",
    "InsectWingbeat_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/train/InsectWingbeat_TRAIN.ts",
    "InsectWingbeat_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/InsectWingbeat/ts/test/InsectWingbeat_TEST.ts",
    "PenDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/train/PenDigits_TRAIN.ts",
    "PenDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/PenDigits/ts/test/PenDigits_TEST.ts",
    "SpokenArabicDigits_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/train/SpokenArabicDigits_TRAIN.ts",
    "SpokenArabicDigits_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ts/test/SpokenArabicDigits_TEST.ts",
    "CharacterTrajectories_TRAIN_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/train/CharacterTrajectories_TRAIN.ts",
    "CharacterTrajectories_TEST_TS": "/home/ubuntu/vitmtsc_nbdev/output/CharacterTrajectories/ts/test/CharacterTrajectories_TEST.ts",
}
In [5]:
!nvidia-smi
Fri Sep 23 18:47:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla T4            Off  | 00000000:00:1B.0 Off |                    0 |
| N/A   30C    P0    24W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:1C.0 Off |                    0 |
| N/A   31C    P0    25W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  Tesla T4            Off  | 00000000:00:1D.0 Off |                    0 |
| N/A   30C    P0    25W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  Tesla T4            Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   31C    P0    24W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+
In [6]:
import torch
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
Using device: cuda

Common Code and Utility Functions for Multivariate TSC datasets
¶

This notebook contains common library code used across all subsequent notebooks.

In [7]:
# |export
import math
import os
In [8]:
# |export
import cudf
import dask_cudf
import numpy as np
In [9]:
# |hide
from nbdev.showdoc import *
In [10]:
# |export
from numba import cuda
from sktime.datasets import load_from_tsfile_to_dataframe
from sktime.datasets.tsc_dataset_names import multivariate
from sktime.datatypes._panel._convert import from_nested_to_multi_index
In [11]:
multivariate
Out[11]:
['ArticularyWordRecognition',
 'AsphaltObstaclesCoordinates',
 'AsphaltPavementTypeCoordinates',
 'AsphaltRegularityCoordinates',
 'AtrialFibrillation',
 'BasicMotions',
 'CharacterTrajectories',
 'Cricket',
 'DuckDuckGeese',
 'EigenWorms',
 'Epilepsy',
 'EthanolConcentration',
 'ERing',
 'FaceDetection',
 'FingerMovements',
 'HandMovementDirection',
 'Handwriting',
 'Heartbeat',
 'InsectWingbeat',
 'JapaneseVowels',
 'Libras',
 'LSST',
 'MotorImagery',
 'NATOPS',
 'PenDigits',
 'PEMS-SF',
 'PhonemeSpectra',
 'RacketSports',
 'SelfRegulationSCP1',
 'SelfRegulationSCP2',
 'SpokenArabicDigits',
 'StandWalkJump',
 'UWaveGestureLibrary']
In [12]:
!rm -rf Multivariate2018_ts.zip 
In [13]:
!rm -rf Multivariate_ts
In [14]:
!wget http://www.timeseriesclassification.com/Downloads/Archives/Multivariate2018_ts.zip
--2022-09-23 18:47:39--  http://www.timeseriesclassification.com/Downloads/Archives/Multivariate2018_ts.zip
Resolving www.timeseriesclassification.com (www.timeseriesclassification.com)... 109.123.71.232
Connecting to www.timeseriesclassification.com (www.timeseriesclassification.com)|109.123.71.232|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 887405508 (846M) [application/zip]
Saving to: ‘Multivariate2018_ts.zip’

100%[======================================>] 887,405,508 19.3MB/s   in 46s    

2022-09-23 18:48:25 (18.5 MB/s) - ‘Multivariate2018_ts.zip’ saved [887405508/887405508]

In [15]:
!unzip Multivariate2018_ts.zip 
Archive:  Multivariate2018_ts.zip
   creating: Multivariate_ts/
   creating: Multivariate_ts/ArticularyWordRecognition/
  inflating: Multivariate_ts/ArticularyWordRecognition/ArticularyWordRecognition_TEST.ts  
  inflating: Multivariate_ts/ArticularyWordRecognition/ArticularyWordRecognition_TRAIN.ts  
   creating: Multivariate_ts/AtrialFibrillation/
  inflating: Multivariate_ts/AtrialFibrillation/AtrialFibrillation_TEST.ts  
  inflating: Multivariate_ts/AtrialFibrillation/AtrialFibrillation_TRAIN.ts  
   creating: Multivariate_ts/BasicMotions/
  inflating: Multivariate_ts/BasicMotions/BasicMotions_TEST.ts  
  inflating: Multivariate_ts/BasicMotions/BasicMotions_TRAIN.ts  
   creating: Multivariate_ts/CharacterTrajectories/
  inflating: Multivariate_ts/CharacterTrajectories/CharacterTrajectories_TEST.ts  
  inflating: Multivariate_ts/CharacterTrajectories/CharacterTrajectories_TRAIN.ts  
   creating: Multivariate_ts/Cricket/
  inflating: Multivariate_ts/Cricket/Cricket_TEST.ts  
  inflating: Multivariate_ts/Cricket/Cricket_TRAIN.ts  
   creating: Multivariate_ts/DuckDuckGeese/
  inflating: Multivariate_ts/DuckDuckGeese/DuckDuckGeese_TEST.ts  
  inflating: Multivariate_ts/DuckDuckGeese/DuckDuckGeese_TRAIN.ts  
   creating: Multivariate_ts/EigenWorms/
  inflating: Multivariate_ts/EigenWorms/EigenWorms_TEST.ts  
  inflating: Multivariate_ts/EigenWorms/EigenWorms_TRAIN.ts  
   creating: Multivariate_ts/Epilepsy/
  inflating: Multivariate_ts/Epilepsy/Epilepsy_TEST.ts  
  inflating: Multivariate_ts/Epilepsy/Epilepsy_TRAIN.ts  
   creating: Multivariate_ts/ERing/
  inflating: Multivariate_ts/ERing/ERing_TEST.ts  
  inflating: Multivariate_ts/ERing/ERing_TRAIN.ts  
   creating: Multivariate_ts/EthanolConcentration/
  inflating: Multivariate_ts/EthanolConcentration/EthanolConcentration_TEST.ts  
  inflating: Multivariate_ts/EthanolConcentration/EthanolConcentration_TRAIN.ts  
   creating: Multivariate_ts/FaceDetection/
  inflating: Multivariate_ts/FaceDetection/FaceDetection_TEST.ts  
  inflating: Multivariate_ts/FaceDetection/FaceDetection_TRAIN.ts  
   creating: Multivariate_ts/FingerMovements/
  inflating: Multivariate_ts/FingerMovements/FingerMovements_TEST.ts  
  inflating: Multivariate_ts/FingerMovements/FingerMovements_TRAIN.ts  
   creating: Multivariate_ts/HandMovementDirection/
  inflating: Multivariate_ts/HandMovementDirection/HandMovementDirection_TEST.ts  
  inflating: Multivariate_ts/HandMovementDirection/HandMovementDirection_TRAIN.ts  
   creating: Multivariate_ts/Handwriting/
  inflating: Multivariate_ts/Handwriting/Handwriting_TEST.ts  
  inflating: Multivariate_ts/Handwriting/Handwriting_TRAIN.ts  
   creating: Multivariate_ts/Heartbeat/
  inflating: Multivariate_ts/Heartbeat/Heartbeat_TEST.ts  
  inflating: Multivariate_ts/Heartbeat/Heartbeat_TRAIN.ts  
   creating: Multivariate_ts/InsectWingbeat/
  inflating: Multivariate_ts/InsectWingbeat/InsectWingbeat_TEST.ts  
  inflating: Multivariate_ts/InsectWingbeat/InsectWingbeat_TRAIN.ts  
   creating: Multivariate_ts/JapaneseVowels/
  inflating: Multivariate_ts/JapaneseVowels/JapaneseVowels_TEST.ts  
  inflating: Multivariate_ts/JapaneseVowels/JapaneseVowels_TRAIN.ts  
   creating: Multivariate_ts/Libras/
  inflating: Multivariate_ts/Libras/Libras_TEST.ts  
  inflating: Multivariate_ts/Libras/Libras_TRAIN.ts  
   creating: Multivariate_ts/LSST/
  inflating: Multivariate_ts/LSST/LSST_TEST.ts  
  inflating: Multivariate_ts/LSST/LSST_TRAIN.ts  
   creating: Multivariate_ts/MotorImagery/
  inflating: Multivariate_ts/MotorImagery/MotorImagery_TEST.ts  
  inflating: Multivariate_ts/MotorImagery/MotorImagery_TRAIN.ts  
   creating: Multivariate_ts/NATOPS/
  inflating: Multivariate_ts/NATOPS/NATOPS_TEST.ts  
  inflating: Multivariate_ts/NATOPS/NATOPS_TRAIN.ts  
   creating: Multivariate_ts/PEMS-SF/
  inflating: Multivariate_ts/PEMS-SF/PEMS-SF_TEST.ts  
  inflating: Multivariate_ts/PEMS-SF/PEMS-SF_TRAIN.ts  
   creating: Multivariate_ts/PenDigits/
  inflating: Multivariate_ts/PenDigits/PenDigits_TEST.ts  
  inflating: Multivariate_ts/PenDigits/PenDigits_TRAIN.ts  
   creating: Multivariate_ts/PhonemeSpectra/
  inflating: Multivariate_ts/PhonemeSpectra/PhonemeSpectra_TEST.ts  
  inflating: Multivariate_ts/PhonemeSpectra/PhonemeSpectra_TRAIN.ts  
   creating: Multivariate_ts/RacketSports/
  inflating: Multivariate_ts/RacketSports/RacketSports_TEST.ts  
  inflating: Multivariate_ts/RacketSports/RacketSports_TRAIN.ts  
   creating: Multivariate_ts/SelfRegulationSCP1/
  inflating: Multivariate_ts/SelfRegulationSCP1/SelfRegulationSCP1_TEST.ts  
  inflating: Multivariate_ts/SelfRegulationSCP1/SelfRegulationSCP1_TRAIN.ts  
   creating: Multivariate_ts/SelfRegulationSCP2/
  inflating: Multivariate_ts/SelfRegulationSCP2/SelfRegulationSCP2_TEST.ts  
  inflating: Multivariate_ts/SelfRegulationSCP2/SelfRegulationSCP2_TRAIN.ts  
   creating: Multivariate_ts/SpokenArabicDigits/
  inflating: Multivariate_ts/SpokenArabicDigits/SpokenArabicDigits_TEST.ts  
  inflating: Multivariate_ts/SpokenArabicDigits/SpokenArabicDigits_TRAIN.ts  
   creating: Multivariate_ts/StandWalkJump/
  inflating: Multivariate_ts/StandWalkJump/StandWalkJump_TEST.ts  
  inflating: Multivariate_ts/StandWalkJump/StandWalkJump_TRAIN.ts  
   creating: Multivariate_ts/UWaveGestureLibrary/
  inflating: Multivariate_ts/UWaveGestureLibrary/UWaveGestureLibrary_TEST.ts  
  inflating: Multivariate_ts/UWaveGestureLibrary/UWaveGestureLibrary_TRAIN.ts  
In [16]:
!rm -rf Multivariate2018_ts.zip 

Move files to output¶

In [17]:
!rm -rf output/FaceDetection
!rm -rf output/InsectWingbeat
!rm -rf output/PenDigits
!rm -rf output/SpokenArabicDigits
!rm -rf output/CharacterTrajectories
!mkdir -p output/FaceDetection/ts/train/
!mkdir -p output/FaceDetection/ts/test/
!mkdir -p output/InsectWingbeat/ts/train/
!mkdir -p output/InsectWingbeat/ts/test/
!mkdir -p output/PenDigits/ts/train/
!mkdir -p output/PenDigits/ts/test/
!mkdir -p output/SpokenArabicDigits/ts/train/
!mkdir -p output/SpokenArabicDigits/ts/test/
!mkdir -p output/CharacterTrajectories/ts/train/
!mkdir -p output/CharacterTrajectories/ts/test/
In [18]:
!mv  ./Multivariate_ts/FaceDetection/FaceDetection_TRAIN.ts output/FaceDetection/ts/train/
!mv  ./Multivariate_ts/FaceDetection/FaceDetection_TEST.ts output/FaceDetection/ts/test/
!mv  ./Multivariate_ts/InsectWingbeat/InsectWingbeat_TRAIN.ts output/InsectWingbeat/ts/train/
!mv  ./Multivariate_ts/InsectWingbeat/InsectWingbeat_TEST.ts output/InsectWingbeat/ts/test/
!mv  ./Multivariate_ts/PenDigits/PenDigits_TRAIN.ts output/PenDigits/ts/train/
!mv  ./Multivariate_ts/PenDigits/PenDigits_TEST.ts output/PenDigits/ts/test/
!mv  ./Multivariate_ts/SpokenArabicDigits/SpokenArabicDigits_TRAIN.ts output/SpokenArabicDigits/ts/train/
!mv  ./Multivariate_ts/SpokenArabicDigits/SpokenArabicDigits_TEST.ts output/SpokenArabicDigits/ts/test/
!mv  ./Multivariate_ts/CharacterTrajectories/CharacterTrajectories_TRAIN.ts output/CharacterTrajectories/ts/train/
!mv  ./Multivariate_ts/CharacterTrajectories/CharacterTrajectories_TEST.ts output/CharacterTrajectories/ts/test/
In [19]:
!rm -rf Multivariate_ts
In [20]:
# |export
def get_mtsc_data_tabular_from_ts(
    path
):
    print("Reading dataset TS file...")
    data = load_from_tsfile_to_dataframe(
        str(path),
        return_separate_X_and_y=False,
    )
    print("Converting _x to tabular format...")
    data_x = data.loc[:, data.columns != "class_vals"]
    data_x_tabular = from_nested_to_multi_index(
        data_x, instance_index="case_id", time_index="reading_id"
    )
    data_x_tabular.reset_index(inplace=True)

    print("Converting _y to tabular format...")
    data_y_tabular = data["class_vals"].to_frame()
    data_y_tabular.reset_index(inplace=True)
    data_y_tabular = data_y_tabular.rename(columns={"index": "case_id"})

    print("Merging _x and _y...")
    return data_x_tabular.merge(data_y_tabular, how="inner")

Tabular to Time-Series conversion routines

In [21]:
# | export
def cum_count(case_id_seq, cumcount):
    for i in range(cuda.threadIdx.x, len(case_id_seq), cuda.blockDim.x):
        cumcount[i] = i


def add_cum_count(gdf):
    return gdf.groupby(["case_id_seq"]).apply_grouped(
        cum_count, incols=["case_id_seq"], outcols=dict(cumcount=np.int32)
    )
In [22]:
# | export
def convert_from_tabular_to_timeseries_format(
    input_dir,
    output_dir,
    all_columns,
    mtsc_column_names,
    chunk_size_processing,
    number_of_features,
    seq_len,
    chunk_size_file,
):
    # read data in GPU
    data_gdf = dask_cudf.read_parquet(input_dir, columns=all_columns)

    # get min and max customer_id_seq
    case_id_seq_min, case_id_seq_max = (
        data_gdf.case_id_seq.min().compute(),
        data_gdf.case_id_seq.max().compute(),
    )

    print("case_id_seq_min: ", case_id_seq_min, "case_id_seq_max: ", case_id_seq_max)

    # total chunks
    total_chunks = math.ceil(case_id_seq_max / chunk_size_processing)
    print("Total number of chunks to be processed: ", total_chunks)

    # process each chunk one by one
    for chunk_num in range(total_chunks):
        min_idx = chunk_num * chunk_size_processing
        max_idx = (chunk_num + 1) * chunk_size_processing
        if max_idx > case_id_seq_max:
            max_idx = case_id_seq_max
        print(
            "Started processing chunk: ",
            chunk_num,
            " with case_id_seq from : ",
            min_idx,
            "to ",
            max_idx,
        )
        convert_from_tabular_to_timeseries_format_chunk(
            data_gdf,
            output_dir,
            chunk_num,
            min_idx,
            max_idx,
            number_of_features,
            seq_len,
            chunk_size_file,
            mtsc_column_names,
        )
        print(
            "Finished processing chunk: ",
            chunk_num,
            " with case_id_seq from : ",
            min_idx,
            "to ",
            max_idx,
        )
    del data_gdf
In [23]:
# | export
def convert_from_tabular_to_timeseries_format_chunk(
    data_gdf,
    output_location,
    chunk_num,
    case_id_seq_min,
    case_id_seq_max,
    number_of_features,
    seq_len,
    chunk_size_file,
    mtsc_column_names,
):
    # only keep customers in range >= case_id_seq_min
    dataset_gdf = data_gdf[data_gdf.case_id_seq >= case_id_seq_min]

    # only keep customers in range < case_id_seq_max
    dataset_gdf = dataset_gdf[dataset_gdf.case_id_seq <= case_id_seq_max]

    # convert to time-series format
    flattened_gdf = convert_tabular_to_fixed_length_time_series(
        dataset_gdf.compute(), seq_len, mtsc_column_names
    )

    # clean-up
    del dataset_gdf

    # fix data-type
    flattened_gdf = flattened_gdf.astype("float64")

    # write to disk
    write_big_parquet_file_chunk_to_disk(
        flattened_gdf,
        chunk_num,
        output_location,
        number_of_features,
        seq_len,
        chunk_size_file,
    )

    # clean-up
    del flattened_gdf
In [24]:
# | export
def write_big_parquet_file_chunk_to_disk(
    gdf, chunk_num, output_location, number_of_features, seq_len, chunk_size_file
):
    if (
        gdf.shape[1] != number_of_features * seq_len + 2
    ):  # NUMBER_OF_FEATURES * SEQUENCE_LENGTH + 2
        print("Error: Skipping!!!")
        return

    total_rows = gdf.shape[0]
    file_count = math.ceil(total_rows / chunk_size_file)
    cnt = 0
    print("Total number of files to be created: ", file_count)
    for i in range(file_count):
        low = cnt * chunk_size_file
        high = (cnt + 1) * chunk_size_file
        if high > total_rows:
            high = total_rows
        output_file = (
            output_location
            + "/"
            + "chunk_"
            + str(chunk_num)
            + "_part_"
            + str(cnt)
            + ".parquet"
        )
        print(
            "Writing to output file: ",
            output_file,
            "with records from iloc: ",
            low,
            "to ",
            high,
        )
        gdf.iloc[low:high].to_pandas().to_parquet(
            output_file, engine="pyarrow", row_group_size=5000
        )
        cnt = cnt + 1
In [25]:
# | export
def add_missing_columns_for_uniform_length(gdf, mtsc_column_names, seq_len):
    ordered_columns = []
    for col in mtsc_column_names:
        ordered_columns += [col + "_" + str(i) for i in range(0, seq_len)]
    ordered_columns += ["class_vals", "case_id"]
    gdf = gdf.reindex(ordered_columns, axis=1)
    return gdf
In [26]:
# | export
def convert_tabular_to_fixed_length_time_series(gdf, seq_len, mtsc_column_names):
    # Step 1: Add cumcount
    processed_cumcount_gdf = add_cum_count(gdf)

    print(
        "Before CumCount Min: ",
        processed_cumcount_gdf["cumcount"].min(),
        "CumCount Max: ",
        processed_cumcount_gdf["cumcount"].max(),
    )

    processed_cumcount_gdf = processed_cumcount_gdf[
        processed_cumcount_gdf.cumcount < seq_len
    ]

    print(
        "After CumCount Min: ",
        processed_cumcount_gdf["cumcount"].min(),
        "CumCount Max: ",
        processed_cumcount_gdf["cumcount"].max(),
    )

    processed_cumcount_gdf = processed_cumcount_gdf.sort_values(
        ["case_id_seq", "reading_id"], ascending=(True, True)
    )

    print("sorted")

    # Step 2: Get labels
    target = processed_cumcount_gdf.groupby("case_id_seq")["class_vals"].max()
    case_id = processed_cumcount_gdf.groupby("case_id_seq")["case_id"].max()

    # Step 3: Convert to fixed length time-series
    flattened_gdf = processed_cumcount_gdf.pivot(
        index="case_id_seq", columns="reading_id", values=mtsc_column_names
    ).assign(class_vals=target, case_id=case_id)

    # Step 4: Fix column names
    columns_list_tuple = flattened_gdf.columns.to_list()
    columns_list_tuple.remove("case_id")
    columns_list_tuple.remove("class_vals")

    multi_index_columns = cudf.MultiIndex.from_tuples(columns_list_tuple)
    expanded_columns = (
        multi_index_columns.get_level_values(0).astype(str)
        + "_"
        + multi_index_columns.get_level_values(1).astype(str)
    )
    expanded_columns = expanded_columns.append("class_vals")
    expanded_columns = expanded_columns.append("case_id")
    flattened_gdf.columns = expanded_columns.to_pandas()

    # Step 5: Add missing columns with N/A values to maintain uniform length
    flattened_gdf = add_missing_columns_for_uniform_length(
        flattened_gdf, mtsc_column_names, seq_len
    )

    # Step 6: fillna
    flattened_gdf.fillna(0, inplace=True)

    # Step 7: reset index
    flattened_gdf.reset_index(drop=True, inplace=True)

    flattened_gdf = flattened_gdf.sample(frac=1).reset_index(drop=True)

    print("flattened_gdf.shape: ", flattened_gdf.shape)

    return flattened_gdf
In [27]:
from nbdev import nbdev_export
In [28]:
nbdev_export()
In [ ]: