In [1]:
#| default_exp model.optimization.nn.tsc.vittsc.spoken_arabic_digits_training_mask_tune
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['tabular_to_timeseries_spoken_arabic_digits']
In [3]:
# Parameters
upstream = {"tabular_to_timeseries_spoken_arabic_digits": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/304_feature_preprocessing.spoken_arabic_digits.tabular_to_timeseries.html", "SpokenArabicDigits_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/train", "SpokenArabicDigits_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/valid", "SpokenArabicDigits_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/test"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/404_model.optimization.nn.tsc.vittsc.spoken_arabic_digits_training_mask_tune.html", "SpokenArabicDigits_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ray_results", "SpokenArabicDigits_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result", "SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/checkpoint", "SpokenArabicDigits_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model.ckpt", "SpokenArabicDigits_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model_config.json"}
In [4]:
#| export
upstream = {
    "tabular_to_timeseries_spoken_arabic_digits": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/304_feature_preprocessing.spoken_arabic_digits.tabular_to_timeseries.html",
        "SpokenArabicDigits_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/train",
        "SpokenArabicDigits_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/valid",
        "SpokenArabicDigits_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/test",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/404_model.optimization.nn.tsc.vittsc.spoken_arabic_digits_training_mask_tune.html",
    "SpokenArabicDigits_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ray_results",
    "SpokenArabicDigits_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result",
    "SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/checkpoint",
    "SpokenArabicDigits_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model.ckpt",
    "SpokenArabicDigits_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model_config.json",
}
In [5]:
#| hide
from nbdev.showdoc import *
In [6]:
#| export
import sys
import pathlib as p

def is_running_from_ipython():
    from IPython import get_ipython
    return get_ipython() is not None

if not is_running_from_ipython() and __package__ is None:
    DIR = p.Path(__file__).resolve().parent
    sys.path.insert(0, str(DIR.parent))
    __package__ = DIR.name
In [7]:
#| export
import torch
import pytorch_lightning
import pandas as pd
import numpy as np
import os
import math 

from torch.nn import functional as F
from torch import nn
from torchmetrics import functional as FM
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor
from petastorm import make_batch_reader
from petastorm.pytorch import DataLoader
from einops import rearrange, repeat
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import pytorch_lightning as pl

Vision Transformer for Multivariate Time-Series Classification (VitMTSC) Model Training with Masking - Hyperparameter search¶

Classification Task

Data Loader Module

Hyperparameter Search

In [8]:
#| export
DATASET_NAME = 'SpokenArabicDigits'
NUM_TARGET = 10
SEQUENCE_LENGTH = 93
NUMBER_OF_FEATURES = 13
NUM_WORKERS = 1
NUM_GPUS = 1
MAX_EPOCHS = 50
TUNE_EPOCHS = 5
NUM_SAMPLES = 1000
In [9]:
#| export
import dask_cudf
import numpy as np
import sklearn.utils.class_weight

def get_train_dataset_size():
    gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_MODEL_INPUT'], columns = ['case_id'])
    return gdf.case_id.nunique().compute()

def get_valid_dataset_size():
    gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_VALID_MODEL_INPUT'], columns = ['case_id'])
    return gdf.case_id.nunique().compute()

def get_test_dataset_size():
    gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TEST_MODEL_INPUT'], columns = ['case_id'])
    return gdf.case_id.nunique().compute()

def get_class_weight():
    train_gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_MODEL_INPUT'], columns = ['case_id', 'class_vals'])
    y_train = train_gdf['class_vals'].compute().to_numpy()
    class_weight = sklearn.utils.class_weight.compute_class_weight('balanced', classes = np.unique(y_train), y = y_train)
    class_weight = class_weight/2
    print(f'class_weight: {class_weight}')
    return class_weight
In [10]:
get_train_dataset_size(), get_valid_dataset_size(), get_test_dataset_size(), get_class_weight()
class_weight: [0.51958661 0.48699262 0.51452242 0.48342491 0.491527   0.50372137
 0.49061338 0.50662188 0.51552734 0.49061338]
Out[10]:
(5279,
 1320,
 2199,
 array([0.51958661, 0.48699262, 0.51452242, 0.48342491, 0.491527  ,
        0.50372137, 0.49061338, 0.50662188, 0.51552734, 0.49061338]))

1. Model Definition¶

In [11]:
#| export
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        
        self.fn = fn
        
    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x
In [12]:
#| export
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
        
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)
In [13]:
#| export
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)
In [14]:
#| export
class Attention(nn.Module):
    def __init__(self, dim, heads = 10, dim_head = 32, dropout = 0.):
        super().__init__()
        
        inner_dim = dim_head *  heads
        self.heads = heads
        self.scale = dim_head ** -0.5
        
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )
        
        self.attn_gradients = None
        self.attention_map = None
        
    def save_attn_gradients(self, attn_gradients):
        self.attn_gradients = attn_gradients

    def get_attn_gradients(self):
        return self.attn_gradients

    def save_attention_map(self, attention_map):
        self.attention_map = attention_map

    def get_attention_map(self):
        return self.attention_map

    def forward(self, x, mask = None, register_hook = False):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
    
        dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
        mask_value = -torch.finfo(dots.dtype).max
        
        #print('mask1.shape', mask.shape)
        if mask is not None:
            #mask = F.pad(mask, (1, 0), value = True)
            mask = F.pad(mask.flatten(1), (1, 0), value = True)
            mask = mask.unsqueeze(1).unsqueeze(2)
            
            #print('mask2.shape', mask.shape)
           # print('mask:', mask)
            assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
            dots.masked_fill_(mask == 0.0, mask_value)
            del mask
        
        attn = dots.softmax(dim=-1)
        #print('attn.shape: ', attn.shape)
        #print('attn: ', attn)
        
        out = torch.einsum('bhij,bhjd->bhid', attn, v)
        
        if register_hook:
            self.save_attention_map(attn)
            attn.register_hook(self.save_attn_gradients)
            
        out = rearrange(out, 'b h n d -> b n (h d)')
        out =  self.to_out(out)
        #print('out.shape: ', out.shape)
        #print('out: ', out)
        return out
In [15]:
#| export
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout):
        super().__init__()
        
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Residual(PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
                Residual(PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
            ]))
            
    def forward(self, x, mask = None, register_hook = False):
        for attn, ff in self.layers:
            x = attn(x, mask = mask, register_hook = register_hook)
            x = ff(x)
        return x
In [16]:
#| export
def petastorm_collate_fn(rows):
    data_df = pd.DataFrame(rows)
    #print(f'data_df.shape: {data_df.shape}') # data_df.shape: (2, 4402) 22 * 200 + 2
    
    case_id_df = data_df.iloc[:, NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+2] # NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+2
    case_id_tensor = torch.tensor(case_id_df.values.astype(np.float64))
    
    target_df = data_df.iloc[:, NUMBER_OF_FEATURES*SEQUENCE_LENGTH+0:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1] # NUMBER_OF_FEATURES*SEQUENCE_LENGTH+0:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1
    target_tensor = torch.tensor(target_df.values.astype(np.float32))
    
    data_tensor_df = data_df.iloc[:, 0*SEQUENCE_LENGTH:NUMBER_OF_FEATURES*SEQUENCE_LENGTH] # 0*SEQUENCE_LENGTH:NUMBER_OF_FEATURES*SEQUENCE_LENGTH
    data_tensor = torch.tensor(data_tensor_df.values.astype(np.float32))
    data_tensor = rearrange(data_tensor, 't (b h)-> t h b', h = SEQUENCE_LENGTH)
    
    mask_df = data_df.iloc[:, 0*SEQUENCE_LENGTH:1*SEQUENCE_LENGTH] # 0*SEQUENCE_LENGTH:1*SEQUENCE_LENGTH
    mask_tensor = torch.tensor(mask_df.values.astype(np.float32))
    
    return data_tensor, target_tensor.squeeze(), case_id_tensor.squeeze(), mask_tensor.squeeze()

class VitMTSCPetastormDataModule(pl.LightningDataModule):
    def __init__(self, config,
                 #data_dir=f"file:///home/ubuntu/vitmtsc_nbdev/Multivariate_parquet/{DATASET_NAME}/target_encoding-nn/",
                 num_workers=NUM_WORKERS,
                 transform_spec = None,
                 shard_count = NUM_GPUS, 
                 num_epochs = MAX_EPOCHS):
        super().__init__()
        #self.data_dir = data_dir
        #self.train_files = os.path.join(data_dir, 'train')
        #self.valid_files = os.path.join(data_dir, 'valid')
        #self.test_files = os.path.join(data_dir, 'test')
        self.train_files = f"file://{upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_MODEL_INPUT']}"
        self.valid_files = f"file://{upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_VALID_MODEL_INPUT']}"
        self.test_files = f"file://{upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TEST_MODEL_INPUT']}"
        self.batch_size =  config["batch_size"]
        self.num_workers = num_workers
        self.transform_spec = transform_spec
        self.shard_count = shard_count
        self.num_epochs = num_epochs     
    
    def train_dataloader(self):
        
        self.train_ds = make_batch_reader(self.train_files, workers_count=self.num_workers, transform_spec=self.transform_spec, 
                                          cur_shard = int(os.environ['LOCAL_RANK']), shard_count = self.shard_count, num_epochs = self.num_epochs)
        return DataLoader(self.train_ds, batch_size = self.batch_size, collate_fn= petastorm_collate_fn)

    def val_dataloader(self):
        print('val_dataloader: local rank :', int(os.environ['LOCAL_RANK']), 'shard count: ', self.shard_count)
        self.val_ds = make_batch_reader(self.valid_files, workers_count=self.num_workers, transform_spec=self.transform_spec, 
                                        cur_shard = int(os.environ['LOCAL_RANK']), shard_count = self.shard_count, num_epochs = self.num_epochs)
        return DataLoader(self.val_ds, batch_size = self.batch_size, collate_fn= petastorm_collate_fn)

    def test_dataloader(self):
        print('test_dataloader: local rank :', int(os.environ['LOCAL_RANK']), 'shard count: ', self.shard_count)
        self.test_ds = make_batch_reader(self.test_files, workers_count=self.num_workers, transform_spec=self.transform_spec, 
                                         cur_shard = int(os.environ['LOCAL_RANK']), shard_count = self.shard_count, num_epochs = self.num_epochs)
        return DataLoader(self.test_ds, batch_size = self.batch_size, collate_fn= petastorm_collate_fn) 
In [17]:
#| export
class VitTimeSeriesTransformer(pl.LightningModule):
    def __init__(self, config, c_in = NUMBER_OF_FEATURES, c_out = NUM_TARGET, 
                 seq_len = SEQUENCE_LENGTH,class_weight = torch.FloatTensor(get_class_weight())):
        super(VitTimeSeriesTransformer, self).__init__()
        
        self.d_model = config["d_model"]
        self.depth = config["depth"]
        self.heads = config["heads"]
        self.mlp_dim = config["mlp_dim"]
        self.dim_head = config["dim_head"]
        self.dropout_p = config["dropout"]
        self.emb_dropout_p = config["emb_dropout"]
        self.lr = config["lr"]
        self.weight_decay = config["weight_decay"]
        self.patience = config["patience"]
        
        self.pos_embedding = nn.Parameter(torch.randn(1, seq_len + 1, self.d_model))
        self.patch_to_embedding = nn.Linear(c_in, self.d_model)
        self.cls_token = nn.Parameter(torch.randn(1, 1, self.d_model))
        self.dropout = nn.Dropout(self.emb_dropout_p)
        self.transformer = Transformer(self.d_model, self.depth, self.heads, self.dim_head, self.mlp_dim, self.dropout_p)
        self.to_cls_token = nn.Identity()
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(self.d_model),
            nn.Linear(self.d_model, self.mlp_dim),
            nn.GELU(),
            nn.Dropout(self.dropout_p),
            nn.Linear(self.mlp_dim, c_out)
        )
    
        self.c_out = c_out
        self.register_buffer('class_weight', class_weight)

    def forward(self, x, mask = None, register_hook = False):
        #x = rearrange(x, 'b v s-> b s v') # bs x nvars x seq_len ->  bs x seq_len x nvars
        x = self.patch_to_embedding(x) # bs x seq_len x nvars -> bs x seq_len x d_model
        b, n, _ = x.shape # bs, seq_len

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b) # bs x 1 x d_model
        x = torch.cat((cls_tokens, x), dim=1) # bs x (seq_len + 1) x d_model
        x += self.pos_embedding[:, :(n + 1)] # +=  1 x (seq_len + 1) x d_model -> # bs x (seq_len + 1) x d_model
        x = self.dropout(x) # bs x (seq_len + 1) x d_model

        x = self.transformer(x, mask = mask, register_hook = register_hook) # bs x (seq_len + 1) x d_model

        x = self.to_cls_token(x[:, 0]) # bs x d_model
        return self.mlp_head(x) # bs x num_classes

    def configure_optimizers(self):
        #optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=self.step_size, gamma=self.gamma)
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=self.patience)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "train_loss"}
        
    def training_step(self, batch, batch_idx):
        x, y, _, mask = batch
        y_hat = self(x, mask)
        y = y.long()
        train_loss = F.cross_entropy(y_hat, y, weight = self.class_weight)
        train_auc =  FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
        train_auroc =  FM.auroc(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
        self.log('train_loss', train_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('train_auc', train_auc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return train_loss

    def validation_step(self, batch, batch_idx):
        x, y, _, mask = batch
        y_hat = self(x, mask)
        y = y.long()
        val_loss = F.cross_entropy(y_hat, y, weight = self.class_weight)
        val_auc =  FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
        val_auroc =  FM.auroc(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
        self.log('val_loss', val_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
        self.log('val_auc', val_auc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return val_loss

    def test_step(self, batch, batch_idx):
        x, y, _, mask = batch
        y_hat = self(x, mask)
        y = y.long()
        test_loss = F.cross_entropy(y_hat, y, weight = self.class_weight)
        test_auc =  FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
        test_auroc =  FM.auroc(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
        self.log('test_loss', test_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
        self.log('test_auc', test_auc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return test_loss
class_weight: [0.51958661 0.48699262 0.51452242 0.48342491 0.491527   0.50372137
 0.49061338 0.50662188 0.51552734 0.49061338]

2. Routine for Single/Multi-GPU DDP Training¶

In [18]:
#| export
def get_model(config):
    model = VitTimeSeriesTransformer(config)
    return model

def get_datamodule(config):
    return VitMTSCPetastormDataModule(config)
In [19]:
%env LOCAL_RANK=0
env: LOCAL_RANK=0
In [20]:
#| export
import json
import ray
from ray.tune import ExperimentAnalysis
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.pytorch_lightning import TuneReportCallback

def tune_training(config, num_epochs = TUNE_EPOCHS, num_gpus = NUM_GPUS):
    pl.seed_everything(42, workers=True)
    model = get_model(config)
    dm = get_datamodule(config)
    metrics = {"loss": "val_loss", "auc": "val_auc"}
    callbacks = [TuneReportCallback(metrics, on="validation_end")]
    
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        #gpus= math.ceil(num_gpus),
        accelerator='gpu', devices=math.ceil(num_gpus),
        strategy= "dp",
        callbacks=callbacks,
        limit_train_batches= math.ceil(get_train_dataset_size()/config['batch_size']), 
        limit_val_batches= math.ceil(get_valid_dataset_size()/config['batch_size']), 
        val_check_interval= math.ceil(get_train_dataset_size()/config['batch_size']), 
        num_sanity_val_steps=0,
        reload_dataloaders_every_n_epochs=1,
        deterministic=True
    )
    
    trainer.fit(model, dm)
In [21]:
#| export
def tune_training_asha(num_samples=NUM_SAMPLES, num_epochs=TUNE_EPOCHS, num_gpus = NUM_GPUS, gpus_per_trial=0.5):
    config = {
        "d_model": tune.choice([16, 32, 48, 64]),
        "depth": tune.choice([2, 4, 6, 8]),
        "heads": tune.choice([2, 4, 6, 8]),
        "mlp_dim": tune.choice([8, 10, 12, 14, 16, 20, 24, 32]),
        "dim_head": tune.choice([8, 10, 12, 14, 16]),
        "dropout": tune.loguniform(1e-6, 1e-3),
        "emb_dropout": tune.loguniform(1e-6, 1e-3),
        "weight_decay": tune.loguniform(1e-5, 1e-1),
        "lr": tune.loguniform(1e-6, 1e-3),
        "patience": tune.choice([1, 2]),
        "batch_size": tune.choice([64, 128, 256, 512, 1024])
    }
    
    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["d_model", "depth", "heads", "mlp_dim", "dim_head", "dropout", "emb_dropout", "weight_decay", "lr", "patience", "batch_size"],
        metric_columns=["loss", "auc", "training_iteration"])

    trainable = tune.with_parameters(
            tune_training,
            num_epochs=num_epochs,
            num_gpus=num_gpus)
    analysis = tune.run(
        trainable,
        resources_per_trial={
           "cpu": 1,
            "gpu": gpus_per_trial
        },
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        verbose = 1,
        name="SpokenArabicDigits",
        raise_on_failed_trial = False)

    print("Best hyperparameters found were: ", analysis.best_config)
In [22]:
%env LOCAL_RANK=0
env: LOCAL_RANK=0
In [23]:
!rm -rf ~/ray_results/SpokenArabicDigits/ 
!rm -rf ./output/SpokenArabicDigits/ray_results/
!rm -rf ./output/SpokenArabicDigits/experiments_result
!mkdir -p  output/SpokenArabicDigits/experiments_result
In [25]:
!cp -rf ~/ray_results/SpokenArabicDigits/ output/SpokenArabicDigits/ray_results/
In [26]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

#analysis = ExperimentAnalysis(f'~/ray_results/{DATASET_NAME}')
analysis = ExperimentAnalysis(product['SpokenArabicDigits_MODEL_TUNE_OUTPUT'])
tune_result_df = analysis.results_df[['loss', 'auc', 'training_iteration', 'experiment_tag']]
tune_result_df.nsmallest(5, 'loss')
2022-09-23 21:07:38,528	INFO experiment_analysis.py:757 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/ray/tune/analysis/experiment_analysis.py:303: UserWarning: Dataframes will use '/' instead of '.' to delimit nested result keys in future versions of Ray. For forward compatibility, set the environment variable TUNE_RESULT_DELIM='/'
  warnings.warn(
Out[26]:
loss auc training_iteration experiment_tag
trial_id
72401_00681 0.037081 0.993304 5 681_batch_size=64,d_model=64,depth=8,dim_head=12,dropout=0.0000,emb_dropout=0.0000,heads=4,lr=0.0009,mlp_dim=14,patience=2,weight_decay=0.0001
72401_00081 0.041933 0.987351 5 81_batch_size=64,d_model=64,depth=8,dim_head=16,dropout=0.0001,emb_dropout=0.0000,heads=6,lr=0.0006,mlp_dim=32,patience=2,weight_decay=0.0002
72401_00633 0.052483 0.988095 5 633_batch_size=64,d_model=64,depth=8,dim_head=16,dropout=0.0004,emb_dropout=0.0000,heads=4,lr=0.0006,mlp_dim=14,patience=2,weight_decay=0.0001
72401_00420 0.054474 0.983631 5 420_batch_size=64,d_model=48,depth=4,dim_head=14,dropout=0.0000,emb_dropout=0.0000,heads=6,lr=0.0010,mlp_dim=14,patience=2,weight_decay=0.0011
72401_00585 0.073108 0.984375 5 585_batch_size=64,d_model=48,depth=6,dim_head=10,dropout=0.0000,emb_dropout=0.0002,heads=4,lr=0.0006,mlp_dim=24,patience=1,weight_decay=0.0370
In [27]:
best_config = analysis.get_best_config('loss', 'min')
print(best_config)
{'d_model': 64, 'depth': 8, 'heads': 4, 'mlp_dim': 14, 'dim_head': 12, 'dropout': 1.3662397928816542e-06, 'emb_dropout': 3.0197891179318393e-06, 'weight_decay': 7.817488014752812e-05, 'lr': 0.0009217628852992138, 'patience': 2, 'batch_size': 64}
In [28]:
#| export
import json
def write_best_model_config():
    analysis = ExperimentAnalysis(product['SpokenArabicDigits_MODEL_TUNE_OUTPUT'])
    best_config = analysis.get_best_config('loss', 'min')
    with open(product['SpokenArabicDigits_BEST_MODEL_CONFIG'], 'w') as outfile:
        # Serializing json
        json_object = json.dumps(best_config, indent=4)
        outfile.write(json_object)
In [29]:
#| export
write_best_model_config()
2022-09-23 21:07:55,682	INFO experiment_analysis.py:757 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
In [30]:
#| export
def get_best_model_config():
    with open(product['SpokenArabicDigits_BEST_MODEL_CONFIG'], 'r') as json_file:
        return json.load(json_file)
        
best_config = get_best_model_config()
In [31]:
#| export
def training_loop(TB_LOG_DIR, max_epochs = MAX_EPOCHS, config = best_config):
    pl.seed_everything(42, workers=True)
    model = get_model(config)
    dm = get_datamodule(config)

    checkpoint_callback = ModelCheckpoint(dirpath=product['SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT'], 
                                          save_top_k = 1, #-1, 
                                          filename=f"{DATASET_NAME}" + '-vittsc-mask-{epoch:02d}')
    tb_logger = pl_loggers.TensorBoardLogger(TB_LOG_DIR)
    
    lr_monitor = LearningRateMonitor(logging_interval='step')

    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.001, patience=3, verbose=False, mode="min")
    
    trainer = pl.Trainer(
        #gpus=1,
        accelerator='gpu', devices=1,
        #track_grad_norm=2,
        #plugins='deepspeed', 
        #stochastic_weight_avg=True,
        #precision=16,
        max_epochs=max_epochs,
        strategy= 'dp',   #'ddp',
        logger=tb_logger,
        callbacks=[lr_monitor, checkpoint_callback, early_stop_callback],
        limit_train_batches= math.ceil(get_train_dataset_size()/config['batch_size']), 
        limit_val_batches= math.ceil(get_valid_dataset_size()/config['batch_size']), 
        val_check_interval= math.ceil(get_train_dataset_size()/config['batch_size']),  
        num_sanity_val_steps=0,
        reload_dataloaders_every_n_epochs=1,
        deterministic=True
    )
    
    trainer.fit(model, dm)
In [32]:
%env LOCAL_RANK=0
env: LOCAL_RANK=0

4. Model Training¶

In [33]:
#| export
if __name__ == "__main__":
    training_loop(TB_LOG_DIR = product['SpokenArabicDigits_MODEL_TRAINING_OUTPUT'],
                  max_epochs = MAX_EPOCHS, 
                  config = get_best_model_config())
Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name               | Type        | Params
---------------------------------------------------
0 | patch_to_embedding | Linear      | 896   
1 | dropout            | Dropout     | 0     
2 | transformer        | Transformer | 115 K 
3 | to_cls_token       | Identity    | 0     
4 | mlp_head           | Sequential  | 1.2 K 
---------------------------------------------------
123 K     Trainable params
0         Non-trainable params
123 K     Total params
0.496     Total estimated model params size (MB)
val_dataloader: local rank : 0 shard count:  1
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/fs_utils.py:88: FutureWarning: pyarrow.localfs is deprecated as of 2.0.0, please use pyarrow.fs.LocalFileSystem instead.
  self._filesystem = pyarrow.localfs
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:402: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version
  dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10)
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:362: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version.
  if not dataset.common_metadata:
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/reader.py:405: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version
  self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem,
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:317: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead.
  meta = parquet_dataset.pieces[0].get_metadata()
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:321: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead.
  for partition in (parquet_dataset.partitions or []):
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:253: FutureWarning: 'ParquetDataset.metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version.
  metadata = dataset.metadata
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:254: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version.
  common_metadata = dataset.common_metadata
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead.
  futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead.
  futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:334: FutureWarning: ParquetDatasetPiece is deprecated as of pyarrow 5.0.0 and will be removed in a future version.
  return [pq.ParquetDatasetPiece(piece.path, open_file_func=fs_open,
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:138: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead.
  parquet_file = ParquetFile(self._dataset.fs.open(piece.path))
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:286: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead.
  partition_names = self._dataset.partitions.partition_names if self._dataset.partitions else set()
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:289: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead.
  table = piece.read(columns=column_names - partition_names, partitions=self._dataset.partitions)
Training: 0it [00:00, ?it/s]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:36: UserWarning: No positive samples in targets, true positive value should be meaningless. Returning zero tensor in true positive score
  warnings.warn(*args, **kwargs)
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]

5. Training Results¶

In [34]:
import glob
import shutil
source_file = glob.glob(product['SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT'] + '/*.ckpt')[0]
print(source_file)
shutil.copyfile(source_file, product['SpokenArabicDigits_BEST_MODEL'])
/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/checkpoint/SpokenArabicDigits-vittsc-mask-epoch=10.ckpt
Out[34]:
'/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model.ckpt'
In [35]:
#%load_ext tensorboard
#%tensorboard --logdir experiments_result/SpokenArabicDigits/vittsc_mask --port 8199

We shutdown the kernel!!!

In [36]:
from nbdev import nbdev_export
nbdev_export()

Multi-GPU Training