In [1]:
# | default_exp model.optimization.nn.tsc.vittsc.face_detection_training_mask_tune
%load_ext autoreload
%autoreload 2
In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = ['tabular_to_timeseries_face_detection']
In [3]:
# Parameters
upstream = {"tabular_to_timeseries_face_detection": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/301_feature_preprocessing.face_detection.tabular_to_timeseries.html", "FaceDetection_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/train", "FaceDetection_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/valid", "FaceDetection_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/test"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/401_model.optimization.nn.tsc.vittsc.face_detection_training_mask_tune.html", "FaceDetection_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ray_results", "FaceDetection_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result", "FaceDetection_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/checkpoint", "FaceDetection_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model.ckpt", "FaceDetection_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model_config.json"}
In [4]:
#| export
upstream = {
    "tabular_to_timeseries_face_detection": {
        "nb": "/home/ubuntu/vitmtsc_nbdev/output/301_feature_preprocessing.face_detection.tabular_to_timeseries.html",
        "FaceDetection_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/train",
        "FaceDetection_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/valid",
        "FaceDetection_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/test",
    }
}
product = {
    "nb": "/home/ubuntu/vitmtsc_nbdev/output/401_model.optimization.nn.tsc.vittsc.face_detection_training_mask_tune.html",
    "FaceDetection_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ray_results",
    "FaceDetection_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result",
    "FaceDetection_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/checkpoint",
    "FaceDetection_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model.ckpt",
    "FaceDetection_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model_config.json",
}
In [5]:
#| hide
from nbdev.showdoc import *
In [6]:
#| export
import sys
import pathlib as p

def is_running_from_ipython():
    from IPython import get_ipython
    return get_ipython() is not None

if not is_running_from_ipython() and __package__ is None:
    DIR = p.Path(__file__).resolve().parent
    sys.path.insert(0, str(DIR.parent))
    __package__ = DIR.name
In [7]:
#| export
import torch
import pytorch_lightning
import pandas as pd
import numpy as np
import os
import math 

from torch.nn import functional as F
from torch import nn
from torchmetrics import functional as FM
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor
from petastorm import make_batch_reader
from petastorm.pytorch import DataLoader
from einops import rearrange, repeat
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import pytorch_lightning as pl

Vision Transformer for Multivariate Time-Series Classification (VitMTSC) Model Training with Masking - Hyperparameter search¶

Classification Task

Data Loader Module

Hyperparameter Search

In [8]:
# | export
DATASET_NAME = "FaceDetection"
NUM_TARGET = 2
SEQUENCE_LENGTH = 62
NUMBER_OF_FEATURES = 144
NUM_WORKERS = 1
NUM_GPUS = 1
MAX_EPOCHS = 50
TUNE_EPOCHS = 5
NUM_SAMPLES = 1000
In [9]:
#| export
import dask_cudf
import numpy as np
import sklearn.utils.class_weight

def get_train_dataset_size():
    gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_face_detection']['FaceDetection_TRAIN_MODEL_INPUT'], columns = ['case_id'])
    return gdf.case_id.nunique().compute()

def get_valid_dataset_size():
    gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_face_detection']['FaceDetection_VALID_MODEL_INPUT'], columns = ['case_id'])
    return gdf.case_id.nunique().compute()

def get_test_dataset_size():
    gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_face_detection']['FaceDetection_TEST_MODEL_INPUT'], columns = ['case_id'])
    return gdf.case_id.nunique().compute()

def get_class_weight():
    train_gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_face_detection']['FaceDetection_TRAIN_MODEL_INPUT'], columns = ['case_id', 'class_vals'])
    y_train = train_gdf['class_vals'].compute().to_numpy()
    class_weight = sklearn.utils.class_weight.compute_class_weight('balanced', classes = np.unique(y_train), y = y_train)
    class_weight = class_weight/2
    print(f'class_weight: {class_weight}')
    return class_weight
In [10]:
get_train_dataset_size(), get_valid_dataset_size(), get_test_dataset_size(), get_class_weight()
class_weight: [0.49978787 0.50021231]
Out[10]:
(4712, 1178, 3524, array([0.49978787, 0.50021231]))

1. Model Definition¶

In [11]:
# | export
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()

        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x
In [12]:
# | export
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()

        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)
In [13]:
# | export
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout=0.0):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
In [14]:
# | export
class Attention(nn.Module):
    def __init__(self, dim, heads=10, dim_head=32, dropout=0.0):
        super().__init__()

        inner_dim = dim_head * heads
        self.heads = heads
        self.scale = dim_head**-0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))

        self.attn_gradients = None
        self.attention_map = None

    def save_attn_gradients(self, attn_gradients):
        self.attn_gradients = attn_gradients

    def get_attn_gradients(self):
        return self.attn_gradients

    def save_attention_map(self, attention_map):
        self.attention_map = attention_map

    def get_attention_map(self):
        return self.attention_map

    def forward(self, x, mask=None, register_hook=False):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), qkv)

        dots = torch.einsum("bhid,bhjd->bhij", q, k) * self.scale
        mask_value = -torch.finfo(dots.dtype).max

        # print('mask1.shape', mask.shape)
        if mask is not None:
            # mask = F.pad(mask, (1, 0), value = True)
            mask = F.pad(mask.flatten(1), (1, 0), value=True)
            mask = mask.unsqueeze(1).unsqueeze(2)

            # print('mask2.shape', mask.shape)
            # print('mask:', mask)
            assert mask.shape[-1] == dots.shape[-1], "mask has incorrect dimensions"
            dots.masked_fill_(mask == 0.0, mask_value)
            del mask

        attn = dots.softmax(dim=-1)
        # print('attn.shape: ', attn.shape)
        # print('attn: ', attn)

        out = torch.einsum("bhij,bhjd->bhid", attn, v)

        if register_hook:
            self.save_attention_map(attn)
            attn.register_hook(self.save_attn_gradients)

        out = rearrange(out, "b h n d -> b n (h d)")
        out = self.to_out(out)
        # print('out.shape: ', out.shape)
        # print('out: ', out)
        return out
In [15]:
# | export
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout):
        super().__init__()

        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(
                nn.ModuleList(
                    [
                        Residual(
                            PreNorm(
                                dim,
                                Attention(
                                    dim, heads=heads, dim_head=dim_head, dropout=dropout
                                ),
                            )
                        ),
                        Residual(
                            PreNorm(dim, FeedForward(dim, mlp_dim, dropout=dropout))
                        ),
                    ]
                )
            )

    def forward(self, x, mask=None, register_hook=False):
        for attn, ff in self.layers:
            x = attn(x, mask=mask, register_hook=register_hook)
            x = ff(x)
        return x
In [16]:
# | export
def petastorm_collate_fn(rows):
    data_df = pd.DataFrame(rows)
    # print(f'data_df.shape: {data_df.shape}') # data_df.shape: (2, 4402) 22 * 200 + 2

    case_id_df = data_df.iloc[
        :,
        NUMBER_OF_FEATURES * SEQUENCE_LENGTH
        + 1 : NUMBER_OF_FEATURES * SEQUENCE_LENGTH
        + 2,
    ]  # NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+2
    case_id_tensor = torch.tensor(case_id_df.values.astype(np.float64))

    target_df = data_df.iloc[
        :,
        NUMBER_OF_FEATURES * SEQUENCE_LENGTH
        + 0 : NUMBER_OF_FEATURES * SEQUENCE_LENGTH
        + 1,
    ]  # NUMBER_OF_FEATURES*SEQUENCE_LENGTH+0:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1
    target_tensor = torch.tensor(target_df.values.astype(np.float32))

    data_tensor_df = data_df.iloc[
        :, 0 * SEQUENCE_LENGTH : NUMBER_OF_FEATURES * SEQUENCE_LENGTH
    ]  # 0*SEQUENCE_LENGTH:NUMBER_OF_FEATURES*SEQUENCE_LENGTH
    data_tensor = torch.tensor(data_tensor_df.values.astype(np.float32))
    data_tensor = rearrange(data_tensor, "t (b h)-> t h b", h=SEQUENCE_LENGTH)

    mask_df = data_df.iloc[
        :, 0 * SEQUENCE_LENGTH : 1 * SEQUENCE_LENGTH
    ]  # 0*SEQUENCE_LENGTH:1*SEQUENCE_LENGTH
    mask_tensor = torch.tensor(mask_df.values.astype(np.float32))

    return (
        data_tensor,
        target_tensor.squeeze(),
        case_id_tensor.squeeze(),
        mask_tensor.squeeze(),
    )


class VitMTSCPetastormDataModule(pl.LightningDataModule):
    def __init__(
        self,
        config,
        #data_dir=f"file:///home/ubuntu/vitmtsc_nbdev/Multivariate_parquet/{DATASET_NAME}/target_encoding-nn/",
        num_workers=NUM_WORKERS,
        transform_spec=None,
        shard_count=NUM_GPUS,
        num_epochs=MAX_EPOCHS,
    ):
        super().__init__()
        self.train_files = f"file://{upstream['tabular_to_timeseries_face_detection']['FaceDetection_TRAIN_MODEL_INPUT']}"
        self.valid_files = f"file://{upstream['tabular_to_timeseries_face_detection']['FaceDetection_VALID_MODEL_INPUT']}"
        self.test_files = f"file://{upstream['tabular_to_timeseries_face_detection']['FaceDetection_TEST_MODEL_INPUT']}"
        self.batch_size = config["batch_size"]
        self.num_workers = num_workers
        self.transform_spec = transform_spec
        self.shard_count = shard_count
        self.num_epochs = num_epochs

    def train_dataloader(self):

        self.train_ds = make_batch_reader(
            self.train_files,
            workers_count=self.num_workers,
            transform_spec=self.transform_spec,
            cur_shard=int(os.environ["LOCAL_RANK"]),
            shard_count=self.shard_count,
            num_epochs=self.num_epochs,
        )
        return DataLoader(
            self.train_ds, batch_size=self.batch_size, collate_fn=petastorm_collate_fn
        )

    def val_dataloader(self):
        print(
            "val_dataloader: local rank :",
            int(os.environ["LOCAL_RANK"]),
            "shard count: ",
            self.shard_count,
        )
        self.val_ds = make_batch_reader(
            self.valid_files,
            workers_count=self.num_workers,
            transform_spec=self.transform_spec,
            cur_shard=int(os.environ["LOCAL_RANK"]),
            shard_count=self.shard_count,
            num_epochs=self.num_epochs,
        )
        return DataLoader(
            self.val_ds, batch_size=self.batch_size, collate_fn=petastorm_collate_fn
        )

    def test_dataloader(self):
        print(
            "test_dataloader: local rank :",
            int(os.environ["LOCAL_RANK"]),
            "shard count: ",
            self.shard_count,
        )
        self.test_ds = make_batch_reader(
            self.test_files,
            workers_count=self.num_workers,
            transform_spec=self.transform_spec,
            cur_shard=int(os.environ["LOCAL_RANK"]),
            shard_count=self.shard_count,
            num_epochs=self.num_epochs,
        )
        return DataLoader(
            self.test_ds, batch_size=self.batch_size, collate_fn=petastorm_collate_fn
        )
In [17]:
# | export
class VitTimeSeriesTransformer(pl.LightningModule):
    def __init__(
        self,
        config,
        c_in=NUMBER_OF_FEATURES,
        c_out=NUM_TARGET,
        seq_len=SEQUENCE_LENGTH,
        class_weight=torch.FloatTensor(get_class_weight()),
    ):
        super(VitTimeSeriesTransformer, self).__init__()

        self.d_model = config["d_model"]
        self.depth = config["depth"]
        self.heads = config["heads"]
        self.mlp_dim = config["mlp_dim"]
        self.dim_head = config["dim_head"]
        self.dropout_p = config["dropout"]
        self.emb_dropout_p = config["emb_dropout"]
        self.lr = config["lr"]
        self.weight_decay = config["weight_decay"]
        self.patience = config["patience"]

        self.pos_embedding = nn.Parameter(torch.randn(1, seq_len + 1, self.d_model))
        self.patch_to_embedding = nn.Linear(c_in, self.d_model)
        self.cls_token = nn.Parameter(torch.randn(1, 1, self.d_model))
        self.dropout = nn.Dropout(self.emb_dropout_p)
        self.transformer = Transformer(
            self.d_model,
            self.depth,
            self.heads,
            self.dim_head,
            self.mlp_dim,
            self.dropout_p,
        )
        self.to_cls_token = nn.Identity()
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(self.d_model),
            nn.Linear(self.d_model, self.mlp_dim),
            nn.GELU(),
            nn.Dropout(self.dropout_p),
            nn.Linear(self.mlp_dim, c_out),
        )

        self.c_out = c_out
        self.register_buffer("class_weight", class_weight)

    def forward(self, x, mask=None, register_hook=False):
        # x = rearrange(x, 'b v s-> b s v') # bs x nvars x seq_len ->  bs x seq_len x nvars
        x = self.patch_to_embedding(x)  # bs x seq_len x nvars -> bs x seq_len x d_model
        b, n, _ = x.shape  # bs, seq_len

        cls_tokens = repeat(self.cls_token, "() n d -> b n d", b=b)  # bs x 1 x d_model
        x = torch.cat((cls_tokens, x), dim=1)  # bs x (seq_len + 1) x d_model
        x += self.pos_embedding[
            :, : (n + 1)
        ]  # +=  1 x (seq_len + 1) x d_model -> # bs x (seq_len + 1) x d_model
        x = self.dropout(x)  # bs x (seq_len + 1) x d_model

        x = self.transformer(
            x, mask=mask, register_hook=register_hook
        )  # bs x (seq_len + 1) x d_model

        x = self.to_cls_token(x[:, 0])  # bs x d_model
        return self.mlp_head(x)  # bs x num_classes

    def configure_optimizers(self):
        # optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=self.step_size, gamma=self.gamma)
        optimizer = torch.optim.AdamW(
            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=self.patience
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "train_loss",
        }

    def training_step(self, batch, batch_idx):
        x, y, _, mask = batch
        y_hat = self(x, mask)
        y = y.long()
        train_loss = F.cross_entropy(y_hat, y, weight=self.class_weight)
        train_auc = FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
        train_auroc = FM.auroc(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
        self.log(
            "train_loss",
            train_loss,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        self.log(
            "train_auc",
            train_auc,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return train_loss

    def validation_step(self, batch, batch_idx):
        x, y, _, mask = batch
        y_hat = self(x, mask)
        y = y.long()
        val_loss = F.cross_entropy(y_hat, y, weight=self.class_weight)
        val_auc = FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
        val_auroc = FM.auroc(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
        self.log(
            "val_loss",
            val_loss,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
            sync_dist=True,
        )
        self.log(
            "val_auc", val_auc, on_step=False, on_epoch=True, prog_bar=True, logger=True
        )
        return val_loss

    def test_step(self, batch, batch_idx):
        x, y, _, mask = batch
        y_hat = self(x, mask)
        y = y.long()
        test_loss = F.cross_entropy(y_hat, y, weight=self.class_weight)
        test_auc = FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
        test_auroc = FM.auroc(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
        self.log(
            "test_loss",
            test_loss,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
            sync_dist=True,
        )
        self.log(
            "test_auc",
            test_auc,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return test_loss
class_weight: [0.49978787 0.50021231]

2. Routine for Single/Multi-GPU DDP Training¶

In [18]:
# | export
def get_model(config):
    model = VitTimeSeriesTransformer(config)
    return model


def get_datamodule(config):
    return VitMTSCPetastormDataModule(config)
In [19]:
%env LOCAL_RANK=0
env: LOCAL_RANK=0
In [20]:
#| export
import json
import ray
from ray.tune import ExperimentAnalysis
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.pytorch_lightning import TuneReportCallback

def tune_training(config, num_epochs = TUNE_EPOCHS, num_gpus = NUM_GPUS):
    pl.seed_everything(42, workers=True)
    model = get_model(config)
    dm = get_datamodule(config)
    metrics = {"loss": "val_loss", "auc": "val_auc"}
    callbacks = [TuneReportCallback(metrics, on="validation_end")]
    
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        #gpus= math.ceil(num_gpus),
        accelerator='gpu', devices=math.ceil(num_gpus),
        strategy= "dp",
        callbacks=callbacks,
        limit_train_batches= math.ceil(get_train_dataset_size()/config['batch_size']), 
        limit_val_batches= math.ceil(get_valid_dataset_size()/config['batch_size']), 
        val_check_interval= math.ceil(get_train_dataset_size()/config['batch_size']), 
        num_sanity_val_steps=0,
        reload_dataloaders_every_n_epochs=1,
        deterministic=True
    )
    
    trainer.fit(model, dm)
In [21]:
# | export
def tune_training_asha(
    num_samples=NUM_SAMPLES,
    num_epochs=TUNE_EPOCHS,
    num_gpus=NUM_GPUS,
    gpus_per_trial=0.2,
):
    config = {
        "d_model": tune.choice([16, 32, 48, 64, 128, 256, 512]),
        "depth": tune.choice([2, 4, 6, 8]),
        "heads": tune.choice([2, 4, 6, 8]),
        "mlp_dim": tune.choice([8, 10, 12, 14, 16, 20, 24, 32]),
        "dim_head": tune.choice([8, 10, 12, 14, 16]),
        "dropout": tune.loguniform(1e-6, 1e-3),
        "emb_dropout": tune.loguniform(1e-6, 1e-3),
        "weight_decay": tune.loguniform(1e-3, 1e-1),
        "lr": tune.loguniform(1e-6, 1e-1),
        "patience": tune.choice([1, 2]),
        "batch_size": tune.choice([64, 128, 256, 512, 1024]),
    }

    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=[
            "d_model",
            "depth",
            "heads",
            "mlp_dim",
            "dim_head",
            "dropout",
            "emb_dropout",
            "weight_decay",
            "lr",
            "patience",
            "batch_size",
        ],
        metric_columns=["loss", "auc", "training_iteration"],
    )

    trainable = tune.with_parameters(
        tune_training, num_epochs=num_epochs, num_gpus=num_gpus
    )
    analysis = tune.run(
        trainable,
        resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        verbose=1,
        name="FaceDetection",
        raise_on_failed_trial = False
    )

    print("Best hyperparameters found were: ", analysis.best_config)
In [22]:
%env LOCAL_RANK=0
env: LOCAL_RANK=0
In [23]:
!rm -rf ~/ray_results/FaceDetection/ 
!rm -rf ./output/FaceDetection/ray_results/
!rm -rf ./output/FaceDetection/experiments_result
!mkdir -p  output/FaceDetection/experiments_result
In [25]:
!cp -rf ~/ray_results/FaceDetection/ output/FaceDetection/ray_results/
In [26]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

#analysis = ExperimentAnalysis(f'~/ray_results/{DATASET_NAME}')
analysis = ExperimentAnalysis(product['FaceDetection_MODEL_TUNE_OUTPUT'])
tune_result_df = analysis.results_df[['loss', 'auc', 'training_iteration', 'experiment_tag']]
tune_result_df.nsmallest(5, 'loss')
2022-09-24 06:19:44,215	INFO experiment_analysis.py:757 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/ray/tune/analysis/experiment_analysis.py:303: UserWarning: Dataframes will use '/' instead of '.' to delimit nested result keys in future versions of Ray. For forward compatibility, set the environment variable TUNE_RESULT_DELIM='/'
  warnings.warn(
Out[26]:
loss auc training_iteration experiment_tag
trial_id
3b088_00135 0.523333 0.739062 5 135_batch_size=256,d_model=128,depth=6,dim_head=14,dropout=0.0000,emb_dropout=0.0003,heads=8,lr=0.0007,mlp_dim=8,patience=1,weight_decay=0.0092
3b088_00973 0.538751 0.741406 5 973_batch_size=128,d_model=512,depth=4,dim_head=8,dropout=0.0003,emb_dropout=0.0003,heads=8,lr=0.0001,mlp_dim=32,patience=1,weight_decay=0.0607
3b088_00218 0.539851 0.737500 5 218_batch_size=128,d_model=256,depth=8,dim_head=12,dropout=0.0002,emb_dropout=0.0000,heads=6,lr=0.0001,mlp_dim=14,patience=1,weight_decay=0.0965
3b088_00321 0.540390 0.749178 5 321_batch_size=64,d_model=512,depth=6,dim_head=12,dropout=0.0000,emb_dropout=0.0000,heads=4,lr=0.0001,mlp_dim=8,patience=2,weight_decay=0.0034
3b088_00008 0.541308 0.728906 5 8_batch_size=256,d_model=256,depth=6,dim_head=12,dropout=0.0000,emb_dropout=0.0003,heads=4,lr=0.0004,mlp_dim=20,patience=1,weight_decay=0.0025
In [27]:
best_config = analysis.get_best_config('loss', 'min')
print(best_config)
{'d_model': 128, 'depth': 6, 'heads': 8, 'mlp_dim': 8, 'dim_head': 14, 'dropout': 2.339598278520724e-05, 'emb_dropout': 0.0002715732862353105, 'weight_decay': 0.009198666704068874, 'lr': 0.0006525079632904397, 'patience': 1, 'batch_size': 256}
In [28]:
#| export
import json
def write_best_model_config():
    analysis = ExperimentAnalysis(product['FaceDetection_MODEL_TUNE_OUTPUT'])
    best_config = analysis.get_best_config('loss', 'min')
    with open(product['FaceDetection_BEST_MODEL_CONFIG'], 'w') as outfile:
        # Serializing json
        json_object = json.dumps(best_config, indent=4)
        outfile.write(json_object)
In [29]:
#| export
write_best_model_config()
2022-09-24 06:20:16,757	INFO experiment_analysis.py:757 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
In [30]:
#| export
def get_best_model_config():
    with open(product['FaceDetection_BEST_MODEL_CONFIG'], 'r') as json_file:
        return json.load(json_file)
        
best_config = get_best_model_config()
In [31]:
#| export
def training_loop(TB_LOG_DIR, max_epochs = MAX_EPOCHS, config = best_config):
    pl.seed_everything(42, workers=True)
    model = get_model(config)
    dm = get_datamodule(config)

    checkpoint_callback = ModelCheckpoint(dirpath=product['FaceDetection_MODEL_TRAINING_CHECKPOINT_OUTPUT'], 
                                          save_top_k = 1, #-1, 
                                          filename=f"{DATASET_NAME}" + '-vittsc-mask-{epoch:02d}')
    tb_logger = pl_loggers.TensorBoardLogger(TB_LOG_DIR)
    
    lr_monitor = LearningRateMonitor(logging_interval='step')

    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.001, patience=3, verbose=False, mode="min")
    
    trainer = pl.Trainer(
        #gpus=1,
        accelerator='gpu', devices=1,
        #track_grad_norm=2,
        #plugins='deepspeed', 
        #stochastic_weight_avg=True,
        #precision=16,
        max_epochs=max_epochs,
        strategy= 'dp',   #'ddp',
        logger=tb_logger,
        callbacks=[lr_monitor, checkpoint_callback, early_stop_callback],
        limit_train_batches= math.ceil(get_train_dataset_size()/config['batch_size']), 
        limit_val_batches= math.ceil(get_valid_dataset_size()/config['batch_size']), 
        val_check_interval= math.ceil(get_train_dataset_size()/config['batch_size']), 
        num_sanity_val_steps=0,
        reload_dataloaders_every_n_epochs=1,
        deterministic=True
    )
    
    trainer.fit(model, dm)
In [32]:
%env LOCAL_RANK=0
env: LOCAL_RANK=0

4. Model Training¶

In [33]:
#| export
if __name__ == "__main__":
    training_loop(TB_LOG_DIR = product['FaceDetection_MODEL_TRAINING_OUTPUT'],
                  max_epochs = MAX_EPOCHS, 
                  config = get_best_model_config())
Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name               | Type        | Params
---------------------------------------------------
0 | patch_to_embedding | Linear      | 18.6 K
1 | dropout            | Dropout     | 0     
2 | transformer        | Transformer | 361 K 
3 | to_cls_token       | Identity    | 0     
4 | mlp_head           | Sequential  | 1.3 K 
---------------------------------------------------
389 K     Trainable params
0         Non-trainable params
389 K     Total params
1.556     Total estimated model params size (MB)
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/fs_utils.py:88: FutureWarning: pyarrow.localfs is deprecated as of 2.0.0, please use pyarrow.fs.LocalFileSystem instead.
  self._filesystem = pyarrow.localfs
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:402: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version
  dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10)
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:362: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version.
  if not dataset.common_metadata:
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/reader.py:405: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version
  self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem,
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:317: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead.
  meta = parquet_dataset.pieces[0].get_metadata()
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:321: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead.
  for partition in (parquet_dataset.partitions or []):
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:253: FutureWarning: 'ParquetDataset.metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version.
  metadata = dataset.metadata
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:254: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version.
  common_metadata = dataset.common_metadata
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead.
  futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead.
  futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:334: FutureWarning: ParquetDatasetPiece is deprecated as of pyarrow 5.0.0 and will be removed in a future version.
  return [pq.ParquetDatasetPiece(piece.path, open_file_func=fs_open,
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1933: PossibleUserWarning: The number of training batches (19) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
  rank_zero_warn(
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:138: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead.
  parquet_file = ParquetFile(self._dataset.fs.open(piece.path))
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:286: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead.
  partition_names = self._dataset.partitions.partition_names if self._dataset.partitions else set()
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:289: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead.
  table = piece.read(columns=column_names - partition_names, partitions=self._dataset.partitions)
val_dataloader: local rank : 0 shard count:  1
Training: 0it [00:00, ?it/s]
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count:  1
Validation: 0it [00:00, ?it/s]

5. Training Results¶

In [34]:
import glob
import shutil
source_file = glob.glob(product['FaceDetection_MODEL_TRAINING_CHECKPOINT_OUTPUT'] + '/*.ckpt')[0]
print(source_file)
shutil.copyfile(source_file, product['FaceDetection_BEST_MODEL'])
/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/checkpoint/FaceDetection-vittsc-mask-epoch=07.ckpt
Out[34]:
'/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model.ckpt'
In [35]:
#%load_ext tensorboard
#%tensorboard --logdir experiments_result/FaceDetection/vittsc_mask --port 8199

We shutdown the kernel!!!

In [36]:
from nbdev import nbdev_export
nbdev_export()

Multi-GPU Training

In [ ]: