# | default_exp model.optimization.nn.tsc.vittsc.face_detection_training_mask_tune
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['tabular_to_timeseries_face_detection']
# Parameters
upstream = {"tabular_to_timeseries_face_detection": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/301_feature_preprocessing.face_detection.tabular_to_timeseries.html", "FaceDetection_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/train", "FaceDetection_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/valid", "FaceDetection_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/test"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/401_model.optimization.nn.tsc.vittsc.face_detection_training_mask_tune.html", "FaceDetection_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ray_results", "FaceDetection_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result", "FaceDetection_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/checkpoint", "FaceDetection_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model.ckpt", "FaceDetection_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model_config.json"}
#| export
upstream = {
"tabular_to_timeseries_face_detection": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/301_feature_preprocessing.face_detection.tabular_to_timeseries.html",
"FaceDetection_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/train",
"FaceDetection_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/valid",
"FaceDetection_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/target_encoding-nn/test",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/401_model.optimization.nn.tsc.vittsc.face_detection_training_mask_tune.html",
"FaceDetection_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/ray_results",
"FaceDetection_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result",
"FaceDetection_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/checkpoint",
"FaceDetection_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model.ckpt",
"FaceDetection_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model_config.json",
}
#| hide
from nbdev.showdoc import *
#| export
import sys
import pathlib as p
def is_running_from_ipython():
from IPython import get_ipython
return get_ipython() is not None
if not is_running_from_ipython() and __package__ is None:
DIR = p.Path(__file__).resolve().parent
sys.path.insert(0, str(DIR.parent))
__package__ = DIR.name
#| export
import torch
import pytorch_lightning
import pandas as pd
import numpy as np
import os
import math
from torch.nn import functional as F
from torch import nn
from torchmetrics import functional as FM
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor
from petastorm import make_batch_reader
from petastorm.pytorch import DataLoader
from einops import rearrange, repeat
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import pytorch_lightning as pl
Classification Task
Data Loader Module
Hyperparameter Search
# | export
DATASET_NAME = "FaceDetection"
NUM_TARGET = 2
SEQUENCE_LENGTH = 62
NUMBER_OF_FEATURES = 144
NUM_WORKERS = 1
NUM_GPUS = 1
MAX_EPOCHS = 50
TUNE_EPOCHS = 5
NUM_SAMPLES = 1000
#| export
import dask_cudf
import numpy as np
import sklearn.utils.class_weight
def get_train_dataset_size():
gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_face_detection']['FaceDetection_TRAIN_MODEL_INPUT'], columns = ['case_id'])
return gdf.case_id.nunique().compute()
def get_valid_dataset_size():
gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_face_detection']['FaceDetection_VALID_MODEL_INPUT'], columns = ['case_id'])
return gdf.case_id.nunique().compute()
def get_test_dataset_size():
gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_face_detection']['FaceDetection_TEST_MODEL_INPUT'], columns = ['case_id'])
return gdf.case_id.nunique().compute()
def get_class_weight():
train_gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_face_detection']['FaceDetection_TRAIN_MODEL_INPUT'], columns = ['case_id', 'class_vals'])
y_train = train_gdf['class_vals'].compute().to_numpy()
class_weight = sklearn.utils.class_weight.compute_class_weight('balanced', classes = np.unique(y_train), y = y_train)
class_weight = class_weight/2
print(f'class_weight: {class_weight}')
return class_weight
get_train_dataset_size(), get_valid_dataset_size(), get_test_dataset_size(), get_class_weight()
class_weight: [0.49978787 0.50021231]
(4712, 1178, 3524, array([0.49978787, 0.50021231]))
# | export
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
# | export
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = nn.LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
# | export
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout=0.0):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout),
)
def forward(self, x):
return self.net(x)
# | export
class Attention(nn.Module):
def __init__(self, dim, heads=10, dim_head=32, dropout=0.0):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head**-0.5
self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
self.to_out = nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
self.attn_gradients = None
self.attention_map = None
def save_attn_gradients(self, attn_gradients):
self.attn_gradients = attn_gradients
def get_attn_gradients(self):
return self.attn_gradients
def save_attention_map(self, attention_map):
self.attention_map = attention_map
def get_attention_map(self):
return self.attention_map
def forward(self, x, mask=None, register_hook=False):
b, n, _, h = *x.shape, self.heads
qkv = self.to_qkv(x).chunk(3, dim=-1)
q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), qkv)
dots = torch.einsum("bhid,bhjd->bhij", q, k) * self.scale
mask_value = -torch.finfo(dots.dtype).max
# print('mask1.shape', mask.shape)
if mask is not None:
# mask = F.pad(mask, (1, 0), value = True)
mask = F.pad(mask.flatten(1), (1, 0), value=True)
mask = mask.unsqueeze(1).unsqueeze(2)
# print('mask2.shape', mask.shape)
# print('mask:', mask)
assert mask.shape[-1] == dots.shape[-1], "mask has incorrect dimensions"
dots.masked_fill_(mask == 0.0, mask_value)
del mask
attn = dots.softmax(dim=-1)
# print('attn.shape: ', attn.shape)
# print('attn: ', attn)
out = torch.einsum("bhij,bhjd->bhid", attn, v)
if register_hook:
self.save_attention_map(attn)
attn.register_hook(self.save_attn_gradients)
out = rearrange(out, "b h n d -> b n (h d)")
out = self.to_out(out)
# print('out.shape: ', out.shape)
# print('out: ', out)
return out
# | export
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout):
super().__init__()
self.layers = nn.ModuleList([])
for _ in range(depth):
self.layers.append(
nn.ModuleList(
[
Residual(
PreNorm(
dim,
Attention(
dim, heads=heads, dim_head=dim_head, dropout=dropout
),
)
),
Residual(
PreNorm(dim, FeedForward(dim, mlp_dim, dropout=dropout))
),
]
)
)
def forward(self, x, mask=None, register_hook=False):
for attn, ff in self.layers:
x = attn(x, mask=mask, register_hook=register_hook)
x = ff(x)
return x
# | export
def petastorm_collate_fn(rows):
data_df = pd.DataFrame(rows)
# print(f'data_df.shape: {data_df.shape}') # data_df.shape: (2, 4402) 22 * 200 + 2
case_id_df = data_df.iloc[
:,
NUMBER_OF_FEATURES * SEQUENCE_LENGTH
+ 1 : NUMBER_OF_FEATURES * SEQUENCE_LENGTH
+ 2,
] # NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+2
case_id_tensor = torch.tensor(case_id_df.values.astype(np.float64))
target_df = data_df.iloc[
:,
NUMBER_OF_FEATURES * SEQUENCE_LENGTH
+ 0 : NUMBER_OF_FEATURES * SEQUENCE_LENGTH
+ 1,
] # NUMBER_OF_FEATURES*SEQUENCE_LENGTH+0:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1
target_tensor = torch.tensor(target_df.values.astype(np.float32))
data_tensor_df = data_df.iloc[
:, 0 * SEQUENCE_LENGTH : NUMBER_OF_FEATURES * SEQUENCE_LENGTH
] # 0*SEQUENCE_LENGTH:NUMBER_OF_FEATURES*SEQUENCE_LENGTH
data_tensor = torch.tensor(data_tensor_df.values.astype(np.float32))
data_tensor = rearrange(data_tensor, "t (b h)-> t h b", h=SEQUENCE_LENGTH)
mask_df = data_df.iloc[
:, 0 * SEQUENCE_LENGTH : 1 * SEQUENCE_LENGTH
] # 0*SEQUENCE_LENGTH:1*SEQUENCE_LENGTH
mask_tensor = torch.tensor(mask_df.values.astype(np.float32))
return (
data_tensor,
target_tensor.squeeze(),
case_id_tensor.squeeze(),
mask_tensor.squeeze(),
)
class VitMTSCPetastormDataModule(pl.LightningDataModule):
def __init__(
self,
config,
#data_dir=f"file:///home/ubuntu/vitmtsc_nbdev/Multivariate_parquet/{DATASET_NAME}/target_encoding-nn/",
num_workers=NUM_WORKERS,
transform_spec=None,
shard_count=NUM_GPUS,
num_epochs=MAX_EPOCHS,
):
super().__init__()
self.train_files = f"file://{upstream['tabular_to_timeseries_face_detection']['FaceDetection_TRAIN_MODEL_INPUT']}"
self.valid_files = f"file://{upstream['tabular_to_timeseries_face_detection']['FaceDetection_VALID_MODEL_INPUT']}"
self.test_files = f"file://{upstream['tabular_to_timeseries_face_detection']['FaceDetection_TEST_MODEL_INPUT']}"
self.batch_size = config["batch_size"]
self.num_workers = num_workers
self.transform_spec = transform_spec
self.shard_count = shard_count
self.num_epochs = num_epochs
def train_dataloader(self):
self.train_ds = make_batch_reader(
self.train_files,
workers_count=self.num_workers,
transform_spec=self.transform_spec,
cur_shard=int(os.environ["LOCAL_RANK"]),
shard_count=self.shard_count,
num_epochs=self.num_epochs,
)
return DataLoader(
self.train_ds, batch_size=self.batch_size, collate_fn=petastorm_collate_fn
)
def val_dataloader(self):
print(
"val_dataloader: local rank :",
int(os.environ["LOCAL_RANK"]),
"shard count: ",
self.shard_count,
)
self.val_ds = make_batch_reader(
self.valid_files,
workers_count=self.num_workers,
transform_spec=self.transform_spec,
cur_shard=int(os.environ["LOCAL_RANK"]),
shard_count=self.shard_count,
num_epochs=self.num_epochs,
)
return DataLoader(
self.val_ds, batch_size=self.batch_size, collate_fn=petastorm_collate_fn
)
def test_dataloader(self):
print(
"test_dataloader: local rank :",
int(os.environ["LOCAL_RANK"]),
"shard count: ",
self.shard_count,
)
self.test_ds = make_batch_reader(
self.test_files,
workers_count=self.num_workers,
transform_spec=self.transform_spec,
cur_shard=int(os.environ["LOCAL_RANK"]),
shard_count=self.shard_count,
num_epochs=self.num_epochs,
)
return DataLoader(
self.test_ds, batch_size=self.batch_size, collate_fn=petastorm_collate_fn
)
# | export
class VitTimeSeriesTransformer(pl.LightningModule):
def __init__(
self,
config,
c_in=NUMBER_OF_FEATURES,
c_out=NUM_TARGET,
seq_len=SEQUENCE_LENGTH,
class_weight=torch.FloatTensor(get_class_weight()),
):
super(VitTimeSeriesTransformer, self).__init__()
self.d_model = config["d_model"]
self.depth = config["depth"]
self.heads = config["heads"]
self.mlp_dim = config["mlp_dim"]
self.dim_head = config["dim_head"]
self.dropout_p = config["dropout"]
self.emb_dropout_p = config["emb_dropout"]
self.lr = config["lr"]
self.weight_decay = config["weight_decay"]
self.patience = config["patience"]
self.pos_embedding = nn.Parameter(torch.randn(1, seq_len + 1, self.d_model))
self.patch_to_embedding = nn.Linear(c_in, self.d_model)
self.cls_token = nn.Parameter(torch.randn(1, 1, self.d_model))
self.dropout = nn.Dropout(self.emb_dropout_p)
self.transformer = Transformer(
self.d_model,
self.depth,
self.heads,
self.dim_head,
self.mlp_dim,
self.dropout_p,
)
self.to_cls_token = nn.Identity()
self.mlp_head = nn.Sequential(
nn.LayerNorm(self.d_model),
nn.Linear(self.d_model, self.mlp_dim),
nn.GELU(),
nn.Dropout(self.dropout_p),
nn.Linear(self.mlp_dim, c_out),
)
self.c_out = c_out
self.register_buffer("class_weight", class_weight)
def forward(self, x, mask=None, register_hook=False):
# x = rearrange(x, 'b v s-> b s v') # bs x nvars x seq_len -> bs x seq_len x nvars
x = self.patch_to_embedding(x) # bs x seq_len x nvars -> bs x seq_len x d_model
b, n, _ = x.shape # bs, seq_len
cls_tokens = repeat(self.cls_token, "() n d -> b n d", b=b) # bs x 1 x d_model
x = torch.cat((cls_tokens, x), dim=1) # bs x (seq_len + 1) x d_model
x += self.pos_embedding[
:, : (n + 1)
] # += 1 x (seq_len + 1) x d_model -> # bs x (seq_len + 1) x d_model
x = self.dropout(x) # bs x (seq_len + 1) x d_model
x = self.transformer(
x, mask=mask, register_hook=register_hook
) # bs x (seq_len + 1) x d_model
x = self.to_cls_token(x[:, 0]) # bs x d_model
return self.mlp_head(x) # bs x num_classes
def configure_optimizers(self):
# optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=self.step_size, gamma=self.gamma)
optimizer = torch.optim.AdamW(
self.parameters(), lr=self.lr, weight_decay=self.weight_decay
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, patience=self.patience
)
return {
"optimizer": optimizer,
"lr_scheduler": scheduler,
"monitor": "train_loss",
}
def training_step(self, batch, batch_idx):
x, y, _, mask = batch
y_hat = self(x, mask)
y = y.long()
train_loss = F.cross_entropy(y_hat, y, weight=self.class_weight)
train_auc = FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
train_auroc = FM.auroc(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
self.log(
"train_loss",
train_loss,
on_step=False,
on_epoch=True,
prog_bar=True,
logger=True,
)
self.log(
"train_auc",
train_auc,
on_step=False,
on_epoch=True,
prog_bar=True,
logger=True,
)
return train_loss
def validation_step(self, batch, batch_idx):
x, y, _, mask = batch
y_hat = self(x, mask)
y = y.long()
val_loss = F.cross_entropy(y_hat, y, weight=self.class_weight)
val_auc = FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
val_auroc = FM.auroc(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
self.log(
"val_loss",
val_loss,
on_step=False,
on_epoch=True,
prog_bar=True,
logger=True,
sync_dist=True,
)
self.log(
"val_auc", val_auc, on_step=False, on_epoch=True, prog_bar=True, logger=True
)
return val_loss
def test_step(self, batch, batch_idx):
x, y, _, mask = batch
y_hat = self(x, mask)
y = y.long()
test_loss = F.cross_entropy(y_hat, y, weight=self.class_weight)
test_auc = FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
test_auroc = FM.auroc(F.softmax(y_hat, dim=1), y, num_classes=self.c_out)
self.log(
"test_loss",
test_loss,
on_step=False,
on_epoch=True,
prog_bar=True,
logger=True,
sync_dist=True,
)
self.log(
"test_auc",
test_auc,
on_step=False,
on_epoch=True,
prog_bar=True,
logger=True,
)
return test_loss
class_weight: [0.49978787 0.50021231]
# | export
def get_model(config):
model = VitTimeSeriesTransformer(config)
return model
def get_datamodule(config):
return VitMTSCPetastormDataModule(config)
%env LOCAL_RANK=0
env: LOCAL_RANK=0
#| export
import json
import ray
from ray.tune import ExperimentAnalysis
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.pytorch_lightning import TuneReportCallback
def tune_training(config, num_epochs = TUNE_EPOCHS, num_gpus = NUM_GPUS):
pl.seed_everything(42, workers=True)
model = get_model(config)
dm = get_datamodule(config)
metrics = {"loss": "val_loss", "auc": "val_auc"}
callbacks = [TuneReportCallback(metrics, on="validation_end")]
trainer = pl.Trainer(
max_epochs=num_epochs,
# If fractional GPUs passed in, convert to int.
#gpus= math.ceil(num_gpus),
accelerator='gpu', devices=math.ceil(num_gpus),
strategy= "dp",
callbacks=callbacks,
limit_train_batches= math.ceil(get_train_dataset_size()/config['batch_size']),
limit_val_batches= math.ceil(get_valid_dataset_size()/config['batch_size']),
val_check_interval= math.ceil(get_train_dataset_size()/config['batch_size']),
num_sanity_val_steps=0,
reload_dataloaders_every_n_epochs=1,
deterministic=True
)
trainer.fit(model, dm)
# | export
def tune_training_asha(
num_samples=NUM_SAMPLES,
num_epochs=TUNE_EPOCHS,
num_gpus=NUM_GPUS,
gpus_per_trial=0.2,
):
config = {
"d_model": tune.choice([16, 32, 48, 64, 128, 256, 512]),
"depth": tune.choice([2, 4, 6, 8]),
"heads": tune.choice([2, 4, 6, 8]),
"mlp_dim": tune.choice([8, 10, 12, 14, 16, 20, 24, 32]),
"dim_head": tune.choice([8, 10, 12, 14, 16]),
"dropout": tune.loguniform(1e-6, 1e-3),
"emb_dropout": tune.loguniform(1e-6, 1e-3),
"weight_decay": tune.loguniform(1e-3, 1e-1),
"lr": tune.loguniform(1e-6, 1e-1),
"patience": tune.choice([1, 2]),
"batch_size": tune.choice([64, 128, 256, 512, 1024]),
}
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)
reporter = CLIReporter(
parameter_columns=[
"d_model",
"depth",
"heads",
"mlp_dim",
"dim_head",
"dropout",
"emb_dropout",
"weight_decay",
"lr",
"patience",
"batch_size",
],
metric_columns=["loss", "auc", "training_iteration"],
)
trainable = tune.with_parameters(
tune_training, num_epochs=num_epochs, num_gpus=num_gpus
)
analysis = tune.run(
trainable,
resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
metric="loss",
mode="min",
config=config,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter,
verbose=1,
name="FaceDetection",
raise_on_failed_trial = False
)
print("Best hyperparameters found were: ", analysis.best_config)
%env LOCAL_RANK=0
env: LOCAL_RANK=0
!rm -rf ~/ray_results/FaceDetection/
!rm -rf ./output/FaceDetection/ray_results/
!rm -rf ./output/FaceDetection/experiments_result
!mkdir -p output/FaceDetection/experiments_result
!cp -rf ~/ray_results/FaceDetection/ output/FaceDetection/ray_results/
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
#analysis = ExperimentAnalysis(f'~/ray_results/{DATASET_NAME}')
analysis = ExperimentAnalysis(product['FaceDetection_MODEL_TUNE_OUTPUT'])
tune_result_df = analysis.results_df[['loss', 'auc', 'training_iteration', 'experiment_tag']]
tune_result_df.nsmallest(5, 'loss')
2022-09-24 06:19:44,215 INFO experiment_analysis.py:757 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic. /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/ray/tune/analysis/experiment_analysis.py:303: UserWarning: Dataframes will use '/' instead of '.' to delimit nested result keys in future versions of Ray. For forward compatibility, set the environment variable TUNE_RESULT_DELIM='/' warnings.warn(
loss | auc | training_iteration | experiment_tag | |
---|---|---|---|---|
trial_id | ||||
3b088_00135 | 0.523333 | 0.739062 | 5 | 135_batch_size=256,d_model=128,depth=6,dim_head=14,dropout=0.0000,emb_dropout=0.0003,heads=8,lr=0.0007,mlp_dim=8,patience=1,weight_decay=0.0092 |
3b088_00973 | 0.538751 | 0.741406 | 5 | 973_batch_size=128,d_model=512,depth=4,dim_head=8,dropout=0.0003,emb_dropout=0.0003,heads=8,lr=0.0001,mlp_dim=32,patience=1,weight_decay=0.0607 |
3b088_00218 | 0.539851 | 0.737500 | 5 | 218_batch_size=128,d_model=256,depth=8,dim_head=12,dropout=0.0002,emb_dropout=0.0000,heads=6,lr=0.0001,mlp_dim=14,patience=1,weight_decay=0.0965 |
3b088_00321 | 0.540390 | 0.749178 | 5 | 321_batch_size=64,d_model=512,depth=6,dim_head=12,dropout=0.0000,emb_dropout=0.0000,heads=4,lr=0.0001,mlp_dim=8,patience=2,weight_decay=0.0034 |
3b088_00008 | 0.541308 | 0.728906 | 5 | 8_batch_size=256,d_model=256,depth=6,dim_head=12,dropout=0.0000,emb_dropout=0.0003,heads=4,lr=0.0004,mlp_dim=20,patience=1,weight_decay=0.0025 |
best_config = analysis.get_best_config('loss', 'min')
print(best_config)
{'d_model': 128, 'depth': 6, 'heads': 8, 'mlp_dim': 8, 'dim_head': 14, 'dropout': 2.339598278520724e-05, 'emb_dropout': 0.0002715732862353105, 'weight_decay': 0.009198666704068874, 'lr': 0.0006525079632904397, 'patience': 1, 'batch_size': 256}
#| export
import json
def write_best_model_config():
analysis = ExperimentAnalysis(product['FaceDetection_MODEL_TUNE_OUTPUT'])
best_config = analysis.get_best_config('loss', 'min')
with open(product['FaceDetection_BEST_MODEL_CONFIG'], 'w') as outfile:
# Serializing json
json_object = json.dumps(best_config, indent=4)
outfile.write(json_object)
#| export
write_best_model_config()
2022-09-24 06:20:16,757 INFO experiment_analysis.py:757 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
#| export
def get_best_model_config():
with open(product['FaceDetection_BEST_MODEL_CONFIG'], 'r') as json_file:
return json.load(json_file)
best_config = get_best_model_config()
#| export
def training_loop(TB_LOG_DIR, max_epochs = MAX_EPOCHS, config = best_config):
pl.seed_everything(42, workers=True)
model = get_model(config)
dm = get_datamodule(config)
checkpoint_callback = ModelCheckpoint(dirpath=product['FaceDetection_MODEL_TRAINING_CHECKPOINT_OUTPUT'],
save_top_k = 1, #-1,
filename=f"{DATASET_NAME}" + '-vittsc-mask-{epoch:02d}')
tb_logger = pl_loggers.TensorBoardLogger(TB_LOG_DIR)
lr_monitor = LearningRateMonitor(logging_interval='step')
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.001, patience=3, verbose=False, mode="min")
trainer = pl.Trainer(
#gpus=1,
accelerator='gpu', devices=1,
#track_grad_norm=2,
#plugins='deepspeed',
#stochastic_weight_avg=True,
#precision=16,
max_epochs=max_epochs,
strategy= 'dp', #'ddp',
logger=tb_logger,
callbacks=[lr_monitor, checkpoint_callback, early_stop_callback],
limit_train_batches= math.ceil(get_train_dataset_size()/config['batch_size']),
limit_val_batches= math.ceil(get_valid_dataset_size()/config['batch_size']),
val_check_interval= math.ceil(get_train_dataset_size()/config['batch_size']),
num_sanity_val_steps=0,
reload_dataloaders_every_n_epochs=1,
deterministic=True
)
trainer.fit(model, dm)
%env LOCAL_RANK=0
env: LOCAL_RANK=0
#| export
if __name__ == "__main__":
training_loop(TB_LOG_DIR = product['FaceDetection_MODEL_TRAINING_OUTPUT'],
max_epochs = MAX_EPOCHS,
config = get_best_model_config())
Global seed set to 42 GPU available: True, used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs Missing logger folder: /home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/lightning_logs LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3] | Name | Type | Params --------------------------------------------------- 0 | patch_to_embedding | Linear | 18.6 K 1 | dropout | Dropout | 0 2 | transformer | Transformer | 361 K 3 | to_cls_token | Identity | 0 4 | mlp_head | Sequential | 1.3 K --------------------------------------------------- 389 K Trainable params 0 Non-trainable params 389 K Total params 1.556 Total estimated model params size (MB) /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/fs_utils.py:88: FutureWarning: pyarrow.localfs is deprecated as of 2.0.0, please use pyarrow.fs.LocalFileSystem instead. self._filesystem = pyarrow.localfs /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:402: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10) /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:362: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. if not dataset.common_metadata: /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/reader.py:405: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem, /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:317: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead. meta = parquet_dataset.pieces[0].get_metadata() /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:321: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. for partition in (parquet_dataset.partitions or []): /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:253: FutureWarning: 'ParquetDataset.metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. metadata = dataset.metadata /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:254: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. common_metadata = dataset.common_metadata /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead. futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead. futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:334: FutureWarning: ParquetDatasetPiece is deprecated as of pyarrow 5.0.0 and will be removed in a future version. return [pq.ParquetDatasetPiece(piece.path, open_file_func=fs_open, /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:1933: PossibleUserWarning: The number of training batches (19) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch. rank_zero_warn( /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:138: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead. parquet_file = ParquetFile(self._dataset.fs.open(piece.path)) /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:286: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. partition_names = self._dataset.partitions.partition_names if self._dataset.partitions else set() /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:289: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. table = piece.read(columns=column_names - partition_names, partitions=self._dataset.partitions)
val_dataloader: local rank : 0 shard count: 1
Training: 0it [00:00, ?it/s]
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
import glob
import shutil
source_file = glob.glob(product['FaceDetection_MODEL_TRAINING_CHECKPOINT_OUTPUT'] + '/*.ckpt')[0]
print(source_file)
shutil.copyfile(source_file, product['FaceDetection_BEST_MODEL'])
/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/checkpoint/FaceDetection-vittsc-mask-epoch=07.ckpt
'/home/ubuntu/vitmtsc_nbdev/output/FaceDetection/experiments_result/best_model.ckpt'
#%load_ext tensorboard
#%tensorboard --logdir experiments_result/FaceDetection/vittsc_mask --port 8199
We shutdown the kernel!!!
from nbdev import nbdev_export
nbdev_export()
Multi-GPU Training