#| default_exp model.optimization.nn.tsc.vittsc.spoken_arabic_digits_training_mask_tune
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['tabular_to_timeseries_spoken_arabic_digits']
# Parameters
upstream = {"tabular_to_timeseries_spoken_arabic_digits": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/304_feature_preprocessing.spoken_arabic_digits.tabular_to_timeseries.html", "SpokenArabicDigits_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/train", "SpokenArabicDigits_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/valid", "SpokenArabicDigits_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/test"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/404_model.optimization.nn.tsc.vittsc.spoken_arabic_digits_training_mask_tune.html", "SpokenArabicDigits_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ray_results", "SpokenArabicDigits_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result", "SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/checkpoint", "SpokenArabicDigits_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model.ckpt", "SpokenArabicDigits_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model_config.json"}
#| export
upstream = {
"tabular_to_timeseries_spoken_arabic_digits": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/304_feature_preprocessing.spoken_arabic_digits.tabular_to_timeseries.html",
"SpokenArabicDigits_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/train",
"SpokenArabicDigits_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/valid",
"SpokenArabicDigits_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/test",
}
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/404_model.optimization.nn.tsc.vittsc.spoken_arabic_digits_training_mask_tune.html",
"SpokenArabicDigits_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ray_results",
"SpokenArabicDigits_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result",
"SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/checkpoint",
"SpokenArabicDigits_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model.ckpt",
"SpokenArabicDigits_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model_config.json",
}
#| hide
from nbdev.showdoc import *
#| export
import sys
import pathlib as p
def is_running_from_ipython():
from IPython import get_ipython
return get_ipython() is not None
if not is_running_from_ipython() and __package__ is None:
DIR = p.Path(__file__).resolve().parent
sys.path.insert(0, str(DIR.parent))
__package__ = DIR.name
#| export
import torch
import pytorch_lightning
import pandas as pd
import numpy as np
import os
import math
from torch.nn import functional as F
from torch import nn
from torchmetrics import functional as FM
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor
from petastorm import make_batch_reader
from petastorm.pytorch import DataLoader
from einops import rearrange, repeat
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import pytorch_lightning as pl
Classification Task
Data Loader Module
Hyperparameter Search
#| export
DATASET_NAME = 'SpokenArabicDigits'
NUM_TARGET = 10
SEQUENCE_LENGTH = 93
NUMBER_OF_FEATURES = 13
NUM_WORKERS = 1
NUM_GPUS = 1
MAX_EPOCHS = 50
TUNE_EPOCHS = 5
NUM_SAMPLES = 1000
#| export
import dask_cudf
import numpy as np
import sklearn.utils.class_weight
def get_train_dataset_size():
gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_MODEL_INPUT'], columns = ['case_id'])
return gdf.case_id.nunique().compute()
def get_valid_dataset_size():
gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_VALID_MODEL_INPUT'], columns = ['case_id'])
return gdf.case_id.nunique().compute()
def get_test_dataset_size():
gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TEST_MODEL_INPUT'], columns = ['case_id'])
return gdf.case_id.nunique().compute()
def get_class_weight():
train_gdf = dask_cudf.read_parquet(upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_MODEL_INPUT'], columns = ['case_id', 'class_vals'])
y_train = train_gdf['class_vals'].compute().to_numpy()
class_weight = sklearn.utils.class_weight.compute_class_weight('balanced', classes = np.unique(y_train), y = y_train)
class_weight = class_weight/2
print(f'class_weight: {class_weight}')
return class_weight
get_train_dataset_size(), get_valid_dataset_size(), get_test_dataset_size(), get_class_weight()
class_weight: [0.51958661 0.48699262 0.51452242 0.48342491 0.491527 0.50372137 0.49061338 0.50662188 0.51552734 0.49061338]
(5279, 1320, 2199, array([0.51958661, 0.48699262, 0.51452242, 0.48342491, 0.491527 , 0.50372137, 0.49061338, 0.50662188, 0.51552734, 0.49061338]))
#| export
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
#| export
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = nn.LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
#| export
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout = 0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
#| export
class Attention(nn.Module):
def __init__(self, dim, heads = 10, dim_head = 32, dropout = 0.):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5
self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
self.to_out = nn.Sequential(
nn.Linear(inner_dim, dim),
nn.Dropout(dropout)
)
self.attn_gradients = None
self.attention_map = None
def save_attn_gradients(self, attn_gradients):
self.attn_gradients = attn_gradients
def get_attn_gradients(self):
return self.attn_gradients
def save_attention_map(self, attention_map):
self.attention_map = attention_map
def get_attention_map(self):
return self.attention_map
def forward(self, x, mask = None, register_hook = False):
b, n, _, h = *x.shape, self.heads
qkv = self.to_qkv(x).chunk(3, dim = -1)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
mask_value = -torch.finfo(dots.dtype).max
#print('mask1.shape', mask.shape)
if mask is not None:
#mask = F.pad(mask, (1, 0), value = True)
mask = F.pad(mask.flatten(1), (1, 0), value = True)
mask = mask.unsqueeze(1).unsqueeze(2)
#print('mask2.shape', mask.shape)
# print('mask:', mask)
assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
dots.masked_fill_(mask == 0.0, mask_value)
del mask
attn = dots.softmax(dim=-1)
#print('attn.shape: ', attn.shape)
#print('attn: ', attn)
out = torch.einsum('bhij,bhjd->bhid', attn, v)
if register_hook:
self.save_attention_map(attn)
attn.register_hook(self.save_attn_gradients)
out = rearrange(out, 'b h n d -> b n (h d)')
out = self.to_out(out)
#print('out.shape: ', out.shape)
#print('out: ', out)
return out
#| export
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout):
super().__init__()
self.layers = nn.ModuleList([])
for _ in range(depth):
self.layers.append(nn.ModuleList([
Residual(PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
]))
def forward(self, x, mask = None, register_hook = False):
for attn, ff in self.layers:
x = attn(x, mask = mask, register_hook = register_hook)
x = ff(x)
return x
#| export
def petastorm_collate_fn(rows):
data_df = pd.DataFrame(rows)
#print(f'data_df.shape: {data_df.shape}') # data_df.shape: (2, 4402) 22 * 200 + 2
case_id_df = data_df.iloc[:, NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+2] # NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+2
case_id_tensor = torch.tensor(case_id_df.values.astype(np.float64))
target_df = data_df.iloc[:, NUMBER_OF_FEATURES*SEQUENCE_LENGTH+0:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1] # NUMBER_OF_FEATURES*SEQUENCE_LENGTH+0:NUMBER_OF_FEATURES*SEQUENCE_LENGTH+1
target_tensor = torch.tensor(target_df.values.astype(np.float32))
data_tensor_df = data_df.iloc[:, 0*SEQUENCE_LENGTH:NUMBER_OF_FEATURES*SEQUENCE_LENGTH] # 0*SEQUENCE_LENGTH:NUMBER_OF_FEATURES*SEQUENCE_LENGTH
data_tensor = torch.tensor(data_tensor_df.values.astype(np.float32))
data_tensor = rearrange(data_tensor, 't (b h)-> t h b', h = SEQUENCE_LENGTH)
mask_df = data_df.iloc[:, 0*SEQUENCE_LENGTH:1*SEQUENCE_LENGTH] # 0*SEQUENCE_LENGTH:1*SEQUENCE_LENGTH
mask_tensor = torch.tensor(mask_df.values.astype(np.float32))
return data_tensor, target_tensor.squeeze(), case_id_tensor.squeeze(), mask_tensor.squeeze()
class VitMTSCPetastormDataModule(pl.LightningDataModule):
def __init__(self, config,
#data_dir=f"file:///home/ubuntu/vitmtsc_nbdev/Multivariate_parquet/{DATASET_NAME}/target_encoding-nn/",
num_workers=NUM_WORKERS,
transform_spec = None,
shard_count = NUM_GPUS,
num_epochs = MAX_EPOCHS):
super().__init__()
#self.data_dir = data_dir
#self.train_files = os.path.join(data_dir, 'train')
#self.valid_files = os.path.join(data_dir, 'valid')
#self.test_files = os.path.join(data_dir, 'test')
self.train_files = f"file://{upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TRAIN_MODEL_INPUT']}"
self.valid_files = f"file://{upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_VALID_MODEL_INPUT']}"
self.test_files = f"file://{upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TEST_MODEL_INPUT']}"
self.batch_size = config["batch_size"]
self.num_workers = num_workers
self.transform_spec = transform_spec
self.shard_count = shard_count
self.num_epochs = num_epochs
def train_dataloader(self):
self.train_ds = make_batch_reader(self.train_files, workers_count=self.num_workers, transform_spec=self.transform_spec,
cur_shard = int(os.environ['LOCAL_RANK']), shard_count = self.shard_count, num_epochs = self.num_epochs)
return DataLoader(self.train_ds, batch_size = self.batch_size, collate_fn= petastorm_collate_fn)
def val_dataloader(self):
print('val_dataloader: local rank :', int(os.environ['LOCAL_RANK']), 'shard count: ', self.shard_count)
self.val_ds = make_batch_reader(self.valid_files, workers_count=self.num_workers, transform_spec=self.transform_spec,
cur_shard = int(os.environ['LOCAL_RANK']), shard_count = self.shard_count, num_epochs = self.num_epochs)
return DataLoader(self.val_ds, batch_size = self.batch_size, collate_fn= petastorm_collate_fn)
def test_dataloader(self):
print('test_dataloader: local rank :', int(os.environ['LOCAL_RANK']), 'shard count: ', self.shard_count)
self.test_ds = make_batch_reader(self.test_files, workers_count=self.num_workers, transform_spec=self.transform_spec,
cur_shard = int(os.environ['LOCAL_RANK']), shard_count = self.shard_count, num_epochs = self.num_epochs)
return DataLoader(self.test_ds, batch_size = self.batch_size, collate_fn= petastorm_collate_fn)
#| export
class VitTimeSeriesTransformer(pl.LightningModule):
def __init__(self, config, c_in = NUMBER_OF_FEATURES, c_out = NUM_TARGET,
seq_len = SEQUENCE_LENGTH,class_weight = torch.FloatTensor(get_class_weight())):
super(VitTimeSeriesTransformer, self).__init__()
self.d_model = config["d_model"]
self.depth = config["depth"]
self.heads = config["heads"]
self.mlp_dim = config["mlp_dim"]
self.dim_head = config["dim_head"]
self.dropout_p = config["dropout"]
self.emb_dropout_p = config["emb_dropout"]
self.lr = config["lr"]
self.weight_decay = config["weight_decay"]
self.patience = config["patience"]
self.pos_embedding = nn.Parameter(torch.randn(1, seq_len + 1, self.d_model))
self.patch_to_embedding = nn.Linear(c_in, self.d_model)
self.cls_token = nn.Parameter(torch.randn(1, 1, self.d_model))
self.dropout = nn.Dropout(self.emb_dropout_p)
self.transformer = Transformer(self.d_model, self.depth, self.heads, self.dim_head, self.mlp_dim, self.dropout_p)
self.to_cls_token = nn.Identity()
self.mlp_head = nn.Sequential(
nn.LayerNorm(self.d_model),
nn.Linear(self.d_model, self.mlp_dim),
nn.GELU(),
nn.Dropout(self.dropout_p),
nn.Linear(self.mlp_dim, c_out)
)
self.c_out = c_out
self.register_buffer('class_weight', class_weight)
def forward(self, x, mask = None, register_hook = False):
#x = rearrange(x, 'b v s-> b s v') # bs x nvars x seq_len -> bs x seq_len x nvars
x = self.patch_to_embedding(x) # bs x seq_len x nvars -> bs x seq_len x d_model
b, n, _ = x.shape # bs, seq_len
cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b) # bs x 1 x d_model
x = torch.cat((cls_tokens, x), dim=1) # bs x (seq_len + 1) x d_model
x += self.pos_embedding[:, :(n + 1)] # += 1 x (seq_len + 1) x d_model -> # bs x (seq_len + 1) x d_model
x = self.dropout(x) # bs x (seq_len + 1) x d_model
x = self.transformer(x, mask = mask, register_hook = register_hook) # bs x (seq_len + 1) x d_model
x = self.to_cls_token(x[:, 0]) # bs x d_model
return self.mlp_head(x) # bs x num_classes
def configure_optimizers(self):
#optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=self.step_size, gamma=self.gamma)
optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=self.patience)
return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "train_loss"}
def training_step(self, batch, batch_idx):
x, y, _, mask = batch
y_hat = self(x, mask)
y = y.long()
train_loss = F.cross_entropy(y_hat, y, weight = self.class_weight)
train_auc = FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
train_auroc = FM.auroc(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
self.log('train_loss', train_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('train_auc', train_auc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return train_loss
def validation_step(self, batch, batch_idx):
x, y, _, mask = batch
y_hat = self(x, mask)
y = y.long()
val_loss = F.cross_entropy(y_hat, y, weight = self.class_weight)
val_auc = FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
val_auroc = FM.auroc(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
self.log('val_loss', val_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
self.log('val_auc', val_auc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return val_loss
def test_step(self, batch, batch_idx):
x, y, _, mask = batch
y_hat = self(x, mask)
y = y.long()
test_loss = F.cross_entropy(y_hat, y, weight = self.class_weight)
test_auc = FM.accuracy(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
test_auroc = FM.auroc(F.softmax(y_hat, dim=1), y, num_classes = self.c_out)
self.log('test_loss', test_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
self.log('test_auc', test_auc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return test_loss
class_weight: [0.51958661 0.48699262 0.51452242 0.48342491 0.491527 0.50372137 0.49061338 0.50662188 0.51552734 0.49061338]
#| export
def get_model(config):
model = VitTimeSeriesTransformer(config)
return model
def get_datamodule(config):
return VitMTSCPetastormDataModule(config)
%env LOCAL_RANK=0
env: LOCAL_RANK=0
#| export
import json
import ray
from ray.tune import ExperimentAnalysis
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.pytorch_lightning import TuneReportCallback
def tune_training(config, num_epochs = TUNE_EPOCHS, num_gpus = NUM_GPUS):
pl.seed_everything(42, workers=True)
model = get_model(config)
dm = get_datamodule(config)
metrics = {"loss": "val_loss", "auc": "val_auc"}
callbacks = [TuneReportCallback(metrics, on="validation_end")]
trainer = pl.Trainer(
max_epochs=num_epochs,
# If fractional GPUs passed in, convert to int.
#gpus= math.ceil(num_gpus),
accelerator='gpu', devices=math.ceil(num_gpus),
strategy= "dp",
callbacks=callbacks,
limit_train_batches= math.ceil(get_train_dataset_size()/config['batch_size']),
limit_val_batches= math.ceil(get_valid_dataset_size()/config['batch_size']),
val_check_interval= math.ceil(get_train_dataset_size()/config['batch_size']),
num_sanity_val_steps=0,
reload_dataloaders_every_n_epochs=1,
deterministic=True
)
trainer.fit(model, dm)
#| export
def tune_training_asha(num_samples=NUM_SAMPLES, num_epochs=TUNE_EPOCHS, num_gpus = NUM_GPUS, gpus_per_trial=0.5):
config = {
"d_model": tune.choice([16, 32, 48, 64]),
"depth": tune.choice([2, 4, 6, 8]),
"heads": tune.choice([2, 4, 6, 8]),
"mlp_dim": tune.choice([8, 10, 12, 14, 16, 20, 24, 32]),
"dim_head": tune.choice([8, 10, 12, 14, 16]),
"dropout": tune.loguniform(1e-6, 1e-3),
"emb_dropout": tune.loguniform(1e-6, 1e-3),
"weight_decay": tune.loguniform(1e-5, 1e-1),
"lr": tune.loguniform(1e-6, 1e-3),
"patience": tune.choice([1, 2]),
"batch_size": tune.choice([64, 128, 256, 512, 1024])
}
scheduler = ASHAScheduler(
max_t=num_epochs,
grace_period=1,
reduction_factor=2)
reporter = CLIReporter(
parameter_columns=["d_model", "depth", "heads", "mlp_dim", "dim_head", "dropout", "emb_dropout", "weight_decay", "lr", "patience", "batch_size"],
metric_columns=["loss", "auc", "training_iteration"])
trainable = tune.with_parameters(
tune_training,
num_epochs=num_epochs,
num_gpus=num_gpus)
analysis = tune.run(
trainable,
resources_per_trial={
"cpu": 1,
"gpu": gpus_per_trial
},
metric="loss",
mode="min",
config=config,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter,
verbose = 1,
name="SpokenArabicDigits",
raise_on_failed_trial = False)
print("Best hyperparameters found were: ", analysis.best_config)
%env LOCAL_RANK=0
env: LOCAL_RANK=0
!rm -rf ~/ray_results/SpokenArabicDigits/
!rm -rf ./output/SpokenArabicDigits/ray_results/
!rm -rf ./output/SpokenArabicDigits/experiments_result
!mkdir -p output/SpokenArabicDigits/experiments_result
!cp -rf ~/ray_results/SpokenArabicDigits/ output/SpokenArabicDigits/ray_results/
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
#analysis = ExperimentAnalysis(f'~/ray_results/{DATASET_NAME}')
analysis = ExperimentAnalysis(product['SpokenArabicDigits_MODEL_TUNE_OUTPUT'])
tune_result_df = analysis.results_df[['loss', 'auc', 'training_iteration', 'experiment_tag']]
tune_result_df.nsmallest(5, 'loss')
2022-09-23 21:07:38,528 INFO experiment_analysis.py:757 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic. /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/ray/tune/analysis/experiment_analysis.py:303: UserWarning: Dataframes will use '/' instead of '.' to delimit nested result keys in future versions of Ray. For forward compatibility, set the environment variable TUNE_RESULT_DELIM='/' warnings.warn(
loss | auc | training_iteration | experiment_tag | |
---|---|---|---|---|
trial_id | ||||
72401_00681 | 0.037081 | 0.993304 | 5 | 681_batch_size=64,d_model=64,depth=8,dim_head=12,dropout=0.0000,emb_dropout=0.0000,heads=4,lr=0.0009,mlp_dim=14,patience=2,weight_decay=0.0001 |
72401_00081 | 0.041933 | 0.987351 | 5 | 81_batch_size=64,d_model=64,depth=8,dim_head=16,dropout=0.0001,emb_dropout=0.0000,heads=6,lr=0.0006,mlp_dim=32,patience=2,weight_decay=0.0002 |
72401_00633 | 0.052483 | 0.988095 | 5 | 633_batch_size=64,d_model=64,depth=8,dim_head=16,dropout=0.0004,emb_dropout=0.0000,heads=4,lr=0.0006,mlp_dim=14,patience=2,weight_decay=0.0001 |
72401_00420 | 0.054474 | 0.983631 | 5 | 420_batch_size=64,d_model=48,depth=4,dim_head=14,dropout=0.0000,emb_dropout=0.0000,heads=6,lr=0.0010,mlp_dim=14,patience=2,weight_decay=0.0011 |
72401_00585 | 0.073108 | 0.984375 | 5 | 585_batch_size=64,d_model=48,depth=6,dim_head=10,dropout=0.0000,emb_dropout=0.0002,heads=4,lr=0.0006,mlp_dim=24,patience=1,weight_decay=0.0370 |
best_config = analysis.get_best_config('loss', 'min')
print(best_config)
{'d_model': 64, 'depth': 8, 'heads': 4, 'mlp_dim': 14, 'dim_head': 12, 'dropout': 1.3662397928816542e-06, 'emb_dropout': 3.0197891179318393e-06, 'weight_decay': 7.817488014752812e-05, 'lr': 0.0009217628852992138, 'patience': 2, 'batch_size': 64}
#| export
import json
def write_best_model_config():
analysis = ExperimentAnalysis(product['SpokenArabicDigits_MODEL_TUNE_OUTPUT'])
best_config = analysis.get_best_config('loss', 'min')
with open(product['SpokenArabicDigits_BEST_MODEL_CONFIG'], 'w') as outfile:
# Serializing json
json_object = json.dumps(best_config, indent=4)
outfile.write(json_object)
#| export
write_best_model_config()
2022-09-23 21:07:55,682 INFO experiment_analysis.py:757 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
#| export
def get_best_model_config():
with open(product['SpokenArabicDigits_BEST_MODEL_CONFIG'], 'r') as json_file:
return json.load(json_file)
best_config = get_best_model_config()
#| export
def training_loop(TB_LOG_DIR, max_epochs = MAX_EPOCHS, config = best_config):
pl.seed_everything(42, workers=True)
model = get_model(config)
dm = get_datamodule(config)
checkpoint_callback = ModelCheckpoint(dirpath=product['SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT'],
save_top_k = 1, #-1,
filename=f"{DATASET_NAME}" + '-vittsc-mask-{epoch:02d}')
tb_logger = pl_loggers.TensorBoardLogger(TB_LOG_DIR)
lr_monitor = LearningRateMonitor(logging_interval='step')
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.001, patience=3, verbose=False, mode="min")
trainer = pl.Trainer(
#gpus=1,
accelerator='gpu', devices=1,
#track_grad_norm=2,
#plugins='deepspeed',
#stochastic_weight_avg=True,
#precision=16,
max_epochs=max_epochs,
strategy= 'dp', #'ddp',
logger=tb_logger,
callbacks=[lr_monitor, checkpoint_callback, early_stop_callback],
limit_train_batches= math.ceil(get_train_dataset_size()/config['batch_size']),
limit_val_batches= math.ceil(get_valid_dataset_size()/config['batch_size']),
val_check_interval= math.ceil(get_train_dataset_size()/config['batch_size']),
num_sanity_val_steps=0,
reload_dataloaders_every_n_epochs=1,
deterministic=True
)
trainer.fit(model, dm)
%env LOCAL_RANK=0
env: LOCAL_RANK=0
#| export
if __name__ == "__main__":
training_loop(TB_LOG_DIR = product['SpokenArabicDigits_MODEL_TRAINING_OUTPUT'],
max_epochs = MAX_EPOCHS,
config = get_best_model_config())
Global seed set to 42 GPU available: True, used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs Missing logger folder: /home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/lightning_logs LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3] | Name | Type | Params --------------------------------------------------- 0 | patch_to_embedding | Linear | 896 1 | dropout | Dropout | 0 2 | transformer | Transformer | 115 K 3 | to_cls_token | Identity | 0 4 | mlp_head | Sequential | 1.2 K --------------------------------------------------- 123 K Trainable params 0 Non-trainable params 123 K Total params 0.496 Total estimated model params size (MB)
val_dataloader: local rank : 0 shard count: 1
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/fs_utils.py:88: FutureWarning: pyarrow.localfs is deprecated as of 2.0.0, please use pyarrow.fs.LocalFileSystem instead. self._filesystem = pyarrow.localfs /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:402: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10) /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:362: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. if not dataset.common_metadata: /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/reader.py:405: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem, /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:317: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead. meta = parquet_dataset.pieces[0].get_metadata() /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:321: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. for partition in (parquet_dataset.partitions or []): /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:253: FutureWarning: 'ParquetDataset.metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. metadata = dataset.metadata /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:254: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. common_metadata = dataset.common_metadata /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead. futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead. futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:334: FutureWarning: ParquetDatasetPiece is deprecated as of pyarrow 5.0.0 and will be removed in a future version. return [pq.ParquetDatasetPiece(piece.path, open_file_func=fs_open, /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:138: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead. parquet_file = ParquetFile(self._dataset.fs.open(piece.path)) /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:286: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. partition_names = self._dataset.partitions.partition_names if self._dataset.partitions else set() /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:289: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. table = piece.read(columns=column_names - partition_names, partitions=self._dataset.partitions)
Training: 0it [00:00, ?it/s]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:36: UserWarning: No positive samples in targets, true positive value should be meaningless. Returning zero tensor in true positive score warnings.warn(*args, **kwargs)
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
val_dataloader: local rank : 0 shard count: 1
Validation: 0it [00:00, ?it/s]
import glob
import shutil
source_file = glob.glob(product['SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT'] + '/*.ckpt')[0]
print(source_file)
shutil.copyfile(source_file, product['SpokenArabicDigits_BEST_MODEL'])
/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/checkpoint/SpokenArabicDigits-vittsc-mask-epoch=10.ckpt
'/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model.ckpt'
#%load_ext tensorboard
#%tensorboard --logdir experiments_result/SpokenArabicDigits/vittsc_mask --port 8199
We shutdown the kernel!!!
from nbdev import nbdev_export
nbdev_export()
Multi-GPU Training