#| default_exp model.optimization.nn.tsc.vittsc.spoken_arabic_digits_evaluation_mask
%load_ext autoreload
%autoreload 2
# declare a list tasks whose products you want to use as inputs
upstream = ['tabular_to_timeseries_spoken_arabic_digits', 'model_training_spoken_arabic_digits']
# Parameters
upstream = {"tabular_to_timeseries_spoken_arabic_digits": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/304_feature_preprocessing.spoken_arabic_digits.tabular_to_timeseries.html", "SpokenArabicDigits_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/train", "SpokenArabicDigits_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/valid", "SpokenArabicDigits_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/test"}, "model_training_spoken_arabic_digits": {"nb": "/home/ubuntu/vitmtsc_nbdev/output/404_model.optimization.nn.tsc.vittsc.spoken_arabic_digits_training_mask_tune.html", "SpokenArabicDigits_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ray_results", "SpokenArabicDigits_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result", "SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/checkpoint", "SpokenArabicDigits_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model.ckpt", "SpokenArabicDigits_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model_config.json"}}
product = {"nb": "/home/ubuntu/vitmtsc_nbdev/output/504_model.optimization.nn.tsc.vittsc.spoken_arabic_digits_evaluation_mask.html", "SpokenArabicDigits_MODEL_VALID_EVAL_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/evaluation/valid", "SpokenArabicDigits_MODEL_TEST_EVAL_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/evaluation/test"}
#| hide
from nbdev.showdoc import *
#| export
import sys
import pathlib as p
def is_running_from_ipython():
from IPython import get_ipython
return get_ipython() is not None
if not is_running_from_ipython() and __package__ is None:
DIR = p.Path(__file__).resolve().parent
sys.path.insert(0, str(DIR.parent))
__package__ = DIR.name
#| export
from vitmtsc.model.optimization.nn.tsc.vittsc.spoken_arabic_digits_training_mask_tune import *
class_weight: [0.51958661 0.48699262 0.51452242 0.48342491 0.491527 0.50372137 0.49061338 0.50662188 0.51552734 0.49061338]
#| export
upstream = {
"model_training_spoken_arabic_digits": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/404_model.optimization.nn.tsc.vittsc.spoken_arabic_digits_training_mask_tune.html",
"SpokenArabicDigits_MODEL_TUNE_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/ray_results",
"SpokenArabicDigits_MODEL_TRAINING_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result",
"SpokenArabicDigits_MODEL_TRAINING_CHECKPOINT_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/checkpoint",
"SpokenArabicDigits_BEST_MODEL": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model.ckpt",
"SpokenArabicDigits_BEST_MODEL_CONFIG": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model_config.json",
},
"tabular_to_timeseries_spoken_arabic_digits": {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/304_feature_preprocessing.spoken_arabic_digits.tabular_to_timeseries.html",
"SpokenArabicDigits_TRAIN_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/train",
"SpokenArabicDigits_VALID_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/valid",
"SpokenArabicDigits_TEST_MODEL_INPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/test",
},
}
product = {
"nb": "/home/ubuntu/vitmtsc_nbdev/output/504_model.optimization.nn.tsc.vittsc.spoken_arabic_digits_evaluation_mask.html",
"SpokenArabicDigits_MODEL_VALID_EVAL_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/evaluation/valid",
"SpokenArabicDigits_MODEL_TEST_EVAL_OUTPUT": "/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/evaluation/test",
}
# |export
import json
def get_best_model_config():
with open(upstream['model_training_spoken_arabic_digits']['SpokenArabicDigits_BEST_MODEL_CONFIG'], 'r') as json_file:
return json.load(json_file)
#| export
import pandas as pd
import os
import torch
import math
import glob
import pytorch_lightning as pl
from torch.nn import functional as F
import matplotlib.pyplot as plt
import scikitplot as skplt
from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer
from petastorm import make_batch_reader
from petastorm.pytorch import DataLoader
Load Model
Model Evaluation: Evaluate Model on test and validation dataset using PR-AUC
#| export
DATASET_NAME = 'SpokenArabicDigits'
VALID_DATA_DIR = f"file://{upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_VALID_MODEL_INPUT']}"
TEST_DATA_DIR = f"file://{upstream['tabular_to_timeseries_spoken_arabic_digits']['SpokenArabicDigits_TEST_MODEL_INPUT']}"
VALID_EVAL_OUTPUT_DIR = product['SpokenArabicDigits_MODEL_VALID_EVAL_OUTPUT']
TEST_EVAL_OUTPUT_DIR = product['SpokenArabicDigits_MODEL_TEST_EVAL_OUTPUT']
BEST_MODEL_CHECKPOINT = upstream['model_training_spoken_arabic_digits']['SpokenArabicDigits_BEST_MODEL']
NUM_WORKERS=1
SHARD_COUNT=1
BATCH_SIZE = 64
TOTAL_VALID_BATCHES = math.ceil(get_valid_dataset_size()/BATCH_SIZE)
TOTAL_TEST_BATCHES = math.ceil(get_test_dataset_size()/BATCH_SIZE)
BEST_MODEL_CHECKPOINT, TOTAL_VALID_BATCHES, TOTAL_TEST_BATCHES, VALID_DATA_DIR, TEST_DATA_DIR, VALID_EVAL_OUTPUT_DIR, TEST_EVAL_OUTPUT_DIR
('/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/best_model.ckpt', 21, 35, 'file:///home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/valid', 'file:///home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/target_encoding-nn/test', '/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/evaluation/valid', '/home/ubuntu/vitmtsc_nbdev/output/SpokenArabicDigits/experiments_result/evaluation/test')
!mkdir -p $VALID_EVAL_OUTPUT_DIR
!mkdir -p $TEST_EVAL_OUTPUT_DIR
#| export
class VitMTSCClassificationPredictionTask(LightningModule):
def __init__(self,
model,
output_pred_dir,
input_data_dir,
batch_size=BATCH_SIZE,
num_workers=NUM_WORKERS,
shard_count = SHARD_COUNT):
super().__init__()
pl.seed_everything(42, workers=True)
self.model = model
self.case_id = []
self.probability_0 = []
self.probability_1 = []
self.probability_2 = []
self.probability_3 = []
self.probability_4 = []
self.probability_5 = []
self.probability_6 = []
self.probability_7 = []
self.probability_8 = []
self.probability_9 = []
self.prediction = []
self.target = []
self.output_pred_dir = output_pred_dir
self.input_data_dir = input_data_dir
self.prediction_files = input_data_dir
self.batch_size = batch_size
self.num_workers = num_workers
self.shard_count = shard_count
def test_step(self, batch, batch_idx):
x, y, case_id_1, mask = batch
y_hat = self.model(x, mask)
self.case_id.extend(case_id_1.to('cpu').numpy())
self.probability_0.extend(F.softmax(y_hat, dim=1)[:,0].to('cpu').numpy())
self.probability_1.extend(F.softmax(y_hat, dim=1)[:,1].to('cpu').numpy())
self.probability_2.extend(F.softmax(y_hat, dim=1)[:,2].to('cpu').numpy())
self.probability_3.extend(F.softmax(y_hat, dim=1)[:,3].to('cpu').numpy())
self.probability_4.extend(F.softmax(y_hat, dim=1)[:,4].to('cpu').numpy())
self.probability_5.extend(F.softmax(y_hat, dim=1)[:,5].to('cpu').numpy())
self.probability_6.extend(F.softmax(y_hat, dim=1)[:,6].to('cpu').numpy())
self.probability_7.extend(F.softmax(y_hat, dim=1)[:,7].to('cpu').numpy())
self.probability_8.extend(F.softmax(y_hat, dim=1)[:,8].to('cpu').numpy())
self.probability_9.extend(F.softmax(y_hat, dim=1)[:,9].to('cpu').numpy())
self.prediction.extend(torch.max(y_hat.data, 1)[1].to('cpu').numpy())
self.target.extend(y.to('cpu').numpy())
def test_dataloader(self):
print('test_dataloader: local rank :', int(os.environ['LOCAL_RANK']), 'shard count: ', self.shard_count)
self.test_ds = make_batch_reader(self.prediction_files, workers_count=self.num_workers,
cur_shard = int(os.environ['LOCAL_RANK']),
shard_count = self.shard_count, num_epochs = 2)
return DataLoader(self.test_ds, batch_size = self.batch_size, collate_fn= petastorm_collate_fn)
def test_epoch_end(self, outputs):
print('Consolidating predictions on GPU:', os.environ['LOCAL_RANK'])
df_text_predictions = pd.DataFrame({'case_id': self.case_id,
'probability_0': self.probability_0,
'probability_1': self.probability_1,
'probability_2': self.probability_2,
'probability_3': self.probability_3,
'probability_4': self.probability_4,
'probability_5': self.probability_5,
'probability_6': self.probability_6,
'probability_7': self.probability_7,
'probability_8': self.probability_8,
'probability_9': self.probability_9,
'prediction': self.prediction,
'target': self.target
})
print('Writing predictions on GPU:', os.environ['LOCAL_RANK'])
df_text_predictions.to_csv(self.output_pred_dir + "/" + os.environ['LOCAL_RANK'] + '_predictions.csv', index=False)
print('Finished Writing predictions on GPU:', os.environ['LOCAL_RANK'])
#| export
def get_model_for_prediction(BEST_MODEL_CHECKPOINT, config, output_pred_dir, input_data_dir, shard_count = SHARD_COUNT):
# load the best model
pl.seed_everything(42, workers=True)
model = VitTimeSeriesTransformer.load_from_checkpoint(BEST_MODEL_CHECKPOINT, config = config)
model.eval()
return VitMTSCClassificationPredictionTask(model = model, shard_count = shard_count, output_pred_dir = output_pred_dir, input_data_dir = input_data_dir)
#| export
def write_prediction_for_valid_dataset(BEST_MODEL_CHECKPOINT,
config,
shard_count,
output_pred_dir = VALID_EVAL_OUTPUT_DIR,
input_data_dir=VALID_DATA_DIR):
pl.seed_everything(42, workers=True)
model = get_model_for_prediction(BEST_MODEL_CHECKPOINT = BEST_MODEL_CHECKPOINT,
config = config,
shard_count = shard_count,
output_pred_dir = output_pred_dir,
input_data_dir = input_data_dir)
trainer = Trainer(gpus = [0],
accelerator='dp',
progress_bar_refresh_rate=1,
limit_test_batches = TOTAL_VALID_BATCHES)
trainer.test(model)
def write_prediction_for_test_dataset(BEST_MODEL_CHECKPOINT,
config,
shard_count,
output_pred_dir = TEST_EVAL_OUTPUT_DIR,
input_data_dir=TEST_DATA_DIR):
pl.seed_everything(42, workers=True)
model = get_model_for_prediction(BEST_MODEL_CHECKPOINT = BEST_MODEL_CHECKPOINT,
config = config,
shard_count = shard_count,
output_pred_dir = output_pred_dir,
input_data_dir = input_data_dir)
trainer = Trainer(gpus = [0],
accelerator='dp',
progress_bar_refresh_rate=1,
limit_test_batches = TOTAL_TEST_BATCHES)
trainer.test(model)
%env LOCAL_RANK=0
env: LOCAL_RANK=0
#| export
if __name__ == "__main__":
print('Processing valid dataset...\n')
write_prediction_for_valid_dataset(BEST_MODEL_CHECKPOINT = BEST_MODEL_CHECKPOINT,
config = get_best_model_config(),
shard_count = SHARD_COUNT)
print('Finished Processing valid dataset!!!\n')
print('Processing test dataset...\n')
write_prediction_for_test_dataset(BEST_MODEL_CHECKPOINT = BEST_MODEL_CHECKPOINT,
config = get_best_model_config(),
shard_count = SHARD_COUNT)
print('Finished Processing test dataset!!!\n')
Global seed set to 42 Global seed set to 42 Global seed set to 42 /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:297: LightningDeprecationWarning: Passing `Trainer(accelerator='dp')` has been deprecated in v1.5 and will be removed in v1.7. Use `Trainer(strategy='dp')` instead. rank_zero_deprecation( /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/pytorch_lightning/loops/utilities.py:91: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`. rank_zero_warn( /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/callback_connector.py:96: LightningDeprecationWarning: Setting `Trainer(progress_bar_refresh_rate=1)` is deprecated in v1.5 and will be removed in v1.7. Please pass `pytorch_lightning.callbacks.progress.TQDMProgressBar` with `refresh_rate` directly to the Trainer's `callbacks` argument instead. Or, to disable the progress bar pass `enable_progress_bar = False` to the Trainer. rank_zero_deprecation( GPU available: True, used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs Missing logger folder: /home/ubuntu/vitmtsc_nbdev/lightning_logs
Processing valid dataset...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
test_dataloader: local rank : 0 shard count: 1
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/fs_utils.py:88: FutureWarning: pyarrow.localfs is deprecated as of 2.0.0, please use pyarrow.fs.LocalFileSystem instead. self._filesystem = pyarrow.localfs /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:402: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10) /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:362: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. if not dataset.common_metadata: /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/reader.py:405: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem, /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:317: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead. meta = parquet_dataset.pieces[0].get_metadata() /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:321: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. for partition in (parquet_dataset.partitions or []): /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:253: FutureWarning: 'ParquetDataset.metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. metadata = dataset.metadata /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:254: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. common_metadata = dataset.common_metadata /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead. futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead. futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:334: FutureWarning: ParquetDatasetPiece is deprecated as of pyarrow 5.0.0 and will be removed in a future version. return [pq.ParquetDatasetPiece(piece.path, open_file_func=fs_open, /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:138: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead. parquet_file = ParquetFile(self._dataset.fs.open(piece.path))
Testing: 0it [00:00, ?it/s]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:286: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. partition_names = self._dataset.partitions.partition_names if self._dataset.partitions else set() /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:289: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. table = piece.read(columns=column_names - partition_names, partitions=self._dataset.partitions) Global seed set to 42 Global seed set to 42 Global seed set to 42 /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:297: LightningDeprecationWarning: Passing `Trainer(accelerator='dp')` has been deprecated in v1.5 and will be removed in v1.7. Use `Trainer(strategy='dp')` instead. rank_zero_deprecation( /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/pytorch_lightning/loops/utilities.py:91: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`. rank_zero_warn( /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/callback_connector.py:96: LightningDeprecationWarning: Setting `Trainer(progress_bar_refresh_rate=1)` is deprecated in v1.5 and will be removed in v1.7. Please pass `pytorch_lightning.callbacks.progress.TQDMProgressBar` with `refresh_rate` directly to the Trainer's `callbacks` argument instead. Or, to disable the progress bar pass `enable_progress_bar = False` to the Trainer. rank_zero_deprecation( GPU available: True, used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Consolidating predictions on GPU: 0 Writing predictions on GPU: 0 Finished Writing predictions on GPU: 0 Finished Processing valid dataset!!! Processing test dataset... test_dataloader: local rank : 0 shard count: 1
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/fs_utils.py:88: FutureWarning: pyarrow.localfs is deprecated as of 2.0.0, please use pyarrow.fs.LocalFileSystem instead. self._filesystem = pyarrow.localfs /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:402: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10) /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:362: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. if not dataset.common_metadata: /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/reader.py:405: FutureWarning: Specifying the 'metadata_nthreads' argument is deprecated as of pyarrow 8.0.0, and the argument will be removed in a future version self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem, /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:317: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead. meta = parquet_dataset.pieces[0].get_metadata() /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/unischema.py:321: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. for partition in (parquet_dataset.partitions or []): /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:253: FutureWarning: 'ParquetDataset.metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. metadata = dataset.metadata /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:254: FutureWarning: 'ParquetDataset.common_metadata' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. common_metadata = dataset.common_metadata /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.pieces' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.fragments' attribute instead. futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:350: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead. futures_list = [thread_pool.submit(_split_piece, piece, dataset.fs.open) for piece in dataset.pieces] /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/etl/dataset_metadata.py:334: FutureWarning: ParquetDatasetPiece is deprecated as of pyarrow 5.0.0 and will be removed in a future version. return [pq.ParquetDatasetPiece(piece.path, open_file_func=fs_open, /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:138: FutureWarning: 'ParquetDataset.fs' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.filesystem' attribute instead. parquet_file = ParquetFile(self._dataset.fs.open(piece.path))
Testing: 0it [00:00, ?it/s]
/home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:286: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. partition_names = self._dataset.partitions.partition_names if self._dataset.partitions else set() /home/ubuntu/anaconda3/envs/rapids-22.08_ploomber/lib/python3.8/site-packages/petastorm/arrow_reader_worker.py:289: FutureWarning: 'ParquetDataset.partitions' attribute is deprecated as of pyarrow 5.0.0 and will be removed in a future version. Specify 'use_legacy_dataset=False' while constructing the ParquetDataset, and then use the '.partitioning' attribute instead. table = piece.read(columns=column_names - partition_names, partitions=self._dataset.partitions)
Consolidating predictions on GPU: 0 Writing predictions on GPU: 0 Finished Writing predictions on GPU: 0 Finished Processing test dataset!!!
import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import f1_score
valid_gdf = pd.concat(map(pd.read_csv, glob.glob(f'{VALID_EVAL_OUTPUT_DIR}/*.csv')))
valid_gdf['target'] = valid_gdf['target'].astype('int64')
valid_gdf['case_id'] = valid_gdf['case_id'].astype('int64')
valid_gdf = valid_gdf.drop_duplicates()
valid_gdf
case_id | probability_0 | probability_1 | probability_2 | probability_3 | probability_4 | probability_5 | probability_6 | probability_7 | probability_8 | probability_9 | prediction | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6086 | 0.000015 | 0.000102 | 2.763680e-05 | 8.355258e-07 | 1.157322e-03 | 2.742739e-04 | 7.967497e-05 | 0.001281 | 6.982987e-06 | 0.997054 | 9 | 9 |
1 | 3789 | 0.000931 | 0.000002 | 1.004550e-08 | 7.976095e-07 | 4.281655e-05 | 9.984261e-01 | 1.401553e-06 | 0.000509 | 3.157551e-06 | 0.000083 | 5 | 5 |
2 | 1611 | 0.000500 | 0.000036 | 9.992444e-01 | 5.950678e-05 | 3.001507e-08 | 1.693952e-08 | 7.108908e-06 | 0.000009 | 1.172593e-04 | 0.000026 | 2 | 2 |
3 | 3820 | 0.000696 | 0.000001 | 3.384612e-09 | 2.461169e-07 | 3.237710e-06 | 9.990751e-01 | 2.877846e-05 | 0.000048 | 5.729424e-07 | 0.000146 | 5 | 5 |
4 | 5223 | 0.000265 | 0.000010 | 3.995616e-05 | 4.252254e-06 | 8.052929e-05 | 9.899642e-05 | 1.093062e-05 | 0.999210 | 3.055935e-05 | 0.000249 | 7 | 7 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1315 | 3887 | 0.000671 | 0.000013 | 1.678639e-07 | 2.144096e-06 | 1.022787e-03 | 9.502772e-01 | 1.535875e-05 | 0.044675 | 2.264061e-05 | 0.003300 | 5 | 5 |
1316 | 6453 | 0.000004 | 0.000014 | 7.670404e-06 | 5.575985e-08 | 6.096742e-05 | 2.890944e-04 | 3.945854e-05 | 0.000119 | 6.948956e-06 | 0.999458 | 9 | 9 |
1317 | 1446 | 0.005195 | 0.000819 | 9.871879e-01 | 3.339070e-05 | 4.102937e-07 | 2.758672e-07 | 1.022102e-03 | 0.000027 | 3.300941e-06 | 0.005711 | 2 | 2 |
1318 | 414 | 0.998634 | 0.000006 | 3.500827e-04 | 1.455810e-04 | 1.505734e-07 | 5.569475e-05 | 1.729968e-04 | 0.000629 | 1.468110e-06 | 0.000005 | 0 | 0 |
1319 | 3245 | 0.000002 | 0.000794 | 1.282249e-06 | 1.069307e-04 | 9.982917e-01 | 1.648027e-04 | 6.428783e-08 | 0.000502 | 2.765111e-05 | 0.000109 | 4 | 4 |
1320 rows × 13 columns
valid_gdf[valid_gdf.prediction == valid_gdf.target].count()
case_id 1307 probability_0 1307 probability_1 1307 probability_2 1307 probability_3 1307 probability_4 1307 probability_5 1307 probability_6 1307 probability_7 1307 probability_8 1307 probability_9 1307 prediction 1307 target 1307 dtype: int64
valid_gdf['target'].min(), valid_gdf['prediction'].min(), valid_gdf['target'].max(), valid_gdf['prediction'].max()
(0, 0, 9, 9)
skplt.metrics.plot_precision_recall(valid_gdf['target'].to_numpy(),
valid_gdf[['probability_0', 'probability_1', 'probability_2', 'probability_3', 'probability_4',
'probability_5', 'probability_6', 'probability_7', 'probability_8', 'probability_9']].to_numpy(),
cmap='nipy_spectral')
plt.show()
skplt.metrics.plot_roc(valid_gdf['target'].to_numpy(),
valid_gdf[['probability_0', 'probability_1', 'probability_2', 'probability_3', 'probability_4',
'probability_5', 'probability_6', 'probability_7', 'probability_8', 'probability_9']].to_numpy(),
cmap='nipy_spectral')
plt.show()
f1_score(valid_gdf['target'], valid_gdf['prediction'], average='macro')
0.9902344679385061
f1_score(valid_gdf['target'], valid_gdf['prediction'], average='weighted')
0.9901544244947846
test_gdf = pd.concat(map(pd.read_csv, glob.glob(f'{TEST_EVAL_OUTPUT_DIR}/*.csv')))
test_gdf['target'] = test_gdf['target'].astype('int64')
test_gdf['case_id'] = test_gdf['case_id'].astype('int64')
test_gdf = test_gdf.drop_duplicates()
test_gdf
case_id | probability_0 | probability_1 | probability_2 | probability_3 | probability_4 | probability_5 | probability_6 | probability_7 | probability_8 | probability_9 | prediction | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1872 | 0.000026 | 1.304874e-03 | 9.491577e-05 | 2.289573e-04 | 0.000503 | 0.000194 | 1.731295e-08 | 0.001042 | 9.965348e-01 | 0.000071 | 8 | 8 |
1 | 2009 | 0.001168 | 1.589717e-05 | 2.127727e-05 | 2.592365e-07 | 0.000005 | 0.001665 | 3.412918e-03 | 0.000210 | 7.967621e-07 | 0.993501 | 9 | 9 |
2 | 1789 | 0.000021 | 6.273520e-04 | 2.053423e-04 | 5.828706e-04 | 0.000021 | 0.000011 | 1.872509e-08 | 0.000074 | 9.984506e-01 | 0.000006 | 8 | 8 |
3 | 1361 | 0.000807 | 7.936506e-08 | 2.452141e-07 | 7.586585e-06 | 0.000002 | 0.000297 | 9.986936e-01 | 0.000020 | 3.663079e-09 | 0.000173 | 6 | 6 |
4 | 1008 | 0.000053 | 6.843553e-04 | 4.506264e-06 | 4.896078e-04 | 0.987416 | 0.000693 | 1.174814e-06 | 0.010442 | 1.520625e-05 | 0.000201 | 4 | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2194 | 1864 | 0.000008 | 1.710077e-04 | 1.101227e-04 | 5.423648e-04 | 0.000028 | 0.000007 | 8.676777e-09 | 0.000068 | 9.990626e-01 | 0.000003 | 8 | 8 |
2195 | 937 | 0.000091 | 1.386862e-03 | 1.984476e-05 | 1.861359e-03 | 0.976293 | 0.000283 | 5.518412e-06 | 0.019924 | 2.570462e-05 | 0.000110 | 4 | 4 |
2196 | 1763 | 0.000013 | 2.897645e-04 | 1.291768e-04 | 4.223139e-04 | 0.000034 | 0.000010 | 1.255020e-08 | 0.000092 | 9.990032e-01 | 0.000006 | 8 | 8 |
2197 | 886 | 0.000011 | 1.302320e-03 | 1.094393e-05 | 4.114340e-03 | 0.994058 | 0.000052 | 5.552887e-07 | 0.000293 | 1.196289e-04 | 0.000038 | 4 | 4 |
2198 | 1152 | 0.010388 | 4.270554e-07 | 4.940290e-09 | 1.718983e-06 | 0.000002 | 0.989509 | 1.566761e-05 | 0.000078 | 4.095562e-07 | 0.000005 | 5 | 5 |
2199 rows × 13 columns
test_gdf[test_gdf.prediction == test_gdf.target].count()
case_id 2133 probability_0 2133 probability_1 2133 probability_2 2133 probability_3 2133 probability_4 2133 probability_5 2133 probability_6 2133 probability_7 2133 probability_8 2133 probability_9 2133 prediction 2133 target 2133 dtype: int64
test_gdf['target'].min(), test_gdf['prediction'].min(), test_gdf['target'].max(), test_gdf['prediction'].max()
(0, 0, 9, 9)
skplt.metrics.plot_precision_recall(test_gdf['target'].to_numpy(),
test_gdf[['probability_0', 'probability_1', 'probability_2', 'probability_3', 'probability_4',
'probability_5', 'probability_6', 'probability_7', 'probability_8', 'probability_9']].to_numpy(),
cmap='nipy_spectral')
plt.show()
skplt.metrics.plot_roc(test_gdf['target'].to_numpy(),
test_gdf[['probability_0', 'probability_1', 'probability_2', 'probability_3', 'probability_4',
'probability_5', 'probability_6', 'probability_7', 'probability_8', 'probability_9']].to_numpy(),
cmap='nipy_spectral')
plt.show()
f1_score(test_gdf['target'], test_gdf['prediction'], average='macro')
0.9699782441824517
f1_score(test_gdf['target'], test_gdf['prediction'], average='weighted')
0.9699836652143021
We shutdown the kernel!!!
from nbdev import nbdev_export
nbdev_export()