Source code for gmmvi.experiments.target_distributions.logistic_regression

from pathlib import Path

import tensorflow as tf
import numpy as np
from .lnpdf import LNPDF
from math import pi

from gmmvi.models.gmm_wrapper import GmmWrapper


[docs]class LogisticRegression(LNPDF): """This class is used for implementing the logistic regression experiments based on the BreastCancer and GermanCredit dataset :cite:p:`UCI`, reimplementing the experiments used by :cite:t:`Arenz2020`. Parameters: dataset_id: a string Should be either "breast_cancer" or "german_credit" """ def __init__(self, dataset_id): super(LogisticRegression, self).__init__(use_log_density_and_grad=False) self.const_term = tf.constant(tf.cast(0.5 * tf.math.log(2. * pi), dtype=tf.float32)) root_path = Path(__file__).parent.resolve() if dataset_id == "breast_cancer": path = Path.joinpath(root_path, "datasets/breast_cancer.data") data = np.loadtxt(str(path)) X = data[:, 2:] X /= np.std(X, 0)[np.newaxis, :] X = np.hstack((np.ones((len(X), 1)), X)) self.data = tf.cast(X, tf.float32) self.labels = data[:, 1] self.num_dimensions = self.data.shape[1] self._prior_std = tf.constant(10., dtype=tf.float32) self.prior_mean = tf.constant(0., dtype=tf.float32) elif dataset_id == "german_credit": path = Path.joinpath(root_path, "datasets/german.data-numeric") data = np.loadtxt(str(path)) X = data[:, :-1] X /= np.std(X, 0)[np.newaxis, :] X = np.hstack((np.ones((len(X), 1)), X)) self.data = tf.cast(X, tf.float32) self.labels = data[:, -1] - 1 self.num_dimensions = self.data.shape[1] self._prior_std = tf.constant(10., dtype=tf.float32) self.prior_mean = tf.constant(0., dtype=tf.float32) self.labels = tf.Variable(tf.expand_dims(self.labels.astype(np.float32), 1))
[docs] def get_num_dimensions(self): return self.num_dimensions
@property def prior_std(self): return self._prior_std
[docs] def log_likelihood(self, x): features = -tf.matmul(self.data, tf.transpose(x)) log_likelihoods = tf.where(self.labels == 1, tf.transpose(tf.math.log_sigmoid(features)), tf.transpose(tf.math.log_sigmoid(features) - features)) return log_likelihoods
[docs] def log_density(self, x): features = -tf.matmul(self.data, tf.transpose(x)) log_likelihoods = tf.reduce_sum(tf.where(self.labels == 1, tf.math.log_sigmoid(features), tf.math.log_sigmoid(features) - features), axis=0) log_prior = tf.reduce_sum(-tf.math.log(self.prior_std) - self.const_term - 0.5 * tf.math.square( (x - self.prior_mean) / self.prior_std), axis=1) log_posterior = log_likelihoods + log_prior return log_posterior
[docs]class LogisticRegression_minibatch(LogisticRegression): """This class is used for implementing minibatch-variants of the GermanCredit and BreastCancer :py:class:`experiments<gmmvi.experiments.target_distributions.logistic_regression.LogisticRegression>` Parameters: dataset_id: str Should be either "breast_cancer" or "german_credit" batchsize: int batchsize for evaluating the likelihood. size_test_set: int number of training data that should be held out. use_own_batch_per_samples: bool if True, a different minibatch is used for every sample for which we want to evaluate the target log-density, which reduces the variance (local reparameterization). """ def __init__(self, dataset_id, batchsize, size_test_set, use_own_batch_per_sample): super(LogisticRegression_minibatch, self).__init__(dataset_id) self.data = tf.Variable(self.data) if size_test_set > 0: self.data_test = tf.Variable(self.data[-size_test_set:]) self.labels_test = tf.Variable(self.labels[-size_test_set:]) self.data = tf.Variable(self.data[:-size_test_set]) self.labels = self.labels[:-size_test_set] self.num_data = tf.shape(self.data)[0] self.labels = tf.Variable(self.labels, dtype=tf.float32) self.batchsize = tf.Variable(batchsize, dtype=tf.int32) self.use_own_batch_per_sample = use_own_batch_per_sample self.last_start = tf.Variable(0, dtype=tf.int32)
[docs] def shuffle_data(self): data, labels = tf.split( tf.random.shuffle(tf.concat((self.data, tf.cast(self.labels, tf.float32)), 1)), [self.num_dimensions, 1], axis=1) self.data.assign(data) self.labels.assign(labels)
[docs] def likelihood_batch(self, x, data, labels): features = -tf.matmul(data, tf.transpose(x)) log_likelihoods = tf.reduce_mean(tf.where(labels == 1, tf.math.log_sigmoid(features), tf.math.log_sigmoid(features) - features), axis=0) return log_likelihoods
[docs] def log_density_fb(self, x): """ Evaluate the log-density on the full data set (used for evaluation). If size_test_set=0, this function is equivalent to :py:meth:`gmmvi.experiments.target_distributions.logistic_regression.LogisticRegression.log_density`. """ return LogisticRegression.log_density(self, x)
[docs] def log_density(self, x): self.shuffle_data() if self.use_own_batch_per_sample: log_likelihoods = tf.TensorArray(tf.float32, size=tf.shape(x)[0]) start = 0 for i in tf.range(tf.shape(x)[0]): if start + self.batchsize > self.num_data: start = 0 indices = tf.slice(tf.range(self.num_data), [start], [self.batchsize]) start = start + self.batchsize log_likelihoods = log_likelihoods.write(i, self.likelihood_batch(tf.expand_dims(x[i], axis=0), tf.gather(self.data, indices), tf.gather(self.labels, indices))) log_likelihoods = log_likelihoods.concat() else: indices = tf.slice(tf.range(self.num_data), [0], [self.batchsize]) log_likelihoods = self.likelihood_batch(x, tf.gather(self.data, indices), tf.gather(self.labels, indices)) log_prior = tf.reduce_sum(-tf.math.log(self.prior_std) - 0.5 * tf.math.log(2. * pi) - 0.5 * tf.math.square((x - self.prior_mean) / self.prior_std), axis=1) log_posterior = tf.cast(self.num_data, tf.float32) * log_likelihoods + log_prior return log_posterior
[docs] def expensive_metrics(self, model: GmmWrapper, samples: tf.Tensor) -> dict: """ As target-distribution specific metric, we estimate the full-batch ELBO. Parameters: model: :py:class:`GmmWrapper<gmmvi.models.gmm_wrapper.GmmWrapper>` The learned model that we want to evaluate for this target distribution. samples: tf.Tensor Samples that have been drawn from the model and that are used for estimating the full-batch ELBO. Returns: dict: a dictionary with a single item containing the full-batch elbo. """ expensive_metrics = dict() entropy = -tf.reduce_mean(model.log_density(samples)) mean_reward = tf.reduce_mean(self.log_density_fb(samples)) elbo_fb = mean_reward + entropy expensive_metrics.update({"elbo_fb:": elbo_fb}) return expensive_metrics
def make_breast_cancer(): return LogisticRegression("breast_cancer") def make_german_credit(): return LogisticRegression("german_credit") def make_breast_cancer_mb(batch_size, size_test_set, use_own_batch_per_sample): return LogisticRegression_minibatch("breast_cancer", batch_size, size_test_set, use_own_batch_per_sample) def make_german_credit_mb(batch_size, size_test_set, use_own_batch_per_sample): return LogisticRegression_minibatch("german_credit", batch_size, size_test_set, use_own_batch_per_sample)