from typing import Tuple
import tensorflow as tf
import tensorflow_probability as tfp
from gmmvi.optimization.least_squares import QuadFunc
from gmmvi.models.gmm_wrapper import GmmWrapper
[docs]class NgEstimator:
""" This class provides a common interface for estimating the natural gradient for a Gaussian component.
There are currently two options for estimating the natural gradient:
1. The :py:class:`MoreNgEstimator` uses compatible function approximation to estimate the natural gradient from a
quadratic reward surrogate :cite:p:`Pajarinen2019,Abdolmaleki2015,Peters2008,Sutton1999`.
2. The :py:class:`SteinNgEstimator` uses Stein's Lemma to estimate the natural gradient using first-order
information :cite:p:`Lin2019Stein`.
Parameters:
temperature: float
Usually temperature=1., can be used to scale the importance of maximizing the model entropy.
model: :py:class:`GmmWrapper<gmmvi.models.gmm_wrapper.GmmWrapper>`
The wrapped model where we want to update the components.
requires_gradient: bool
Does this object require first-order information?
only_use_own_samples: bool
If true, we do not use importance sampling to update one component based on samples from a different component.
use_self_normalized_importance_weights: bool
if true, use self-normalized importance weighting (normalizing the importance weights such they sum to one),
rather than standard importance weighting.
"""
def __init__(self, temperature, model: GmmWrapper, requires_gradient: bool,
only_use_own_samples: bool, use_self_normalized_importance_weights: bool):
self._model = model
self._temperature = temperature
self._requires_gradients = requires_gradient
self._only_use_own_samples = only_use_own_samples
self._use_self_normalized_importance_weights = use_self_normalized_importance_weights
[docs] @staticmethod
def build_from_config(config, temperature, gmm_wrapper):
"""This static method provides a convenient way to create a
:py:class:`MoreNgEstimator`, or :py:class:`SteinNgEstimator` depending on the provided config.
Parameters:
temperature: float
Usually temperature=1., can be used to scale the importance of maximizing the model entropy.
config: dict
The dictionary is typically read from YAML a file, and holds all hyperparameters.
"""
if config["ng_estimator_type"] == "Stein":
return SteinNgEstimator(temperature=temperature, model=gmm_wrapper, **config['ng_estimator_config'])
elif config["ng_estimator_type"] == "MORE":
return MoreNgEstimator(temperature=temperature, model=gmm_wrapper, **config['ng_estimator_config'])
else:
raise ValueError(f"config['ng_estimator_type'] is '{config['ng_estimator_type']}' "
f"which is an unknown type")
@property
def requires_gradients(self) -> bool:
return self._requires_gradients
[docs] def get_expected_hessian_and_grad(self, samples: tf.Tensor, mapping: tf.Tensor, background_densities: tf.Tensor,
target_lnpdfs: tf.Tensor, target_lnpdfs_grads: tf.Tensor):
""" Perform the natural gradient estimation, needs to be implemented by the deriving class.
Parameters:
samples: tf.Tensor
a tensor of shape num_samples x num_dimension containing the samples used for the approximation
mapping: tf.Tensor
a one-dimensional tensor of integers, storing for every sample from which component it was sampled.
background_densities: tf.Tensor
the log probability density of the background distribution (which was used for sampling the provided
samples). A one-dimensional tensor of size num_samples.
target_lnpdfs: tf.Tensor
The rewards are given by the log-densities of the target-distribution,
:math:`\\log p(\\mathbf{x})`.
target_lnpdfs_grads: tf.Tensor
The gradients of the target_lnpdfs with respect to the samples,
:math:`\\nabla_{\\mathbf{x}}\\log p(\\mathbf{x})`.
Returns:
tuple(tf.Tensor, tf.Tensor):
**expected_hessian_neg** - A tensor of shape num_components x num_dimensions x num_dimensions containing
for each component an estimate of the (negated) expected Hessian
:math:`-\\mathbb{E}_{q(\\mathbf{x}|o)}\\left[ \\nabla_{\\mathbf{x}\\mathbf{x}} \\log \\frac{p(\\mathbf{x}}{q(\\mathbf{x}}\\right]`
**expected_gradient_neg** - A tensor of shape num_components x num_dimensions containing
for each component an estimate of the (negated) expected gradient
:math:`-\\mathbb{E}_{q(\\mathbf{x}|o)}\\left[ \\nabla_{\\mathbf{x}} \\log \\frac{p(\\mathbf{x}}{q(\\mathbf{x}}\\right]`
"""
raise NotImplementedError
[docs] def get_rewards_for_comp(self, index: int, samples: tf.Tensor, mapping: tf.Tensor, component_log_densities,
log_ratios: tf.Tensor, log_ratio_grads: tf.Tensor, background_densities: tf.Tensor) \
-> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
if self._only_use_own_samples:
own_sample_indices = tf.reshape(tf.where(mapping == index), [-1])
my_samples = tf.gather(samples, own_sample_indices)
my_rewards = tf.gather(log_ratios, own_sample_indices)
my_reward_grads = tf.gather(log_ratio_grads, own_sample_indices)
# my_background_densities = tf.gather(background_densities, own_sample_indices)
my_background_densities = tf.gather(component_log_densities[index], own_sample_indices)
my_component_log_densities = tf.gather(component_log_densities[index], own_sample_indices)
return my_samples, my_rewards, my_reward_grads, my_background_densities, my_component_log_densities
else:
return samples, log_ratios, log_ratio_grads, background_densities, component_log_densities[index]
[docs]class SteinNgEstimator(NgEstimator):
""" Use Stein's Lemma to estimate the natural gradient using first-order information.
See :cite:p:`Lin2019Stein`.
Parameters:
temperature: float
Usually temperature=1., can be used to scale the importance of maximizing the model entropy.
model: :py:class:`GmmWrapper<gmmvi.models.gmm_wrapper.GmmWrapper>`
The wrapped model where we want to update the components.
only_use_own_samples: bool
If true, we do not use importance sampling to update one component based on samples from a different component.
use_self_normalized_importance_weights: bool
if true, use self-normalized importance weighting (normalizing the importance weights such they sum to one),
rather than standard importance weighting.
"""
def __init__(self, temperature, model, only_use_own_samples: bool, use_self_normalized_importance_weights: bool):
super(SteinNgEstimator, self).__init__(temperature, model, True, only_use_own_samples,
use_self_normalized_importance_weights)
@staticmethod
def _stable_expectation(log_weights, log_values):
n = tf.cast(tf.shape(log_weights)[0], tf.float32)
lswe, signs = tfp.math.reduce_weighted_logsumexp(
tf.expand_dims(log_weights, 1) + tf.math.log(tf.math.abs(log_values)),
w=tf.math.sign(log_values), axis=0, return_sign=True)
return 1 / n * signs * tf.exp(lswe)
def _get_expected_gradient_and_hessian_standard_iw(self, chol_cov, mean, component_log_densities, samples,
background_mixture_densities, log_ratio_grads):
log_importance_weights = component_log_densities - background_mixture_densities
expected_gradient = self._stable_expectation(log_importance_weights, log_ratio_grads)
if self._model.diagonal_covs:
prec_times_diff = tf.expand_dims(1 / (chol_cov ** 2), 1) \
* tf.transpose(samples - mean)
prec_times_diff_times_grad = tf.transpose(prec_times_diff) * log_ratio_grads
else:
prec_times_diff = tf.linalg.cholesky_solve(chol_cov, tf.transpose(samples - mean))
prec_times_diff_times_grad = \
tf.expand_dims(tf.transpose(prec_times_diff), 1) * tf.expand_dims(log_ratio_grads, -1)
log_importance_weights = tf.expand_dims(log_importance_weights, 1)
expected_hessian = self._stable_expectation(log_importance_weights, prec_times_diff_times_grad)
return expected_gradient, expected_hessian
def _get_expected_gradient_and_hessian_self_normalized_iw(self, chol_cov, mean, component_log_densities, samples,
background_mixture_densities, log_ratio_grads):
log_weights = component_log_densities - background_mixture_densities
log_weights -= tf.reduce_logsumexp(log_weights, axis=0, keepdims=True)
weights = tf.exp(log_weights)
importance_weights = weights / tf.reduce_sum(weights, axis=0, keepdims=True)
weighted_gradients = tf.expand_dims(importance_weights, 1) * log_ratio_grads
if self._model.diagonal_covs:
prec_times_diff = tf.expand_dims(1 / (chol_cov ** 2), 1) \
* tf.transpose(samples - mean)
expected_hessian = tf.reduce_sum(tf.transpose(prec_times_diff) * weighted_gradients, 0)
else:
prec_times_diff = tf.linalg.cholesky_solve(chol_cov, tf.transpose(samples - mean))
expected_hessian = tf.reduce_sum(
tf.expand_dims(tf.transpose(prec_times_diff), 1) * tf.expand_dims(weighted_gradients, -1), 0)
expected_hessian = 0.5 * (expected_hessian + tf.transpose(expected_hessian))
expected_gradient = tf.reduce_sum(weighted_gradients, 0)
return expected_gradient, expected_hessian
def _get_expected_gradient_and_hessian_for_comp(self, i, my_component_log_densities, my_samples,
my_background_densities, my_log_ratios_grad):
if self._use_self_normalized_importance_weights:
expected_gradient, expected_hessian = \
self._get_expected_gradient_and_hessian_self_normalized_iw(
self._model.chol_cov[i], self._model.means[i], my_component_log_densities, my_samples,
my_background_densities, my_log_ratios_grad)
else:
expected_gradient, expected_hessian = \
self._get_expected_gradient_and_hessian_standard_iw(
self._model.chol_cov[i], self._model.means[i], my_component_log_densities, my_samples,
my_background_densities, my_log_ratios_grad)
return expected_gradient, expected_hessian
[docs] def get_expected_hessian_and_grad(self, samples: tf.Tensor, mapping: tf.Tensor, background_densities: tf.Tensor,
target_lnpdfs: tf.Tensor, target_lnpdfs_grads: tf.Tensor):
""" Estimates the natural gradient using Stein's Lemma :cite:p:`Lin2019Stein`. The expected gradient is
a simple importance-weighted Monte-Carlo estimate based on the provided *target_lnpdfs_grads* and the gradients
of the component log-densities. The expected Hessians are estimated as
:math:`-\\mathbb{E}_{q(\\mathbf{x}|o)}\\left[ \\nabla_{\\mathbf{x}\\mathbf{x}} \\log \\frac{p(\\mathbf{x}}{q(\\mathbf{x}}\\right] \\approx - \\sum_{\\mathbf{x}_i} w_i \\Sigma^{-1} (\\mathbf{x}_i - \\mu) \\nabla_{\\mathbf{x}_i} g_{\\mathbf{x}_i}^T`,
where :math:`g_{\\mathbf{x}_i} = \\nabla_{\\mathbf{x}_i} \\log \\frac{p(\\mathbf{x}}{q(\\mathbf{x}}` is the
gradient of the log-ratio with respect to the corresponding sample.
Parameters:
samples: tf.Tensor
a tensor of shape num_samples x num_dimension containing the samples used for the approximation
mapping: tf.Tensor
a one-dimensional tensor of integers, storing for every sample from which component it was sampled.
background_densities: tf.Tensor
the log probability density of the background distribution (which was used for sampling the provided
samples). A one-dimensional tensor of size num_samples.
target_lnpdfs: tf.Tensor
The rewards are given by the log-densities of the target-distribution,
:math:`\\log p(\\mathbf{x})`.
target_lnpdfs_grads: tf.Tensor
The gradients of the target_lnpdfs with respect to the samples,
:math:`\\nabla_{\\mathbf{x}}\\log p(\\mathbf{x})`.
Returns:
tuple(tf.Tensor, tf.Tensor):
**expected_hessian_neg** - A tensor of shape num_components x num_dimensions x num_dimensions containing
for each component an estimate of the (negated) expected Hessian
:math:`-\\mathbb{E}_{q(\\mathbf{x}|o)}\\left[ \\nabla_{\\mathbf{x}\\mathbf{x}} \\log \\frac{p(\\mathbf{x}}{q(\\mathbf{x}}\\right]`
**expected_gradient_neg** - A tensor of shape num_components x num_dimensions containing
for each component an estimate of the (negated) expected gradient
:math:`-\\mathbb{E}_{q(\\mathbf{x}|o)}\\left[ \\nabla_{\\mathbf{x}} \\log \\frac{p(\\mathbf{x}}{q(\\mathbf{x}}\\right]`
"""
num_components = self._model.num_components
relative_mapping = mapping - tf.reduce_max(mapping) + num_components - 1
model_densities, model_densities_grad, component_log_densities = self._model.log_density_and_grad(samples)
log_ratios = target_lnpdfs - model_densities
log_ratio_grads = target_lnpdfs_grads - model_densities_grad
expected_hessian_neg = tf.TensorArray(tf.float32, size=num_components)
expected_gradient_neg = tf.TensorArray(tf.float32, size=num_components)
for i in tf.range(num_components):
my_samples, my_log_ratios, my_log_ratios_grad, my_background_densities, my_component_log_densities = \
self.get_rewards_for_comp(i, samples, relative_mapping, component_log_densities,
log_ratios, log_ratio_grads, background_densities)
expected_gradient, expected_hessian = \
self._get_expected_gradient_and_hessian_for_comp(i, my_component_log_densities, my_samples,
my_background_densities, my_log_ratios_grad)
expected_hessian_neg = expected_hessian_neg.write(i, -expected_hessian)
expected_gradient_neg = expected_gradient_neg.write(i, -expected_gradient)
expected_hessian_neg = tf.convert_to_tensor(expected_hessian_neg.stack())
expected_gradient_neg = tf.convert_to_tensor(expected_gradient_neg.stack())
return expected_hessian_neg, expected_gradient_neg
[docs]class MoreNgEstimator(NgEstimator):
""" Use compatible function approximation to estimate the natural gradient from a quadratic reward surrogate.
See :cite:p:`Pajarinen2019,Abdolmaleki2015,Peters2008,Sutton1999`.
Parameters:
temperature: float
Usually temperature=1., can be used to scale the importance of maximizing the model entropy.
model: :py:class:`GmmWrapper<gmmvi.models.gmm_wrapper.GmmWrapper>`
The wrapped model where we want to update the components.
only_use_own_samples: bool
If true, we do not use importance sampling to update one component based on samples from a different component.
initial_l2_regularizer: float
The l2_regularizer is as regularizer during weighted least-squares (ridge regression) for fitting the
compatible surrogate.
use_self_normalized_importance_weights: bool
if true, use self-normalized importance weighting (normalizing the importance weights such they sum to one),
rather than standard importance weighting.
"""
def __init__(self, temperature, model, only_use_own_samples: bool,
initial_l2_regularizer: float, use_self_normalized_importance_weights: bool):
super(MoreNgEstimator, self).__init__(temperature, model, True, only_use_own_samples,
use_self_normalized_importance_weights)
tf.assert_equal(self._model.l2_regularizers, initial_l2_regularizer)
self.least_square_fitter = QuadFunc(self._model.num_dimensions)
[docs] def get_expected_hessian_and_grad(self, samples: tf.Tensor, mapping: tf.Tensor, background_densities: tf.Tensor,
target_lnpdfs: tf.Tensor, target_lnpdfs_grads: tf.Tensor) \
-> [tf.Tensor, tf.Tensor]:
""" Estimates the natural gradient using compatible function approximation. This method does not require / make
use of the provided gradients, but only uses the function evaluations *target_lnpdfs* for estimating the
natural gradient. The method fits a quadratic reward function
:math:`\\tilde{R}(\\mathbf{x}) = \\mathbf{x}^T \\mathbf{R} \\mathbf{x} + \\mathbf{x}^T \\mathbf{r} + r_0`
to approximate the target distribution using importance-weighted least squares where the targets are given by
*target_lnpdfs*, :math:`\\log p(\\mathbf{x})`.
The natural gradient estimate, can then be computed from the coefficients :math:`\\mathbf{R}`
and :math:`\\mathbf{r}`.
Parameters:
samples: tf.Tensor
a tensor of shape num_samples x num_dimension containing the samples used for the approximation
mapping: tf.Tensor
a one-dimensional tensor of integers, storing for every sample from which component it was sampled.
background_densities: tf.Tensor
the log probability density of the background distribution (which was used for sampling the provided
samples). A one-dimensional tensor of size num_samples.
target_lnpdfs: tf.Tensor
The rewards are given by the (unnormalized) log-densities of the target-distribution,
:math:`\\log p(\\mathbf{x})`.
target_lnpdfs_grads: tf.Tensor
The gradients of the target_lnpdfs with respect to the samples (not used),
:math:`\\nabla_{\\mathbf{x}}\\log p(\\mathbf{x})`.
Returns:
tuple(tf.Tensor, tf.Tensor):
**expected_hessian_neg** - A tensor of shape num_components x num_dimensions x num_dimensions containing
for each component an estimate of the (negated) expected Hessian
:math:`-\\mathbb{E}_{q(\\mathbf{x}|o)}\\left[ \\nabla_{\\mathbf{x}\\mathbf{x}} \\log \\frac{p(\\mathbf{x})}{q(\\mathbf{x})}\\right]`
**expected_gradient_neg** - A tensor of shape num_components x num_dimensions containing
for each component an estimate of the (negated) expected gradient
:math:`-\\mathbb{E}_{q(\\mathbf{x}|o)}\\left[ \\nabla_{\\mathbf{x}} \\log \\frac{p(\\mathbf{x})}{q(\\mathbf{x})}\\right]`
"""
num_components = tf.shape(self._model.means)[0]
expected_hessian_neg = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
expected_gradient_neg = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
relative_mapping = mapping - tf.reduce_max(mapping) + num_components - 1
model_densities, component_log_densities = self._model.log_densities_also_individual(samples)
log_ratios = target_lnpdfs - model_densities
log_ratio_grads = tf.zeros(tf.shape(samples[0]))
for i in tf.range(num_components):
my_samples, my_rewards, _, my_background_densities, my_component_log_densities = \
self.get_rewards_for_comp(i, samples, relative_mapping, component_log_densities, log_ratios,
log_ratio_grads, background_densities)
log_weights = my_component_log_densities - my_background_densities
if self._use_self_normalized_importance_weights:
log_weights -= tf.reduce_logsumexp(log_weights, axis=0, keepdims=True)
weights = tf.exp(log_weights)
my_importance_weights = weights / tf.reduce_sum(weights, axis=0, keepdims=True)
else:
my_importance_weights = tf.exp(log_weights)
reward_quad, reward_lin, const_term = self.least_square_fitter.fit_quadratic(self._model.l2_regularizers[i],
tf.shape(my_samples)[0],
my_samples, my_rewards,
my_importance_weights,
self._model.means[i],
self._model.chol_cov[i])
this_G_hat = reward_quad
expected_hessian_neg = expected_hessian_neg.write(i, this_G_hat)
this_g_hat = tf.reshape(reward_quad @ tf.expand_dims(self._model.means[i], axis=1)
- tf.expand_dims(reward_lin, axis=1), [self._model.num_dimensions])
expected_gradient_neg = expected_gradient_neg.write(i, this_g_hat)
expected_hessian_neg = tf.convert_to_tensor(expected_hessian_neg.stack())
expected_gradient_neg = tf.convert_to_tensor(expected_gradient_neg.stack())
return expected_hessian_neg, expected_gradient_neg