import sys
import os
path = os.path.dirname(os.path.abspath("__file__"))
sys.path.insert(0, path + '/..')
import base64
import IPython
import importlib
import logging
logging.getLogger().setLevel(logging.ERROR)
import random
import time
from collections import namedtuple
from tf_agents.environments import suite_gym, suite_dm_control, parallel_py_environment
from tf_agents.environments import tf_py_environment, FlattenObservationsWrapper
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer, episodic_replay_buffer
from tf_agents.drivers import dynamic_episode_driver, dynamic_step_driver
from tf_agents.trajectories import time_step as ts, policy_step, trajectory
from tf_agents.utils import common
from tf_agents.policies import TFPolicy
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)
import tensorflow_probability as tfp
tfd = tfp.distributions
import numpy as np
import json
from reinforcement_learning import labeling_functions
import reinforcement_learning.environments
from reinforcement_learning.environments import EnvironmentLoader, perturbed_env
from reinforcement_learning.metrics import AverageDiscountedReturnMetric
from policies.saved_policy import SavedTFPolicy
from policies.epsilon_mimic import EpsilonMimicPolicy
from policies.latent_policy import LatentPolicyOverRealStateAndActionSpaces
from verification import model, local_losses, binary_latent_space
from verification.local_losses import compute_values_from_initial_distribution
from verification.value_iteration import value_iteration
from util.io.dataset_generator import ergodic_batched_labeling_function, is_reset_state
from util.io import video
import wasserstein_mdp
from typing import Callable, Optional
from tf_agents.typing.types import Float, Bool
# set seed
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
def embed_mp4(filename):
"""Embeds an mp4 file in the notebook."""
video = open(filename,'rb').read()
b64 = base64.b64encode(video)
tag = '''
<video width="640" height="480" controls>
<source src="data:video/mp4;base64,{0}" type="video/mp4">
Your browser does not support the video tag.
</video>'''.format(b64.decode())
return IPython.display.HTML(tag)
def display_state_space(py_env):
print("state space shape:", py_env.observation_spec().shape)
try:
print("state space max values:", py_env.observation_spec().maximum)
print("state space min values:", py_env.observation_spec().minimum)
except AttributeError as e:
pass
def display_action_space(py_env):
if py_env.action_spec().dtype in [np.int64, np.int32]:
print("discrete action space")
print("number of discrete actions:", py_env.action_spec().maximum + 1)
else:
print("continuous action space")
print("action space shape:", py_env.action_spec().shape)
print("action space max values:", py_env.action_spec().maximum)
print("action space min values:", py_env.action_spec().minimum)
from util.io.dataset_generator import is_reset_state
from verification.local_losses import PolicyDecorator
@tf.function
def get_p_init(
wae_mdp,
original_state,
latent_transition_fn,
environment_name,
):
latent_state_space = binary_latent_space(wae_mdp.latent_state_size)
is_reset_state_test_fn = lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims)
original_reset_state = tf.tile(tf.zeros_like(original_state[:1, ...]), [tf.shape(latent_state_space)[0], 1])
reset_state = wae_mdp.state_embedding_function(
original_reset_state,
ergodic_batched_labeling_function(
labeling_functions[environment_name]
)(original_reset_state))
reset_state = tf.cast(reset_state, tf.float32)
latent_action_space = tf.one_hot(
indices=tf.range(wae_mdp.number_of_discrete_actions),
depth=tf.cast(wae_mdp.number_of_discrete_actions, tf.int32),
dtype=tf.float32)
return tf.reduce_sum(
tf.transpose(
PolicyDecorator(wae_mdp.get_latent_policy(action_dtype=tf.int64))(
reset_state
).probs_parameter()
) * tf.map_fn(
fn=lambda latent_action: latent_transition_fn(
reset_state,
tf.tile(tf.expand_dims(latent_action, 0), [tf.shape(latent_state_space)[0], 1]),
).prob(
tf.cast(latent_state_space, tf.float32),
full_latent_state_space=True),
elems=latent_action_space),
axis=0) * (1. - tf.cast(is_reset_state_test_fn(latent_state_space), tf.float32))
def C_until_T_values(
C_fn: Callable[[Float], Bool],
T_fn: Callable[[Float], Bool],
transition_matrix: Float,
latent_state_size: int,
A: int,
latent_policy: TFPolicy,
gamma: Float = 0.99,
transition_to_T_reward: Optional[Float] = None,
) -> Float:
S = tf.pow(2, latent_state_size)
state_space = binary_latent_space(latent_state_size, dtype=tf.float32)
# make absorbing ¬C and T
absorbing_states = lambda latent_state: tf.math.logical_or(
tf.math.logical_not(C_fn(latent_state)),
T_fn(latent_state))
# reward of 1 when transitioning to T;
# set it to the input values if provided
reward_objective = tf.ones(
shape=(S, A, S),
) * tf.cast(T_fn(state_space), tf.float32)
if transition_to_T_reward is not None:
reward_objective *= transition_to_T_reward
policy_probs = PolicyDecorator(
latent_policy
)(state_space).probs_parameter()
values = value_iteration(
latent_state_size=latent_state_size,
num_actions=A,
transition_fn=transition_matrix,
reward_fn=reward_objective,
gamma=gamma,
policy_probs=policy_probs,
epsilon=1e-6,
v_init=tf.zeros(S, dtype=tf.float32),
episodic_return=True,
is_reset_state_test_fn=absorbing_states,
error_type='absolute',
transition_matrix=transition_matrix,
reward_matrix=reward_objective,)
# set the values of the target states to either one or the input values if provided
if transition_to_T_reward is None:
values = values + tf.cast(T_fn(state_space), tf.float32)
else:
values = values + (tf.cast(T_fn(state_space), tf.float32) * transition_to_T_reward)
return values
def reach_C_then_T_values(
C_fn: Callable[[Float], Bool],
T_fn: Callable[[Float], Bool],
transition_matrix: Float,
latent_state_size: int,
A: int,
latent_policy: TFPolicy,
gamma: Float = 0.99,
) -> Float:
S = tf.pow(2, latent_state_size)
state_space = binary_latent_space(latent_state_size, dtype=tf.float32)
C = C_fn(state_space)
T = T_fn(state_space)
all_states = tf.ones(shape=(S, A, S))
# detect when the agent transitions from the C to T
# set C-state rows to 1
from_C = tf.transpose(all_states * tf.cast(C, tf.float32))
# set T-state columns to 1
to_T = all_states * tf.cast(T, tf.float32)
C_to_T_transitions = from_C * to_T
# create the MDP augmented by a new absorbing state where C-states transition
# to instead of transitioning to T-states
#
# get the probability of transitioning from C to T
C_to_T_probs = tf.reduce_sum(transition_matrix * C_to_T_transitions, axis=-1)
# deviate the transitions from C to a new absorbing state
augmented_transition_matrix = tf.concat(
# set the probabilities of transitioning from C to T to 0.
[transition_matrix * (1. - C_to_T_transitions),
# set the transition probabilities to the absorbing state to those
# of transitioning to T
tf.expand_dims(C_to_T_probs, axis=-1)],
axis=-1)
# create a new sink state
sink_state_probs = tf.concat([
tf.zeros(shape=(1, A, S)),
tf.ones(shape=(1, A, 1))
], axis=-1)
# add this sink state to the transition matrix of the augmented MDP
augmented_transition_matrix = tf.concat([
augmented_transition_matrix,
sink_state_probs,
], axis=0)
# enable some random actions for the sink state
policy_probs = PolicyDecorator(
latent_policy
)(state_space).probs_parameter()
policy_probs = tf.concat([
policy_probs,
tf.pow(
tf.cast(A, tf.float32), -1.
) * tf.ones(shape=(1, A))
], axis=0)
# reward of 1 when transitioning to the sink state
reward_objective = tf.concat([
tf.zeros(shape=(S, A, S)),
# add a last column full of ones
tf.ones(shape=(S, A, 1))
], axis=-1)
reward_objective = tf.concat([
# add a last row full of zeros
reward_objective,
tf.zeros(shape=(1, A, S + 1))
], axis=0)
return value_iteration(
latent_state_size=latent_state_size,
num_actions=A,
transition_fn=augmented_transition_matrix,
reward_fn=reward_objective,
gamma=gamma,
policy_probs=policy_probs,
epsilon=1e-6,
v_init=tf.zeros(S + 1, dtype=tf.float32),
episodic_return=False,
error_type='absolute',
transition_matrix=augmented_transition_matrix,
reward_matrix=reward_objective,)
video_path = 'policy_videos/cartpole_dqn'
with suite_gym.load('CartPole-v0') as py_env:
py_env.seed(seed)
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
display_state_space(py_env)
display_action_space(py_env)
policy_dir = '../reinforcement_learning/saves/CartPole-v0/policy'
policy = SavedTFPolicy(policy_dir)
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes)
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env,
policy,
num_episodes=num_episodes,
observers=[
reward_metric,
video_observer,
]).run()
tf.print(f'avg. episode return: {reward_metric.result():.6g}')
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
embed_mp4(video_observer.file_name)
state space shape: (4,) state space max values: [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38] state space min values: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] discrete action space number of discrete actions: 2
[swscaler @ 0x62701c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: 200 std: 0
[swscaler @ 0x6c051c0] Warning: data is not aligned! This can lead to a speed loss
wae_model_path = 'saved_models/experiments/CartPole-v0/model/'
with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
wae_data = json.load(f)
print(wae_data)
wae_mdp = wasserstein_mdp.load(wae_model_path)
print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
WAE-MDP at training step 120000 Size of the latent state space: 512
video_path = 'policy_videos/cartpole_wae_distillation'
with suite_gym.load('CartPole-v0') as py_env:
py_env.seed(seed)
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
original_state = tf_env.current_time_step().observation
tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['CartPole-v0'])
policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
discounted_reward_metric = AverageDiscountedReturnMetric(
gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes)
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env, policy, num_episodes=num_episodes,
observers=[
reward_metric,
discounted_reward_metric,
video_observer,
]).run()
tf.print(f'avg. episode return: {reward_metric.result():.6g}')
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x5d2c1c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: 200 std: 0 avg. discounted (scaled) return: 43.3010445
[swscaler @ 0x70761c0] Warning: data is not aligned! This can lead to a speed loss
# PAC bounds for local losses
# the bound computed during training can already be found in the log file (wae_data)
epsilon = 1e-2
delta = 5e-3
T = int(np.ceil(-np.log(delta / 4) / (2 * epsilon**2)))
with suite_gym.load(
'CartPole-v0',
env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
) as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
local_losses_metrics = wae_mdp.estimate_local_losses_from_samples(
tf_env,
steps=T,
labeling_function=labeling_functions['CartPole-v0'],
estimate_transition_function_from_samples=True,
reward_scaling=wae_mdp._dynamic_reward_scaling,
estimate_value_difference=False)
tf.print('Local reward loss: {:.6g}'.format(local_losses_metrics.local_reward_loss))
tf.print('Local transition loss: {:.6g}'.format(local_losses_metrics.local_transition_loss))
tf.print('Local transition loss (freq. estimation): {:.6g}'.format(
local_losses_metrics.local_transition_loss_transition_function_estimation))
local_losses_metrics.print_time_metrics()
Local reward loss: 0.00499653 Local transition loss: 0.399636 Local transition loss (freq. estimation): 0.421809 Time metrics: Fill in the Replay Buffer (100000 frames): 119.436 Estimate the local reward loss function (from 33424 transitions): 1.631 Transition model generation (empirical frequency estimation, from 33424 transitions): 4.192 Estimate the local transition loss function (from 33424 transitions): 0.065 Estimate the local transition loss function via the frequency-estimated transition function:: 27.531
_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
latent_state=tf.cast(latent_state, dtype=tf.float32),
latent_action=tf.cast(latent_action, dtype=tf.float32),
next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
).mode()
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion
_latent_transition_fn = lambda latent_state, latent_action: \
wae_mdp.discrete_latent_transition(
tf.cast(latent_state, tf.float32),
tf.cast(latent_action, tf.float32))
print('Local reward loss: {:.2g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.2g}'.format(eval(wae_data['local_transition_loss'])))
print('Transition/reward model generation')
start = time.time()
# write the transition/reward functions to tensors,
# to formally check the values in an efficient way
latent_transition_fn = model.TransitionFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
transition_function=_latent_transition_fn,
epsilon=1e-6)
latent_reward_fn = model.RewardFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
reward_function=_latent_reward_fn,
transition_function=_latent_transition_fn,
epsilon=1e-6)
end = time.time() - start
print("Time to generate the model: {:.2g} sec".format(end))
Local reward loss: 0.0038 Local transition loss: 0.4 Transition/reward model generation Time to generate the model: 2.1 sec
start = time.time()
latent_mdp_values = compute_values_from_initial_distribution(
latent_state_size=wae_mdp.latent_state_size,
atomic_prop_dims=wae_mdp.atomic_prop_dims,
original_state=original_state,
number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
latent_transition_fn=latent_transition_fn,
latent_reward_function=latent_reward_fn,
epsilon=1e-6,
gamma=.99,
stochastic_state_embedding=lambda original_state: tfd.Independent(
tfd.Deterministic(loc=wae_mdp.state_embedding_function(
original_state,
ergodic_batched_labeling_function(
labeling_functions['CartPole-v0']
)(original_state))),
reinterpreted_batch_ndims=1)
)
value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)
tf.print("Value difference: {:.6g}".format(value_difference))
end = time.time() - start
print("Time to compute the value difference: {:2g} sec".format(end))
Value difference: 3.71213 Time to compute the value difference: 0.861585 sec
Time-to-failure property: $\neg\mathsf{Reset} \, \mathcal{U} \, \mathsf{Unsafe}$ where $\mathsf{Unsafe} \in \ell\left(s\right)$ iff the cart position is greather than 1.5 or the pole angle is greather than 9 degrees
start = time.time()
values = C_until_T_values(
C_fn=lambda latent_state: tf.math.logical_not(is_reset_state(latent_state, wae_mdp.atomic_prop_dims)),
T_fn=lambda latent_state: tf.logical_or(
# unsafe position
tf.cast(1. - latent_state[..., 0], tf.bool),
# unsafe angle
tf.cast(1. - latent_state[..., 1], tf.bool)),
transition_matrix=latent_transition_fn.to_dense(),
latent_state_size=wae_mdp.latent_state_size,
A=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
gamma=0.99,)
p_init = get_p_init(
wae_mdp,
tf_env.current_time_step().observation['state'],
latent_transition_fn,
'CartPole-v0',)
# get the values for the initial distribution
p_init_values = tf.reduce_sum(
p_init * values
) / tf.reduce_sum(p_init)
tf.print("property values: {:.6g}".format(p_init_values))
end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
property values: 0.031732 Time to compute the values of the property: 2.41673 sec
video_path = 'policy_videos/mountain_car_dqn'
with suite_gym.load('MountainCar-v0') as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
display_state_space(py_env)
display_action_space(py_env)
policy_dir = '../reinforcement_learning/saves/MountainCar-v0/dqn_policy'
policy = SavedTFPolicy(policy_dir)
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes)
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env,
policy,
num_episodes=num_episodes,
observers=[
reward_metric,
video_observer,
]).run()
tf.print(f'avg. episode return: {reward_metric.result():.6g}')
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
embed_mp4(video_observer.file_name)
state space shape: (2,) state space max values: [0.6 0.07] state space min values: [-1.2 -0.07] discrete action space number of discrete actions: 3
[swscaler @ 0x57471c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -103.2 std: 7.18053
[swscaler @ 0x6f0b1c0] Warning: data is not aligned! This can lead to a speed loss
wae_model_path = 'saved_models/hyperparameter_search/MountainCar-v0/model/'
with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
wae_data = json.load(f)
print(wae_data)
wae_mdp = wasserstein_mdp.load(wae_model_path)
print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
print('Local reward loss: {:.6g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.6g}'.format(eval(wae_data['local_transition_loss'])))
WAE-MDP at training step 232000 Size of the latent state space: 1024 Local reward loss: 0.0141763 Local transition loss: 0.382323
video_path = 'policy_videos/moutain_car_wae_distillation'
def latent_labeling_fn(time_step):
latent_state = time_step.observation['latent_state']
return {
'goal': latent_state.numpy()[..., 0],
}
with suite_gym.load('MountainCar-v0') as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
original_state = tf_env.current_time_step().observation
tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['MountainCar-v0'])
policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
discounted_reward_metric = AverageDiscountedReturnMetric(
gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes, labeling_function=latent_labeling_fn)
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env, policy, num_episodes=num_episodes,
observers=[
reward_metric,
discounted_reward_metric,
video_observer,
]).run()
tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x5593340] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -101.1 std: 5.12738 avg. discounted (scaled) return: -31.8745079
[swscaler @ 0x5fb2340] Warning: data is not aligned! This can lead to a speed loss
frequency_estimation = True
_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
latent_state=tf.cast(latent_state, dtype=tf.float32),
latent_action=tf.cast(latent_action, dtype=tf.float32),
next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
).mode()
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion
_latent_transition_fn = lambda latent_state, latent_action: \
wae_mdp.discrete_latent_transition(
tf.cast(latent_state, tf.float32),
tf.cast(latent_action, tf.float32))
# write the transition/reward functions to tensors,
# to formally check the values in an efficient way
print('Transition/reward model generation')
start = time.time()
if frequency_estimation:
# compute the transition tensor by frequency estimation and use the
# latent transition function learned during the WAE optimization as backup function
with suite_gym.load(
'MountainCar-v0',
env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
) as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
latent_transition_fn = model.estimate_latent_transition_function_from_samples(
environment=tf_env,
n_steps=100000,
state_embedding_function=wae_mdp.state_embedding_function,
action_embedding_function=wae_mdp.action_embedding_function,
labeling_function=labeling_functions['MountainCar-v0'],
latent_state_size=wae_mdp.latent_state_size,
number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
backup_transition_fn=_latent_transition_fn)
else:
latent_transition_fn = model.TransitionFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
transition_function=_latent_transition_fn,
epsilon=0.)
latent_reward_fn = model.RewardFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
reward_function=_latent_reward_fn,
transition_function=_latent_transition_fn,
epsilon=1e-6)
end = time.time() - start
print("Time to generate the model: {:.2g} sec".format(end))
Transition/reward model generation Time to generate the model: 1.2e+02 sec
start = time.time()
latent_mdp_values = compute_values_from_initial_distribution(
latent_state_size=wae_mdp.latent_state_size,
atomic_prop_dims=wae_mdp.atomic_prop_dims,
original_state=original_state,
number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
latent_transition_fn=latent_transition_fn,
latent_reward_function=latent_reward_fn,
epsilon=1e-6,
gamma=.99,
stochastic_state_embedding=lambda original_state: tfd.Independent(
tfd.Deterministic(loc=wae_mdp.state_embedding_function(
original_state,
ergodic_batched_labeling_function(
labeling_functions['MountainCar-v0']
)(original_state))),
reinterpreted_batch_ndims=1)
)
value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)
tf.print("Value difference: {:.6g}".format(value_difference))
end = time.time() - start
print("Time to compute the value difference: {:2g} sec".format(end))
Value difference: 2.83714 Time to compute the value difference: 1.27584 sec
Time-to-failure property: $\neg\mathsf{Goal} \, \mathcal{U} \, \mathsf{Reset}$ where $\mathsf{Goal} \in \ell\left(s\right)$ iff the car reaches the top of the mountain, at yellow flag position.
start = time.time()
values = C_until_T_values(
C_fn=lambda latent_state: tf.math.logical_not(tf.cast(latent_state[..., 0], tf.bool)),
T_fn=lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims),
transition_matrix=latent_transition_fn.to_dense(),
latent_state_size=wae_mdp.latent_state_size,
A=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
gamma=0.99,)
p_init = get_p_init(
wae_mdp,
tf_env.current_time_step().observation,
latent_transition_fn,
'MountainCar-v0',)
# get the values for the initial distribution
p_init_values = tf.reduce_sum(
p_init * values
) / tf.reduce_sum(p_init)
tf.print("property values: {:.6g}".format(p_init_values))
end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
property values: 0 Time to compute the values of the property: 0.327113 sec
video_path = 'policy_videos/acrobot_dqn'
with suite_gym.load('Acrobot-v1') as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
display_state_space(py_env)
display_action_space(py_env)
policy_dir = '../reinforcement_learning/saves/AcrobotRandomInit-v1/dqn_policy'
policy = SavedTFPolicy(policy_dir)
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes)
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env,
policy,
num_episodes=num_episodes,
observers=[
reward_metric,
video_observer,
]).run()
tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
embed_mp4(video_observer.file_name)
state space shape: (6,) state space max values: [ 1. 1. 1. 1. 12.566371 28.274334] state space min values: [ -1. -1. -1. -1. -12.566371 -28.274334] discrete action space number of discrete actions: 3
[swscaler @ 0x5d17e80] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -70.1 std: 14.8893
[swscaler @ 0x580de80] Warning: data is not aligned! This can lead to a speed loss
wae_model_path = 'saved_models/experiments/Acrobot-v1/model/'
with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
wae_data = json.load(f)
print(wae_data)
wae_mdp = wasserstein_mdp.load(wae_model_path)
print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
print('Local reward loss: {:.6g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.6g}'.format(eval(wae_data['local_transition_loss'])))
WAE-MDP at training step 430000 Size of the latent state space: 8192 Local reward loss: 0.0347698 Local transition loss: 0.649478
video_path = 'policy_videos/acrobot_wae_distillation'
def latent_labeling_fn(time_step):
latent_state = time_step.observation['latent_state']
return {
'goal': latent_state.numpy()[..., 0],
}
with suite_gym.load('Acrobot-v1') as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
original_state = tf_env.current_time_step().observation
tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['Acrobot-v1'])
policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
discounted_reward_metric = AverageDiscountedReturnMetric(
gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes, labeling_function=latent_labeling_fn)
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env, policy, num_episodes=num_episodes,
observers=[
reward_metric,
discounted_reward_metric,
video_observer,
]).run()
tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x58f61c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -80.5 std: 9.58384 avg. discounted (scaled) return: -27.6321716
[swscaler @ 0x694d1c0] Warning: data is not aligned! This can lead to a speed loss
frequency_estimation = False
_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
latent_state=tf.cast(latent_state, dtype=tf.float32),
latent_action=tf.cast(latent_action, dtype=tf.float32),
next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
).mode()
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion
_latent_transition_fn = lambda latent_state, latent_action: \
wae_mdp.discrete_latent_transition(
tf.cast(latent_state, tf.float32),
tf.cast(latent_action, tf.float32))
print('Local reward loss: {:.2g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.2g}'.format(eval(wae_data['local_transition_loss'])))
# write the transition/reward functions to tensors,
# to formally check the values in an efficient way
print('Transition/reward model generation')
start = time.time()
if frequency_estimation:
# compute the transition tensor by frequency estimation and use the
# latent transition function learned during the WAE optimization as backup function
with suite_gym.load(
'Acrobot-v1',
env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
) as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
latent_transition_fn = model.estimate_latent_transition_function_from_samples(
environment=tf_env,
n_steps=100000,
state_embedding_function=wae_mdp.state_embedding_function,
action_embedding_function=wae_mdp.action_embedding_function,
labeling_function=labeling_functions['Acrobot-v1'],
latent_state_size=wae_mdp.latent_state_size,
number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
backup_transition_fn=_latent_transition_fn)
else:
latent_transition_fn = model.TransitionFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
transition_function=_latent_transition_fn,
epsilon=0.)
latent_reward_fn = model.RewardFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
reward_function=_latent_reward_fn,
transition_function=_latent_transition_fn,
epsilon=1e-6)
end = time.time() - start
print("Time to generate the model: {:.2f} sec".format(end))
Local reward loss: 0.035 Local transition loss: 0.65 Transition/reward model generation
2022-09-14 18:15:06.410863: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 4831838208 exceeds 10% of free system memory. 2022-09-14 18:15:07.913843: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 4831838208 exceeds 10% of free system memory.
Time to generate the model: 189.49 sec
start = time.time()
with tf.device('/CPU:0'):
latent_mdp_values = compute_values_from_initial_distribution(
latent_state_size=wae_mdp.latent_state_size,
atomic_prop_dims=wae_mdp.atomic_prop_dims,
original_state=original_state,
number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
latent_transition_fn=latent_transition_fn,
latent_reward_function=latent_reward_fn,
epsilon=1e-6,
gamma=.99,
stochastic_state_embedding=lambda original_state: tfd.Independent(
tfd.Deterministic(loc=wae_mdp.state_embedding_function(
original_state,
ergodic_batched_labeling_function(
labeling_functions['Acrobot-v1']
)(original_state))),
reinterpreted_batch_ndims=1)
)
value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)
tf.print("Value difference: {:.6g}".format(value_difference))
end = time.time() - start
print("Time to compute the value difference: {:2f} sec".format(end))
Value difference: 2.22006 Time to compute the value difference: 207.473 sec
Time-to-failure property: $\neg\mathsf{Goal} \, \mathcal{U} \, \mathsf{Reset}$
start = time.time()
with tf.device('/CPU:0'):
values = C_until_T_values(
C_fn=lambda latent_state: tf.math.logical_not(tf.cast(latent_state[..., 0], tf.bool)),
T_fn=lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims),
transition_matrix=latent_transition_fn.to_dense(),
latent_state_size=wae_mdp.latent_state_size,
A=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
gamma=0.99,)
p_init = get_p_init(
wae_mdp,
tf_env.current_time_step().observation['state'],
latent_transition_fn,
'Acrobot-v1',)
# get the values for the initial distribution
p_init_values = tf.reduce_sum(
p_init * values
) / tf.reduce_sum(p_init)
tf.print("property values: {:.6g}".format(p_init_values))
end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
property values: 0.0021911 Time to compute the values of the property: 4.66126 sec
video_path = 'policy_videos/pendulum_sac'
with suite_gym.load('Pendulum-v1') as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
display_state_space(py_env)
display_action_space(py_env)
policy_dir = '../reinforcement_learning/saves/PendulumRandomInit-v0/sac_policy'
policy = SavedTFPolicy(policy_dir)
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes)
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env,
policy,
num_episodes=num_episodes,
observers=[
reward_metric,
video_observer,
]).run()
tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
embed_mp4(video_observer.file_name)
state space shape: (3,) state space max values: [1. 1. 8.] state space min values: [-1. -1. -8.] continuous action space action space shape: (1,) action space max values: 2.0 action space min values: -2.0
[swscaler @ 0x69dd1c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -175.277344 std: 74.7762
[swscaler @ 0x6d8b1c0] Warning: data is not aligned! This can lead to a speed loss
wae_model_path = 'saved_models/experiments/PendulumRandomInit-v1/model/'
with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
wae_data = json.load(f)
print(wae_data)
wae_mdp = wasserstein_mdp.load(wae_model_path)
print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
print("Size of the latent action space: {:d}".format(wae_mdp.number_of_discrete_actions))
print('Local reward loss: {:.6g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.6g}'.format(eval(wae_data['local_transition_loss'])))
WAE-MDP at training step 370000 Size of the latent state space: 8192 Size of the latent action space: 3 Local reward loss: 0.0266745 Local transition loss: 0.539508
video_path = 'policy_videos/pendulum_wae_distillation'
def latent_labeling_fn(time_step):
latent_state = time_step.observation['latent_state']
return {
'safe_region': latent_state.numpy()[..., 0],
}
with suite_gym.load('Pendulum-v1') as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
original_state = tf_env.current_time_step().observation
tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['Pendulum-v1'])
policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
discounted_reward_metric = AverageDiscountedReturnMetric(
gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes, labeling_function=latent_labeling_fn)
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env, policy, num_episodes=num_episodes,
observers=[
reward_metric,
discounted_reward_metric,
video_observer,
]).run()
tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x67791c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -148.118195 std: 49.4227 avg. discounted (scaled) return: -3.87636757
[swscaler @ 0x60211c0] Warning: data is not aligned! This can lead to a speed loss
frequency_estimation = False
_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
latent_state=tf.cast(latent_state, dtype=tf.float32),
latent_action=tf.cast(latent_action, dtype=tf.float32),
next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
).mode()
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion
_latent_transition_fn = lambda latent_state, latent_action: \
wae_mdp.discrete_latent_transition(
tf.cast(latent_state, tf.float32),
tf.cast(latent_action, tf.float32))
print('Local reward loss: {:.2g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.2g}'.format(eval(wae_data['local_transition_loss'])))
print('Transition/reward model generation')
# write the transition/reward functions to tensors,
# to formally check the values in an efficient way
start = time.time()
if frequency_estimation:
# compute the latent transition function by frequency estimation and use the
# latent transition function learned during the WAE optimization as backup function
with suite_gym.load(
'PendulumRandomInit-v1',
env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
) as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
latent_transition_fn = model.estimate_latent_transition_function_from_samples(
environment=tf_env,
n_steps=100000,
state_embedding_function=wae_mdp.state_embedding_function,
action_embedding_function=wae_mdp.action_embedding_function,
labeling_function=labeling_functions['Pendulum-v1'],
latent_state_size=wae_mdp.latent_state_size,
number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
backup_transition_fn=_latent_transition_fn)
else:
latent_transition_fn = model.TransitionFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
transition_function=_latent_transition_fn,
epsilon=0.)
latent_reward_fn = model.RewardFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
reward_function=_latent_reward_fn,
transition_function=_latent_transition_fn,
epsilon=1e-6)
end = time.time() - start
print("Time to generate the model: {:.2f} sec".format(end))
Local reward loss: 0.027 Local transition loss: 0.54 Transition/reward model generation
2022-09-26 21:35:27.230329: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 4784408256 exceeds 10% of free system memory. 2022-09-26 21:35:28.689146: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 4784408256 exceeds 10% of free system memory.
Time to generate the model: 145.44 sec
start = time.time()
with tf.device('/CPU:0'):
latent_mdp_values = compute_values_from_initial_distribution(
latent_state_size=wae_mdp.latent_state_size,
atomic_prop_dims=wae_mdp.atomic_prop_dims,
original_state=original_state,
number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
latent_transition_fn=latent_transition_fn,
latent_reward_function=latent_reward_fn,
epsilon=1e-6,
gamma=.99,
stochastic_state_embedding=lambda original_state: tfd.Independent(
tfd.Deterministic(loc=wae_mdp.state_embedding_function(
original_state,
ergodic_batched_labeling_function(
labeling_functions['Pendulum-v1']
)(original_state))),
reinterpreted_batch_ndims=1)
)
value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)
tf.print("Value difference: {:.6g}".format(value_difference))
end = time.time() - start
print("Time to compute the value difference: {:2f} sec".format(end))
Value difference: 4.33006 Time to compute the value difference: 168.700369 sec
The goal of the agent is to reach a safe region of the system (i.e., the pendulum is upright) and remain in this region until the end of the episode. The pendulum is safe iff it eventually remains upright during the episode, i.e., its angle compared to y axis remains in a tight range: $60^{\circ} = {\pi}/{3}$ rad.
The system fails when the pendulum does not reach this safe region: $ \Diamond (\neg\mathsf{Safe} \wedge \bigcirc \mathsf{Reset})$
start = time.time()
with tf.device('/CPU:0'):
values = reach_C_then_T_values(
# the first label indicates whether the pendulum is safe or not
C_fn=lambda latent_state: tf.logical_not(tf.cast(latent_state[..., 0], tf.bool)),
T_fn=lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims),
transition_matrix=latent_transition_fn.to_dense(),
latent_state_size=wae_mdp.latent_state_size,
A=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
gamma=0.99,)
p_init = get_p_init(
wae_mdp,
tf_env.current_time_step().observation['state'],
latent_transition_fn,
'Pendulum-v1',)
# take into account the extra absorbing state
p_init = tf.concat([p_init, [0.]], axis=-1)
# get the values for the initial distribution
p_init_values = tf.reduce_sum(
p_init * values
) / tf.reduce_sum(p_init)
tf.print("property values: {:.6g}".format(p_init_values))
end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
property values: 0.104939 Time to compute the values of the property: 41.9842 sec
video_path = 'policy_videos/lunar_lander_sac'
with suite_gym.load('LunarLanderContinuous-v2') as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
display_state_space(py_env)
display_action_space(py_env)
policy_dir = '../reinforcement_learning/saves/LunarLanderContinuous-v2/sac_policy'
policy = SavedTFPolicy(policy_dir)
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes)
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env,
policy,
num_episodes=num_episodes,
observers=[
reward_metric,
video_observer,
]).run()
tf.print(f'avg. episode return: {reward_metric.result():.6g}')
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
embed_mp4(video_observer.file_name)
state space shape: (8,) state space max values: 3.4028235e+38 state space min values: -3.4028235e+38 continuous action space action space shape: (2,) action space max values: 1.0 action space min values: -1.0
[swscaler @ 0x6fd71c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: 283.811 std: 22.3405
[swscaler @ 0x63421c0] Warning: data is not aligned! This can lead to a speed loss
wae_model_path = 'saved_models/experiments/LunarLanderContinuous-v2/model/'
with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
wae_data = json.load(f)
print(wae_data)
wae_mdp = wasserstein_mdp.load(wae_model_path)
print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
print("Size of the latent action space: {:d}".format(wae_mdp.number_of_discrete_actions))
print('Local reward loss: {:.6g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.6g}'.format(eval(wae_data['local_transition_loss'])))
WAE-MDP at training step 320000 Size of the latent state space: 16384 Size of the latent action space: 3 Local reward loss: 0.0207205 Local transition loss: 0.131357
video_path = 'policy_videos/lunar_lander_wae_distillation'
def latent_labeling_fn(time_step):
latent_state = time_step.observation['latent_state']
return {
'safe_angle': tf.logical_not(tf.cast(latent_state[..., 0], tf.bool)).numpy(),
'safe_landing': tf.logical_and(
tf.cast(latent_state[..., 1], tf.bool),
tf.cast(latent_state[..., 5], tf.bool)
).numpy()
}
with suite_gym.load('LunarLanderContinuous-v2') as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
original_state = tf_env.current_time_step().observation
tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['LunarLanderContinuous-v2'])
policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
num_episodes=30
reward_metric = tf_metrics.AverageReturnMetric()
discounted_reward_metric = AverageDiscountedReturnMetric(
gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
video_observer = video.VideoEmbeddingObserver(
py_env, video_path, num_episodes=num_episodes,
labeling_function=latent_labeling_fn, font_color='white')
dynamic_episode_driver.DynamicEpisodeDriver(
tf_env, policy, num_episodes=num_episodes,
observers=[
reward_metric,
discounted_reward_metric,
video_observer,
]).run()
tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x5bed340] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: 292.923981 std: 10.4602 avg. discounted (scaled) return: 0.426882565
[swscaler @ 0x6eec340] Warning: data is not aligned! This can lead to a speed loss
frequency_estimation = False
_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
latent_state=tf.cast(latent_state, dtype=tf.float32),
latent_action=tf.cast(latent_action, dtype=tf.float32),
next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
).mode()
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion
_latent_transition_fn = lambda latent_state, latent_action: \
wae_mdp.discrete_latent_transition(
tf.cast(latent_state, tf.float32),
tf.cast(latent_action, tf.float32))
print('Local reward loss: {:.2g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.2g}'.format(eval(wae_data['local_transition_loss'])))
print('Transition/reward model generation')
# write the transition/reward functions to tensors,
# to formally check the values in an efficient way
start = time.time()
if frequency_estimation:
# compute the latent transition function by frequency estimation and use the
# latent transition function learned during the WAE optimization as backup function
with suite_gym.load(
'LunarLanderContinuous-v2',
env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
) as py_env:
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
latent_transition_fn = model.estimate_latent_transition_function_from_samples(
environment=tf_env,
n_steps=100000,
state_embedding_function=wae_mdp.state_embedding_function,
action_embedding_function=wae_mdp.action_embedding_function,
labeling_function=labeling_functions['LunarLanderContinuous-v2'],
latent_state_size=wae_mdp.latent_state_size,
number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
backup_transition_fn=_latent_transition_fn)
else:
with tf.device('/CPU:0'):
latent_transition_fn = model.TransitionFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
transition_function=_latent_transition_fn,
epsilon=1e-6)
with tf.device('/CPU:0'):
latent_reward_fn = model.RewardFunctionCopy(
num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
num_actions=wae_mdp.number_of_discrete_actions,
reward_function=_latent_reward_fn,
transition_function=_latent_transition_fn,
epsilon=1e-6)
end = time.time() - start
print("Time to generate the model: {:.2f} sec".format(end))
Local reward loss: 0.021 Local transition loss: 0.13 Transition/reward model generation Time to generate the model: 922.83 sec
start = time.time()
with tf.device('/CPU:0'):
latent_mdp_values = compute_values_from_initial_distribution(
latent_state_size=wae_mdp.latent_state_size,
atomic_prop_dims=wae_mdp.atomic_prop_dims,
original_state=original_state,
number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
latent_transition_fn=latent_transition_fn,
latent_reward_function=latent_reward_fn,
epsilon=1e-6,
gamma=.99,
stochastic_state_embedding=lambda original_state: tfd.Independent(
tfd.Deterministic(loc=wae_mdp.state_embedding_function(
original_state,
ergodic_batched_labeling_function(
labeling_functions['LunarLanderContinuous-v2']
)(original_state))),
reinterpreted_batch_ndims=1)
)
value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)
tf.print("Value difference: {:.6g}".format(value_difference))
end = time.time() - start
print("Time to compute the value difference: {:2f} sec".format(end))
2022-05-11 22:18:56.386137: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory. 2022-05-11 22:18:56.386206: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory. 2022-05-11 22:18:57.350524: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory. 2022-05-11 22:18:58.029953: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory. 2022-05-11 22:18:58.705602: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
Value difference: 0.0372883 Time to compute the value difference: 480.655583 sec
Time to failure property: $\neg \mathsf{SafeLanding} \, \mathcal{U} \, \mathsf{Reset}$
safe_landing = lambda latent_state: tf.logical_and(
tf.cast(latent_state[..., 1], tf.bool),
tf.cast(latent_state[..., 5], tf.bool))
start = time.time()
with tf.device('/CPU:0'):
values = C_until_T_values(
C_fn=lambda latent_state: tf.math.logical_not(safe_landing(latent_state)),
T_fn=lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims),
transition_matrix=latent_transition_fn.to_dense(),
latent_state_size=wae_mdp.latent_state_size,
A=wae_mdp.number_of_discrete_actions,
latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
gamma=0.99,)
p_init = get_p_init(
wae_mdp,
tf_env.current_time_step().observation['state'],
latent_transition_fn,
'LunarLanderContinuous-v2',)
# get the values for the initial distribution
p_init_values = tf.reduce_sum(
p_init * values
) / tf.reduce_sum(p_init)
tf.print("property values: {:.6g}".format(p_init_values))
end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
2022-09-26 23:21:18.613753: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory. 2022-09-26 23:21:19.420906: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory. 2022-09-26 23:21:19.596072: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory. 2022-09-26 23:21:20.571898: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory. 2022-09-26 23:21:20.571939: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
property values: 0.0702039 Time to compute the values of the property: 137.057 sec