import sys
import os

path = os.path.dirname(os.path.abspath("__file__"))
sys.path.insert(0, path + '/..')

import base64
import IPython
import importlib
import logging
logging.getLogger().setLevel(logging.ERROR)
import random
import time
from collections import namedtuple

from tf_agents.environments import suite_gym, suite_dm_control, parallel_py_environment
from tf_agents.environments import tf_py_environment, FlattenObservationsWrapper
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer, episodic_replay_buffer
from tf_agents.drivers import dynamic_episode_driver, dynamic_step_driver
from tf_agents.trajectories import time_step as ts, policy_step, trajectory
from tf_agents.utils import common
from tf_agents.policies import TFPolicy

import tensorflow as tf
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)
import tensorflow_probability as tfp
tfd = tfp.distributions

import numpy as np
import json

from reinforcement_learning import labeling_functions
import reinforcement_learning.environments
from reinforcement_learning.environments import EnvironmentLoader, perturbed_env
from reinforcement_learning.metrics import AverageDiscountedReturnMetric
from policies.saved_policy import SavedTFPolicy
from policies.epsilon_mimic import EpsilonMimicPolicy
from policies.latent_policy import LatentPolicyOverRealStateAndActionSpaces

from verification import model, local_losses, binary_latent_space
from verification.local_losses import compute_values_from_initial_distribution
from verification.value_iteration import value_iteration
from util.io.dataset_generator import ergodic_batched_labeling_function, is_reset_state

from util.io import video
import wasserstein_mdp

from typing import Callable, Optional
from tf_agents.typing.types import Float, Bool

# set seed
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)
def display_state_space(py_env):
    print("state space shape:", py_env.observation_spec().shape)
    try:
        print("state space max values:", py_env.observation_spec().maximum)
        print("state space min values:", py_env.observation_spec().minimum)
    except AttributeError as e:
        pass

def display_action_space(py_env):
    if py_env.action_spec().dtype in [np.int64, np.int32]:
        print("discrete action space")
        print("number of discrete actions:", py_env.action_spec().maximum + 1)
    else:
        print("continuous action space")
        print("action space shape:", py_env.action_spec().shape)
        print("action space max values:", py_env.action_spec().maximum)
        print("action space min values:", py_env.action_spec().minimum)

Verification utils¶

from util.io.dataset_generator import is_reset_state
from verification.local_losses import PolicyDecorator

@tf.function
def get_p_init(
    wae_mdp,
    original_state,
    latent_transition_fn,
    environment_name,
):
    latent_state_space = binary_latent_space(wae_mdp.latent_state_size)
    is_reset_state_test_fn = lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims)
    original_reset_state = tf.tile(tf.zeros_like(original_state[:1, ...]), [tf.shape(latent_state_space)[0], 1])
    reset_state = wae_mdp.state_embedding_function(
        original_reset_state,
        ergodic_batched_labeling_function(
            labeling_functions[environment_name]
        )(original_reset_state))
    reset_state = tf.cast(reset_state, tf.float32)
    
    latent_action_space = tf.one_hot(
        indices=tf.range(wae_mdp.number_of_discrete_actions),
        depth=tf.cast(wae_mdp.number_of_discrete_actions, tf.int32),
        dtype=tf.float32)

    return tf.reduce_sum(
        tf.transpose(
            PolicyDecorator(wae_mdp.get_latent_policy(action_dtype=tf.int64))(
                reset_state
            ).probs_parameter()
        ) * tf.map_fn(
            fn=lambda latent_action: latent_transition_fn(
                reset_state,
                tf.tile(tf.expand_dims(latent_action, 0), [tf.shape(latent_state_space)[0], 1]),
            ).prob(
                tf.cast(latent_state_space, tf.float32),
                full_latent_state_space=True),
            elems=latent_action_space),
        axis=0) * (1. - tf.cast(is_reset_state_test_fn(latent_state_space), tf.float32))
def C_until_T_values(
    C_fn: Callable[[Float], Bool],
    T_fn: Callable[[Float], Bool],
    transition_matrix: Float,
    latent_state_size: int,
    A: int,
    latent_policy: TFPolicy,
    gamma: Float = 0.99,
    transition_to_T_reward: Optional[Float] = None,
) -> Float:
    
    S = tf.pow(2, latent_state_size)
    state_space = binary_latent_space(latent_state_size, dtype=tf.float32)
    
    # make absorbing ¬C and T
    absorbing_states = lambda latent_state: tf.math.logical_or(
        tf.math.logical_not(C_fn(latent_state)),
        T_fn(latent_state))
    
    # reward of 1 when transitioning to T;
    # set it to the input values if provided
    reward_objective = tf.ones(
        shape=(S, A, S),
    ) * tf.cast(T_fn(state_space), tf.float32)
    if transition_to_T_reward is not None:
        reward_objective *= transition_to_T_reward
    
    policy_probs = PolicyDecorator(
        latent_policy
    )(state_space).probs_parameter()
    
    values = value_iteration(
        latent_state_size=latent_state_size,
        num_actions=A,
        transition_fn=transition_matrix,
        reward_fn=reward_objective,
        gamma=gamma,
        policy_probs=policy_probs,
        epsilon=1e-6,
        v_init=tf.zeros(S, dtype=tf.float32),
        episodic_return=True,
        is_reset_state_test_fn=absorbing_states,
        error_type='absolute',
        transition_matrix=transition_matrix,
        reward_matrix=reward_objective,)
    
    # set the values of the target states to either one or the input values if provided
    if transition_to_T_reward is None:
        values = values + tf.cast(T_fn(state_space), tf.float32)
    else:
        values = values + (tf.cast(T_fn(state_space), tf.float32) * transition_to_T_reward)
    
    return values
def reach_C_then_T_values(
    C_fn: Callable[[Float], Bool],
    T_fn: Callable[[Float], Bool],
    transition_matrix: Float,
    latent_state_size: int,
    A: int,
    latent_policy: TFPolicy,
    gamma: Float = 0.99,
) -> Float:
    
    S = tf.pow(2, latent_state_size)
    state_space = binary_latent_space(latent_state_size, dtype=tf.float32)

    C = C_fn(state_space)
    T = T_fn(state_space)
    all_states = tf.ones(shape=(S, A, S))
    
    # detect when the agent transitions from the C to T
    # set C-state rows to 1 
    from_C = tf.transpose(all_states * tf.cast(C, tf.float32))
    # set T-state columns to 1 
    to_T = all_states * tf.cast(T, tf.float32)
    C_to_T_transitions = from_C * to_T
    
    # create the MDP augmented by a new absorbing state where C-states transition
    # to instead of transitioning to T-states
    #
    # get the probability of transitioning from C to T
    C_to_T_probs = tf.reduce_sum(transition_matrix * C_to_T_transitions, axis=-1)

    # deviate the transitions from C to a new absorbing state
    augmented_transition_matrix = tf.concat(
        # set the probabilities of transitioning from C to T to 0.
        [transition_matrix * (1. - C_to_T_transitions),
         # set the transition probabilities to the absorbing state to those
         # of transitioning to T
         tf.expand_dims(C_to_T_probs, axis=-1)],
        axis=-1)
    # create a new sink state
    sink_state_probs = tf.concat([
            tf.zeros(shape=(1, A, S)),
            tf.ones(shape=(1, A, 1))
        ], axis=-1)
    # add this sink state to the transition matrix of the augmented MDP
    augmented_transition_matrix = tf.concat([
        augmented_transition_matrix,
        sink_state_probs,
    ], axis=0)

    # enable some random actions for the sink state
    policy_probs = PolicyDecorator(
        latent_policy
    )(state_space).probs_parameter()
    policy_probs = tf.concat([
        policy_probs,
        tf.pow(
            tf.cast(A, tf.float32), -1.
        ) * tf.ones(shape=(1, A))
    ], axis=0)
    
    # reward of 1 when transitioning to the sink state
    reward_objective = tf.concat([
        tf.zeros(shape=(S, A, S)),
        # add a last column full of ones
        tf.ones(shape=(S, A, 1))
    ], axis=-1)
    reward_objective = tf.concat([
        # add a last row full of zeros
        reward_objective,
        tf.zeros(shape=(1, A, S + 1))
    ], axis=0)
    
    return value_iteration(
        latent_state_size=latent_state_size,
        num_actions=A,
        transition_fn=augmented_transition_matrix,
        reward_fn=reward_objective,
        gamma=gamma,
        policy_probs=policy_probs,
        epsilon=1e-6,
        v_init=tf.zeros(S + 1, dtype=tf.float32),
        episodic_return=False,
        error_type='absolute',
        transition_matrix=augmented_transition_matrix,
        reward_matrix=reward_objective,)

CartPole¶

RL policy (DQN)¶

video_path = 'policy_videos/cartpole_dqn'

with suite_gym.load('CartPole-v0') as py_env:
    py_env.seed(seed)
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    
    display_state_space(py_env)
    display_action_space(py_env)

    policy_dir = '../reinforcement_learning/saves/CartPole-v0/policy'
    policy = SavedTFPolicy(policy_dir)
    num_episodes=30

    reward_metric = tf_metrics.AverageReturnMetric()
    
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes)
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        policy,
        num_episodes=num_episodes,
        observers=[
            reward_metric,
            video_observer,
        ]).run()

    tf.print(f'avg. episode return: {reward_metric.result():.6g}')
    tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')

embed_mp4(video_observer.file_name)
state space shape: (4,)
state space max values: [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
state space min values: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
discrete action space
number of discrete actions: 2
[swscaler @ 0x62701c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: 200
std: 0
[swscaler @ 0x6c051c0] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.

Distilled policy¶

wae_model_path = 'saved_models/experiments/CartPole-v0/model/'

with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
    wae_data = json.load(f)
    print(wae_data)

wae_mdp = wasserstein_mdp.load(wae_model_path)
print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
WAE-MDP at training step 120000
Size of the latent state space: 512
video_path = 'policy_videos/cartpole_wae_distillation'
with suite_gym.load('CartPole-v0') as py_env:
    py_env.seed(seed)
    py_env.reset()
    
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    original_state = tf_env.current_time_step().observation
    
    tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['CartPole-v0'])
    policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
    
    num_episodes=30
    reward_metric = tf_metrics.AverageReturnMetric()
    discounted_reward_metric = AverageDiscountedReturnMetric(
        gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes)
    
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env, policy, num_episodes=num_episodes,
        observers=[
            reward_metric,
            discounted_reward_metric,
            video_observer,
        ]).run()
    

tf.print(f'avg. episode return: {reward_metric.result():.6g}')
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x5d2c1c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: 200
std: 0
avg. discounted (scaled) return: 43.3010445
[swscaler @ 0x70761c0] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.
# PAC bounds for local losses
# the bound computed during training can already be found in the log file (wae_data)
epsilon = 1e-2
delta = 5e-3
T = int(np.ceil(-np.log(delta / 4) / (2 * epsilon**2)))

with suite_gym.load(
    'CartPole-v0',
    env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
) as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    local_losses_metrics = wae_mdp.estimate_local_losses_from_samples(
        tf_env,
        steps=T,
        labeling_function=labeling_functions['CartPole-v0'],
        estimate_transition_function_from_samples=True,
        reward_scaling=wae_mdp._dynamic_reward_scaling,
        estimate_value_difference=False)

tf.print('Local reward loss: {:.6g}'.format(local_losses_metrics.local_reward_loss))
tf.print('Local transition loss: {:.6g}'.format(local_losses_metrics.local_transition_loss))
tf.print('Local transition loss (freq. estimation): {:.6g}'.format(
    local_losses_metrics.local_transition_loss_transition_function_estimation))
local_losses_metrics.print_time_metrics()
Local reward loss: 0.00499653
Local transition loss: 0.399636
Local transition loss (freq. estimation): 0.421809
Time metrics:
    Fill in the Replay Buffer (100000 frames): 119.436
    Estimate the local reward loss function (from 33424 transitions): 1.631
    Transition model generation (empirical frequency estimation, from 33424 transitions): 4.192
    Estimate the local transition loss function (from 33424 transitions): 0.065
    Estimate the local transition loss function via the frequency-estimated transition function:: 27.531
_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
    wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
        latent_state=tf.cast(latent_state, dtype=tf.float32),
        latent_action=tf.cast(latent_action, dtype=tf.float32),
        next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
    ).mode()
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion 

_latent_transition_fn = lambda latent_state, latent_action: \
        wae_mdp.discrete_latent_transition(
            tf.cast(latent_state, tf.float32),
            tf.cast(latent_action, tf.float32))

print('Local reward loss: {:.2g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.2g}'.format(eval(wae_data['local_transition_loss'])))

print('Transition/reward model generation')
start = time.time()

#  write the transition/reward functions to tensors,
#  to formally check the values in an efficient way
latent_transition_fn = model.TransitionFunctionCopy(
    num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
    num_actions=wae_mdp.number_of_discrete_actions,
    transition_function=_latent_transition_fn,
    epsilon=1e-6)

latent_reward_fn = model.RewardFunctionCopy(
    num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
    num_actions=wae_mdp.number_of_discrete_actions,
    reward_function=_latent_reward_fn,
    transition_function=_latent_transition_fn,
    epsilon=1e-6)

end = time.time() - start

print("Time to generate the model: {:.2g} sec".format(end))
Local reward loss: 0.0038
Local transition loss: 0.4
Transition/reward model generation
Time to generate the model: 2.1 sec
start = time.time()

latent_mdp_values = compute_values_from_initial_distribution(
    latent_state_size=wae_mdp.latent_state_size,
    atomic_prop_dims=wae_mdp.atomic_prop_dims,
    original_state=original_state,
    number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
    latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
    latent_transition_fn=latent_transition_fn,
    latent_reward_function=latent_reward_fn,
    epsilon=1e-6,
    gamma=.99,
    stochastic_state_embedding=lambda original_state: tfd.Independent(
        tfd.Deterministic(loc=wae_mdp.state_embedding_function(
            original_state,
            ergodic_batched_labeling_function(
                labeling_functions['CartPole-v0']
            )(original_state))),
        reinterpreted_batch_ndims=1)
)

value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)

tf.print("Value difference: {:.6g}".format(value_difference))

end = time.time() - start

print("Time to compute the value difference: {:2g} sec".format(end))
Value difference: 3.71213
Time to compute the value difference: 0.861585 sec

Time-to-failure property: $\neg\mathsf{Reset} \, \mathcal{U} \, \mathsf{Unsafe}$ where $\mathsf{Unsafe} \in \ell\left(s\right)$ iff the cart position is greather than 1.5 or the pole angle is greather than 9 degrees

start = time.time()

values = C_until_T_values(
    C_fn=lambda latent_state: tf.math.logical_not(is_reset_state(latent_state, wae_mdp.atomic_prop_dims)),
    T_fn=lambda latent_state: tf.logical_or(
        # unsafe position
        tf.cast(1. - latent_state[..., 0], tf.bool),
        # unsafe angle
        tf.cast(1. - latent_state[..., 1], tf.bool)),
    transition_matrix=latent_transition_fn.to_dense(),
    latent_state_size=wae_mdp.latent_state_size,
    A=wae_mdp.number_of_discrete_actions,
    latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
    gamma=0.99,)

p_init = get_p_init(
    wae_mdp,
    tf_env.current_time_step().observation['state'],
    latent_transition_fn,
    'CartPole-v0',)

# get the values for the initial distribution
p_init_values = tf.reduce_sum(
    p_init * values
) / tf.reduce_sum(p_init)


tf.print("property values: {:.6g}".format(p_init_values))

end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
property values: 0.031732
Time to compute the values of the property: 2.41673 sec

MountainCar¶

RL policy (DQN)¶

video_path = 'policy_videos/mountain_car_dqn'

with suite_gym.load('MountainCar-v0') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    
    display_state_space(py_env)
    display_action_space(py_env)

    policy_dir = '../reinforcement_learning/saves/MountainCar-v0/dqn_policy'
    policy = SavedTFPolicy(policy_dir)
    num_episodes=30

    reward_metric = tf_metrics.AverageReturnMetric()
    
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes)
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        policy,
        num_episodes=num_episodes,
        observers=[
            reward_metric,
            video_observer,
        ]).run()

    tf.print(f'avg. episode return: {reward_metric.result():.6g}')
    tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')

embed_mp4(video_observer.file_name)
state space shape: (2,)
state space max values: [0.6  0.07]
state space min values: [-1.2  -0.07]
discrete action space
number of discrete actions: 3
[swscaler @ 0x57471c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -103.2
std: 7.18053
[swscaler @ 0x6f0b1c0] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.

Distilled policy¶

wae_model_path = 'saved_models/hyperparameter_search/MountainCar-v0/model/'

with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
    wae_data = json.load(f)
    print(wae_data)

wae_mdp = wasserstein_mdp.load(wae_model_path)

print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
print('Local reward loss: {:.6g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.6g}'.format(eval(wae_data['local_transition_loss'])))
WAE-MDP at training step 232000
Size of the latent state space: 1024
Local reward loss: 0.0141763
Local transition loss: 0.382323
video_path = 'policy_videos/moutain_car_wae_distillation'

def latent_labeling_fn(time_step):
    latent_state = time_step.observation['latent_state']
    return {
        'goal': latent_state.numpy()[..., 0],
    }

with suite_gym.load('MountainCar-v0') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    original_state = tf_env.current_time_step().observation
    
    tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['MountainCar-v0'])
    policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
    
    num_episodes=30
    reward_metric = tf_metrics.AverageReturnMetric()
    discounted_reward_metric = AverageDiscountedReturnMetric(
        gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes, labeling_function=latent_labeling_fn)
    
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env, policy, num_episodes=num_episodes,
        observers=[
            reward_metric,
            discounted_reward_metric,
            video_observer,
        ]).run()
    

tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x5593340] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -101.1
std: 5.12738
avg. discounted (scaled) return: -31.8745079
[swscaler @ 0x5fb2340] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.
frequency_estimation = True

_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
    wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
        latent_state=tf.cast(latent_state, dtype=tf.float32),
        latent_action=tf.cast(latent_action, dtype=tf.float32),
        next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
    ).mode() 
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion 

_latent_transition_fn = lambda latent_state, latent_action: \
        wae_mdp.discrete_latent_transition(
            tf.cast(latent_state, tf.float32),
            tf.cast(latent_action, tf.float32))

#  write the transition/reward functions to tensors,
#  to formally check the values in an efficient way
print('Transition/reward model generation')
start = time.time()

if frequency_estimation:
    #  compute the transition tensor by frequency estimation and use the
    #  latent transition function learned during the WAE optimization as backup function
    with suite_gym.load(
        'MountainCar-v0',
        env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
    ) as py_env:
        py_env.reset()
        tf_env = tf_py_environment.TFPyEnvironment(py_env)
        latent_transition_fn = model.estimate_latent_transition_function_from_samples(
            environment=tf_env,
            n_steps=100000,
            state_embedding_function=wae_mdp.state_embedding_function,
            action_embedding_function=wae_mdp.action_embedding_function,
            labeling_function=labeling_functions['MountainCar-v0'],
            latent_state_size=wae_mdp.latent_state_size,
            number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
            latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
            backup_transition_fn=_latent_transition_fn)
else:
    latent_transition_fn = model.TransitionFunctionCopy(
        num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
        num_actions=wae_mdp.number_of_discrete_actions,
        transition_function=_latent_transition_fn,
        epsilon=0.)

latent_reward_fn = model.RewardFunctionCopy(
    num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
    num_actions=wae_mdp.number_of_discrete_actions,
    reward_function=_latent_reward_fn,
    transition_function=_latent_transition_fn,
    epsilon=1e-6)

end = time.time() - start

print("Time to generate the model: {:.2g} sec".format(end))
Transition/reward model generation
Time to generate the model: 1.2e+02 sec
start = time.time()

latent_mdp_values = compute_values_from_initial_distribution(
    latent_state_size=wae_mdp.latent_state_size,
    atomic_prop_dims=wae_mdp.atomic_prop_dims,
    original_state=original_state,
    number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
    latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
    latent_transition_fn=latent_transition_fn,
    latent_reward_function=latent_reward_fn,
    epsilon=1e-6,
    gamma=.99,
    stochastic_state_embedding=lambda original_state: tfd.Independent(
        tfd.Deterministic(loc=wae_mdp.state_embedding_function(
            original_state,
            ergodic_batched_labeling_function(
                labeling_functions['MountainCar-v0']
            )(original_state))),
        reinterpreted_batch_ndims=1)
)

value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)

tf.print("Value difference: {:.6g}".format(value_difference))

end = time.time() - start

print("Time to compute the value difference: {:2g} sec".format(end))
Value difference: 2.83714
Time to compute the value difference: 1.27584 sec

Time-to-failure property: $\neg\mathsf{Goal} \, \mathcal{U} \, \mathsf{Reset}$ where $\mathsf{Goal} \in \ell\left(s\right)$ iff the car reaches the top of the mountain, at yellow flag position.

start = time.time()

values = C_until_T_values(
    C_fn=lambda latent_state: tf.math.logical_not(tf.cast(latent_state[..., 0], tf.bool)),
    T_fn=lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims),
    transition_matrix=latent_transition_fn.to_dense(),
    latent_state_size=wae_mdp.latent_state_size,
    A=wae_mdp.number_of_discrete_actions,
    latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
    gamma=0.99,)

p_init = get_p_init(
    wae_mdp,
    tf_env.current_time_step().observation,
    latent_transition_fn,
    'MountainCar-v0',)

# get the values for the initial distribution
p_init_values = tf.reduce_sum(
    p_init * values
) / tf.reduce_sum(p_init)


tf.print("property values: {:.6g}".format(p_init_values))

end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
property values: 0
Time to compute the values of the property: 0.327113 sec

Acrobot¶

RL Policy (DQN, trained in an environment with random initial states)¶

video_path = 'policy_videos/acrobot_dqn'

with suite_gym.load('Acrobot-v1') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    
    display_state_space(py_env)
    display_action_space(py_env)

    policy_dir = '../reinforcement_learning/saves/AcrobotRandomInit-v1/dqn_policy'
    policy = SavedTFPolicy(policy_dir)
    num_episodes=30

    reward_metric = tf_metrics.AverageReturnMetric()
    
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes)
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        policy,
        num_episodes=num_episodes,
        observers=[
            reward_metric,
            video_observer,
        ]).run()

    tf.print('avg. episode return:', reward_metric.result())
    tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')

embed_mp4(video_observer.file_name)
state space shape: (6,)
state space max values: [ 1.        1.        1.        1.       12.566371 28.274334]
state space min values: [ -1.        -1.        -1.        -1.       -12.566371 -28.274334]
discrete action space
number of discrete actions: 3
[swscaler @ 0x5d17e80] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -70.1
std: 14.8893
[swscaler @ 0x580de80] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.

Distilled policy¶

wae_model_path = 'saved_models/experiments/Acrobot-v1/model/'

with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
    wae_data = json.load(f)
    print(wae_data)

wae_mdp = wasserstein_mdp.load(wae_model_path)

print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
print('Local reward loss: {:.6g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.6g}'.format(eval(wae_data['local_transition_loss'])))
WAE-MDP at training step 430000
Size of the latent state space: 8192
Local reward loss: 0.0347698
Local transition loss: 0.649478
video_path = 'policy_videos/acrobot_wae_distillation'

def latent_labeling_fn(time_step):
    latent_state = time_step.observation['latent_state']
    return {
        'goal': latent_state.numpy()[..., 0],
    }

with suite_gym.load('Acrobot-v1') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    original_state = tf_env.current_time_step().observation
    
    tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['Acrobot-v1'])
    policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
    
    num_episodes=30
    reward_metric = tf_metrics.AverageReturnMetric()
    discounted_reward_metric = AverageDiscountedReturnMetric(
        gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes, labeling_function=latent_labeling_fn)
    
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env, policy, num_episodes=num_episodes,
        observers=[
            reward_metric,
            discounted_reward_metric,
            video_observer,
        ]).run()
    

tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x58f61c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -80.5
std: 9.58384
avg. discounted (scaled) return: -27.6321716
[swscaler @ 0x694d1c0] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.
frequency_estimation = False

_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
    wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
        latent_state=tf.cast(latent_state, dtype=tf.float32),
        latent_action=tf.cast(latent_action, dtype=tf.float32),
        next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
    ).mode() 
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion 

_latent_transition_fn = lambda latent_state, latent_action: \
        wae_mdp.discrete_latent_transition(
            tf.cast(latent_state, tf.float32),
            tf.cast(latent_action, tf.float32))

print('Local reward loss: {:.2g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.2g}'.format(eval(wae_data['local_transition_loss'])))

#  write the transition/reward functions to tensors,
#  to formally check the values in an efficient way
print('Transition/reward model generation')

start = time.time()

if frequency_estimation:
    #  compute the transition tensor by frequency estimation and use the
    #  latent transition function learned during the WAE optimization as backup function
    with suite_gym.load(
        'Acrobot-v1',
        env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
    ) as py_env:
        py_env.reset()
        tf_env = tf_py_environment.TFPyEnvironment(py_env)
        latent_transition_fn = model.estimate_latent_transition_function_from_samples(
            environment=tf_env,
            n_steps=100000,
            state_embedding_function=wae_mdp.state_embedding_function,
            action_embedding_function=wae_mdp.action_embedding_function,
            labeling_function=labeling_functions['Acrobot-v1'],
            latent_state_size=wae_mdp.latent_state_size,
            number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
            latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
            backup_transition_fn=_latent_transition_fn)
else:
    latent_transition_fn = model.TransitionFunctionCopy(
        num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
        num_actions=wae_mdp.number_of_discrete_actions,
        transition_function=_latent_transition_fn,
        epsilon=0.)

latent_reward_fn = model.RewardFunctionCopy(
    num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
    num_actions=wae_mdp.number_of_discrete_actions,
    reward_function=_latent_reward_fn,
    transition_function=_latent_transition_fn,
    epsilon=1e-6)

end = time.time() - start

print("Time to generate the model: {:.2f} sec".format(end))
Local reward loss: 0.035
Local transition loss: 0.65
Transition/reward model generation
2022-09-14 18:15:06.410863: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 4831838208 exceeds 10% of free system memory.
2022-09-14 18:15:07.913843: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 4831838208 exceeds 10% of free system memory.
Time to generate the model: 189.49 sec
start = time.time()

with tf.device('/CPU:0'):
    latent_mdp_values = compute_values_from_initial_distribution(
        latent_state_size=wae_mdp.latent_state_size,
        atomic_prop_dims=wae_mdp.atomic_prop_dims,
        original_state=original_state,
        number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
        latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
        latent_transition_fn=latent_transition_fn,
        latent_reward_function=latent_reward_fn,
        epsilon=1e-6,
        gamma=.99,
        stochastic_state_embedding=lambda original_state: tfd.Independent(
            tfd.Deterministic(loc=wae_mdp.state_embedding_function(
                original_state,
                ergodic_batched_labeling_function(
                    labeling_functions['Acrobot-v1']
                )(original_state))),
            reinterpreted_batch_ndims=1)
    )

value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)

tf.print("Value difference: {:.6g}".format(value_difference))

end = time.time() - start

print("Time to compute the value difference: {:2f} sec".format(end))
Value difference: 2.22006
Time to compute the value difference: 207.473 sec

Time-to-failure property: $\neg\mathsf{Goal} \, \mathcal{U} \, \mathsf{Reset}$

start = time.time()

with tf.device('/CPU:0'):
    values = C_until_T_values(
        C_fn=lambda latent_state: tf.math.logical_not(tf.cast(latent_state[..., 0], tf.bool)),
        T_fn=lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims),
        transition_matrix=latent_transition_fn.to_dense(),
        latent_state_size=wae_mdp.latent_state_size,
        A=wae_mdp.number_of_discrete_actions,
        latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
        gamma=0.99,)

    p_init = get_p_init(
        wae_mdp,
        tf_env.current_time_step().observation['state'],
        latent_transition_fn,
        'Acrobot-v1',)

# get the values for the initial distribution
p_init_values = tf.reduce_sum(
    p_init * values
) / tf.reduce_sum(p_init)


tf.print("property values: {:.6g}".format(p_init_values))

end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
property values: 0.0021911
Time to compute the values of the property: 4.66126 sec

Pendulum¶

RL policy (SAC, trained in an environment with random initial states)¶

video_path = 'policy_videos/pendulum_sac'

with suite_gym.load('Pendulum-v1') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    
    display_state_space(py_env)
    display_action_space(py_env)

    policy_dir = '../reinforcement_learning/saves/PendulumRandomInit-v0/sac_policy'
    policy = SavedTFPolicy(policy_dir)
    num_episodes=30

    reward_metric = tf_metrics.AverageReturnMetric()
    
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes)
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        policy,
        num_episodes=num_episodes,
        observers=[
            reward_metric,
            video_observer,
        ]).run()

tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')

embed_mp4(video_observer.file_name)
state space shape: (3,)
state space max values: [1. 1. 8.]
state space min values: [-1. -1. -8.]
continuous action space
action space shape: (1,)
action space max values: 2.0
action space min values: -2.0
[swscaler @ 0x69dd1c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -175.277344
std: 74.7762
[swscaler @ 0x6d8b1c0] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.

Distilled policy¶

wae_model_path = 'saved_models/experiments/PendulumRandomInit-v1/model/'

with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
    wae_data = json.load(f)
    print(wae_data)

wae_mdp = wasserstein_mdp.load(wae_model_path)

print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
print("Size of the latent action space: {:d}".format(wae_mdp.number_of_discrete_actions))
print('Local reward loss: {:.6g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.6g}'.format(eval(wae_data['local_transition_loss'])))
WAE-MDP at training step 370000
Size of the latent state space: 8192
Size of the latent action space: 3
Local reward loss: 0.0266745
Local transition loss: 0.539508
video_path = 'policy_videos/pendulum_wae_distillation'

def latent_labeling_fn(time_step):
    latent_state = time_step.observation['latent_state']
    return {
        'safe_region': latent_state.numpy()[..., 0],
    }

with suite_gym.load('Pendulum-v1') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    original_state = tf_env.current_time_step().observation
    
    tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['Pendulum-v1'])
    policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
    
    num_episodes=30
    reward_metric = tf_metrics.AverageReturnMetric()
    discounted_reward_metric = AverageDiscountedReturnMetric(
        gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes, labeling_function=latent_labeling_fn)
    
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env, policy, num_episodes=num_episodes,
        observers=[
            reward_metric,
            discounted_reward_metric,
            video_observer,
        ]).run()
    

tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x67791c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: -148.118195
std: 49.4227
avg. discounted (scaled) return: -3.87636757
[swscaler @ 0x60211c0] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.
frequency_estimation = False

_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
    wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
        latent_state=tf.cast(latent_state, dtype=tf.float32),
        latent_action=tf.cast(latent_action, dtype=tf.float32),
        next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
    ).mode() 
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion 

_latent_transition_fn = lambda latent_state, latent_action: \
        wae_mdp.discrete_latent_transition(
            tf.cast(latent_state, tf.float32),
            tf.cast(latent_action, tf.float32))

print('Local reward loss: {:.2g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.2g}'.format(eval(wae_data['local_transition_loss'])))

print('Transition/reward model generation')
#  write the transition/reward functions to tensors,
#  to formally check the values in an efficient way
start = time.time()

if frequency_estimation:
    # compute the latent transition function by frequency estimation and use the
    # latent transition function learned during the WAE optimization as backup function
    with suite_gym.load(
        'PendulumRandomInit-v1',
        env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
    ) as py_env:
        py_env.reset()
        tf_env = tf_py_environment.TFPyEnvironment(py_env)
        latent_transition_fn = model.estimate_latent_transition_function_from_samples(
            environment=tf_env,
            n_steps=100000,
            state_embedding_function=wae_mdp.state_embedding_function,
            action_embedding_function=wae_mdp.action_embedding_function,
            labeling_function=labeling_functions['Pendulum-v1'],
            latent_state_size=wae_mdp.latent_state_size,
            number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
            latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
            backup_transition_fn=_latent_transition_fn)
else:
    latent_transition_fn = model.TransitionFunctionCopy(
        num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
        num_actions=wae_mdp.number_of_discrete_actions,
        transition_function=_latent_transition_fn,
        epsilon=0.)

latent_reward_fn = model.RewardFunctionCopy(
    num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
    num_actions=wae_mdp.number_of_discrete_actions,
    reward_function=_latent_reward_fn,
    transition_function=_latent_transition_fn,
    epsilon=1e-6)

end = time.time() - start

print("Time to generate the model: {:.2f} sec".format(end))
Local reward loss: 0.027
Local transition loss: 0.54
Transition/reward model generation
2022-09-26 21:35:27.230329: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 4784408256 exceeds 10% of free system memory.
2022-09-26 21:35:28.689146: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 4784408256 exceeds 10% of free system memory.
Time to generate the model: 145.44 sec
start = time.time()

with tf.device('/CPU:0'):
    latent_mdp_values = compute_values_from_initial_distribution(
        latent_state_size=wae_mdp.latent_state_size,
        atomic_prop_dims=wae_mdp.atomic_prop_dims,
        original_state=original_state,
        number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
        latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
        latent_transition_fn=latent_transition_fn,
        latent_reward_function=latent_reward_fn,
        epsilon=1e-6,
        gamma=.99,
        stochastic_state_embedding=lambda original_state: tfd.Independent(
            tfd.Deterministic(loc=wae_mdp.state_embedding_function(
                original_state,
                ergodic_batched_labeling_function(
                    labeling_functions['Pendulum-v1']
                )(original_state))),
            reinterpreted_batch_ndims=1)
    )

value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)

tf.print("Value difference: {:.6g}".format(value_difference))

end = time.time() - start

print("Time to compute the value difference: {:2f} sec".format(end))
Value difference: 4.33006
Time to compute the value difference: 168.700369 sec

The goal of the agent is to reach a safe region of the system (i.e., the pendulum is upright) and remain in this region until the end of the episode. The pendulum is safe iff it eventually remains upright during the episode, i.e., its angle compared to y axis remains in a tight range: $60^{\circ} = {\pi}/{3}$ rad.

The system fails when the pendulum does not reach this safe region: $ \Diamond (\neg\mathsf{Safe} \wedge \bigcirc \mathsf{Reset})$

start = time.time()
with tf.device('/CPU:0'):
    values = reach_C_then_T_values(
        # the first label indicates whether the pendulum is safe or not
        C_fn=lambda latent_state: tf.logical_not(tf.cast(latent_state[..., 0], tf.bool)),
        T_fn=lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims),
        transition_matrix=latent_transition_fn.to_dense(),
        latent_state_size=wae_mdp.latent_state_size,
        A=wae_mdp.number_of_discrete_actions,
        latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
        gamma=0.99,)
    p_init = get_p_init(
        wae_mdp,
        tf_env.current_time_step().observation['state'],
        latent_transition_fn,
        'Pendulum-v1',)
    # take into account the extra absorbing state
    p_init = tf.concat([p_init, [0.]], axis=-1)

# get the values for the initial distribution
p_init_values = tf.reduce_sum(
    p_init * values
) / tf.reduce_sum(p_init)

tf.print("property values: {:.6g}".format(p_init_values))

end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
property values: 0.104939
Time to compute the values of the property: 41.9842 sec

LunarLander Continuous¶

RL policy (SAC)¶

video_path = 'policy_videos/lunar_lander_sac'

with suite_gym.load('LunarLanderContinuous-v2') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    
    display_state_space(py_env)
    display_action_space(py_env)

    policy_dir = '../reinforcement_learning/saves/LunarLanderContinuous-v2/sac_policy'
    policy = SavedTFPolicy(policy_dir)
    num_episodes=30

    reward_metric = tf_metrics.AverageReturnMetric()
    
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes)
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        policy,
        num_episodes=num_episodes,
        observers=[
            reward_metric,
            video_observer,
        ]).run()

    tf.print(f'avg. episode return: {reward_metric.result():.6g}')
    tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')

embed_mp4(video_observer.file_name)
state space shape: (8,)
state space max values: 3.4028235e+38
state space min values: -3.4028235e+38
continuous action space
action space shape: (2,)
action space max values: 1.0
action space min values: -1.0
[swscaler @ 0x6fd71c0] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: 283.811
std: 22.3405
[swscaler @ 0x63421c0] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.

Distilled policy¶

wae_model_path = 'saved_models/experiments/LunarLanderContinuous-v2/model/'

with open(os.path.join(wae_model_path, 'model_infos.json'), 'r') as f:
    wae_data = json.load(f)
    print(wae_data)

wae_mdp = wasserstein_mdp.load(wae_model_path)

print("WAE-MDP loaded")
print("WAE-MDP at training step {:d}".format(eval(wae_data['training_step'])))
print("Size of the latent state space: {:d}".format(2 ** wae_mdp.latent_state_size))
print("Size of the latent action space: {:d}".format(wae_mdp.number_of_discrete_actions))
print('Local reward loss: {:.6g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.6g}'.format(eval(wae_data['local_transition_loss'])))
WAE-MDP at training step 320000
Size of the latent state space: 16384
Size of the latent action space: 3
Local reward loss: 0.0207205
Local transition loss: 0.131357
video_path = 'policy_videos/lunar_lander_wae_distillation'

def latent_labeling_fn(time_step):
    latent_state = time_step.observation['latent_state']
    return {
        'safe_angle': tf.logical_not(tf.cast(latent_state[..., 0], tf.bool)).numpy(),
        'safe_landing': tf.logical_and(
            tf.cast(latent_state[..., 1], tf.bool),
            tf.cast(latent_state[..., 5], tf.bool)
        ).numpy()
    }

with suite_gym.load('LunarLanderContinuous-v2') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    original_state = tf_env.current_time_step().observation
    
    tf_env = wae_mdp.wrap_tf_environment(tf_env, labeling_functions['LunarLanderContinuous-v2'])
    policy =tf_env.wrap_latent_policy(wae_mdp.get_latent_policy(action_dtype=tf.int64))
    
    num_episodes=30
    reward_metric = tf_metrics.AverageReturnMetric()
    discounted_reward_metric = AverageDiscountedReturnMetric(
        gamma=.99, reward_scale=wae_mdp._dynamic_reward_scaling)
    video_observer = video.VideoEmbeddingObserver(
        py_env, video_path, num_episodes=num_episodes,
        labeling_function=latent_labeling_fn, font_color='white')
    
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env, policy, num_episodes=num_episodes,
        observers=[
            reward_metric,
            discounted_reward_metric,
            video_observer,
        ]).run()
    

tf.print('avg. episode return:', reward_metric.result())
tf.print(f'std: {tf.math.reduce_std(reward_metric._buffer.data):.6g}')
tf.print('avg. discounted (scaled) return:', discounted_reward_metric.result())
embed_mp4(video_observer.file_name)
[swscaler @ 0x5bed340] Warning: data is not aligned! This can lead to a speed loss
avg. episode return: 292.923981
std: 10.4602
avg. discounted (scaled) return: 0.426882565
[swscaler @ 0x6eec340] Warning: data is not aligned! This can lead to a speed loss
Your browser does not support the video tag.
frequency_estimation = False

_latent_reward_fn = lambda latent_state, latent_action, next_latent_state: \
    wae_mdp._dynamic_reward_scaling * wae_mdp.reward_distribution(
        latent_state=tf.cast(latent_state, dtype=tf.float32),
        latent_action=tf.cast(latent_action, dtype=tf.float32),
        next_latent_state=tf.cast(next_latent_state, dtype=tf.float32),
    ).mode() 
# as the distribution is deterministic, taking the mode
# allows to retrieve the Dirac impulsion 

_latent_transition_fn = lambda latent_state, latent_action: \
        wae_mdp.discrete_latent_transition(
            tf.cast(latent_state, tf.float32),
            tf.cast(latent_action, tf.float32))

print('Local reward loss: {:.2g}'.format(eval(wae_data['local_reward_loss'])))
print('Local transition loss: {:.2g}'.format(eval(wae_data['local_transition_loss'])))

print('Transition/reward model generation')
#  write the transition/reward functions to tensors,
#  to formally check the values in an efficient way
start = time.time()

if frequency_estimation:
    # compute the latent transition function by frequency estimation and use the
    # latent transition function learned during the WAE optimization as backup function
    with suite_gym.load(
        'LunarLanderContinuous-v2',
        env_wrappers=[lambda env: perturbed_env.PerturbedEnvironment(env, .75)]
    ) as py_env:
        py_env.reset()
        tf_env = tf_py_environment.TFPyEnvironment(py_env)
        latent_transition_fn = model.estimate_latent_transition_function_from_samples(
            environment=tf_env,
            n_steps=100000,
            state_embedding_function=wae_mdp.state_embedding_function,
            action_embedding_function=wae_mdp.action_embedding_function,
            labeling_function=labeling_functions['LunarLanderContinuous-v2'],
            latent_state_size=wae_mdp.latent_state_size,
            number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
            latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
            backup_transition_fn=_latent_transition_fn)
else:
    with tf.device('/CPU:0'):
        latent_transition_fn = model.TransitionFunctionCopy(
            num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
            num_actions=wae_mdp.number_of_discrete_actions,
            transition_function=_latent_transition_fn,
            epsilon=1e-6)
with tf.device('/CPU:0'):
    latent_reward_fn = model.RewardFunctionCopy(
        num_states=tf.cast(tf.pow(2, wae_mdp.latent_state_size), dtype=tf.int32),
        num_actions=wae_mdp.number_of_discrete_actions,
        reward_function=_latent_reward_fn,
        transition_function=_latent_transition_fn,
        epsilon=1e-6)

end = time.time() - start

print("Time to generate the model: {:.2f} sec".format(end))
Local reward loss: 0.021
Local transition loss: 0.13
Transition/reward model generation
Time to generate the model: 922.83 sec
start = time.time()

with tf.device('/CPU:0'):
    latent_mdp_values = compute_values_from_initial_distribution(
        latent_state_size=wae_mdp.latent_state_size,
        atomic_prop_dims=wae_mdp.atomic_prop_dims,
        original_state=original_state,
        number_of_discrete_actions=wae_mdp.number_of_discrete_actions,
        latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
        latent_transition_fn=latent_transition_fn,
        latent_reward_function=latent_reward_fn,
        epsilon=1e-6,
        gamma=.99,
        stochastic_state_embedding=lambda original_state: tfd.Independent(
            tfd.Deterministic(loc=wae_mdp.state_embedding_function(
                original_state,
                ergodic_batched_labeling_function(
                    labeling_functions['LunarLanderContinuous-v2']
                )(original_state))),
            reinterpreted_batch_ndims=1)
    )

value_difference = tf.abs(discounted_reward_metric.result() - latent_mdp_values)

tf.print("Value difference: {:.6g}".format(value_difference))

end = time.time() - start

print("Time to compute the value difference: {:2f} sec".format(end))
2022-05-11 22:18:56.386137: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
2022-05-11 22:18:56.386206: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
2022-05-11 22:18:57.350524: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
2022-05-11 22:18:58.029953: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
2022-05-11 22:18:58.705602: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
Value difference: 0.0372883
Time to compute the value difference: 480.655583 sec

Time to failure property: $\neg \mathsf{SafeLanding} \, \mathcal{U} \, \mathsf{Reset}$

safe_landing = lambda latent_state: tf.logical_and(
        tf.cast(latent_state[..., 1], tf.bool),
        tf.cast(latent_state[..., 5], tf.bool))

start = time.time()

with tf.device('/CPU:0'):
    values = C_until_T_values(
        C_fn=lambda latent_state: tf.math.logical_not(safe_landing(latent_state)),
        T_fn=lambda latent_state: is_reset_state(latent_state, wae_mdp.atomic_prop_dims),
        transition_matrix=latent_transition_fn.to_dense(),
        latent_state_size=wae_mdp.latent_state_size,
        A=wae_mdp.number_of_discrete_actions,
        latent_policy=wae_mdp.get_latent_policy(action_dtype=tf.int64),
        gamma=0.99,)

    p_init = get_p_init(
        wae_mdp,
        tf_env.current_time_step().observation['state'],
        latent_transition_fn,
        'LunarLanderContinuous-v2',)

# get the values for the initial distribution
p_init_values = tf.reduce_sum(
    p_init * values
) / tf.reduce_sum(p_init)

tf.print("property values: {:.6g}".format(p_init_values))

end = time.time() - start
print("Time to compute the values of the property: {:2g} sec".format(end))
2022-09-26 23:21:18.613753: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
2022-09-26 23:21:19.420906: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
2022-09-26 23:21:19.596072: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
2022-09-26 23:21:20.571898: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
2022-09-26 23:21:20.571939: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3221225472 exceeds 10% of free system memory.
property values: 0.0702039
Time to compute the values of the property: 137.057 sec