time_interval: [1, 5, 10, -1]  # -1 means the whole episode, # these are relative to the start of the evaluation time interval
evaluation_type: [mean, last]   # mean: aggregation over the time intervals, last: last value in the time interval
metric: [mse]  # mse: mean squared error, optional: mlh: marginal log-likelihood
eval_interval: 50
initial_eval: True
animation_indices: [0, 5, 10, 15, 20, 25]
early_stopping_metric: "full_rollout_mean_mse"
early_stopping_majority_needed: 0.3
context_test_sizes: ${env.context_test_sizes}
context_val_sizes: ${env.context_val_sizes}

# -> Results in dictionary entries like this:
# 5_step_mean_mse, full_rollout_last_mse, ...




