# Environment-related
env: "hopper" #
classdef:                   # The path to the environment class
d4rl_config: ???            # E.g., "medium-expert-v2"
term_func: ${env.term_func} # Termination function of the environment
reward_func: ${env.reward_func}
obs_preproc: ${env.obs_preproc} # Identity function.. Could use ${env.obs_preproc} instead
obs_postproc: ${env.obs_postproc}
targ_proc: ${env.targ_proc}

# MPC policy related
horizon: ???                 # The length of the horizon to be used during trajectory optimization
use_lamb_return: true        # Use the lambda return if set to true
plan_every: 1
population_size: ???         # Number of trajectories
num_particles: ${overrides.dynamics.num_elites} # Number of particles is the same as the number of ensemble models
use_value_func: true        # Whether to learn a value function to use during trajectory optimization
use_true_reward: false      # Set to true if the true reward function of the environment is used
replay_buffer_size: 2e6
collector_type: "step"        # 'step' -- one sample at a time, 'path' -- a path at a time
model_dir:                  # Specify the root directory to find for state_dicts
discount: 0.99

indep_var:
exp_name: 
  - H_${overrides.horizon}
  - N_${overrides.population_size}
  - S_${overrides.optimizer_cfg.sigma}
  - Ka_${overrides.optimizer_cfg.kappa}

optimizer_cfg:
  max_iters: 1              # No iterations for MPPI (in MBOP)
  use_mu_mix: false         # Mix with previous plan with the coefficient mu
  sigma: 0.01
  mu: 0.0
  beta: 1.0
  kappa: 3.0
  polyak: 0.1               # Soft update of action mean in CEM (contribution of prev act)
  elite_ratio: 0.2          # Ratio of elite plans to use in CEM

value_func: learned
value_func_cfg:
  _target_: rlkit.torch.models.ensemble.FlattenEnsembleMLP
  ensemble_size: ???
  is_qf: true
  sampling_method: mean               # 'min', 'mean' or 'lcb'
  lcb_coeff: 1                        # c of 'mu - c * std'
  normalize_inputs: false
  normalize_outputs: false
  num_hidden_layer: 3
  activation_func: torch.relu
  layer_size: 256
  use_decay: false

dynamics:
  _target_: rlkit.torch.models.dynamics_models.model.DynamicsModel
  ensemble_model:
    _target_: rlkit.torch.models.ensemble.Ensemble    # MBOP originally uses a deterministic ensemble
    ensemble_size: 7
    propagation_method: "fixed_model"
    layer_size: 200
    num_hidden_layer: 4
    activation_func: torch.nn.functional.silu         # Elu can also be an option (by MBOP author)
    use_decay: true                  # Offline RL may not need weight decaying?
  learn_reward: true                  # Whether to learn the reward model as well
  separate_reward_func: false         # Whether to instantiate a separate module for the reward function
  learn_logstd_min_max: true          # If a deterministic ensemble is used, this need to be set to False
  num_elites: 5
  normalize_inputs: true
  normalize_outputs: true

offline_cfg:
  checkpoint_name: ???
  
  policy_prior: 'bc'
  prior_model:
    _target_: rlkit.torch.models.ensemble.Ensemble
    propagation_method: "fixed_model"
    ensemble_size: 7
    num_elites: 5
    layer_size: 256
    num_hidden_layer: 3
    activation_func: torch.relu
    include_prev_action_as_input: false
    use_decay: false
    normalize_inputs: true
    normalize_outputs: true
