# Environment-related
env: "Hopper" #
classdef:                   # The path to the environment class
d4rl_config: ???            # E.g., "medium-expert-v0"
term_func: ${env.term_func} # Termination function of the environment
reward_func: ${env.reward_func}
obs_preproc: rlkit.envs.env_processor.DefaultEnvProc.obs_preproc # Identity function.. Could use ${env.obs_preproc} instead
obs_postproc: rlkit.envs.env_processor.DefaultEnvProc.obs_postproc
targ_proc: rlkit.envs.env_processor.DefaultEnvProc.targ_proc

replay_buffer_size: 2e6
collector_type: "step"        # 'step' -- one sample at a time, 'path' -- a path at a time
model_dir:                  # Specify the root directory to find for state_dicts

indep_var:
exp_name:

sampling_cfg:
  type: var
  penalty_coeff: ???

layer_size: 256
num_hidden_layer: 2

dynamics:
  _target_: rlkit.torch.models.dynamics_models.model.DynamicsModel
  ensemble_model:
    _target_: rlkit.torch.models.probabilistic_ensemble.ProbabilisticEnsemble    # MOPO uses a probabilistic ensemble
    ensemble_size: 7
    propagation_method: "random_model"
    layer_size: 200
    num_hidden_layer: 4
    activation_func: torch.nn.functional.silu
    use_decay: true
  learn_reward: true                  # Whether to learn the reward model as well
  separate_reward_func: false         # Whether to instantiate a separate module for the reward function
  learn_logstd_min_max: true
  learning_rate: 1e-3
  batch_size: 256
  max_epochs_since_last_update: 5
  num_model_learning_epochs:
  num_elites: 5
  normalize_inputs: true
  normalize_outputs: true             # Dynamics model learn to output a normalized target

trainer_cfg:
  policy_cfg:
    discount: 0.99
    soft_target_tau: 5e-3
    target_update_period: 1
    actor_update_period: 1
    policy_lr: 3e-4
    qf_lr: 3e-4
    use_automatic_entropy_tuning: true
    target_entropy: -3
    reward_scale: 1
    init_alpha: 1.0
    max_grad_norm:
    track_grad_norm: true
  rollout_schedule: ???
  batch_size: 256
  model_train_period: 1000
  rollout_batch_size: 50e3
  max_path_length: ${algorithm.algorithm_cfg.max_path_length}
  num_policy_updates_per_step: 1
  num_max_policy_updates_per_step: 5
  policy_updates_every_steps: 1
  num_epochs_to_retain_model_buffer: 5
  real_data_ratio: 0.05
  log_policy_training_period: 1000

algorithm_cfg:
  num_epochs: 1000

offline_cfg:
  train_at_start: true
