# Environment-related
env: "gym___HalfCheetah" #
classdef:                   # The path to the environment class
d4rl_config: ???            # E.g., "medium-expert-v0"
term_func: ${env.term_func} # Termination function of the environment
reward_func: ${env.reward_func}
obs_preproc: rlkit.envs.env_processor.DefaultEnvProc.obs_preproc # Identity function.. Could use ${env.obs_preproc} instead
obs_postproc: rlkit.envs.env_processor.DefaultEnvProc.obs_postproc
targ_proc: rlkit.envs.env_processor.DefaultEnvProc.targ_proc

replay_buffer_size: 1e6
collector_type: "step"        # 'step' -- one sample at a time, 'path' -- a path at a time
model_dir:                  # Specify the root directory to find for state_dicts

indep_var:
exp_name:

layer_size: 256
num_hidden_layer: 2

dynamics:
  _target_: rlkit.torch.models.dynamics_models.model.DynamicsModel
  ensemble_model:
    _target_: rlkit.torch.models.probabilistic_ensemble.ProbabilisticEnsemble    # MBOP uses a deterministic ensemble (captures epistemic uncertainty only)
    ensemble_size: 7
    propagation_method: "random_model"
    layer_size: 200
    num_hidden_layer: 4
    activation_func: torch.nn.functional.silu
    use_decay: true
  learn_reward: true                  # Whether to learn the reward model as well
  separate_reward_func: false         # Whether to instantiate a separate module for the reward function
  learn_logstd_min_max: true          # as in mbpo-pytorch
  learning_rate: 1e-3
  batch_size: 256
  max_epochs_since_last_update: 5
  num_model_learning_epochs:
  num_elites: 5
  normalize_inputs: true

trainer_cfg:
  policy_cfg:
    discount: 0.99
    soft_target_tau: 5e-3
    target_update_period: 1
    actor_update_period: 1
    policy_lr: 3e-4
    qf_lr: 3e-4
    use_automatic_entropy_tuning: false
    target_entropy: -3
    reward_scale: 1
    init_alpha: 0.2
#    max_grad_norm:
#      log_alpha: 5
#      qf1: 50
#      qf2: 50
#      policy: 1
    track_grad_norm: true
  num_model_rollouts_per_step: 400
  rollout_schedule: [ 20, 150, 1, 1]
  model_buffer_size: 1e5
  num_policy_updates_per_step: 40
  policy_updates_every_steps: 1
  real_data_ratio: 0.05
  batch_size: 256
  model_train_period: 250
  log_policy_training_period: 1000
  max_path_length: ${algorithm.algorithm_cfg.max_path_length}
