# Environment-related
env: "HalfCheetah" #
classdef:                   # The path to the environment class
d4rl_config: ???            # E.g., "medium-expert-v0"
term_func: ${env.term_func} # Termination function of the environment
reward_func: ${env.reward_func}
obs_preproc: rlkit.envs.env_processor.DefaultEnvProc.obs_preproc # Identity function.. Could use ${env.obs_preproc} instead
obs_postproc: rlkit.envs.env_processor.DefaultEnvProc.obs_postproc
targ_proc: rlkit.envs.env_processor.DefaultEnvProc.targ_proc

replay_buffer_size: 2e6
collector_type: "step"        # 'step' -- one sample at a time, 'path' -- a path at a time
model_dir:                  # Specify the root directory to find for state_dicts

indep_var:
exp_name:

sampling_cfg:
  type: mean
  truncate_thresh: ???
  truncate_reward: -50
  pessimism_coeff:

layer_size: 256
num_hidden_layer: 2

dynamics:
  _target_: rlkit.torch.models.dynamics_models.model.DynamicsModel
  ensemble_model:
    _target_: rlkit.torch.models.probabilistic_ensemble.ProbabilisticEnsemble
    ensemble_size: 7
    propagation_method: "random_model"
    layer_size: 200
    num_hidden_layer: 4
    activation_func: torch.nn.functional.silu
    use_decay: true
  learn_reward: true                  # Whether to learn the reward model as well
  separate_reward_func: false         # Whether to instantiate a separate module for the reward function
  learn_logstd_min_max: true
  learning_rate: 1e-3
  batch_size: 256
  max_epochs_since_last_update: 5
  num_model_learning_epochs:
  num_elites: 5
  normalize_inputs: true
  normalize_outputs: false             # Dynamics model learn to output a normalized target

trainer_cfg:
  policy_cfg:
    discount: 0.99
    soft_target_tau: 5e-3
    target_update_period: 1
    actor_update_period: 1
    policy_lr: 3e-4
    qf_lr: 3e-4
    use_automatic_entropy_tuning: true
    target_entropy: -3
    reward_scale: 1
    init_alpha: 1.0
    max_grad_norm:
    track_grad_norm: true
  rollout_schedule: 150
  batch_size: 256
  model_train_period: 250
  # Note: MOReL uses all models to propagate this number of trajectories, resulting in
  # ``rollout_batch_size * num_models`` number of trajectories. Instead, we randomly select ``rollout_batch_size``
  # trajectories. Hence, to be roughly equivalent to MOReL, use ``rollout_batch_size = rollout_batch_size * num_models``
  rollout_batch_size: 2000
  max_path_length: ${algorithm.algorithm_cfg.max_path_length}
  num_policy_updates_per_step: 20
  num_max_policy_updates_per_step: 5
  policy_updates_every_steps: 1
  num_epochs_to_retain_model_buffer: 1
  real_data_ratio: 0.0                          # MOReL learns a policy entirely from the constructed MDP
  log_policy_training_period: 1000
  start_model_rollouts_from_init_dist: true
  clip_obs: true

algorithm_cfg:
  num_epochs: 500

offline_cfg:
  train_at_start: true
  use_behavior_clone: true
  prior_model:
    learning_rate: 1e-3
    batch_size: 256
    normalize_inputs: true
    normalize_outputs: false
#    max_num_test:           # Should be specified, or just remove this
    num_behavior_clone_learning_epochs: 5
