seed: 42


# Models are instantiated using skrl's model instantiator utility
# https://skrl.readthedocs.io/en/latest/api/utils/model_instantiators.html
#
# obs["joint-positions"]  obs["joint-velocities"]
#           │                        │
#    ┏━━━━━━▼━━━━━┓           ┏━━━━━━▼━━━━━┓
#    ┃   net_pos  ┃           ┃   net_vel  ┃
#    ┡━━━━━━━━━━━━┩           ┡━━━━━━━━━━━━┩
#    │ linear(16) │           │ linear(16) │
#    │ elu        │           │ elu        │
#    │ linear(16) │           │ linear(16) │
#    │ elu        │           │ elu        │
#    └──────┬─────┘           └─────┬──────┘
#           │                       │
#           └─────────▶(+)◀─────────┘
#                       │
#                 ┏━━━━━▼━━━━━┓
#                 ┃    net    ┃
#                 ┡━━━━━━━━━━━┩
#                 │ identity  │
# shared          └─────┬─────┘
# ......................│.......................
# non-shared            │
#           ┏━━━━━━━━━━━▼━━━━━━━━━━━┓
#           ┃  policy|value output  ┃
#           ┡━━━━━━━━━━━━━━━━━━━━━━━┩
#           │ linear(num_actions|1) │
#           └───────────┬───────────┘
#                       ▼
models:
  separate: False
  policy:  # see multicategorical_model parameters
    class: MultiCategoricalMixin
    unnormalized_log_prob: True
    network:
      - name: net_pos
        input: OBSERVATIONS["joint-positions"]
        layers: [16, 16]
        activations: elu
      - name: net_vel
        input: OBSERVATIONS["joint-velocities"]
        layers: [16, 16]
        activations: elu
      - name: net
        input: net_pos + net_vel
        layers: []
        activations: []
    output: ACTIONS
  value:  # see deterministic_model parameters
    class: DeterministicMixin
    clip_actions: False
    network:
      - name: net_pos
        input: OBSERVATIONS["joint-positions"]
        layers: [16, 16]
        activations: elu
      - name: net_vel
        input: OBSERVATIONS["joint-velocities"]
        layers: [16, 16]
        activations: elu
      - name: net
        input: net_pos + net_vel
        layers: []
        activations: []
    output: ONE


# Rollout memory
# https://skrl.readthedocs.io/en/latest/api/memories/random.html
memory:
  class: RandomMemory
  memory_size: -1  # automatically determined (same as agent:rollouts)


# PPO agent configuration (field names are from PPO_DEFAULT_CONFIG)
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html
agent:
  class: PPO
  rollouts: 32
  learning_epochs: 8
  mini_batches: 8
  discount_factor: 0.99
  lambda: 0.95
  learning_rate: 5.0e-04
  learning_rate_scheduler: KLAdaptiveLR
  learning_rate_scheduler_kwargs:
    kl_threshold: 0.008
  state_preprocessor: RunningStandardScaler
  state_preprocessor_kwargs: null
  value_preprocessor: RunningStandardScaler
  value_preprocessor_kwargs: null
  random_timesteps: 0
  learning_starts: 0
  grad_norm_clip: 1.0
  ratio_clip: 0.2
  value_clip: 0.2
  clip_predicted_values: True
  entropy_loss_scale: 0.0
  value_loss_scale: 2.0
  kl_threshold: 0.0
  rewards_shaper_scale: 0.1
  time_limit_bootstrap: False
  mixed_precision: False
  # logging and checkpoint
  experiment:
    directory: "cartpole_direct_dict_multidiscrete"
    experiment_name: ""
    write_interval: auto
    checkpoint_interval: auto


# Sequential trainer
# https://skrl.readthedocs.io/en/latest/api/trainers/sequential.html
trainer:
  class: SequentialTrainer
  timesteps: 4800
  environment_info: log
