hydra:
  job_logging:
    formatters:
      simple:
        format: "[%(levelname)s %(process)d %(module)s:%(lineno)d %(asctime)s] %(message)s"

env: MiniGrid-Empty-8x8-v0  # Gym environment
mode: train  # Choices: train, curriculum
group: knowledge-instructed  # Experiment id
n: 0  # Number (vary this for multirun)
name: "${uid:}"
wandb: true  # Log to wandb
project: PAE
debug: false  # Use threading, among other things
verbose: false  # Log a bunch of extra timing stats

state_counter: coordinates
naive_message_reward: 0.0  # Amount of rewards given for *any* message
naive_message_reward_format: "standard"  # standard or "learn" for learn-style rewards.

separate_message_state_counter: coordinates_messages

is_babyai: "${is_babyai:${env}}"

# Partial obs
partial_obs: true

max_online_goals: 1000

# Training settings
disable_checkpoint: false  # Disable saving checkpoint
savedir: "${oc.env:HOME}/checkpoint/${env}/${group}"
total_frames: 300000000

num_actors: 4
num_buffers: null
num_threads: 4
device: cuda:0

entropy_cost: 0.0005  # Entropy cost/multiplier
generator_entropy_cost: 0.05  # Entropy cost/multiplier
baseline_cost: 0.5  # Baseline cost/multiplier
discounting: 0.99  # Discount factor
reward_clipping: abs_one  # Choices: abs_one, soft_asymmetric, none

# Optimizer settings
lr: 5e-4  # Actor Learning rate
generator_lr: 5e-4  # Planner learning rate

# Other Hyperparameters
batch_size: 32  # Learner batch size
generator_batch_size: 32  # Learner batch size
unroll_length: 100  # The unroll length (in time dimension)
knowledge_dim: 64  # Size of goal embedding
state_embedding_dim: 256  # Dimension of the state embedding representation used in the actor
generator_eps: 0.00  # Epsilon for random goals
generator_reward_negative: -0.1  # Coefficient for the intrinsic reward
generator_maximum_rate: 0.3
generator_threshold: -0.2  # Threshold mean reward for which scheduler increases difficulty
generator_counts: 10
train_generator: true
generator_start_target: 7

int:
  twoheaded: false
  baseline_cost: 0.5

mutual_information: true
mutual_information_rate: 1e-4
use_lstm: false  # Use LSTM in agent model, default false
num_lstm_layers: 1
no_extrinsic_rewards: false  # Only intrinsic rewards
combine_rewards: true
generator: true  # Use generator
intrinsic_reward_coef: 0.5  # Coefficient for the intrisic reward
easy_goal_reward: 0.0  # Reward for completed goals that don't exceed difficulty threshold

# Network settings
achievable_mask: true
actor_cross_attn: true
planner_cross_attn: true
attn_skip: true