name: pbsac
_target_: benchrl.algorithms.pbsac.PBSAC

# Network bindings for PB-SAC - requires separate actor and twin critics
model_bindings:
  actor_network: double_headed_mlp
  critic1_network: mlp2
  critic2_network: mlp2

# Base SAC hyperparameters
total_timesteps: 1000000
buffer_size: 1000000
learning_starts: 5000
batch_size: 256
train_freq: 1  # Train every step
gradient_steps: 1
target_update_interval: 1
policy_frequency: 2  # Update actor every 2 steps (TD3 style delay)

# Network parameters
tau: 0.005
gamma: 0.99

# Learning rates
actor_lr: 3e-4
critic_lr: 1e-3

# Entropy coefficient (auto for automatic tuning, or float value)
ent_coeff: auto
target_entropy: null  # If null, will use -dim(action_space)

# PAC-Bayes specific parameters
pac_bayes_active: true
num_pb_envs: 10
pb_update_freq: 20000  # Update frequency of PAC-Bayes components
beta: 1e-3  # Learning rate for posterior optimization
reg_weight: 1.0  # Regularization weight for KL term
delta: 0.05  # failure probability in PAC-Bayes bound
r_max_estimate: 10.0  # Conservative estimate
mixing_time: 400.0  # Conservative estimate
use_thompson_sampling: false
thompson_sampling_freq: 2
exploration_probability_start: 0.8  # Probability of UCB exploration
exploration_probability_end: 0.1  # End value of exploration probability
pb_rollout_trajectories: 100
pb_rollout_steps: 500
pb_policy_samples: 8  # Increased for better estimation
pb_update_epochs: 50  # Increased for thorough optimization
pb_reset_prior_freq: 50  # EMA-style prior reset
pb_prior_decay_duration: 0.9  # Duration of prior decay
pb_prior_decay_end: 0.05  # End value of prior reset decay
pb_initial_std: 0.01  # Initial std for posterior

# Enhanced exploration parameters
exploration_bonus_coeff: 1.0  # Exploration bonus strength
exploration_samples: 16  # Samples for uncertainty estimation
use_alternating_optimization: true  # Use improved optimization

# Actor freezing for stability after PAC-Bayes updates
actor_freeze_steps: 20  # Steps to freeze actor after PB update
target_samples: 1  # Samples for robust targets (higher for critic adaptation)
adaptation_exploration_samples: 512  # Higher samples during critic adaptation

# Performance options
compile: false