
action_selector: "soft_policies"
mask_before_softmax: True

runner: "parallel"

buffer_size: 10
batch_size_run: 10
batch_size: 10

# update the target network every {} training steps
target_update_interval_or_tau: 0.01

lr: 0.0005

obs_agent_id: False
obs_last_action: False
obs_individual_obs: False

mac: "non_shared_mac"
agent: "rnn_ns"

agent_output_type: "pi_logits"
learner: "pac_dcg_learner"
use_rnn: False
standardise_rewards: False
q_nstep: 10 # 1 corresponds to normal r + gammaV

critic_type: "pac_dcg_critic_ns"
state_value_type: "cv_critic_ns"

initial_entropy_coef: 30.0
final_entropy_coef: 0.01
entropy_end_ratio: 0.8


# Specify the CG
cg_edges: 'full'    # Defines the CG: {'vdn', 'line', 'cycle', 'star', 'full', n_rand_edges:int, [[int,int], .., [int,int]]}
cg_utilities_hidden_dim:      # Hidden layers of utility functions, by default None
cg_payoffs_hidden_dim:        # Hidden layers of payoff functions, by default None
cg_payoff_rank:              # If > 0, payoff matrices are decomposed with this rank, otherwise full rank
duelling: False               # If True, runs DCG-V (adds a state-dependent bias function to the value)
msg_anytime: True             # Anytime extension of greedy action selection (Kok and Vlassis, 2006)
msg_iterations: 8             # Number of message passes in greedy action selection
msg_normalized: True          # Message normalization during greedy action selection (Kok and Vlassis, 2006)


name: "pac_sarsa_dcg_ns"

t_max: 500500
