[base]
package = ocean
env_name = puffer_go
vec = multiprocessing
policy_name = Go
rnn_name = Recurrent

[env]
num_envs = 2048
reward_move_pass = -0.47713279724121094
reward_move_valid = 0
reward_move_invalid = -0.47179355621337893
reward_opponent_capture = -0.5240603446960449
reward_player_capture = 0.22175729274749756
grid_size = 7

[train]
total_timesteps = 2_000_000_000
checkpoint_interval = 50
num_envs = 2
num_workers = 2
env_batch_size =1
batch_size = 524288
update_epochs = 1
minibatch_size = 131072
bptt_horizon = 16
learning_rate = 0.0015
ent_coef = 0.013460194258584548
gae_lambda = 0.90
gamma = 0.95
max_grad_norm = 0.8140400052070618
vf_coef = 0.48416485817685223
anneal_lr = False
device = cpu

[sweep.parameters.env.parameters.reward_move_invalid]
distribution = uniform
min = -1.0
max = 0.0

[sweep.parameters.env.parameters.reward_move_pass]
distribution = uniform
min = -1.0
max = 0.0

[sweep.parameters.env.parameters.reward_player_capture]
distribution = uniform
min = 0.0
max = 1.0

[sweep.parameters.env.parameters.reward_opponent_capture]
distribution = uniform
min = -1.0
max = 0.0
