[base]
package = ocean
env_name = trash_pickup puffer_trash_pickup 
vec = multiprocessing
policy_name = TrashPickup
rnn_name = Recurrent

[env]
num_envs = 1024  # Recommended: 4096 (recommended start value) / num_agents 
grid_size = 10
num_agents = 4
num_trash = 20
num_bins = 1
max_steps = 150
report_interval = 32
agent_sight_range = 5 # only used with 2D local crop obs space

[train]
total_timesteps = 100_000_000
checkpoint_interval = 200
num_envs = 2
num_workers = 2
env_batch_size = 1
batch_size = 131072
update_epochs = 1
minibatch_size = 16384
bptt_horizon = 8
anneal_lr = False
device = cuda
learning_rate=0.001
gamma = 0.95
gae_lambda = 0.85
vf_ceof = 0.4
clip_coef = 0.1
vf_clip_coef = 0.1
ent_coef = 0.01

[sweep.metric]
goal = maximize
name = environment/episode_return

[sweep.parameters.train.parameters.learning_rate]
distribution = log_uniform_values
min = 0.000001
max = 0.01

[sweep.parameters.train.parameters.gamma]
distribution = uniform
min = 0
max = 1

[sweep.parameters.train.parameters.gae_lambda]
distribution = uniform
min = 0
max = 1

[sweep.parameters.train.parameters.update_epochs]
distribution = int_uniform
min = 1
max = 4

[sweep.parameters.train.parameters.ent_coef]
distribution = log_uniform_values
min = 1e-5
max = 1e-1
