defaults:
  - base


micro_batch_size_per_gpu: 8

actor_rollout_ref:
  rollout:
    response_length: 500
    val_kwargs:
      do_sample: True
      temperature: 0.5 # enabling randomness in evaluation 
 
trainer:
  experiment_name: bandit-base

agent_proxy:
  max_turn: 1
  max_actions_per_turn: 1 # how many actions can be output at most in a single turn

es_manager:
  train:
    env_configs:
      tags: ["Bandit"] # BanditGeneralizationNoThink
  val:
    env_groups: 512
    env_configs:
      tags: ["Bandit", "BanditTest"]
      n_groups: [256, 256]
