#cache directory of transformer
cache_dir: './cache'

#token
huggingface_token: ''
wandb_key: ""


policy_lm: "meta-llama/Meta-Llama-3-8B"
agent_type: "reinforce"
use_baseline: False
use_lora: False
max_new_tokens: 32
save_freq: 25
eval_freq: 25

#training hyperparameters
capacity: 100000 #replay buffer size
rollout_size: 128 #number of rollout trajectories for each update
eval_size: 32 #number of trajectories for evaluation
batch_size: 8
iterations: 2000 #total number of iterations
actor_epochs: 3 #number of epochs for the actor each iteration
warmup_iter: 20 #number of iterations without updating the policy
grad_accum_steps: 32
do_sample: True
temperature: 1.0
critic_lr: 1e-5
lm_lr: 1e-5
env_idx: null #set to null if don't want to reset to a specific environment
gamma: 0.95 #discount factor
tau: 0.1 #soft update parameter
max_grad_norm: 1.0

use_wandb: False