wandb:
  entity: null
  resume: 'auto'

experiment:
    project: "alfworld_multinode_rl"
    function: "train"
    start_from_scratch: True
    total_step: 200
    save_every: 30
    eval_every: 5
    current_epoch: 1
    deepspeed_file: "8_node_8_gpus_deepspeed_zero3"
    num_node: 8 # the number of machines you have
    node_index: 0 # no need to change
    coevolve_reward: True
    coevolve_environment: False


# only need to be set when using multi-nodes
system:
    HTTP_PROXY: # your proxy, can be null
    HF_HOME: # hf home
    env_name: # env name
    envs_dir: # env dir
    rl_base_dir: # base dir
    additional_command: "echo 'export ALFWORLD_DATA=/absolute/path/to/alfworld-data' >> ~/.bashrc"


model:
    policy_model: # policy
    reward_model: # reward model
    environment_model: # environment model
    optimized_name: "optimized"
    optimized_reward_name: "optimized_reward"

dataset:
    environment_type: "alfworld"
    alfworld_train_type: "train" # "train" "eval_in_distribution" "eval_out_of_distribution"
    alfworld_temp_train_type: "temp_train"
    alfworld_syn_train_type: "syn_train"
    alfworld_eval_type: "eval_in_distribution"
    environment_file_dir: "/absolute/path/to/alfworld_master"
    environment_data_dir: "/absolute/path/to/alfworld-data"
    optimization_data: "rl_policy"
    reward_optimization_data: "rl_reward"


rollout:
    env_max_parallel: 256
    policy:
        num_trial: 16
        num_rollout_per_trial: 8
        max_interaction_step: 40
        temperature: 0.8
        model_length: 20000
        max_gen_length: 2000
        gpu_groups: [[0,1,2,3],[4,5,6,7]]
        if_start_with_think: False
    reward:
        num_rollout_per_query: 3
        temperature: 0.8
        model_length: 20000
        max_gen_length: 4000
        gpu_groups: [[0,1,2,3],[4,5,6,7]]
        if_start_with_think: False
    environment:
        temperature: 0.8
        model_length: 20000
        max_gen_length: 4000
        gpu_groups: [[0,1,2,3],[4,5,6,7]]
        if_start_with_think: True
    


training:
    policy:
        gradient_checkpointing_enable: False
        update_per_step: 4
        batch_size_lm: 1
        max_gen_length: 2048
        max_prompt_len: 2048
    reward:
        gradient_checkpointing_enable: True
        update_per_step: 4
        batch_size_lm: 1
        max_gen_length: 4096
        max_prompt_len: 2048
    mixed_precision: "bf16"
    enable_tf32: True
    seed: 10086
    max_grad_norm: 1.0
    use_kl_estimator_k3: True
    eps: 0.20
    beta: 0.01
    num_train_epochs: 1


optimizer:
    name: adamw
    params: # default adamw params
        learning_rate: 1e-6
        scale_lr: False # scale learning rate by total batch size
        beta1: 0.9
        beta2: 0.999
        weight_decay: 0.0
        epsilon: 1e-8


lr_scheduler:
    scheduler: "cosine"
    params:
        learning_rate: ${optimizer.params.learning_rate}
        warmup_steps: 0
        min_lr_scale: 1.0


evaluation:
    policy:
        num_rollout_per_trial: 3
        max_interaction_step: 60
        temperature: 0.8
        model_length: 20000
        max_gen_length: 2000
        gpu_groups: [[0,1,2,3],[4,5,6,7]]
        if_start_with_think: False

