info: "box2d"
sub_dirname: "skill_learn"
alt_path: ""
wandb_log: ""
run_name: "goal"
seed: 0
cuda_id: 0
log:
    log_learn_her_keys: false
defaults:                                   # import config for each env, adjust in configs/env/env_name.yaml
    - env:
          - ac_env
          - mini_behavior
          - igibson
env:
    env_name: "ac_env"                # "installing_printer", "thawing", "cleaning_car", "igibson", "test"
    render: false
    num_train_envs: 20
    num_test_envs: 10
    evaluate_graph: true
    goal_sampler: false
load:
    load_dynamics: ""
    load_policy: ""
    load_rpb: false
    load_graph_encoding: ""
    load_replay_buffer: ""
    load_dir: ""
save:
    save_gif_num: 0                         # 0 for not saving gif
    save_replay_buffer: false
    save_freq: 100
    save_frames: ""
train:
    epoch: 500
    init_random_step: 10000                 # random upper and lower actions, no learning
    dynamics_pretrain_step: 0               # if nonzero, takes max of this and init_random step  for pretraining
    dynamics_warmup_step: 0 # 20000            # only dynamics learns, no training for policy and diayn
    env_step_per_epoch: 100000
    env_step_per_collect: 12000
    policy_update_per_env_step: 0.05
    dynamics_update_per_env_step: 0.05
    policy_batch_size: 256
    dynamics_batch_size: 128
    test_ep_per_epoch: 10
    fpg: false # true 
    n_steps_per_goal: 0 # 1200
graph_encoding:
    latent_size: 16
    lr: 3e-4
    num_edge_classes: 2
    hidden_sizes: [128, 128]
    latent_l2_reg_coef: 1e-5
dynamics:
    type: "gt"                                # gt, ac
    ac:
        dynamics_config_path: ""                # if using AC dynamics, this must be used
        infer_graph_type: "hard"                # the way to infer graphs sent in
        graph_threshold: 0.9                    # degree of confidence necessary to recover graph
        mask_mode: "nulls"                      # how masks are to be computed
    grad:
        lr: 3e-4
        grad_clip_norm: 5
        local_causality_type: "cmi"             # gradient, cmi
        pred_granularity: "variable"            # variable, macro_variable, factor
        gradient:
            grad_reg_coef: 0
            local_causality_threshold: 2e-3
        mixup_alpha: 0.2                        # 0 for not using
        classification_correct_threshold: 0.1
        regression_correct_threshold: 0.1
        use_same_network_for_each_variable_prediction: false
        feature_fc_dims: [256, 256]
        predictor_fc_dims: []
        attn:
            num_attns: 2
            residual: true
            attn_dim: 32
            num_heads: 8
            attn_out_dim: 256
            attn_use_bias: false
            share_weight_across_kqv: true
            post_fc_dims: [256, 256]
data:
    buffer_len: 10000000
    count_threshold_for_valid_graph: 0                  # 0 for not using
    her:
        use_her: true                               # will be overwritten to false if env is not goal-based
        separate_her: true
        her_ratio: 0.3 
        her_horizon: 200
        future_k: 8
        use_count_select_goal: true
        her_traj_length: -1
        num_her_samples: 1
        use_lowest_post: False                      # uses the lowest x value post interaction as the hindsight goal
        filter:
            form: "control"
            min_non_passive: 2
            target_idx: -2
            dist_test: 0.0
            vel_test: 0.0
    prio:
        prio: false
        dynamics_prio: false
        alpha: 0.6
        beta: 0.4
        weight_norm: true
        dynamics:
            priority_scale: 10.0
            update_count_scale: 0.01
            change_count_scale: 10.0
        # policy parameters
        td_error_scale: 1.0
        graph_count_scale: 0
        decay_window: 0                             # Prioritized Sequence Experience Replay parameters for lower
        decay_rate: 0.4
        max_prev_decay: 0.7
policy:
    reward:
        reward_type: "goal"                             # possible types: "goal"
        timeout: 100
        target_goal_epsilon: 0.2                       # minimum distance to give reward for reaching a goal
        target_goal_shaping: -1.0                       # uses exp(- dist / target_goal_shaping) to assign reward inside of epsilon, if greater than 0
        target_graph_epsilon: 0.01
        adaptive_radius_rate: -1.0                      # reduces the radius based on radius * (max(1, 1-normalized_reward + adaptive_radius_rate))
        binary_goals: true
        use_count_reward: false
        goal_scale: 1                                   # reward to give goal reaching
        reached_goal_negative_constant: -1.0
        reached_graph_indices: [1]          # MUST be changed to match the index of the target object(s)
        terminate_on_goal: false
    net_config_path: ""
    net_config_name: "network"
    type: "wide"                        # "single_graph", "wide"
    scripted: false
    discrete_algo: "rainbow"            # "dqn", "rainbow", "ppo"
    continuous_algo: "ddpg"              # "ddpg", "td3", "sac", "ppo"
    graph_action_space: "choose_from_history"       # "sample_from_history", "graph_encoding", "choose_from_history"
    graph_action_n: 30
    use_reached_graph_counter: True
    reward_type: "goal"                      # "goal", "graph_count", "task"
    gamma: 0.9
    n_step: 3
    rainbow:
        lr: 3e-4
        hidden_sizes: [512, 512]
        num_atoms: 51
        v_min: -10
        v_max: 10
        target_update_freq: 500
        eps: 0.1
        norm_form: none
        acti_form: relu
    ddpg:
        hidden_sizes: [512, 512]
        actor_lr: 1e-4
        critic_lr: 1e-4
        tau: 0.005
        exploration_noise: 0.1
        norm_form: none
        acti_form: relu
    td3:
        hidden_sizes: [512, 512]
        actor_lr: 1e-4
        critic_lr: 1e-4
        tau: 0.005
        exploration_noise: 0.1
        policy_noise: 0.2
        update_actor_freq: 2
        noise_clip: 0.5
        norm_form: none
        acti_form: relu
    sac:
        hidden_sizes: [512, 512]
        conditioned_sigma: true
        actor_lr: 1e-4
        critic_lr: 1e-4
        tau: 0.005
        alpha: 0.2
        auto_alpha: false
        alpha_lr: 3e-4
        norm_form: none
        acti_form: relu
    ppo:
        hidden_sizes: [512, 512]
        conditioned_sigma: true
        lr: 3e-4
        eps_clip: 0.2
        recompute_advantage: true
        gae_lambda: 0.95
        max_grad_norm: 0.5
        vf_coef: 1
        ent_coef: 0.3
        rew_norm: true
        repeat_per_collect: 3
        norm_form: none
        acti_form: relu

    fpg:
        hidden_sizes: [512, 512]
        conditioned_sigma: true
        lr: 3e-4
        actor_lr: 1e-4
        eps_clip: 0.2
        max_grad_norm: 0.5
        ent_coef: 0.3
        rew_norm: true
        repeat_per_collect: 3
        norm_form: none
        acti_form: relu

