{
    "MODEL_CONFIG": {
        "custom_model": "actor_critic",
        "custom_model_config": {
            "input_conv_channels": 3,
            "critic_share_layers": False,
            "conv_filters": null,
            "actor_layer_sizes": [[39, 400], [400, 400], [400, 324]],
            "critic_layer_sizes": [[39, 400], [400, 400], [400, 1]],
            "action_masking": True,
            "discretize_size": 81,
            "discretize_mode": "linear"
        },
    },

    # Environment configuration
    "ENV_CONFIG": {
        "advice_mode": "", # in the set of {"", "aa", "eaa", "fixed"}
        "config": "ml10_reach-v2",
        "reward_scale": 1,
        "reward_loc": 0,
        "base_penalty": 0.9,
        "max_steps": 500,
    },

    # Base configuration including algorithm parameters
    "BASE_CONFIG": {
        "env": "metaworld",
        "alg": "ppo",
        "device": 0, 
        "lr": 0.00003,
        "gamma": 0.99,
        "lambda": 0.95,
        "kl_coeff": 0,
        "kl_target": -1, # use -1 to denote non-adaptive kl
        "clip_param": 0.2,
        "vf_loss_fn": "smooth_l1", 
        "vf_clip_param": 10.0,
        "vf_loss_coeff": 1,
        "entropy_coeff": 0.00,
        "train_batch_size": 256,
        "sgd_minibatch_size": 128,
        "num_sgd_iter": 2,
        "num_parallel_envs": 8,
        "norm_adv": False,
        "lr_scheduler": 'constant',
    },

    # Hyper parameter optimization parameters
    "HPO_CONFIG": None,
    # {
    #     "lr": "tune.grid_search([0.001, 0.0005, 0.0001])",
    #     "lambda": "tune.grid_search([0.8, 0.9, 0.99])",
    #     "kl_coeff": "tune.grid_search([0.1, 0.5, 0.9])",
    #     "clip_param": "tune.grid_search([0.1, 0.2, 0.3])",
    #     "vf_loss_coeff": "tune.grid_search([0.5, 0.75, 1.0])",
    #     "entropy_coeff": "tune.grid_search([0.1, 0.01, 0.0])",
    #     "num_sgd_iter": "tune.grid_search([3, 10, 20])",
    # }
}