# Default Gridworld configuration
# This is loaded when `-env gridworld` is specified

# environment and broad experiment params
env: Gridworld
N_experiments: 2  # reduced for quick testing
N_iterations: 3  # reduced for quick testing
episode_length: 10  # 8 for StarMDP, 10 for Gridworld
env_move_prob: 0.8  # 0.7 for StarMDP, 0.8 for Gridworld
phi_name: state_counts  # options: state_counts, id_short, id_long, final_state
do_offline_BC: true
N_offline_trajs: 10  # starmdp: 2, gridworld: 10 (more and BC solves it)

# offline learning
delta_offline: 0.05
N_confset_size: 100  # for offline confset construction: (noise-matrices, random generation in online_learning if no BC, or size of initial sample for rejection-sampling-from-sample)
which_confset_construction_method: rejection-sampling-from-sample  # noise-matrices, rejection-sampling-from-all, rejection-sampling-from-sample
which_hellinger_calc: exact  # options: exact (bhattacharyya), approx (local-avg)
n_transition_model_epochs_offline: 5
offlineradius_formula: hardcode_radius  # options: full, ignore_bracket, only_alpha (formerly ignore_beta_in_confset_radius), hardcode_radius_scaled, hardcode_radius (formerly via providing float value to override_offlineradius)
offlineradius_override_value: 0.96
replace_mle_with_optimal_policy_in_offline_confset: true  # in offline confset, replace pi_MLE with pi_true

# online learning
N_rollouts: 10
delta_online: 0.05
W: 1
w_MLE_epochs: 10
w_initialization: uniform
w_sigmoid_slope: 10
xi_formula: smaller_start  # full, smaller_start (formerly true override)
n_transition_model_epochs_online: 5
online_confset_recalc_phi: false  # whether to use precomputed values or not
online_confset_bonus_multiplier: 0.008  # 0.01 for starMDP, 0.008 for gridworld. leave "1" for 'no multiplier'. (formerly multiply_bonus_inside_online_confset)
use_true_T_in_online: false
gamma_t_hardcoded_value: 0.15  # 0.2 for starMDP, 0.15 for gridworld. (formerly override_gamma_t)
baseline_search_space: augmented_ball  # "random_sample" (of size N_confset_size), "all_policies", "augmented_ball" (augmenting BRIDGE's ball to N_confset_size's size with random policies)

# verbosity
verbose: []  # list, either [] or any combination of 'full', 'loop-summary', 'radius-calc', 'offline-confset', 'online-confset', 'warnings', 'losses', 'online'

# which experiments to run
run_bridge: true
run_baseline: true

# saving
save_results: true
run_ID: debug  # options: None (creates unique 3-digit ID), or string. If string, checks if dir exists -- if yes, loads & does what's specified in 'loaded_run_purpose', if no, runs new experiment.
loaded_run_behaviour: overwrite  # options: "continue" (load metrics, sim what's missing, re-plot), "redo" (load params, re-sim, re-plot), "overwrite" (don't load anything, write to dir with current params)

# plotting
which_plot_subopt: cumulative_regret  # "suboptimality_percent" or "regret" or "cumulative_regret"