# environment and broad experiment params
env_id: "Ant-v5"  # "HalfCheetah-v5", "Reacher-v5"
seed: 42  # initial seed
N_experiments: 2  # seeds. 3-30
N_iterations: 3  # online iterations per seed. 2-15
episode_length: 100  # halfcheetah: 100
embedding_name: "avg_sa"  # options: avg_sa , avg_s, last_s, actionenergy, psm, reacher_perf, halfcheetah_xpos
N_offline_trajs: 30  # halfcheetah: 10 (more and BC solves it)
fresh_offline_trajs: false  # if True, will generate new offline trajs even if they already exist
initial_pos_noise: null  # only HalfCheetah. default is 0.1
# offline learning
N_confset_size: 20
confset_base: "bcnoise"  # What the candidate set is made of. "bcnoise" (BC, +noise), "bignoise" (BC+10x noise, w/o BC), "random" (random policies).
confset_dilution: "bignoise"  # What gets added to the confset_base to augment the candidate set. "None", "random" (add random policies), "bignoise" (add BC+10x noise policies).
N_confset_dilution: 80  # Number of policies to add to the confset_base.
confset_noise: 0.01  # noise added to BC policy to generate confset. if not provided, filled in w/ environment defaults: Reacher: 0.05, HalfCheetah: ???
n_bc_epochs: 100
bc_loss: "log-loss"  # "mse" , "log-loss" (tabular BRIDGE)
bc_print_evals: true
radius: 0.3  # if unspecified, uses hardcoded defaults per embedding. for filtering offline confset: L2(embed(π_BC) - embed(π_candidate)) < radius
expert_in_candidates: true  # adds expert to candidate set
expert_in_confset: false  # adds expert to confset (=potentially filtered candidates)
expert_in_eval: false  # adds expert to chosen eval space
which_eval_space: "pi_zero"  # 'candidates' (BC+noise and maybe expert), 'pi_zero' (offline confset = maybe_filter_at_0(candidates)), 'pi_t' (online confset = maybe_filter_at_t(candidates)) -- only for BRIDGE. these are all the same for baseline
fresh_embeddings: false
# online learning
N_rollouts: 1  # how many trajectories to sample & annotate per online loop
filter_pi_t_yesno: false  # if True, filters online confset according to {pi in Pi_0 s.t. for all other pi': Δϕᵀw + γ * sqrt(Δϕᵀ V_inv Δϕ) >= 0}. alias "filter_online"
filter_pi_t_gamma: 1  # used only if filter_pi_t_yesno is True. alias "filter_gamma"
gamma_debug_mode: false
W: 10
w_trainfunc: "mle"  # "rebuttals" (no batching), "mle" (tabular BRIDGE)
w_regularization: null  # null , "l2" (tabular BRIDGE)
w_epochs: 100  # 100 , 10 (tabular BRIDGE)
w_initialization: "uniform"  # "zeros" , "uniform" (tabular BRIDGE), "random"
w_sigmoid_slope: 1  # 1 , 10 (tabular BRIDGE)
project_w: false
retrain_w_from_scratch: false
which_policy_selection: "random"  # "ucb", "random" , "max_uncertainty"
ucb_beta: 1  # used when selecting policy pairs with UCB: formula is ucb_score = σ(Δϕᵀ w) + ucb_beta * sqrt(Δϕᵀ V_inv Δϕ)
V_init: "small"  # "small" , "bounds" (BRIDGE)
n_embedding_samples: 50
# policy model params
hidden_dim: null  # policy hidden dim. for now, this isn't used, instead we use the SB3 expert's value hardcoded via RLZoo hparams in training_cfg. This is reacher: 64, halfcheetah: 256 (all 2-layers) (SB3 would default to 64 x2).
# verbosity
verbose: []  # list, either [] or any combination of 'full', 'loop-summary', 'radius-calc', 'offline-confset', 'online-confset', 'warnings', 'losses'
run_baseline: true
run_bridge: true
save_results: true
run_ID: "debug"  # options: null (creates unique 3-digit ID), or string. If string, checks if dir exists -- if yes, loads & does what's specified in 'loaded_run_purpose', if no, runs new experiment.
loaded_run_behaviour: "overwrite"  # options: null (defaults to continue), "continue" (load metrics, sim what's missing, re-plot), "redo" (load params, re-sim, re-plot), "overwrite" (don't load anything, write to dir with current params)
which_plot_subopt: ["cumulative_regret"]  # list: containing "suboptimality_percent" or "regret" or "cumulative_regret" or "raw_reward"
baseline_or_bridge: null  # "baseline", "bridge" for single runs
plot_scores: false
exclude_outliers: false  # exclude runs based on cumulative_regret_T outliers: "worst_{bcexpertdist, cumregret}" (exclude worst run), "95conf_{bcexpertdist, cumregret}" (exclude runs outside 95% conf estimate (mean+1.96*std))
# difference to default: 2 rollouts; embedding 'reacher_perf' with epsilon adjusted for that

# |Pi_0|: [70,76]
# expert R: -4.4
# BC quality: just a bit too good. [-4.75, -4.4]. 

# compared to rebuttal: 
# - both BRIDGE and baseline have 1/2 regret
# - BRIDGE a bit better than baseline

# but: still not learning well online, regret doesn't converge. 
# scores don't get properly learned, just looks linear.
