# run
setting: 'tfbind10'

# wandb
wandb_mode: 'disabled'   # 'online', 'disabled'
wandb_project: 'sub_tfbind10'
wandb_entity: 'yhangchen'



ordering: False

# model learning
lr_z: 1.0e-2
lr_policy: 1.0e-4
lr_critic: 1.0e-4
lr_logF: 1.0e-4
clip_policy_logit_min: -50.0
clip_policy_logit_max: 50.0
clip_grad_norm: 10.0
clip_param: 0.2
entropy_coef: 0.01
sql_alpha: 0.01
lamda: 0.9
model: 'sub' # choices=['sub', 'maxent', 'random', 'tb']
guide: 'substructure' # choices=['substructure', 'uniform']
init_logz: True
# CAREFUL - guide_scores_back_policy_traj can have high variance in 
# training losses because trajectorie sampled under back policy can have
# extremely low logp under guide 
# choices=['guide_scores_back_policy_traj', 'guide_resamples_traj'])
offline_style: 'guide_scores_back_policy_traj' 
offline_select: 'prt'

sa_or_ssr: 'ssr'

# model architecture
# inp_dim is specified by mdp
ssr_encoder_hid_dim: 512
ssr_encoder_n_layers: 1
ssr_embed_dim: 512
ssr_scorer_hid_dim: 512
ssr_scorer_n_layers: 2
sa_hid_dim: 512
sa_n_layers: 3

# trainer
num_active_learning_rounds: 20000
num_samples_per_online_batch: 16
num_samples_per_offline_batch: 16

num_steps_per_batch: 1
num_online_batches_per_round: 1
num_offline_batches_per_round: 1
target_mix_backpolicy_weight: 0.25

# policy
explore_epsilon: 0.01

# guide
parallelize: True
num_guide_workers: 30

# experiment
replicate: 0
saved_models_dir: 'saved_models/'
save_every_x_active_rounds: 5000

# monitor
monitor_num_samples: 128
monitor_fast_every: 10
monitor_slow_every: 200

# reward
train_inverse_temperature: 3.0
scale_reward_max: 10
scale_reward_min: 0.001
drop_reward_min: 1e-3
guide_sampling_temperature: 3.0
reward_exp: 3
# experiment-specific
mdp_style: 'pa' # ['insert', 'pa']

# mcmc
mcmc: false
k: 4

# reproduction
seed: 0
