# run
setting: 'utr'

# wandb
wandb_mode: 'online'   # 'online', 'disabled'
wandb_project: 'sub_utr'
wandb_entity: 'yhangchen'


# model learning
lr_z: 1.0e-2
lr_policy: 1.0e-4
clip_policy_logit_min: -50.0
clip_policy_logit_max: 50.0
clip_grad_norm: 10.0
model: 'tb' # choices=['sub', 'maxent', 'random', 'tb']
guide: 'substructure' # choices=['substructure', 'uniform']
init_logz: True
# CAREFUL - guide_scores_back_policy_traj can have high variance in 
# training losses because trajectorie sampled under back policy can have
# extremely low logp under guide 
# choices=['guide_scores_back_policy_traj', 'guide_resamples_traj'])
offline_style: 'guide_scores_back_policy_traj' 
offline_select: 'prt'

sa_or_ssr: 'ssr'

# model architecture
# inp_dim is specified by mdp
ssr_encoder_hid_dim: 128
ssr_encoder_n_layers: 1
ssr_embed_dim: 128
ssr_scorer_hid_dim: 128
ssr_scorer_n_layers: 1
sa_hid_dim: 128
sa_n_layers: 2

# trainer
num_active_learning_rounds: 50000
num_samples_per_online_batch: 16
num_samples_per_offline_batch: 16

num_steps_per_batch: 3
num_online_batches_per_round: 1
num_offline_batches_per_round: 1
target_mix_backpolicy_weight: 0.25

# policy
explore_epsilon: 0.01

# guide
parallelize: True
num_guide_workers: 16

# experiment
replicate: 0
saved_models_dir: 'saved_models/utr/'
save_every_x_active_rounds: 100

# monitor
monitor_num_samples: 128
monitor_fast_every: 10
monitor_slow_every: 200

# reward
train_inverse_temperature: 3.0
scale_reward_max: 1
drop_reward_min: 1e-3
guide_sampling_temperature: 3.0
reward_temp: 3

# experiment-specific
mdp_style: 'pa' # ['insert', 'pa']

# checkpointing
ckpt: -1
eval_num_samples: 128

# mcmc
mcmc: false
k: 4