_target_: models.KeyWorld
clip_arch: 'ViT-B-32' # 'ViT-L-14' # 'ViT-B-32'
clip_path: 
clip_visual_dim: 768 # 768 # 1024
clip_text_dim: 512 # 512 # 768
state_num: 3
num_actions: 8
action_bins: 256
scene_transformer_depth: 9
action_transformer_depth: 3
embed_dim: 512
dropout: 0.1
