# Transformer模型配置
model:
  type: transformer
  architecture: TransformerForDiffusion
  
# Transformer特定参数
transformer:
  input_dim: ${env.dataset.action_dim}
  output_dim: ${env.dataset.action_dim}
  horizon: ${env.dataset.pred_horizon}
  cond_dim: ${env.dataset.vision_feature_dim}
  hidden_dim: 512
  num_layers: 6
  num_heads: 8
  dropout: 0.1
  
# 条件编码
conditioning:
  use_vision_encoder: ${env.name in ['pusht']}
  vision_encoder_type: resnet18
  replace_bn_with_gn: true
