models:
  
  _target_: main_model.GroupingImgGumbel
  
  conditioning:
    _target_: conditioning_clean.RandomConditioning
    n_slots: 24
    object_dim: 128
    learn_mean: true
    learn_std: true
  
  feature_extractor:
    _target_: TimmFeatureExtractor
    model_name: vit_base_patch16_224_dino
    feature_level: 12
    pretrained: true
    freeze: true
  
  perceptual_grouping:
    _target_: slot_attention.SlotAttentionGroupingGumbel
    feature_dim: 128
    object_dim: 128
    kvq_dim: null
    n_heads: 1
    iters: 3
    eps: 1.0e-08
    ff_mlp:
      _target_: neural_networks_clean.build_two_layer_mlp
      input_dim: 128
      output_dim: 128
      hidden_dim: 512
      initial_layer_norm: true
      residual: true
    positional_embedding:
      _target_: neural_networks_clean.wrappers.Sequential
      _args_:
      - _target_: neural_networks_clean.positional_embedding.DummyPositionEmbed
      - _target_: neural_networks_clean.build_two_layer_mlp
        input_dim: ${....input_dim}
        output_dim: ${....feature_dim}
        hidden_dim: ${....input_dim}
        initial_layer_norm: true
    use_projection_bias: false
    use_implicit_differentiation: false
    use_empty_slot_for_masked_slots: false
    input_dim: 768
    single_gumbel_score_network:
      _target_: neural_networks_clean.build_two_layer_mlp
      input_dim: 128
      output_dim: 2
      hidden_dim: 512
      initial_layer_norm: true
      residual: false
    low_bound: 0
  
  object_decoder:
    _target_: decoding_clean.PatchDecoderGumbel
    object_dim: 128
    output_dim: ${models.perceptual_grouping.input_dim}
    num_patches: 196
    decoder:
      _target_: neural_networks_clean.convenience.build_mlp
      _partial_: true
      features:
      - 1024
      - 1024
      - 1024
      activation_fn: relu
      final_activation_fn: null
      initial_layer_norm: false
      residual: false
    
    mask_type: mask
