_BASE_: maskformer_R50_bs32_60k.yaml
MODEL:
  META_ARCHITECTURE: "ZeroShotMaskFormer"
  SEM_SEG_HEAD:
    NAME: "ZeroShotMaskFormerHead"
    NUM_CLASSES: 171
    EMBEDDING_DIM: 512
    EMBED_LAYERS: 2
  CLIP_ADAPTER:
    PROMPT_LEARNER: "learnable"
    # for learnable prompt
    PROMPT_DIM: 512
    PROMPT_SHAPE: (16, 0)
    CLIP_MODEL_NAME: "ViT-B/16"
    MASK_FILL: "mean"
    MASK_EXPAND_RATIO: 1.0
    MASK_THR: 0.5
    MASK_MATTING: False
    REGION_RESIZED: True
    CLIP_ENSEMBLE: True
    CLIP_ENSEMBLE_WEIGHT: 0.8
DATASETS:
  TRAIN: ("coco_2017_train_stuff_sem_seg",)
