_BASE_: ../maskformer2_R50_bs16_50ep.yaml
OUTPUT_DIR: ./output/fcclip_convnext_large_eval_cityscapes

INPUT:
  MIN_SIZE_TEST: 1024
  MAX_SIZE_TEST: 2560

MODEL:
  META_ARCHITECTURE: "FCCLIPrompt"
  PROMPT_TUNING:
    L2_REG: False
    ANALYSIS: False
    ADDITIONAL_PROMPT: False
    TASK_ARITHMETIC: False
    TASK_ARITHMETIC_LAMBDA: 0.25
    ALL_TRAIN: True
    FREEZE_PARAM_NAMES: ['mask_embed', 'class_embed', 'decoder_norm', '_mask_pooling_proj']
    # ['mask_embed', 'class_embed', 'query_feat', 'query_embed']
    NUM_QUERIES: 250
  SEM_SEG_HEAD:
    NAME: "FCCLIPHeadPrompt"
    NUM_CLASSES: 19
  # backbone part.
  BACKBONE:
    NAME: "CLIP"
  WEIGHTS: "./fcclip_cocopan.pth"
  PIXEL_MEAN: [122.7709383, 116.7460125, 104.09373615]
  PIXEL_STD: [68.5005327, 66.6321579, 70.32316305]
  FC_CLIP:
    CLIP_MODEL_NAME: "convnext_large_d_320"
    CLIP_PRETRAINED_WEIGHTS: "laion2b_s29b_b131k_ft_soup"
    EMBED_DIM: 768
    GEOMETRIC_ENSEMBLE_ALPHA: 0.4
    GEOMETRIC_ENSEMBLE_BETA: 0.8
  MASK_FORMER:
    TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoderPrompt"
    NUM_OBJECT_QUERIES: 250
    TEST:
      SEMANTIC_ON: True
      INSTANCE_ON: True
      PANOPTIC_ON: True
      OBJECT_MASK_THRESHOLD: 0.0

DATASETS:
  TRAIN: ("openvocab_cityscapes_fine_panoptic_train",)
  TEST: ("openvocab_cityscapes_fine_panoptic_val",)
  # "openvocab_coco_2017_val_panoptic_with_sem_seg"
