_BASE_: ../maskformer2_R50_bs16_50ep.yaml
OUTPUT_DIR: ./output/fcclip_convnext_large_eval_cityscapes_point

INPUT:
  MIN_SIZE_TEST: 1024
  MAX_SIZE_TEST: 2560
  DATASET_MAPPER_NAME: "custom_panoptic"
  SAM:
    STORAGE: False
    STORAGE_PATH: "./datasets/cityscapes/sam_mask"
    DIRECT: False

MODEL:
  META_ARCHITECTURE: "FCCLIPoint"
  SEM_SEG_HEAD:
    NAME: "FCCLIPHead"
    NUM_CLASSES: 19
  # backbone part.
  BACKBONE:
    NAME: "CLIP"
  WEIGHTS: "./fcclip_cocopan.pth"
  PIXEL_MEAN: [122.7709383, 116.7460125, 104.09373615]
  PIXEL_STD: [68.5005327, 66.6321579, 70.32316305]
  FC_CLIP:
    CLIP_MODEL_NAME: "convnext_large_d_320"
    CLIP_PRETRAINED_WEIGHTS: "laion2b_s29b_b131k_ft_soup"
    EMBED_DIM: 768
    GEOMETRIC_ENSEMBLE_ALPHA: 0.4
    GEOMETRIC_ENSEMBLE_BETA: 0.8
  MASK_FORMER:
    NUM_OBJECT_QUERIES: 250
    TEST:
      SEMANTIC_ON: True
      INSTANCE_ON: True
      PANOPTIC_ON: True
      OBJECT_MASK_THRESHOLD: 0.0


SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.00001
  STEPS: (30000, 35000)
  MAX_ITER: 40000

DATASETS:
  TRAIN: ("openvocab_cityscapes_fine_panoptic_train_with_point",)
  TEST: ("openvocab_cityscapes_fine_panoptic_val",)
  # "openvocab_coco_2017_val_panoptic_with_sem_seg"
