_BASE_: ../voc-11k-20/maskformer_R50_bs16_20k.yaml
ORACLE: True
MODEL:
  WEIGHTS: ""
  META_ARCHITECTURE: "ProposalClipClassifier"
  MASK_ON: True
  CLIP_ADAPTER:
    PROMPT_LEARNER: "pomp"
    # for learnable prompt
    PROMPT_DIM: 512
    PROMPT_SHAPE: (16, 0)
    CLIP_MODEL_NAME: "ViT-B/16"
    PROMPT_CHECKPOINT: /home/ubuntu/efs/multimodal-prompt-learning/output/imagenet_21k_o/GPT_tutorial_sampled_softmax/vit_b16_ep20_randaug2_unc1000_16shots/nctx16_cscFalse_ctpend/seed42/prompt_learner/model-best.pth.tar
DATASETS:
  TRAIN: ("voc_base_sem_seg_train_classification",)
  TEST: ("voc_sem_seg_test_classification",)
  SAMPLE_PER_CLASS: 128
INPUT:
  MIN_SIZE_TRAIN: (224,244)
  MIN_SIZE_TEST: 224
  MAX_SIZE_TEST: 2560
  SIZE_DIVISIBILITY: -1
  FORMAT: "RGB"
  DATASET_MAPPER_NAME: "mask_former_binary_semantic"
SOLVER:
  OPTIMIZER: "SGD"
  BASE_LR: 0.02
  WEIGHT_DECAY: 0.0005
  LR_SCHEDULER_NAME: "WarmupCosineLR"
  WARMUP_METHOD: "constant"  
  WARMUP_FACTOR: 0.005
  WARMUP_ITERS: 100
  IMS_PER_BATCH: 32
  TEST_IMS_PER_BATCH: 4
  MAX_ITER: 3200
  CHECKPOINT_PERIOD: 1000
TEST:
  EVAL_PERIOD: 1000
OUTPUT_DIR: output/voc-11k-15/zero_shot_proposal_classification_learn_prompt_pomp_bs16_10k/