_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
  META_ARCHITECTURE: "CLIPFastRCNN"
  BACKBONE:
    NAME: "build_clip_resnet_backbone"
    FREEZE_AT: 2
  WEIGHTS: "" 
  MASK_ON: False
  RESNETS:
    DEPTH: 50
    OUT_FEATURES: ["res4"]
    NORM: FrozenBN
    STEM_OUT_CHANNELS: 64
    RES2_OUT_CHANNELS: 256
  RPN:
    HEAD_NAME: StandardRPNHead
    IN_FEATURES: ["res4"]
  ROI_HEADS:
    NAME: "CLIPRes5ROIHeads"
    IN_FEATURES: ["res4"]
    NUM_CLASSES: 1 #48 # base categories
    SCORE_THRESH_TEST: 0.001
  ROI_BOX_HEAD:
    NAME: ""
    NUM_FC: 0
    POOLER_RESOLUTION: 14
    CLS_AGNOSTIC_BBOX_REG: True
  ROI_MASK_HEAD:
    NAME: "MaskRCNNConvUpsampleHead"
    NUM_CONV: 0
    POOLER_RESOLUTION: 14
  PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
  PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
  CLIP:
    CROP_REGION_TYPE: "RPN"
    USE_TEXT_EMB_CLASSIFIER: True
    CLSS_TEMP: 0.01
    NO_BOX_DELTA: False
    BG_CLS_LOSS_WEIGHT: 0.2
    FOCAL_SCALED_LOSS: 0.5
DATASETS:
  TRAIN_S: ("kitti_voc_car_train",)
  TRAIN_T: ("cityscapes_voc_car_trainval",)
  TEST: ("cityscapes_voc_car_test",)
  #TEST: ("foggy_cityscapes_voc_test",)
TEST:
  EVAL_PERIOD: 1000 #5000
SOLVER:
  IMS_PER_BATCH: 2 #16
  BASE_LR: 0.002
  STEPS: (60000, 80000)
  MAX_ITER: 25000
  WARMUP_ITERS: 5000 #5000
  CHECKPOINT_PERIOD: 2000
#INPUT:
  #MIN_SIZE_TRAIN_SAMPLING: choice
  #MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
  #MAX_SIZE_TRAIN: 1333
  #MIN_SIZE_TEST: 800
  #MAX_SIZE_TEST: 1333
  #FORMAT: "RGB"
INPUT:
  MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024)
  MIN_SIZE_TRAIN_SAMPLING: "choice"
  MIN_SIZE_TEST: 800
  MAX_SIZE_TRAIN: 2048
  MAX_SIZE_TEST: 2048
  FORMAT: "RGB"
LEARNABLE_PROMPT:
  CTX_SIZE: 8
  CLASS: ('car',)
  TUNING: True
