_BASE_: "../../Base-RCNN-C4.yaml"
DE:
  CLASS_PROTOTYPES: "weights/initial/open-vocabulary/prototypes/lvis/lvis_v1_known_train.vitl14.pkl,weights/initial/open-vocabulary/prototypes/lvis/lvis_v1_novel_train.vitl14.pkl"
  BG_PROTOTYPES: "weights/initial/background/background_prototypes.vitl14.pth"
  TOPK: 10
  
  NUM_CLS_LAYERS: 3
  RCNN_BATCH_SIZE: 256
  POS_RATIO: 0.5
  TEMP: 0.05
  BG_CLS_LOSS_WEIGHT: 1.0
  MULTIPLY_RPN_SCORE: True
  OFFLINE_RPN_LSJ_PRETRAINED: True
  OUT_INDICES: [2,12,23]  # [2,6,11] for small/base ViT


MODEL:
  META_ARCHITECTURE: "OpenSetDetectorWithExamples_refactored2"
  BACKBONE:
    NAME: "build_dino_v2_vit"
    TYPE: "large"
  WEIGHTS: ""
  MASK_ON: False # True
  RPN:
    HEAD_NAME: StandardRPNHead
    IN_FEATURES: ["res4"]
  ROI_HEADS:
    SCORE_THRESH_TEST: 0.02
  ROI_BOX_HEAD:
    NAME: ""
    NUM_FC: 0
    POOLER_RESOLUTION: 10
    CLS_AGNOSTIC_BBOX_REG: True
  ROI_MASK_HEAD:
    NAME: "MaskRCNNConvUpsampleHead"
    NUM_CONV: 0
    POOLER_RESOLUTION: 14
    CLS_AGNOSTIC_MASK: True
  PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
  PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]

INPUT:
  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
  TRAIN: ("lvis_v1_train",)
  TEST: ("lvis_v1_val",)
TEST:
  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
  EVAL_PERIOD: 10000
SOLVER:
  IMS_PER_BATCH: 16
  BASE_LR: 0.002
  STEPS: (60000, 80000)
  MAX_ITER: 90000  # 180000 * 16 / 100000 ~ 28.8 epochs
  WARMUP_ITERS: 5000
DATALOADER:
  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
  REPEAT_THRESHOLD: 0.001
INPUT:
  MIN_SIZE_TRAIN_SAMPLING: choice
  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
  MAX_SIZE_TRAIN: 1333
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1333
  FORMAT: "RGB"