_BASE_: "Base_OVCOCO_C4_1x.yaml"
MODEL:
  WEIGHTS: "models/BoxSup_OVCOCO_CLIP_R50_1x.pth"
  WITH_CAPTION: True
  SYNC_CAPTION_BATCH: True
  ROI_BOX_HEAD:
    WS_NUM_PROPS: 32
    ADD_IMAGE_BOX: True # caption loss is added to the image-box
    IMAGE_LABEL_LOSS: 'max_size'

    NEG_CAP_WEIGHT: 1.0
SOLVER:
  IMS_PER_BATCH: 16
  BASE_LR: 0.02
  STEPS: (60000, 80000)
  MAX_ITER: 90000
DATASETS:
  TRAIN: ("coco_zeroshot_train_oriorder", "coco_caption_train_tags")
INPUT:
  CUSTOM_AUG: ResizeShortestEdge
  MIN_SIZE_TRAIN_SAMPLING: range
  MIN_SIZE_TRAIN: (800, 800)
DATALOADER:
  SAMPLER_TRAIN: "MultiDatasetSampler"
  DATASET_RATIO: [1, 4]
  USE_DIFF_BS_SIZE: True
  DATASET_BS: [2, 8]
  USE_RFS: [False, False]
  DATASET_MIN_SIZES: [[800, 800], [400, 400]]
  DATASET_MAX_SIZES: [1333, 667]
  FILTER_EMPTY_ANNOTATIONS: False
  MULTI_DATASET_GROUPING: True
  DATASET_ANN: ['box', 'captiontag']
  NUM_WORKERS: 8
WITH_IMAGE_LABELS: True