_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
  META_ARCHITECTURE: "PretrainFastRCNN"
  BACKBONE:
    NAME: "build_clip_resnet_backbone"
    FREEZE_AT: 2
  WEIGHTS: ""
  MASK_ON: False
  RESNETS:
    DEPTH: 50
    OUT_FEATURES: ["res4"]
    NORM: FrozenBN
    STEM_OUT_CHANNELS: 64
    RES2_OUT_CHANNELS: 256
  ROI_HEADS:
    NAME: "PretrainRes5ROIHeads"
    IN_FEATURES: ["res4"]
  PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
  PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
  CLIP:
    CLSS_TEMP: 0.01
    CROP_REGION_TYPE: "RPN"
    OFFLINE_RPN_NMS_THRESH: 0.5
    GATHER_GPUS: True
    CONCEPT_THRES: 0.1
    PRETRAIN_RPN_REGIONS: 300
    PRETRAIN_SAMPLE_REGIONS: 100
    PRETRAIN_IMG_TXT_LEVEL: True
    PRETRAIN_ONLY_EOT: True
    TEACHER_RESNETS_DEPTH: 50
    TEACHER_POOLER_RESOLUTION: 14
INPUT:
  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
  TRAIN: ("imgtxtpairs",)
  FACTORY_TRAIN: ("CLIPImgTxtPairTSVDataset",)
  PATH_TRAIN: ("/home/v-yiwuzhong/projects/azureblobs/vlpdatasets/coco-caption/val2017",) # ("/tmp/datasets/CC3M",)
  TEST: () 
DATALOADER:
  ASPECT_RATIO_GROUPING: False
  NUM_WORKERS: 4
TEST:
  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
  EVAL_PERIOD: 2500000
SOLVER:
  IMS_PER_BATCH: 96 # 32 gpus
  BASE_LR: 0.002
  WEIGHT_DECAY: 0.0001
  STEPS: (300000, 525000)
  MAX_ITER: 600000
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "norm"
    CLIP_VALUE: 5.0
INPUT:
  MIN_SIZE_TRAIN_SAMPLING: choice
  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
  MAX_SIZE_TRAIN: 1333
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1333
  FORMAT: "RGB"
AUG: # Data Augmentation from MSR-CLIP 
  TRAIN:
    IMAGE_SIZE: [800,]
    MAX_SIZE: 1333
  TEST:
    IMAGE_SIZE: [800,]
    MAX_SIZE: 1333
  INTERPOLATION: 3