MODEL:
  META_ARCHITECTURE: "GeneralizedRCNN"
  ATTRIBUTE_ON: True
  RESNETS:
    OUT_FEATURES: ["res5"]
    RES5_DILATION: 2    
    NORM: "ViT-B/32"
  RPN:
    IN_FEATURES: ["res5"]
    PRE_NMS_TOPK_TEST: 6000
    POST_NMS_TOPK_TEST: 1000
    SMOOTH_L1_BETA: 0.1111
    BOUNDARY_THRESH: 0
  ROI_HEADS:
    NAME: "AttributeStandardROIHeads"
    IN_FEATURES: ["res5"]
    NUM_CLASSES: 1600
  ROI_BOX_HEAD:
    NAME: "FastRCNNConvFCHead"
    NUM_FC: 2
    POOLER_RESOLUTION: 1
    POOLER_SAMPLING_RATIO: 2
    SMOOTH_L1_BETA: 1.
DATASETS:
  TRAIN: ("visual_genome_train", "visual_genome_val")
  TEST: ("visual_genome_test",)
SOLVER:
  IMS_PER_BATCH: 16
  BASE_LR: 0.02
  STEPS: (60000, 80000)
  MAX_ITER: 90000
INPUT:
  MIN_SIZE_TRAIN: (600,)
  MAX_SIZE_TRAIN: 1000
  MIN_SIZE_TEST: 600
  MAX_SIZE_TEST: 1000
VERSION: 2
OUTPUT_DIR: vit_clip_outputs