_base_ = './fvit_vitb16_ovlvis.py'

model = dict(
    backbone=dict(
        pretrained='checkpoints/eva_vitb16_lvis_clipself_patches.pt'
    ),
    roi_head=dict(
        type='MCTRoIHead',
        bbox_head=dict(
            type='MCTBBoxHead',
            alpha=0.0,
            beta=0.5,
            embed_level=True,
            unified_heads=True,
        ),
        vlm_roi_extractor=dict(
            roi_layer=dict(output_size=7)
        ),
        feature_level="sa_ca_ffn",
        num_heads=4,
    )
)

log_config = dict(interval=200)

lr_config = dict(step=[32, 40])
