_base_ = './fvit_vitl14_ovcoco.py'

model = dict(
    backbone=dict(
        pretrained='checkpoints/eva_vitl14_coco_clipself_proposals.pt',
    ),
    roi_head=dict(
        type='MCTRoIHead',
        bbox_head=dict(
            type='MCTBBoxHead',
            alpha=0.0,
            beta=0.5,
            embed_level=True,
            unified_heads=True,
        ),
        vlm_roi_extractor=dict(
            roi_layer=dict(output_size=7)
        ),
        feature_level="sa_ca_ffn",
        num_heads=4,
    )
)

lr_config = dict(
    step=[2]
)