_base_ = './fvit_vitb16_ovcoco.py'

model = dict(
    backbone=dict(
        pretrained='checkpoints/eva_vitb16_coco_clipself_patches.pt'
    ),
    roi_head=dict(
        type='MCTRoIHead',
        bbox_head=dict(
            type='MCTBBoxHead',
            alpha=0.0,
            beta=0.5,
            embed_level=True,
            unified_heads=True,
        ),
        vlm_roi_extractor=dict(
            roi_layer=dict(output_size=7)
        ),
        feature_level="sa_ca_ffn",
        num_heads=4,
    )
)

lr_config = dict(
    step=[2]
)

data = dict(
    samples_per_gpu=16,
    workers_per_gpu=16,
)
